- Published on
LLM Rate Limiting and Cost Controls — Per-User Token Budgets at Scale
- Authors

- Name
- Sanjeev Sharma
- @webcoderspeed1
Introduction
Request-based rate limiting is insufficient for LLMs. A single request can use 50K tokens. Token-based budgets with sliding windows are essential for cost control and fairness at scale.
- Token-Based Rate Limiting
- Sliding Window Token Counter in Redis
- Per-User Daily/Monthly Token Budget
- Burst Allowance for Premium Users
- Cost-Per-Request Tracking
- Budget Exhaustion Handling
- Admin Override
- Cost Anomaly Detection
- Checklist
- Conclusion
Token-Based Rate Limiting
Track token consumption per user per time window, not just request counts.
import { Redis } from 'ioredis';
interface TokenBudget {
user_id: string;
tokens_used: number;
tokens_limit: number;
window_start: Date;
window_end: Date;
}
class TokenBasedRateLimiter {
private redis: Redis;
private window_seconds = 86400; // 24-hour window
private burst_multiplier = 1.5; // Allow 150% burst
constructor(redis: Redis) {
this.redis = redis;
}
async checkTokenBudget(
userId: string,
requestedTokens: number
): Promise<{ allowed: boolean; remaining: number; retry_after_seconds?: number }> {
const key = `tokens:user:${userId}`;
const now = Date.now();
const windowKey = `${key}:window`;
// Check window expiration
const windowStart = parseInt(await this.redis.get(windowKey) || '0');
const windowExpired = now - windowStart > this.window_seconds * 1000;
if (windowExpired) {
// Reset budget for new window
await this.redis.del(key);
await this.redis.set(windowKey, now.toString());
}
// Get current usage
const currentUsage = parseInt(await this.redis.get(key) || '0');
const userLimit = await this.getUserTokenLimit(userId);
const burstLimit = userLimit * this.burst_multiplier;
const wouldExceed = currentUsage + requestedTokens > burstLimit;
if (wouldExceed) {
const resetTime = windowStart + this.window_seconds * 1000;
const retryAfter = Math.ceil((resetTime - now) / 1000);
return {
allowed: false,
remaining: 0,
retry_after_seconds: Math.max(1, retryAfter)
};
}
// Record usage
await this.redis.incrby(key, requestedTokens);
await this.redis.expire(key, this.window_seconds);
const remaining = burstLimit - (currentUsage + requestedTokens);
return {
allowed: true,
remaining: Math.floor(remaining)
};
}
private async getUserTokenLimit(userId: string): Promise<number> {
const tierKey = `user:tier:${userId}`;
const tier = await this.redis.get(tierKey) || 'free';
const limits: Record<string, number> = {
'free': 100000, // 100K tokens/day
'pro': 1000000, // 1M tokens/day
'enterprise': 10000000, // 10M tokens/day
'unlimited': Number.MAX_SAFE_INTEGER
};
return limits[tier] || limits.free;
}
async setUserTier(userId: string, tier: 'free' | 'pro' | 'enterprise' | 'unlimited'): Promise<void> {
const tierKey = `user:tier:${userId}`;
await this.redis.set(tierKey, tier);
}
async getUserUsage(userId: string): Promise<TokenBudget> {
const key = `tokens:user:${userId}`;
const windowKey = `${key}:window`;
const tokensUsed = parseInt(await this.redis.get(key) || '0');
const windowStart = parseInt(await this.redis.get(windowKey) || '0');
const limit = await this.getUserTokenLimit(userId);
return {
user_id: userId,
tokens_used: tokensUsed,
tokens_limit: limit,
window_start: new Date(windowStart),
window_end: new Date(windowStart + this.window_seconds * 1000)
};
}
}
Sliding Window Token Counter in Redis
Accurate token counting with sliding windows for per-minute and per-hour limits.
interface RateLimitWindow {
duration_ms: number;
limit: number;
name: string;
}
class SlidingWindowTokenCounter {
private redis: Redis;
private windows: RateLimitWindow[] = [
{ name: 'per-minute', duration_ms: 60000, limit: 50000 },
{ name: 'per-hour', duration_ms: 3600000, limit: 500000 },
{ name: 'per-day', duration_ms: 86400000, limit: 5000000 }
];
constructor(redis: Redis) {
this.redis = redis;
}
async recordTokenUsage(userId: string, tokens: number): Promise<void> {
const now = Date.now();
const timestamp = now.toString();
for (const window of this.windows) {
const key = `sliding:${userId}:${window.name}`;
// Add to sorted set with current timestamp as score
await this.redis.zadd(key, now, `${timestamp}:${tokens}`);
// Remove entries older than window
const cutoff = now - window.duration_ms;
await this.redis.zremrangebyscore(key, 0, cutoff);
// Set expiry
await this.redis.expire(key, Math.ceil(window.duration_ms / 1000));
}
}
async checkRateLimit(userId: string): Promise<{
allowed: boolean;
usage: Record<string, { current: number; limit: number; percent: number }>;
}> {
const usage: Record<string, { current: number; limit: number; percent: number }> = {};
let allowed = true;
for (const window of this.windows) {
const key = `sliding:${userId}:${window.name}`;
// Get all entries in current window
const entries = await this.redis.zrange(key, 0, -1);
let totalTokens = 0;
for (const entry of entries) {
const tokens = parseInt(entry.split(':')[1]);
totalTokens += tokens;
}
const percent = (totalTokens / window.limit) * 100;
usage[window.name] = {
current: totalTokens,
limit: window.limit,
percent
};
if (totalTokens > window.limit) {
allowed = false;
}
}
return { allowed, usage };
}
async getWindowUsage(userId: string, windowName: string): Promise<number> {
const key = `sliding:${userId}:${windowName}`;
const entries = await this.redis.zrange(key, 0, -1);
let total = 0;
for (const entry of entries) {
const tokens = parseInt(entry.split(':')[1]);
total += tokens;
}
return total;
}
}
Per-User Daily/Monthly Token Budget
Separate daily and monthly budgets for different user tiers.
interface UserBudgetAllocation {
user_id: string;
daily_limit: number;
monthly_limit: number;
daily_used: number;
monthly_used: number;
daily_remaining: number;
monthly_remaining: number;
reset_daily_at: Date;
reset_monthly_at: Date;
}
class PerUserBudgetManager {
private redis: Redis;
private budgetTiers: Record<string, { daily: number; monthly: number }> = {
'free': { daily: 100000, monthly: 2000000 },
'pro': { daily: 500000, monthly: 10000000 },
'enterprise': { daily: 5000000, monthly: 100000000 },
'unlimited': { daily: Number.MAX_SAFE_INTEGER, monthly: Number.MAX_SAFE_INTEGER }
};
constructor(redis: Redis) {
this.redis = redis;
}
async consumeTokens(userId: string, tokens: number): Promise<{ allowed: boolean; reason?: string }> {
const dailyKey = `budget:daily:${userId}`;
const monthlyKey = `budget:monthly:${userId}`;
const budget = await this.getBudget(userId);
const tier = await this.getUserTier(userId);
const limits = this.budgetTiers[tier];
// Check daily limit
if (budget.daily_used + tokens > limits.daily) {
return {
allowed: false,
reason: `Daily quota exceeded: ${budget.daily_used} + ${tokens} > ${limits.daily}`
};
}
// Check monthly limit
if (budget.monthly_used + tokens > limits.monthly) {
return {
allowed: false,
reason: `Monthly quota exceeded: ${budget.monthly_used} + ${tokens} > ${limits.monthly}`
};
}
// Deduct tokens
await this.redis.incrby(dailyKey, tokens);
await this.redis.incrby(monthlyKey, tokens);
// Set expiry on daily key (reset at midnight UTC)
const tomorrow = new Date();
tomorrow.setUTCDate(tomorrow.getUTCDate() + 1);
tomorrow.setUTCHours(0, 0, 0, 0);
const secondsUntilMidnight = Math.ceil((tomorrow.getTime() - Date.now()) / 1000);
await this.redis.expire(dailyKey, secondsUntilMidnight);
return { allowed: true };
}
async getBudget(userId: string): Promise<UserBudgetAllocation> {
const tier = await this.getUserTier(userId);
const limits = this.budgetTiers[tier];
const dailyKey = `budget:daily:${userId}`;
const monthlyKey = `budget:monthly:${userId}`;
const dailyUsed = parseInt(await this.redis.get(dailyKey) || '0');
const monthlyUsed = parseInt(await this.redis.get(monthlyKey) || '0');
const tomorrow = new Date();
tomorrow.setUTCDate(tomorrow.getUTCDate() + 1);
tomorrow.setUTCHours(0, 0, 0, 0);
const nextMonth = new Date();
nextMonth.setUTCMonth(nextMonth.getUTCMonth() + 1);
nextMonth.setUTCDate(1);
nextMonth.setUTCHours(0, 0, 0, 0);
return {
user_id: userId,
daily_limit: limits.daily,
monthly_limit: limits.monthly,
daily_used: dailyUsed,
monthly_used: monthlyUsed,
daily_remaining: Math.max(0, limits.daily - dailyUsed),
monthly_remaining: Math.max(0, limits.monthly - monthlyUsed),
reset_daily_at: tomorrow,
reset_monthly_at: nextMonth
};
}
private async getUserTier(userId: string): Promise<string> {
return (await this.redis.get(`user:tier:${userId}`)) || 'free';
}
}
Burst Allowance for Premium Users
Allow temporary bursts above normal limit for premium users.
interface BurstConfig {
burst_multiplier: number;
burst_duration_minutes: number;
cooldown_minutes: number;
}
class BurstAllowanceManager {
private redis: Redis;
private burstConfigs: Record<string, BurstConfig> = {
'free': { burst_multiplier: 1.0, burst_duration_minutes: 0, cooldown_minutes: 0 },
'pro': { burst_multiplier: 1.5, burst_duration_minutes: 5, cooldown_minutes: 60 },
'enterprise': { burst_multiplier: 2.0, burst_duration_minutes: 15, cooldown_minutes: 30 },
'unlimited': { burst_multiplier: Number.MAX_VALUE, burst_duration_minutes: Number.MAX_VALUE, cooldown_minutes: 0 }
};
constructor(redis: Redis) {
this.redis = redis;
}
async canBurst(userId: string): Promise<{ can_burst: boolean; reason?: string }> {
const tier = await this.redis.get(`user:tier:${userId}`) || 'free';
const config = this.burstConfigs[tier];
if (!config || config.burst_multiplier === 1.0) {
return { can_burst: false, reason: 'User tier does not support burst' };
}
const lastBurstKey = `burst:last:${userId}`;
const lastBurst = await this.redis.get(lastBurstKey);
if (lastBurst) {
const now = Date.now();
const lastBurstTime = parseInt(lastBurst);
const cooldownMs = config.cooldown_minutes * 60 * 1000;
if (now - lastBurstTime < cooldownMs) {
const waitMs = cooldownMs - (now - lastBurstTime);
return {
can_burst: false,
reason: `Cooldown active. Wait ${Math.ceil(waitMs / 1000)}s`
};
}
}
return { can_burst: true };
}
async activateBurst(userId: string): Promise<void> {
const tier = await this.redis.get(`user:tier:${userId}`) || 'free';
const config = this.burstConfigs[tier];
const lastBurstKey = `burst:last:${userId}`;
await this.redis.set(lastBurstKey, Date.now().toString());
await this.redis.expire(
lastBurstKey,
config.cooldown_minutes * 60 + config.burst_duration_minutes * 60
);
const burstEndKey = `burst:end:${userId}`;
const burstEndTime = Date.now() + config.burst_duration_minutes * 60 * 1000;
await this.redis.set(burstEndKey, burstEndTime.toString());
await this.redis.expire(burstEndKey, config.burst_duration_minutes * 60);
}
async getBurstMultiplier(userId: string): Promise<number> {
const burstEndKey = `burst:end:${userId}`;
const burstEnd = await this.redis.get(burstEndKey);
if (!burstEnd || parseInt(burstEnd) < Date.now()) {
return 1.0;
}
const tier = await this.redis.get(`user:tier:${userId}`) || 'free';
return this.burstConfigs[tier].burst_multiplier;
}
}
Cost-Per-Request Tracking
Track actual cost of each request for billing and monitoring.
interface RequestCostMetrics {
request_id: string;
user_id: string;
model: string;
tokens_input: number;
tokens_output: number;
cost_usd: number;
timestamp: Date;
}
class CostPerRequestTracker {
private redis: Redis;
private pricing: Record<string, { input: number; output: number }> = {
'gpt-4o': { input: 0.015, output: 0.06 },
'gpt-3.5-turbo': { input: 0.0005, output: 0.0015 },
'claude-3-opus': { input: 0.015, output: 0.075 }
};
constructor(redis: Redis) {
this.redis = redis;
}
calculateCost(model: string, inputTokens: number, outputTokens: number): number {
const rates = this.pricing[model];
if (!rates) return 0;
return (inputTokens / 1000) * rates.input + (outputTokens / 1000) * rates.output;
}
async recordRequest(
requestId: string,
userId: string,
model: string,
inputTokens: number,
outputTokens: number
): Promise<RequestCostMetrics> {
const cost = this.calculateCost(model, inputTokens, outputTokens);
const metrics: RequestCostMetrics = {
request_id: requestId,
user_id: userId,
model,
tokens_input: inputTokens,
tokens_output: outputTokens,
cost_usd: cost,
timestamp: new Date()
};
// Store in Redis for recent query access
const key = `request:${requestId}`;
await this.redis.setex(key, 86400, JSON.stringify(metrics));
// Add to user's request history (keep last 100)
const historyKey = `user:requests:${userId}`;
await this.redis.lpush(historyKey, JSON.stringify(metrics));
await this.redis.ltrim(historyKey, 0, 99);
// Track daily spend per user
const dailySpendKey = `spend:daily:${userId}`;
await this.redis.incrbyfloat(dailySpendKey, cost);
await this.redis.expire(dailySpendKey, 86400);
return metrics;
}
async getUserDailySpend(userId: string): Promise<number> {
const key = `spend:daily:${userId}`;
return parseFloat(await this.redis.get(key) || '0');
}
async getUserRequestHistory(userId: string, limit: number = 20): Promise<RequestCostMetrics[]> {
const key = `user:requests:${userId}`;
const requests = await this.redis.lrange(key, 0, limit - 1);
return requests.map(r => JSON.parse(r));
}
}
Budget Exhaustion Handling
Graceful degradation when budget is exhausted.
interface DegradationStrategy {
type: 'queue' | 'fallback' | 'reject';
fallback_model?: string;
queue_ttl_seconds?: number;
}
class BudgetExhaustionHandler {
private redis: Redis;
constructor(redis: Redis) {
this.redis = redis;
}
async handleExhaustion(
userId: string,
requestedTokens: number,
strategy: DegradationStrategy
): Promise<{ handled: boolean; response?: string; retry_after?: number }> {
switch (strategy.type) {
case 'queue':
return this.handleWithQueuing(userId, requestedTokens, strategy.queue_ttl_seconds || 3600);
case 'fallback':
return {
handled: true,
response: `Budget exhausted. Upgrade to Pro for unlimited requests. Fallback model: ${strategy.fallback_model}`
};
case 'reject':
return {
handled: false,
response: 'Budget limit exceeded. Upgrade your plan.'
};
default:
return { handled: false };
}
}
private async handleWithQueuing(
userId: string,
tokens: number,
ttl: number
): Promise<{ handled: boolean; retry_after: number }> {
const queueKey = `queue:${userId}`;
// Add to queue
const position = await this.redis.lpush(queueKey, JSON.stringify({ tokens, timestamp: Date.now() }));
await this.redis.expire(queueKey, ttl);
// Estimate wait time (assume 1 request per minute when budget resets)
const estimatedWaitSeconds = Math.ceil(position * 60);
return {
handled: true,
retry_after: Math.min(estimatedWaitSeconds, ttl)
};
}
async processQueue(userId: string): Promise<number> {
const queueKey = `queue:${userId}`;
const request = await this.redis.rpop(queueKey);
if (!request) return 0;
const { tokens } = JSON.parse(request);
return tokens;
}
}
Admin Override
Allow admins to adjust user budgets and bypass limits.
interface AdminAction {
action: 'reset_daily' | 'reset_monthly' | 'increase_quota' | 'set_tier' | 'grant_tokens';
user_id: string;
value?: number | string;
reason: string;
admin_id: string;
timestamp: Date;
}
class AdminBudgetOverride {
private redis: Redis;
private actionLog: AdminAction[] = [];
constructor(redis: Redis) {
this.redis = redis;
}
async resetDailyBudget(userId: string, adminId: string, reason: string): Promise<void> {
const dailyKey = `budget:daily:${userId}`;
await this.redis.del(dailyKey);
this.logAction({
action: 'reset_daily',
user_id: userId,
reason,
admin_id: adminId,
timestamp: new Date()
});
}
async increaseQuota(userId: string, tokens: number, adminId: string, reason: string): Promise<void> {
// Grant temporary tokens by adjusting monthly budget
const bonusKey = `bonus:monthly:${userId}`;
await this.redis.incrby(bonusKey, tokens);
await this.redis.expire(bonusKey, 86400 * 30); // 30 days
this.logAction({
action: 'grant_tokens',
user_id: userId,
value: tokens,
reason,
admin_id: adminId,
timestamp: new Date()
});
}
async upgradeTier(userId: string, newTier: string, adminId: string, reason: string): Promise<void> {
const tierKey = `user:tier:${userId}`;
await this.redis.set(tierKey, newTier);
this.logAction({
action: 'set_tier',
user_id: userId,
value: newTier,
reason,
admin_id: adminId,
timestamp: new Date()
});
}
private logAction(action: AdminAction): void {
this.actionLog.push(action);
// Also store in Redis for audit trail
const auditKey = `audit:${action.admin_id}:${new Date().toISOString().split('T')[0]}`;
this.redis.lpush(auditKey, JSON.stringify(action));
this.redis.expire(auditKey, 86400 * 90); // Keep 90 days
}
getAuditLog(): AdminAction[] {
return [...this.actionLog];
}
}
Cost Anomaly Detection
Alert when a user's spending spikes abnormally.
interface AnomalyAlert {
user_id: string;
spike_percent: number;
daily_spend: number;
expected_spend: number;
alert_level: 'warning' | 'critical';
}
class CostAnomalyDetector {
private redis: Redis;
private baselineWindow = 7; // 7-day baseline
private spikeThreshold = 2.0; // 2x normal = alert
private criticalThreshold = 5.0; // 5x normal = critical
constructor(redis: Redis) {
this.redis = redis;
}
async detectAnomalies(): Promise<AnomalyAlert[]> {
const users = await this.redis.keys('spend:daily:*');
const alerts: AnomalyAlert[] = [];
for (const key of users) {
const userId = key.replace('spend:daily:', '');
const dailySpend = parseFloat(await this.redis.get(key) || '0');
const expectedSpend = await this.getBaselineSpend(userId);
if (expectedSpend === 0) continue; // Not enough history
const spikePercent = dailySpend / expectedSpend;
if (spikePercent >= this.criticalThreshold) {
alerts.push({
user_id: userId,
spike_percent: spikePercent,
daily_spend: dailySpend,
expected_spend: expectedSpend,
alert_level: 'critical'
});
} else if (spikePercent >= this.spikeThreshold) {
alerts.push({
user_id: userId,
spike_percent: spikePercent,
daily_spend: dailySpend,
expected_spend: expectedSpend,
alert_level: 'warning'
});
}
}
return alerts;
}
private async getBaselineSpend(userId: string): Promise<number> {
let totalSpend = 0;
let dayCount = 0;
for (let i = 1; i <= this.baselineWindow; i++) {
const date = new Date();
date.setDate(date.getDate() - i);
const dateStr = date.toISOString().split('T')[0];
const key = `spend:daily:${userId}:${dateStr}`;
const spend = parseFloat(await this.redis.get(key) || '0');
if (spend > 0) {
totalSpend += spend;
dayCount++;
}
}
return dayCount > 0 ? totalSpend / dayCount : 0;
}
async triggerAlert(alert: AnomalyAlert): Promise<void> {
console.warn(`ANOMALY ALERT: ${alert.user_id} spending ${(alert.spike_percent * 100).toFixed(0)}% of baseline`);
// Would send email/Slack notification here
const alertKey = `anomaly:${alert.user_id}`;
await this.redis.setex(alertKey, 3600, JSON.stringify(alert)); // Cache for 1 hour
}
}
Checklist
- Track tokens, not requests, for rate limiting
- Implement per-minute, per-hour, and per-day windows
- Allow 1.5-2x burst for premium users with cooldown
- Use sliding window for accurate token tracking
- Track cost per request for billing and monitoring
- Detect cost anomalies (2-5x normal spending)
- Queue requests gracefully when budget exhausted
- Log all admin overrides for audit trails
- Reset daily budgets at midnight UTC
- Alert on critical spending spikes immediately
Conclusion
Token-based rate limiting with per-user budgets is non-negotiable for LLM products. Pair it with burst allowances, graceful degradation, and anomaly detection, and you've got a system that protects your margin while keeping users happy.