Published on

LLM Rate Limiting and Cost Controls — Per-User Token Budgets at Scale

Authors

Introduction

Request-based rate limiting is insufficient for LLMs. A single request can use 50K tokens. Token-based budgets with sliding windows are essential for cost control and fairness at scale.

Token-Based Rate Limiting

Track token consumption per user per time window, not just request counts.

import { Redis } from 'ioredis';

interface TokenBudget {
  user_id: string;
  tokens_used: number;
  tokens_limit: number;
  window_start: Date;
  window_end: Date;
}

class TokenBasedRateLimiter {
  private redis: Redis;
  private window_seconds = 86400; // 24-hour window
  private burst_multiplier = 1.5; // Allow 150% burst

  constructor(redis: Redis) {
    this.redis = redis;
  }

  async checkTokenBudget(
    userId: string,
    requestedTokens: number
  ): Promise<{ allowed: boolean; remaining: number; retry_after_seconds?: number }> {
    const key = `tokens:user:${userId}`;
    const now = Date.now();
    const windowKey = `${key}:window`;

    // Check window expiration
    const windowStart = parseInt(await this.redis.get(windowKey) || '0');
    const windowExpired = now - windowStart > this.window_seconds * 1000;

    if (windowExpired) {
      // Reset budget for new window
      await this.redis.del(key);
      await this.redis.set(windowKey, now.toString());
    }

    // Get current usage
    const currentUsage = parseInt(await this.redis.get(key) || '0');
    const userLimit = await this.getUserTokenLimit(userId);
    const burstLimit = userLimit * this.burst_multiplier;

    const wouldExceed = currentUsage + requestedTokens > burstLimit;

    if (wouldExceed) {
      const resetTime = windowStart + this.window_seconds * 1000;
      const retryAfter = Math.ceil((resetTime - now) / 1000);

      return {
        allowed: false,
        remaining: 0,
        retry_after_seconds: Math.max(1, retryAfter)
      };
    }

    // Record usage
    await this.redis.incrby(key, requestedTokens);
    await this.redis.expire(key, this.window_seconds);

    const remaining = burstLimit - (currentUsage + requestedTokens);

    return {
      allowed: true,
      remaining: Math.floor(remaining)
    };
  }

  private async getUserTokenLimit(userId: string): Promise<number> {
    const tierKey = `user:tier:${userId}`;
    const tier = await this.redis.get(tierKey) || 'free';

    const limits: Record<string, number> = {
      'free': 100000, // 100K tokens/day
      'pro': 1000000, // 1M tokens/day
      'enterprise': 10000000, // 10M tokens/day
      'unlimited': Number.MAX_SAFE_INTEGER
    };

    return limits[tier] || limits.free;
  }

  async setUserTier(userId: string, tier: 'free' | 'pro' | 'enterprise' | 'unlimited'): Promise<void> {
    const tierKey = `user:tier:${userId}`;
    await this.redis.set(tierKey, tier);
  }

  async getUserUsage(userId: string): Promise<TokenBudget> {
    const key = `tokens:user:${userId}`;
    const windowKey = `${key}:window`;

    const tokensUsed = parseInt(await this.redis.get(key) || '0');
    const windowStart = parseInt(await this.redis.get(windowKey) || '0');
    const limit = await this.getUserTokenLimit(userId);

    return {
      user_id: userId,
      tokens_used: tokensUsed,
      tokens_limit: limit,
      window_start: new Date(windowStart),
      window_end: new Date(windowStart + this.window_seconds * 1000)
    };
  }
}

Sliding Window Token Counter in Redis

Accurate token counting with sliding windows for per-minute and per-hour limits.

interface RateLimitWindow {
  duration_ms: number;
  limit: number;
  name: string;
}

class SlidingWindowTokenCounter {
  private redis: Redis;
  private windows: RateLimitWindow[] = [
    { name: 'per-minute', duration_ms: 60000, limit: 50000 },
    { name: 'per-hour', duration_ms: 3600000, limit: 500000 },
    { name: 'per-day', duration_ms: 86400000, limit: 5000000 }
  ];

  constructor(redis: Redis) {
    this.redis = redis;
  }

  async recordTokenUsage(userId: string, tokens: number): Promise<void> {
    const now = Date.now();
    const timestamp = now.toString();

    for (const window of this.windows) {
      const key = `sliding:${userId}:${window.name}`;

      // Add to sorted set with current timestamp as score
      await this.redis.zadd(key, now, `${timestamp}:${tokens}`);

      // Remove entries older than window
      const cutoff = now - window.duration_ms;
      await this.redis.zremrangebyscore(key, 0, cutoff);

      // Set expiry
      await this.redis.expire(key, Math.ceil(window.duration_ms / 1000));
    }
  }

  async checkRateLimit(userId: string): Promise<{
    allowed: boolean;
    usage: Record<string, { current: number; limit: number; percent: number }>;
  }> {
    const usage: Record<string, { current: number; limit: number; percent: number }> = {};
    let allowed = true;

    for (const window of this.windows) {
      const key = `sliding:${userId}:${window.name}`;

      // Get all entries in current window
      const entries = await this.redis.zrange(key, 0, -1);

      let totalTokens = 0;
      for (const entry of entries) {
        const tokens = parseInt(entry.split(':')[1]);
        totalTokens += tokens;
      }

      const percent = (totalTokens / window.limit) * 100;
      usage[window.name] = {
        current: totalTokens,
        limit: window.limit,
        percent
      };

      if (totalTokens > window.limit) {
        allowed = false;
      }
    }

    return { allowed, usage };
  }

  async getWindowUsage(userId: string, windowName: string): Promise<number> {
    const key = `sliding:${userId}:${windowName}`;
    const entries = await this.redis.zrange(key, 0, -1);

    let total = 0;
    for (const entry of entries) {
      const tokens = parseInt(entry.split(':')[1]);
      total += tokens;
    }

    return total;
  }
}

Per-User Daily/Monthly Token Budget

Separate daily and monthly budgets for different user tiers.

interface UserBudgetAllocation {
  user_id: string;
  daily_limit: number;
  monthly_limit: number;
  daily_used: number;
  monthly_used: number;
  daily_remaining: number;
  monthly_remaining: number;
  reset_daily_at: Date;
  reset_monthly_at: Date;
}

class PerUserBudgetManager {
  private redis: Redis;
  private budgetTiers: Record<string, { daily: number; monthly: number }> = {
    'free': { daily: 100000, monthly: 2000000 },
    'pro': { daily: 500000, monthly: 10000000 },
    'enterprise': { daily: 5000000, monthly: 100000000 },
    'unlimited': { daily: Number.MAX_SAFE_INTEGER, monthly: Number.MAX_SAFE_INTEGER }
  };

  constructor(redis: Redis) {
    this.redis = redis;
  }

  async consumeTokens(userId: string, tokens: number): Promise<{ allowed: boolean; reason?: string }> {
    const dailyKey = `budget:daily:${userId}`;
    const monthlyKey = `budget:monthly:${userId}`;

    const budget = await this.getBudget(userId);
    const tier = await this.getUserTier(userId);
    const limits = this.budgetTiers[tier];

    // Check daily limit
    if (budget.daily_used + tokens > limits.daily) {
      return {
        allowed: false,
        reason: `Daily quota exceeded: ${budget.daily_used} + ${tokens} > ${limits.daily}`
      };
    }

    // Check monthly limit
    if (budget.monthly_used + tokens > limits.monthly) {
      return {
        allowed: false,
        reason: `Monthly quota exceeded: ${budget.monthly_used} + ${tokens} > ${limits.monthly}`
      };
    }

    // Deduct tokens
    await this.redis.incrby(dailyKey, tokens);
    await this.redis.incrby(monthlyKey, tokens);

    // Set expiry on daily key (reset at midnight UTC)
    const tomorrow = new Date();
    tomorrow.setUTCDate(tomorrow.getUTCDate() + 1);
    tomorrow.setUTCHours(0, 0, 0, 0);

    const secondsUntilMidnight = Math.ceil((tomorrow.getTime() - Date.now()) / 1000);
    await this.redis.expire(dailyKey, secondsUntilMidnight);

    return { allowed: true };
  }

  async getBudget(userId: string): Promise<UserBudgetAllocation> {
    const tier = await this.getUserTier(userId);
    const limits = this.budgetTiers[tier];

    const dailyKey = `budget:daily:${userId}`;
    const monthlyKey = `budget:monthly:${userId}`;

    const dailyUsed = parseInt(await this.redis.get(dailyKey) || '0');
    const monthlyUsed = parseInt(await this.redis.get(monthlyKey) || '0');

    const tomorrow = new Date();
    tomorrow.setUTCDate(tomorrow.getUTCDate() + 1);
    tomorrow.setUTCHours(0, 0, 0, 0);

    const nextMonth = new Date();
    nextMonth.setUTCMonth(nextMonth.getUTCMonth() + 1);
    nextMonth.setUTCDate(1);
    nextMonth.setUTCHours(0, 0, 0, 0);

    return {
      user_id: userId,
      daily_limit: limits.daily,
      monthly_limit: limits.monthly,
      daily_used: dailyUsed,
      monthly_used: monthlyUsed,
      daily_remaining: Math.max(0, limits.daily - dailyUsed),
      monthly_remaining: Math.max(0, limits.monthly - monthlyUsed),
      reset_daily_at: tomorrow,
      reset_monthly_at: nextMonth
    };
  }

  private async getUserTier(userId: string): Promise<string> {
    return (await this.redis.get(`user:tier:${userId}`)) || 'free';
  }
}

Burst Allowance for Premium Users

Allow temporary bursts above normal limit for premium users.

interface BurstConfig {
  burst_multiplier: number;
  burst_duration_minutes: number;
  cooldown_minutes: number;
}

class BurstAllowanceManager {
  private redis: Redis;
  private burstConfigs: Record<string, BurstConfig> = {
    'free': { burst_multiplier: 1.0, burst_duration_minutes: 0, cooldown_minutes: 0 },
    'pro': { burst_multiplier: 1.5, burst_duration_minutes: 5, cooldown_minutes: 60 },
    'enterprise': { burst_multiplier: 2.0, burst_duration_minutes: 15, cooldown_minutes: 30 },
    'unlimited': { burst_multiplier: Number.MAX_VALUE, burst_duration_minutes: Number.MAX_VALUE, cooldown_minutes: 0 }
  };

  constructor(redis: Redis) {
    this.redis = redis;
  }

  async canBurst(userId: string): Promise<{ can_burst: boolean; reason?: string }> {
    const tier = await this.redis.get(`user:tier:${userId}`) || 'free';
    const config = this.burstConfigs[tier];

    if (!config || config.burst_multiplier === 1.0) {
      return { can_burst: false, reason: 'User tier does not support burst' };
    }

    const lastBurstKey = `burst:last:${userId}`;
    const lastBurst = await this.redis.get(lastBurstKey);

    if (lastBurst) {
      const now = Date.now();
      const lastBurstTime = parseInt(lastBurst);
      const cooldownMs = config.cooldown_minutes * 60 * 1000;

      if (now - lastBurstTime < cooldownMs) {
        const waitMs = cooldownMs - (now - lastBurstTime);
        return {
          can_burst: false,
          reason: `Cooldown active. Wait ${Math.ceil(waitMs / 1000)}s`
        };
      }
    }

    return { can_burst: true };
  }

  async activateBurst(userId: string): Promise<void> {
    const tier = await this.redis.get(`user:tier:${userId}`) || 'free';
    const config = this.burstConfigs[tier];

    const lastBurstKey = `burst:last:${userId}`;
    await this.redis.set(lastBurstKey, Date.now().toString());
    await this.redis.expire(
      lastBurstKey,
      config.cooldown_minutes * 60 + config.burst_duration_minutes * 60
    );

    const burstEndKey = `burst:end:${userId}`;
    const burstEndTime = Date.now() + config.burst_duration_minutes * 60 * 1000;
    await this.redis.set(burstEndKey, burstEndTime.toString());
    await this.redis.expire(burstEndKey, config.burst_duration_minutes * 60);
  }

  async getBurstMultiplier(userId: string): Promise<number> {
    const burstEndKey = `burst:end:${userId}`;
    const burstEnd = await this.redis.get(burstEndKey);

    if (!burstEnd || parseInt(burstEnd) < Date.now()) {
      return 1.0;
    }

    const tier = await this.redis.get(`user:tier:${userId}`) || 'free';
    return this.burstConfigs[tier].burst_multiplier;
  }
}

Cost-Per-Request Tracking

Track actual cost of each request for billing and monitoring.

interface RequestCostMetrics {
  request_id: string;
  user_id: string;
  model: string;
  tokens_input: number;
  tokens_output: number;
  cost_usd: number;
  timestamp: Date;
}

class CostPerRequestTracker {
  private redis: Redis;
  private pricing: Record<string, { input: number; output: number }> = {
    'gpt-4o': { input: 0.015, output: 0.06 },
    'gpt-3.5-turbo': { input: 0.0005, output: 0.0015 },
    'claude-3-opus': { input: 0.015, output: 0.075 }
  };

  constructor(redis: Redis) {
    this.redis = redis;
  }

  calculateCost(model: string, inputTokens: number, outputTokens: number): number {
    const rates = this.pricing[model];
    if (!rates) return 0;

    return (inputTokens / 1000) * rates.input + (outputTokens / 1000) * rates.output;
  }

  async recordRequest(
    requestId: string,
    userId: string,
    model: string,
    inputTokens: number,
    outputTokens: number
  ): Promise<RequestCostMetrics> {
    const cost = this.calculateCost(model, inputTokens, outputTokens);
    const metrics: RequestCostMetrics = {
      request_id: requestId,
      user_id: userId,
      model,
      tokens_input: inputTokens,
      tokens_output: outputTokens,
      cost_usd: cost,
      timestamp: new Date()
    };

    // Store in Redis for recent query access
    const key = `request:${requestId}`;
    await this.redis.setex(key, 86400, JSON.stringify(metrics));

    // Add to user's request history (keep last 100)
    const historyKey = `user:requests:${userId}`;
    await this.redis.lpush(historyKey, JSON.stringify(metrics));
    await this.redis.ltrim(historyKey, 0, 99);

    // Track daily spend per user
    const dailySpendKey = `spend:daily:${userId}`;
    await this.redis.incrbyfloat(dailySpendKey, cost);
    await this.redis.expire(dailySpendKey, 86400);

    return metrics;
  }

  async getUserDailySpend(userId: string): Promise<number> {
    const key = `spend:daily:${userId}`;
    return parseFloat(await this.redis.get(key) || '0');
  }

  async getUserRequestHistory(userId: string, limit: number = 20): Promise<RequestCostMetrics[]> {
    const key = `user:requests:${userId}`;
    const requests = await this.redis.lrange(key, 0, limit - 1);

    return requests.map(r => JSON.parse(r));
  }
}

Budget Exhaustion Handling

Graceful degradation when budget is exhausted.

interface DegradationStrategy {
  type: 'queue' | 'fallback' | 'reject';
  fallback_model?: string;
  queue_ttl_seconds?: number;
}

class BudgetExhaustionHandler {
  private redis: Redis;

  constructor(redis: Redis) {
    this.redis = redis;
  }

  async handleExhaustion(
    userId: string,
    requestedTokens: number,
    strategy: DegradationStrategy
  ): Promise<{ handled: boolean; response?: string; retry_after?: number }> {
    switch (strategy.type) {
      case 'queue':
        return this.handleWithQueuing(userId, requestedTokens, strategy.queue_ttl_seconds || 3600);

      case 'fallback':
        return {
          handled: true,
          response: `Budget exhausted. Upgrade to Pro for unlimited requests. Fallback model: ${strategy.fallback_model}`
        };

      case 'reject':
        return {
          handled: false,
          response: 'Budget limit exceeded. Upgrade your plan.'
        };

      default:
        return { handled: false };
    }
  }

  private async handleWithQueuing(
    userId: string,
    tokens: number,
    ttl: number
  ): Promise<{ handled: boolean; retry_after: number }> {
    const queueKey = `queue:${userId}`;

    // Add to queue
    const position = await this.redis.lpush(queueKey, JSON.stringify({ tokens, timestamp: Date.now() }));
    await this.redis.expire(queueKey, ttl);

    // Estimate wait time (assume 1 request per minute when budget resets)
    const estimatedWaitSeconds = Math.ceil(position * 60);

    return {
      handled: true,
      retry_after: Math.min(estimatedWaitSeconds, ttl)
    };
  }

  async processQueue(userId: string): Promise<number> {
    const queueKey = `queue:${userId}`;
    const request = await this.redis.rpop(queueKey);

    if (!request) return 0;

    const { tokens } = JSON.parse(request);
    return tokens;
  }
}

Admin Override

Allow admins to adjust user budgets and bypass limits.

interface AdminAction {
  action: 'reset_daily' | 'reset_monthly' | 'increase_quota' | 'set_tier' | 'grant_tokens';
  user_id: string;
  value?: number | string;
  reason: string;
  admin_id: string;
  timestamp: Date;
}

class AdminBudgetOverride {
  private redis: Redis;
  private actionLog: AdminAction[] = [];

  constructor(redis: Redis) {
    this.redis = redis;
  }

  async resetDailyBudget(userId: string, adminId: string, reason: string): Promise<void> {
    const dailyKey = `budget:daily:${userId}`;
    await this.redis.del(dailyKey);

    this.logAction({
      action: 'reset_daily',
      user_id: userId,
      reason,
      admin_id: adminId,
      timestamp: new Date()
    });
  }

  async increaseQuota(userId: string, tokens: number, adminId: string, reason: string): Promise<void> {
    // Grant temporary tokens by adjusting monthly budget
    const bonusKey = `bonus:monthly:${userId}`;
    await this.redis.incrby(bonusKey, tokens);
    await this.redis.expire(bonusKey, 86400 * 30); // 30 days

    this.logAction({
      action: 'grant_tokens',
      user_id: userId,
      value: tokens,
      reason,
      admin_id: adminId,
      timestamp: new Date()
    });
  }

  async upgradeTier(userId: string, newTier: string, adminId: string, reason: string): Promise<void> {
    const tierKey = `user:tier:${userId}`;
    await this.redis.set(tierKey, newTier);

    this.logAction({
      action: 'set_tier',
      user_id: userId,
      value: newTier,
      reason,
      admin_id: adminId,
      timestamp: new Date()
    });
  }

  private logAction(action: AdminAction): void {
    this.actionLog.push(action);

    // Also store in Redis for audit trail
    const auditKey = `audit:${action.admin_id}:${new Date().toISOString().split('T')[0]}`;
    this.redis.lpush(auditKey, JSON.stringify(action));
    this.redis.expire(auditKey, 86400 * 90); // Keep 90 days
  }

  getAuditLog(): AdminAction[] {
    return [...this.actionLog];
  }
}

Cost Anomaly Detection

Alert when a user's spending spikes abnormally.

interface AnomalyAlert {
  user_id: string;
  spike_percent: number;
  daily_spend: number;
  expected_spend: number;
  alert_level: 'warning' | 'critical';
}

class CostAnomalyDetector {
  private redis: Redis;
  private baselineWindow = 7; // 7-day baseline
  private spikeThreshold = 2.0; // 2x normal = alert
  private criticalThreshold = 5.0; // 5x normal = critical

  constructor(redis: Redis) {
    this.redis = redis;
  }

  async detectAnomalies(): Promise<AnomalyAlert[]> {
    const users = await this.redis.keys('spend:daily:*');
    const alerts: AnomalyAlert[] = [];

    for (const key of users) {
      const userId = key.replace('spend:daily:', '');
      const dailySpend = parseFloat(await this.redis.get(key) || '0');

      const expectedSpend = await this.getBaselineSpend(userId);

      if (expectedSpend === 0) continue; // Not enough history

      const spikePercent = dailySpend / expectedSpend;

      if (spikePercent >= this.criticalThreshold) {
        alerts.push({
          user_id: userId,
          spike_percent: spikePercent,
          daily_spend: dailySpend,
          expected_spend: expectedSpend,
          alert_level: 'critical'
        });
      } else if (spikePercent >= this.spikeThreshold) {
        alerts.push({
          user_id: userId,
          spike_percent: spikePercent,
          daily_spend: dailySpend,
          expected_spend: expectedSpend,
          alert_level: 'warning'
        });
      }
    }

    return alerts;
  }

  private async getBaselineSpend(userId: string): Promise<number> {
    let totalSpend = 0;
    let dayCount = 0;

    for (let i = 1; i <= this.baselineWindow; i++) {
      const date = new Date();
      date.setDate(date.getDate() - i);
      const dateStr = date.toISOString().split('T')[0];

      const key = `spend:daily:${userId}:${dateStr}`;
      const spend = parseFloat(await this.redis.get(key) || '0');

      if (spend > 0) {
        totalSpend += spend;
        dayCount++;
      }
    }

    return dayCount > 0 ? totalSpend / dayCount : 0;
  }

  async triggerAlert(alert: AnomalyAlert): Promise<void> {
    console.warn(`ANOMALY ALERT: ${alert.user_id} spending ${(alert.spike_percent * 100).toFixed(0)}% of baseline`);

    // Would send email/Slack notification here
    const alertKey = `anomaly:${alert.user_id}`;
    await this.redis.setex(alertKey, 3600, JSON.stringify(alert)); // Cache for 1 hour
  }
}

Checklist

  • Track tokens, not requests, for rate limiting
  • Implement per-minute, per-hour, and per-day windows
  • Allow 1.5-2x burst for premium users with cooldown
  • Use sliding window for accurate token tracking
  • Track cost per request for billing and monitoring
  • Detect cost anomalies (2-5x normal spending)
  • Queue requests gracefully when budget exhausted
  • Log all admin overrides for audit trails
  • Reset daily budgets at midnight UTC
  • Alert on critical spending spikes immediately

Conclusion

Token-based rate limiting with per-user budgets is non-negotiable for LLM products. Pair it with burst allowances, graceful degradation, and anomaly detection, and you've got a system that protects your margin while keeping users happy.