Published on

LLM API Integration Patterns — Timeouts, Retries, Fallbacks, and Cost Control

Authors

Introduction

Raw LLM API calls are fragile. Rate limits, timeouts, cost overruns, and model unavailability happen in production. This post covers streaming responses with Server-Sent Events, exponential backoff for rate limits, model fallback chains, token budget enforcement, prompt caching, cost tracking, and circuit breaker patterns for bulletproof LLM integration.

Streaming Responses with Server-Sent Events

Streaming improves perceived latency by sending tokens as they arrive instead of waiting for full response.

import { OpenAI } from 'openai';
import { Express, Response } from 'express';

const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });

async function streamLLMResponse(req: Express.Request, res: Response): Promise<void> {
  const { prompt, model = 'gpt-4-turbo-preview' } = req.body;

  // Set SSE headers
  res.setHeader('Content-Type', 'text/event-stream');
  res.setHeader('Cache-Control', 'no-cache');
  res.setHeader('Connection', 'keep-alive');

  try {
    const stream = await openai.chat.completions.create({
      model,
      messages: [{ role: 'user', content: prompt }],
      stream: true,
      temperature: 0.7,
      max_tokens: 2048,
    });

    let tokenCount = 0;
    const startTime = Date.now();

    for await (const chunk of stream) {
      if (chunk.choices[0]?.delta?.content) {
        const content = chunk.choices[0].delta.content;
        tokenCount += Math.ceil(content.length / 4); // Rough token estimate

        // Send SSE event
        res.write(`data: ${JSON.stringify({ content, tokens: tokenCount })}\n\n`);

        // Safety check: abort if response exceeds token limit
        if (tokenCount > 4000) {
          res.write(`data: ${JSON.stringify({ error: 'Token limit exceeded' })}\n\n`);
          break;
        }
      }
    }

    const duration = Date.now() - startTime;
    res.write(`data: ${JSON.stringify({ done: true, duration, tokenCount })}\n\n`);
    res.end();
  } catch (error) {
    if (error instanceof Error) {
      res.write(`data: ${JSON.stringify({ error: error.message })}\n\n`);
    }
    res.end();
  }
}

// Client-side handling
async function consumeStream(prompt: string): Promise<void> {
  const response = await fetch('/api/stream', {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ prompt }),
  });

  const reader = response.body?.getReader();
  const decoder = new TextDecoder();

  while (reader) {
    const { done, value } = await reader.read();
    if (done) break;

    const chunk = decoder.decode(value);
    const lines = chunk.split('\n');

    for (const line of lines) {
      if (line.startsWith('data: ')) {
        const data = JSON.parse(line.slice(6));
        if (data.content) {
          console.log(data.content); // Render in UI
        }
      }
    }
  }
}

Exponential Backoff for Rate Limits

OpenAI enforces rate limits. Smart retries multiply wait time: 1s → 2s → 4s → 8s.

interface RetryConfig {
  maxRetries: number;
  initialDelayMs: number;
  maxDelayMs: number;
  backoffMultiplier: number;
  jitterFactor: number; // 0.1 = ±10% random jitter
}

class ResilientLLMClient {
  private readonly config: RetryConfig;
  private openai: OpenAI;
  private requestMetrics = {
    totalRequests: 0,
    totalRetries: 0,
    totalErrors: 0,
    rateLimitHits: 0,
  };

  constructor(apiKey: string, config?: Partial<RetryConfig>) {
    this.openai = new OpenAI({ apiKey });
    this.config = {
      maxRetries: 5,
      initialDelayMs: 1000,
      maxDelayMs: 60000,
      backoffMultiplier: 2,
      jitterFactor: 0.1,
      ...config,
    };
  }

  private async delay(ms: number): Promise<void> {
    return new Promise(resolve => setTimeout(resolve, ms));
  }

  private calculateBackoffMs(attempt: number): number {
    let delayMs = this.config.initialDelayMs * Math.pow(this.config.backoffMultiplier, attempt);
    delayMs = Math.min(delayMs, this.config.maxDelayMs);

    // Add jitter to avoid thundering herd
    const jitterRange = delayMs * this.config.jitterFactor;
    const jitter = (Math.random() - 0.5) * 2 * jitterRange;
    return Math.max(100, delayMs + jitter);
  }

  async callLLM(
    messages: Array<{ role: 'user' | 'assistant' | 'system'; content: string }>,
    model: string = 'gpt-4-turbo-preview'
  ): Promise<string> {
    this.requestMetrics.totalRequests++;
    let lastError: Error | null = null;

    for (let attempt = 0; attempt <= this.config.maxRetries; attempt++) {
      try {
        const response = await this.openai.chat.completions.create({
          model,
          messages,
          temperature: 0.7,
          max_tokens: 2048,
        });

        return response.choices[0]?.message?.content || '';
      } catch (error) {
        lastError = error as Error;
        this.requestMetrics.totalErrors++;

        // Check if rate limited
        if (error instanceof OpenAI.APIError && error.status === 429) {
          this.requestMetrics.rateLimitHits++;

          if (attempt < this.config.maxRetries) {
            const backoffMs = this.calculateBackoffMs(attempt);
            this.requestMetrics.totalRetries++;
            console.log(`Rate limited. Retrying after ${backoffMs}ms (attempt ${attempt + 1}/${this.config.maxRetries})`);
            await this.delay(backoffMs);
            continue;
          }
        }

        // For other errors, don't retry
        throw error;
      }
    }

    throw new Error(`Failed after ${this.config.maxRetries} retries: ${lastError?.message}`);
  }

  getMetrics() {
    return {
      ...this.requestMetrics,
      retryRate: (this.requestMetrics.totalRetries / this.requestMetrics.totalRequests).toFixed(2),
      errorRate: (this.requestMetrics.totalErrors / this.requestMetrics.totalRequests).toFixed(2),
    };
  }
}

Model Fallback Chain

Primary model fails? Fallback to cheaper/faster alternative automatically.

interface ModelConfig {
  name: string;
  maxCostPerQuery: number;
  maxLatencyMs: number;
  disabled: boolean;
}

class LLMFallbackChain {
  private models: ModelConfig[];
  private modelMetrics: Map<string, { failureCount: number; lastFailedAt: number }> = new Map();

  constructor(models: ModelConfig[]) {
    this.models = models;
  }

  async call(
    messages: Array<{ role: string; content: string }>,
    systemPrompt?: string
  ): Promise<{ response: string; modelUsed: string; costUSD: number }> {
    const fullMessages = systemPrompt
      ? [{ role: 'system' as const, content: systemPrompt }, ...messages]
      : messages;

    for (const model of this.models) {
      if (model.disabled) continue;

      // Check if model recently failed
      const metrics = this.modelMetrics.get(model.name);
      if (metrics && Date.now() - metrics.lastFailedAt < 5000 && metrics.failureCount > 2) {
        console.log(`Skipping ${model.name} (failed recently)`);
        continue;
      }

      try {
        const startTime = Date.now();
        const response = await this.callModel(model.name, fullMessages);
        const latency = Date.now() - startTime;

        if (latency > model.maxLatencyMs) {
          console.warn(`${model.name} slow: ${latency}ms > ${model.maxLatencyMs}ms`);
        }

        return {
          response: response.text,
          modelUsed: model.name,
          costUSD: response.costUSD,
        };
      } catch (error) {
        const metric = this.modelMetrics.get(model.name) || { failureCount: 0, lastFailedAt: 0 };
        metric.failureCount++;
        metric.lastFailedAt = Date.now();
        this.modelMetrics.set(model.name, metric);

        console.error(`${model.name} failed: ${error}. Trying fallback...`);
        continue;
      }
    }

    throw new Error('All models in fallback chain exhausted');
  }

  private async callModel(
    modelName: string,
    messages: any[]
  ): Promise<{ text: string; costUSD: number; tokens: number }> {
    // Model-specific implementation
    return { text: '', costUSD: 0, tokens: 0 };
  }
}

// Usage example
const fallbackChain = new LLMFallbackChain([
  { name: 'gpt-4-turbo-preview', maxCostPerQuery: 0.10, maxLatencyMs: 3000, disabled: false },
  { name: 'gpt-3.5-turbo', maxCostPerQuery: 0.02, maxLatencyMs: 5000, disabled: false },
  { name: 'claude-3-sonnet', maxCostPerQuery: 0.05, maxLatencyMs: 4000, disabled: false },
  { name: 'local-llm', maxCostPerQuery: 0.00, maxLatencyMs: 8000, disabled: false },
]);

const { response, modelUsed, costUSD } = await fallbackChain.call([
  { role: 'user', content: 'Explain quantum computing' },
]);

console.log(`Used ${modelUsed}, cost: $${costUSD}`);

Token Budget Enforcement

Prevent runaway costs by enforcing per-request and per-user token limits.

interface TokenBudget {
  perRequestTokens: number;
  perUserTokensPerDay: number;
  perUserTokensPerMonth: number;
}

class TokenBudgetEnforcer {
  private userTokenCounters: Map<string, { day: number; month: number; lastResetDay: number }> = new Map();
  private readonly budget: TokenBudget;

  constructor(budget: TokenBudget) {
    this.budget = budget;
  }

  async callWithBudget(
    userId: string,
    prompt: string,
    modelCallFn: (prompt: string) => Promise<{ response: string; tokens: number }>
  ): Promise<{ response: string; tokensUsed: number; budgetRemaining: number }> {
    // Estimate input tokens (rough: 1 token per 4 chars)
    const estimatedInputTokens = Math.ceil(prompt.length / 4);

    if (estimatedInputTokens > this.budget.perRequestTokens) {
      throw new Error(
        `Request exceeds token limit: ${estimatedInputTokens} > ${this.budget.perRequestTokens}`
      );
    }

    // Check daily/monthly limits
    const userBudget = this.getUserBudget(userId);
    const estimatedTotalTokens = estimatedInputTokens + Math.ceil(this.budget.perRequestTokens / 2); // Estimate output

    if (userBudget.day + estimatedTotalTokens > this.budget.perUserTokensPerDay) {
      throw new Error(
        `Daily token limit exceeded. Used: ${userBudget.day}, Remaining: ${this.budget.perUserTokensPerDay - userBudget.day}`
      );
    }

    if (userBudget.month + estimatedTotalTokens > this.budget.perUserTokensPerMonth) {
      throw new Error(`Monthly token limit exceeded. Used: ${userBudget.month}`);
    }

    // Call model with token limit
    const result = await modelCallFn(prompt);
    const totalTokens = estimatedInputTokens + result.tokens;

    // Record usage
    userBudget.day += totalTokens;
    userBudget.month += totalTokens;

    return {
      response: result.response,
      tokensUsed: totalTokens,
      budgetRemaining: this.budget.perUserTokensPerDay - userBudget.day,
    };
  }

  private getUserBudget(userId: string) {
    let budget = this.userTokenCounters.get(userId);

    if (!budget) {
      budget = { day: 0, month: 0, lastResetDay: this.getDayOfMonth() };
      this.userTokenCounters.set(userId, budget);
    }

    // Reset daily counter if needed
    const todayDay = this.getDayOfMonth();
    if (budget.lastResetDay !== todayDay) {
      budget.day = 0;
      budget.lastResetDay = todayDay;
    }

    return budget;
  }

  private getDayOfMonth(): number {
    return new Date().getDate();
  }
}

Prompt Caching to Reduce Costs

Cache common prompt prefixes to avoid reprocessing.

import crypto from 'crypto';

interface CachedPrompt {
  hash: string;
  content: string;
  cacheControl: string;
  tokens: number;
  costUSD: number;
}

class PromptCache {
  private cache: Map<string, CachedPrompt> = new Map();

  private hashPrompt(prompt: string): string {
    return crypto.createHash('sha256').update(prompt).digest('hex');
  }

  async callWithCache(
    systemPrompt: string,
    userPrompt: string,
    modelCallFn: (messages: any[]) => Promise<{ response: string; tokens: number; costUSD: number }>
  ): Promise<{ response: string; fromCache: boolean; costSaved: number }> {
    const systemHash = this.hashPrompt(systemPrompt);

    let cached = this.cache.get(systemHash);

    if (!cached) {
      // First call - no cache
      const messages = [
        { role: 'system' as const, content: systemPrompt },
        { role: 'user' as const, content: userPrompt },
      ];

      const result = await modelCallFn(messages);

      cached = {
        hash: systemHash,
        content: systemPrompt,
        cacheControl: 'max-age=86400', // Cache for 1 day
        tokens: result.tokens,
        costUSD: result.costUSD,
      };

      this.cache.set(systemHash, cached);

      return {
        response: result.response,
        fromCache: false,
        costSaved: 0,
      };
    }

    // Subsequent calls - use cache
    const messages = [
      { role: 'system' as const, content: systemPrompt, cache_control: { type: 'ephemeral' } },
      { role: 'user' as const, content: userPrompt },
    ];

    const result = await modelCallFn(messages);

    // OpenAI cache hit: 90% cost reduction on cached tokens
    const cachedTokenCost = cached.costUSD * 0.1;
    const costSaved = cached.costUSD - cachedTokenCost;

    return {
      response: result.response,
      fromCache: true,
      costSaved: costSaved,
    };
  }
}

Cost Per Request Tracking

Monitor and alert on cost anomalies.

interface RequestCostMetrics {
  requestId: string;
  userId: string;
  model: string;
  inputTokens: number;
  outputTokens: number;
  totalCostUSD: number;
  timestamp: Date;
}

class CostTracker {
  private requests: RequestCostMetrics[] = [];
  private dailyCost = 0;
  private hourlyThreshold = 50; // Alert if hourly cost exceeds $50

  recordRequest(metrics: RequestCostMetrics): void {
    this.requests.push(metrics);
    this.dailyCost += metrics.totalCostUSD;

    if (metrics.totalCostUSD > 1.0) {
      console.warn(`Expensive request: ${metrics.requestId} cost $${metrics.totalCostUSD}`);
    }

    // Check hourly threshold
    const oneHourAgo = new Date(Date.now() - 3600000);
    const recentCost = this.requests
      .filter(r => r.timestamp > oneHourAgo)
      .reduce((sum, r) => sum + r.totalCostUSD, 0);

    if (recentCost > this.hourlyThreshold) {
      console.error(`ALERT: Hourly cost $${recentCost.toFixed(2)} exceeds threshold`);
    }
  }

  getCostByUser(userId: string, days: number = 30): number {
    const cutoff = new Date(Date.now() - days * 86400000);
    return this.requests
      .filter(r => r.userId === userId && r.timestamp > cutoff)
      .reduce((sum, r) => sum + r.totalCostUSD, 0);
  }

  getCostByModel(model: string, days: number = 30): number {
    const cutoff = new Date(Date.now() - days * 86400000);
    return this.requests
      .filter(r => r.model === model && r.timestamp > cutoff)
      .reduce((sum, r) => sum + r.totalCostUSD, 0);
  }

  getDailyReport(): { model: string; cost: number; requests: number }[] {
    const byModel = new Map<string, { cost: number; count: number }>();

    this.requests.forEach(r => {
      const existing = byModel.get(r.model) || { cost: 0, count: 0 };
      existing.cost += r.totalCostUSD;
      existing.count++;
      byModel.set(r.model, existing);
    });

    return Array.from(byModel).map(([model, { cost, count }]) => ({
      model,
      cost,
      requests: count,
    }));
  }
}

Circuit Breaker for LLM APIs

Fail fast when LLM service degrades.

enum CircuitState {
  CLOSED = 'CLOSED',
  OPEN = 'OPEN',
  HALF_OPEN = 'HALF_OPEN',
}

interface CircuitBreakerConfig {
  failureThreshold: number; // Consecutive failures to trip
  successThreshold: number; // Consecutive successes to close
  timeout: number; // How long to wait before half-open
}

class LLMCircuitBreaker {
  private state: CircuitState = CircuitState.CLOSED;
  private failureCount = 0;
  private successCount = 0;
  private lastFailureTime = 0;

  constructor(private config: CircuitBreakerConfig) {}

  async call<T>(fn: () => Promise<T>): Promise<T> {
    if (this.state === CircuitState.OPEN) {
      if (Date.now() - this.lastFailureTime > this.config.timeout) {
        this.state = CircuitState.HALF_OPEN;
        this.successCount = 0;
      } else {
        throw new Error('Circuit breaker is OPEN');
      }
    }

    try {
      const result = await fn();
      this.onSuccess();
      return result;
    } catch (error) {
      this.onFailure();
      throw error;
    }
  }

  private onSuccess(): void {
    this.failureCount = 0;

    if (this.state === CircuitState.HALF_OPEN) {
      this.successCount++;
      if (this.successCount >= this.config.successThreshold) {
        this.state = CircuitState.CLOSED;
        console.log('Circuit breaker CLOSED');
      }
    }
  }

  private onFailure(): void {
    this.failureCount++;
    this.lastFailureTime = Date.now();

    if (this.failureCount >= this.config.failureThreshold) {
      this.state = CircuitState.OPEN;
      console.error('Circuit breaker OPEN due to repeated failures');
    }
  }

  getState(): CircuitState {
    return this.state;
  }
}

LLM Integration Checklist

  • Implement SSE streaming for TTFB optimization
  • Use exponential backoff with jitter for rate limit handling
  • Build model fallback chain (expensive → cheap → local)
  • Enforce per-request and per-user token budgets
  • Implement prompt caching for system prompts
  • Track cost per request, user, and model
  • Set up cost anomaly alerts
  • Implement circuit breaker for API degradation
  • Monitor p95 latency per model
  • Test timeout handling at network layer

Conclusion

Resilient LLM integration requires multiple layers: streaming for UX, retries for reliability, fallbacks for cost, budgets for financial safety, caching for efficiency, and circuit breakers for degradation. These patterns together create production systems that survive API outages, rate limits, and cost spikes.