Published on

LLM Fallback Strategies — What Happens When OpenAI Is Down

Authors

Introduction

Dependence on a single LLM provider is a business risk. When OpenAI goes down, your app goes down. This guide covers fallback architectures, circuit breakers, and cost-based routing for production reliability.

Multi-Provider Abstraction Layer

Abstract away provider-specific APIs behind a unified interface.

interface LLMRequest {
  model: string;
  messages: Array<{ role: string; content: string }>;
  max_tokens?: number;
  temperature?: number;
}

interface LLMResponse {
  content: string;
  tokens_used: { input: number; output: number };
  provider: string;
  model: string;
}

interface LLMProvider {
  name: string;
  call(request: LLMRequest): Promise<LLMResponse>;
  health(): Promise<boolean>;
  cost(tokens: { input: number; output: number }): number;
}

class OpenAIProvider implements LLMProvider {
  name = 'openai';
  private client: any; // OpenAI client

  async call(request: LLMRequest): Promise<LLMResponse> {
    const response = await this.client.chat.completions.create({
      model: request.model || 'gpt-4o',
      messages: request.messages,
      max_tokens: request.max_tokens,
      temperature: request.temperature
    });

    return {
      content: response.choices[0].message.content,
      tokens_used: {
        input: response.usage.prompt_tokens,
        output: response.usage.completion_tokens
      },
      provider: this.name,
      model: request.model || 'gpt-4o'
    };
  }

  async health(): Promise<boolean> {
    try {
      // Simple health check via API
      await this.client.models.retrieve('gpt-4o');
      return true;
    } catch {
      return false;
    }
  }

  cost(tokens: { input: number; output: number }): number {
    return (tokens.input / 1000) * 0.015 + (tokens.output / 1000) * 0.06;
  }
}

class AnthropicProvider implements LLMProvider {
  name = 'anthropic';
  private client: any; // Anthropic client

  async call(request: LLMRequest): Promise<LLMResponse> {
    const response = await this.client.messages.create({
      model: request.model || 'claude-3-opus-20240229',
      messages: request.messages,
      max_tokens: request.max_tokens || 1024
    });

    return {
      content: response.content[0].text,
      tokens_used: {
        input: response.usage.input_tokens,
        output: response.usage.output_tokens
      },
      provider: this.name,
      model: request.model || 'claude-3-opus-20240229'
    };
  }

  async health(): Promise<boolean> {
    try {
      await this.client.messages.create({
        model: 'claude-3-opus-20240229',
        messages: [{ role: 'user', content: 'ping' }],
        max_tokens: 10
      });
      return true;
    } catch {
      return false;
    }
  }

  cost(tokens: { input: number; output: number }): number {
    return (tokens.input / 1000) * 0.015 + (tokens.output / 1000) * 0.075;
  }
}

class ProviderAbstractionLayer {
  private providers: LLMProvider[] = [];
  private activeIndex = 0;

  register(provider: LLMProvider): void {
    this.providers.push(provider);
  }

  async call(request: LLMRequest): Promise<LLMResponse> {
    // Try active provider first
    try {
      return await this.providers[this.activeIndex].call(request);
    } catch (error) {
      console.warn(`Provider ${this.providers[this.activeIndex].name} failed: ${error}`);
      return this.callWithFailover(request);
    }
  }

  private async callWithFailover(request: LLMRequest): Promise<LLMResponse> {
    // Try all providers in order
    for (let i = 0; i < this.providers.length; i++) {
      try {
        const response = await this.providers[i].call(request);
        this.activeIndex = i; // Switch to working provider
        return response;
      } catch (error) {
        console.warn(`Provider ${this.providers[i].name} failed: ${error}`);
      }
    }

    throw new Error('All LLM providers exhausted');
  }
}

Failover Chain

Define a cascade of providers to try in order.

interface ProviderChainConfig {
  name: string;
  providers: string[]; // Provider names in order
  strategy: 'fastest' | 'cost' | 'capability';
}

class FailoverChainExecutor {
  private chains: Record<string, ProviderChainConfig> = {
    'balanced': {
      name: 'balanced',
      providers: ['openai', 'anthropic', 'gemini', 'ollama-local'],
      strategy: 'fastest'
    },
    'cost-conscious': {
      name: 'cost-conscious',
      providers: ['gemini', 'llama2', 'mistral', 'gpt-4'],
      strategy: 'cost'
    },
    'quality-first': {
      name: 'quality-first',
      providers: ['gpt-4', 'claude-opus', 'gemini-pro', 'llama2'],
      strategy: 'capability'
    }
  };

  async executeChain(
    chain: ProviderChainConfig,
    request: LLMRequest,
    providers: Map<string, LLMProvider>
  ): Promise<LLMResponse> {
    const errors: Array<{ provider: string; error: string }> = [];

    for (const providerName of chain.providers) {
      const provider = providers.get(providerName);
      if (!provider) continue;

      try {
        const isHealthy = await provider.health();
        if (!isHealthy) {
          console.warn(`Provider ${providerName} is unhealthy`);
          continue;
        }

        console.log(`Trying provider: ${providerName}`);
        const response = await provider.call(request);
        return response;
      } catch (error) {
        errors.push({
          provider: providerName,
          error: (error as Error).message
        });
        console.warn(`Provider ${providerName} failed: ${error}`);
      }
    }

    throw new Error(
      `Failover chain exhausted. Errors: ${JSON.stringify(errors)}`
    );
  }

  getChain(chainName: string): ProviderChainConfig {
    const chain = this.chains[chainName];
    if (!chain) throw new Error(`Unknown chain: ${chainName}`);
    return chain;
  }
}

Circuit Breaker for LLM Providers

Prevent cascading failures by stopping requests to failing providers temporarily.

interface CircuitBreakerState {
  status: 'closed' | 'open' | 'half-open';
  failures: number;
  last_failure: Date | null;
  success_count: number;
}

class CircuitBreaker {
  private state: CircuitBreakerState = {
    status: 'closed',
    failures: 0,
    last_failure: null,
    success_count: 0
  };

  private readonly failure_threshold = 5;
  private readonly reset_timeout_ms = 60000; // 1 minute
  private readonly half_open_threshold = 3; // Successes needed to close

  async execute<T>(
    fn: () => Promise<T>
  ): Promise<T> {
    if (this.state.status === 'open') {
      if (this.shouldHalfOpen()) {
        this.state.status = 'half-open';
        this.state.success_count = 0;
      } else {
        throw new Error('Circuit breaker is OPEN');
      }
    }

    try {
      const result = await fn();

      if (this.state.status === 'half-open') {
        this.state.success_count++;
        if (this.state.success_count >= this.half_open_threshold) {
          this.state.status = 'closed';
          this.state.failures = 0;
        }
      } else {
        this.state.failures = Math.max(0, this.state.failures - 1);
      }

      return result;
    } catch (error) {
      this.state.failures++;
      this.state.last_failure = new Date();

      if (this.state.failures >= this.failure_threshold) {
        this.state.status = 'open';
      }

      throw error;
    }
  }

  private shouldHalfOpen(): boolean {
    if (!this.state.last_failure) return true;
    const elapsed = Date.now() - this.state.last_failure.getTime();
    return elapsed > this.reset_timeout_ms;
  }

  getStatus(): CircuitBreakerState {
    return { ...this.state };
  }
}

LiteLLM as Unified Gateway

Use LiteLLM library to unify provider APIs and simplify fallback logic.

import { completion, embedding } from 'litellm';

class LiteLLMGateway {
  async callWithFallback(
    models: string[], // e.g., ['gpt-4', 'claude-3-opus', 'gemini-pro']
    messages: Array<{ role: string; content: string }>,
    options?: any
  ): Promise<string> {
    for (const model of models) {
      try {
        const response = await completion(
          model,
          messages,
          {
            temperature: 0.7,
            max_tokens: 2000,
            ...options
          }
        );

        return response.choices[0].message.content;
      } catch (error) {
        console.warn(`Model ${model} failed: ${error}`);
        // Continue to next model
      }
    }

    throw new Error(`All models exhausted: ${models.join(', ')}`);
  }

  async embedWithFallback(
    text: string,
    models: string[] = ['text-embedding-3-small', 'voyage-large-2']
  ): Promise<number[]> {
    for (const model of models) {
      try {
        const response = await embedding(model, [text]);
        return response.data[0].embedding;
      } catch (error) {
        console.warn(`Embedding model ${model} failed: ${error}`);
      }
    }

    throw new Error(`All embedding models exhausted`);
  }
}

Cost-Based Routing

Route queries to cheaper models for simple tasks, expensive models for complex ones.

interface ModelCapability {
  model: string;
  cost_per_1k_input: number;
  capability_score: number; // 1-10
  typical_latency_ms: number;
}

class CostBasedRouter {
  private models: ModelCapability[] = [
    {
      model: 'gpt-3.5-turbo',
      cost_per_1k_input: 0.0005,
      capability_score: 5,
      typical_latency_ms: 800
    },
    {
      model: 'gpt-4',
      cost_per_1k_input: 0.03,
      capability_score: 9,
      typical_latency_ms: 2000
    },
    {
      model: 'gemini-pro',
      cost_per_1k_input: 0.005,
      capability_score: 7,
      typical_latency_ms: 1000
    },
    {
      model: 'mistral-medium',
      cost_per_1k_input: 0.0027,
      capability_score: 6,
      typical_latency_ms: 600
    }
  ];

  classifyQueryComplexity(query: string): 1 | 5 | 10 {
    // Simple heuristic: word count and punctuation
    const wordCount = query.split(/\s+/).length;
    const hasFormula = /[\+\-\*\/]/.test(query);
    const hasCode = /```|`/g.test(query);

    if (hasCode || hasFormula || wordCount > 200) return 10;
    if (wordCount > 100) return 5;
    return 1;
  }

  selectModel(query: string): ModelCapability {
    const complexity = this.classifyQueryComplexity(query);

    if (complexity === 1) {
      // Simple query: use cheapest model
      return this.models.reduce((a, b) =>
        a.cost_per_1k_input < b.cost_per_1k_input ? a : b
      );
    }

    if (complexity === 10) {
      // Complex query: use most capable model
      return this.models.reduce((a, b) =>
        a.capability_score > b.capability_score ? a : b
      );
    }

    // Medium: balance cost and capability
    const score = (m: ModelCapability) =>
      (10 - m.capability_score) * 0.3 + m.cost_per_1k_input * 1000;

    return this.models.reduce((a, b) => (score(a) < score(b) ? a : b));
  }

  estimateSavings(queries: string[], defaultModel: ModelCapability): number {
    const defaultCost = queries.length * defaultModel.cost_per_1k_input;
    const routedCost = queries.reduce((sum, q) => {
      const model = this.selectModel(q);
      return sum + model.cost_per_1k_input;
    }, 0);

    return defaultCost - routedCost;
  }
}

Model Capability Matrix

Document which models are best for which tasks.

interface ModelMatrix {
  model: string;
  reasoning: number; // 1-10
  coding: number;
  creative: number;
  multimodal: boolean;
  long_context: boolean;
}

class ModelCapabilityMatrix {
  private matrix: Record<string, ModelMatrix> = {
    'gpt-4o': {
      model: 'gpt-4o',
      reasoning: 9,
      coding: 9,
      creative: 8,
      multimodal: true,
      long_context: true
    },
    'gpt-3.5-turbo': {
      model: 'gpt-3.5-turbo',
      reasoning: 6,
      coding: 7,
      creative: 6,
      multimodal: false,
      long_context: false
    },
    'claude-3-opus': {
      model: 'claude-3-opus',
      reasoning: 10,
      coding: 9,
      creative: 7,
      multimodal: true,
      long_context: true
    },
    'gemini-2.0-pro': {
      model: 'gemini-2.0-pro',
      reasoning: 8,
      coding: 8,
      creative: 7,
      multimodal: true,
      long_context: true
    }
  };

  selectForTask(task: 'reasoning' | 'coding' | 'creative'): string {
    return Object.entries(this.matrix)
      .sort(([, a], [, b]) => b[task] - a[task])[0][0];
  }

  needsMultimodal(): string[] {
    return Object.entries(this.matrix)
      .filter(([, m]) => m.multimodal)
      .map(([name]) => name);
  }
}

Graceful Degradation

Serve best-effort results when all providers fail.

class GracefulDegradationHandler {
  async attemptWithDegradation(
    query: string,
    primaryChain: string[],
    providers: Map<string, LLMProvider>
  ): Promise<LLMResponse | string> {
    // Try primary chain
    for (const provider of primaryChain) {
      try {
        const prov = providers.get(provider);
        if (!prov) continue;

        return await prov.call({
          model: provider,
          messages: [{ role: 'user', content: query }]
        });
      } catch {
        // Continue to next
      }
    }

    // Try local LLM as fallback
    try {
      const localResponse = await this.callLocalLLM(query);
      return {
        content: localResponse,
        tokens_used: { input: 0, output: 0 },
        provider: 'local-fallback',
        model: 'ollama-local'
      };
    } catch {
      // Return cached or generic response
      return this.serveCachedOrGenericResponse(query);
    }
  }

  private async callLocalLLM(query: string): Promise<string> {
    // Would call local Ollama instance
    return 'Local LLM response';
  }

  private serveCachedOrGenericResponse(query: string): string {
    return `Service temporarily unavailable. Your query was: "${query.substring(0, 100)}...". Please try again shortly.`;
  }
}

Health Check Endpoint

Expose provider health status for monitoring.

interface ProviderHealthStatus {
  provider: string;
  healthy: boolean;
  last_check: Date;
  consecutive_failures: number;
  response_time_ms?: number;
}

class HealthCheckMonitor {
  private statuses: Map<string, ProviderHealthStatus> = new Map();

  async checkHealth(
    providers: Map<string, LLMProvider>
  ): Promise<Map<string, ProviderHealthStatus>> {
    const results = new Map<string, ProviderHealthStatus>();

    for (const [name, provider] of providers) {
      const startTime = Date.now();

      try {
        const healthy = await provider.health();
        results.set(name, {
          provider: name,
          healthy,
          last_check: new Date(),
          consecutive_failures: healthy ? 0 : 1,
          response_time_ms: Date.now() - startTime
        });
      } catch (error) {
        const previous = this.statuses.get(name);
        results.set(name, {
          provider: name,
          healthy: false,
          last_check: new Date(),
          consecutive_failures: (previous?.consecutive_failures || 0) + 1,
          response_time_ms: Date.now() - startTime
        });
      }
    }

    this.statuses = results;
    return results;
  }

  exposeHealthEndpoint(app: any): void {
    app.get('/health/llm', (req: any, res: any) => {
      const statuses = Array.from(this.statuses.values());
      const allHealthy = statuses.every(s => s.healthy);

      res.status(allHealthy ? 200 : 503).json({
        status: allHealthy ? 'healthy' : 'degraded',
        providers: statuses
      });
    });
  }
}

Checklist

  • Register at least 3 different LLM providers (OpenAI, Anthropic, Gemini)
  • Implement circuit breaker pattern for each provider
  • Use LiteLLM to reduce boilerplate provider-specific code
  • Define cost/capability tiers for intelligent routing
  • Route simple queries to cheap models, complex ones to capable models
  • Monitor provider health every 30-60 seconds
  • Expose /health/llm endpoint for operational visibility
  • Implement local fallback (Ollama) for complete outages
  • Cache responses to serve on total failure
  • Document model capabilities matrix for your team

Conclusion

Single points of failure in LLM infrastructure are unacceptable in production. Multi-provider architectures with circuit breakers, cost-based routing, and graceful degradation make your service resilient to the inevitable provider outages.