LLM Fallback Strategies — What Happens When OpenAI Is Down

Introduction

Dependence on a single LLM provider is a business risk. When OpenAI goes down, your app goes down. This guide covers fallback architectures, circuit breakers, and cost-based routing for production reliability.

Multi-Provider Abstraction Layer
Failover Chain
Circuit Breaker for LLM Providers
LiteLLM as Unified Gateway
Cost-Based Routing
Model Capability Matrix
Graceful Degradation
Health Check Endpoint
Checklist
Conclusion

Multi-Provider Abstraction Layer

Abstract away provider-specific APIs behind a unified interface.

interface LLMRequest {
  model: string;
  messages: Array&lt;{ role: string; content: string }&gt;;
  max_tokens?: number;
  temperature?: number;
}

interface LLMResponse {
  content: string;
  tokens_used: { input: number; output: number };
  provider: string;
  model: string;
}

interface LLMProvider {
  name: string;
  call(request: LLMRequest): Promise&lt;LLMResponse&gt;;
  health(): Promise&lt;boolean&gt;;
  cost(tokens: { input: number; output: number }): number;
}

class OpenAIProvider implements LLMProvider {
  name = 'openai';
  private client: any; // OpenAI client

  async call(request: LLMRequest): Promise&lt;LLMResponse&gt; {
    const response = await this.client.chat.completions.create({
      model: request.model || 'gpt-4o',
      messages: request.messages,
      max_tokens: request.max_tokens,
      temperature: request.temperature
    });

    return {
      content: response.choices[0].message.content,
      tokens_used: {
        input: response.usage.prompt_tokens,
        output: response.usage.completion_tokens
      },
      provider: this.name,
      model: request.model || 'gpt-4o'
    };
  }

  async health(): Promise&lt;boolean&gt; {
    try {
      // Simple health check via API
      await this.client.models.retrieve('gpt-4o');
      return true;
    } catch {
      return false;
    }
  }

  cost(tokens: { input: number; output: number }): number {
    return (tokens.input / 1000) * 0.015 + (tokens.output / 1000) * 0.06;
  }
}

class AnthropicProvider implements LLMProvider {
  name = 'anthropic';
  private client: any; // Anthropic client

  async call(request: LLMRequest): Promise&lt;LLMResponse&gt; {
    const response = await this.client.messages.create({
      model: request.model || 'claude-3-opus-20240229',
      messages: request.messages,
      max_tokens: request.max_tokens || 1024
    });

    return {
      content: response.content[0].text,
      tokens_used: {
        input: response.usage.input_tokens,
        output: response.usage.output_tokens
      },
      provider: this.name,
      model: request.model || 'claude-3-opus-20240229'
    };
  }

  async health(): Promise&lt;boolean&gt; {
    try {
      await this.client.messages.create({
        model: 'claude-3-opus-20240229',
        messages: [{ role: 'user', content: 'ping' }],
        max_tokens: 10
      });
      return true;
    } catch {
      return false;
    }
  }

  cost(tokens: { input: number; output: number }): number {
    return (tokens.input / 1000) * 0.015 + (tokens.output / 1000) * 0.075;
  }
}

class ProviderAbstractionLayer {
  private providers: LLMProvider[] = [];
  private activeIndex = 0;

  register(provider: LLMProvider): void {
    this.providers.push(provider);
  }

  async call(request: LLMRequest): Promise&lt;LLMResponse&gt; {
    // Try active provider first
    try {
      return await this.providers[this.activeIndex].call(request);
    } catch (error) {
      console.warn(`Provider ${this.providers[this.activeIndex].name} failed: ${error}`);
      return this.callWithFailover(request);
    }
  }

  private async callWithFailover(request: LLMRequest): Promise&lt;LLMResponse&gt; {
    // Try all providers in order
    for (let i = 0; i &lt; this.providers.length; i++) {
      try {
        const response = await this.providers[i].call(request);
        this.activeIndex = i; // Switch to working provider
        return response;
      } catch (error) {
        console.warn(`Provider ${this.providers[i].name} failed: ${error}`);
      }
    }

    throw new Error('All LLM providers exhausted');
  }
}

Failover Chain

Define a cascade of providers to try in order.

interface ProviderChainConfig {
  name: string;
  providers: string[]; // Provider names in order
  strategy: 'fastest' | 'cost' | 'capability';
}

class FailoverChainExecutor {
  private chains: Record&lt;string, ProviderChainConfig&gt; = {
    'balanced': {
      name: 'balanced',
      providers: ['openai', 'anthropic', 'gemini', 'ollama-local'],
      strategy: 'fastest'
    },
    'cost-conscious': {
      name: 'cost-conscious',
      providers: ['gemini', 'llama2', 'mistral', 'gpt-4'],
      strategy: 'cost'
    },
    'quality-first': {
      name: 'quality-first',
      providers: ['gpt-4', 'claude-opus', 'gemini-pro', 'llama2'],
      strategy: 'capability'
    }
  };

  async executeChain(
    chain: ProviderChainConfig,
    request: LLMRequest,
    providers: Map&lt;string, LLMProvider&gt;
  ): Promise&lt;LLMResponse&gt; {
    const errors: Array&lt;{ provider: string; error: string }&gt; = [];

    for (const providerName of chain.providers) {
      const provider = providers.get(providerName);
      if (!provider) continue;

      try {
        const isHealthy = await provider.health();
        if (!isHealthy) {
          console.warn(`Provider ${providerName} is unhealthy`);
          continue;
        }

        console.log(`Trying provider: ${providerName}`);
        const response = await provider.call(request);
        return response;
      } catch (error) {
        errors.push({
          provider: providerName,
          error: (error as Error).message
        });
        console.warn(`Provider ${providerName} failed: ${error}`);
      }
    }

    throw new Error(
      `Failover chain exhausted. Errors: ${JSON.stringify(errors)}`
    );
  }

  getChain(chainName: string): ProviderChainConfig {
    const chain = this.chains[chainName];
    if (!chain) throw new Error(`Unknown chain: ${chainName}`);
    return chain;
  }
}

Circuit Breaker for LLM Providers

Prevent cascading failures by stopping requests to failing providers temporarily.

interface CircuitBreakerState {
  status: 'closed' | 'open' | 'half-open';
  failures: number;
  last_failure: Date | null;
  success_count: number;
}

class CircuitBreaker {
  private state: CircuitBreakerState = {
    status: 'closed',
    failures: 0,
    last_failure: null,
    success_count: 0
  };

  private readonly failure_threshold = 5;
  private readonly reset_timeout_ms = 60000; // 1 minute
  private readonly half_open_threshold = 3; // Successes needed to close

  async execute&lt;T&gt;(
    fn: () =&gt; Promise&lt;T&gt;
  ): Promise&lt;T&gt; {
    if (this.state.status === 'open') {
      if (this.shouldHalfOpen()) {
        this.state.status = 'half-open';
        this.state.success_count = 0;
      } else {
        throw new Error('Circuit breaker is OPEN');
      }
    }

    try {
      const result = await fn();

      if (this.state.status === 'half-open') {
        this.state.success_count++;
        if (this.state.success_count &gt;= this.half_open_threshold) {
          this.state.status = 'closed';
          this.state.failures = 0;
        }
      } else {
        this.state.failures = Math.max(0, this.state.failures - 1);
      }

      return result;
    } catch (error) {
      this.state.failures++;
      this.state.last_failure = new Date();

      if (this.state.failures &gt;= this.failure_threshold) {
        this.state.status = 'open';
      }

      throw error;
    }
  }

  private shouldHalfOpen(): boolean {
    if (!this.state.last_failure) return true;
    const elapsed = Date.now() - this.state.last_failure.getTime();
    return elapsed &gt; this.reset_timeout_ms;
  }

  getStatus(): CircuitBreakerState {
    return { ...this.state };
  }
}

LiteLLM as Unified Gateway

Use LiteLLM library to unify provider APIs and simplify fallback logic.

import { completion, embedding } from 'litellm';

class LiteLLMGateway {
  async callWithFallback(
    models: string[], // e.g., ['gpt-4', 'claude-3-opus', 'gemini-pro']
    messages: Array&lt;{ role: string; content: string }&gt;,
    options?: any
  ): Promise&lt;string&gt; {
    for (const model of models) {
      try {
        const response = await completion(
          model,
          messages,
          {
            temperature: 0.7,
            max_tokens: 2000,
            ...options
          }
        );

        return response.choices[0].message.content;
      } catch (error) {
        console.warn(`Model ${model} failed: ${error}`);
        // Continue to next model
      }
    }

    throw new Error(`All models exhausted: ${models.join(', ')}`);
  }

  async embedWithFallback(
    text: string,
    models: string[] = ['text-embedding-3-small', 'voyage-large-2']
  ): Promise&lt;number[]&gt; {
    for (const model of models) {
      try {
        const response = await embedding(model, [text]);
        return response.data[0].embedding;
      } catch (error) {
        console.warn(`Embedding model ${model} failed: ${error}`);
      }
    }

    throw new Error(`All embedding models exhausted`);
  }
}

Cost-Based Routing

Route queries to cheaper models for simple tasks, expensive models for complex ones.

interface ModelCapability {
  model: string;
  cost_per_1k_input: number;
  capability_score: number; // 1-10
  typical_latency_ms: number;
}

class CostBasedRouter {
  private models: ModelCapability[] = [
    {
      model: 'gpt-3.5-turbo',
      cost_per_1k_input: 0.0005,
      capability_score: 5,
      typical_latency_ms: 800
    },
    {
      model: 'gpt-4',
      cost_per_1k_input: 0.03,
      capability_score: 9,
      typical_latency_ms: 2000
    },
    {
      model: 'gemini-pro',
      cost_per_1k_input: 0.005,
      capability_score: 7,
      typical_latency_ms: 1000
    },
    {
      model: 'mistral-medium',
      cost_per_1k_input: 0.0027,
      capability_score: 6,
      typical_latency_ms: 600
    }
  ];

  classifyQueryComplexity(query: string): 1 | 5 | 10 {
    // Simple heuristic: word count and punctuation
    const wordCount = query.split(/\s+/).length;
    const hasFormula = /[\+\-\*\/]/.test(query);
    const hasCode = /```|`/g.test(query);

    if (hasCode || hasFormula || wordCount &gt; 200) return 10;
    if (wordCount &gt; 100) return 5;
    return 1;
  }

  selectModel(query: string): ModelCapability {
    const complexity = this.classifyQueryComplexity(query);

    if (complexity === 1) {
      // Simple query: use cheapest model
      return this.models.reduce((a, b) =&gt;
        a.cost_per_1k_input &lt; b.cost_per_1k_input ? a : b
      );
    }

    if (complexity === 10) {
      // Complex query: use most capable model
      return this.models.reduce((a, b) =&gt;
        a.capability_score &gt; b.capability_score ? a : b
      );
    }

    // Medium: balance cost and capability
    const score = (m: ModelCapability) =&gt;
      (10 - m.capability_score) * 0.3 + m.cost_per_1k_input * 1000;

    return this.models.reduce((a, b) =&gt; (score(a) &lt; score(b) ? a : b));
  }

  estimateSavings(queries: string[], defaultModel: ModelCapability): number {
    const defaultCost = queries.length * defaultModel.cost_per_1k_input;
    const routedCost = queries.reduce((sum, q) =&gt; {
      const model = this.selectModel(q);
      return sum + model.cost_per_1k_input;
    }, 0);

    return defaultCost - routedCost;
  }
}

Model Capability Matrix

Document which models are best for which tasks.

interface ModelMatrix {
  model: string;
  reasoning: number; // 1-10
  coding: number;
  creative: number;
  multimodal: boolean;
  long_context: boolean;
}

class ModelCapabilityMatrix {
  private matrix: Record&lt;string, ModelMatrix&gt; = {
    'gpt-4o': {
      model: 'gpt-4o',
      reasoning: 9,
      coding: 9,
      creative: 8,
      multimodal: true,
      long_context: true
    },
    'gpt-3.5-turbo': {
      model: 'gpt-3.5-turbo',
      reasoning: 6,
      coding: 7,
      creative: 6,
      multimodal: false,
      long_context: false
    },
    'claude-3-opus': {
      model: 'claude-3-opus',
      reasoning: 10,
      coding: 9,
      creative: 7,
      multimodal: true,
      long_context: true
    },
    'gemini-2.0-pro': {
      model: 'gemini-2.0-pro',
      reasoning: 8,
      coding: 8,
      creative: 7,
      multimodal: true,
      long_context: true
    }
  };

  selectForTask(task: 'reasoning' | 'coding' | 'creative'): string {
    return Object.entries(this.matrix)
      .sort(([, a], [, b]) =&gt; b[task] - a[task])[0][0];
  }

  needsMultimodal(): string[] {
    return Object.entries(this.matrix)
      .filter(([, m]) =&gt; m.multimodal)
      .map(([name]) =&gt; name);
  }
}

Graceful Degradation

Serve best-effort results when all providers fail.

class GracefulDegradationHandler {
  async attemptWithDegradation(
    query: string,
    primaryChain: string[],
    providers: Map&lt;string, LLMProvider&gt;
  ): Promise&lt;LLMResponse | string&gt; {
    // Try primary chain
    for (const provider of primaryChain) {
      try {
        const prov = providers.get(provider);
        if (!prov) continue;

        return await prov.call({
          model: provider,
          messages: [{ role: 'user', content: query }]
        });
      } catch {
        // Continue to next
      }
    }

    // Try local LLM as fallback
    try {
      const localResponse = await this.callLocalLLM(query);
      return {
        content: localResponse,
        tokens_used: { input: 0, output: 0 },
        provider: 'local-fallback',
        model: 'ollama-local'
      };
    } catch {
      // Return cached or generic response
      return this.serveCachedOrGenericResponse(query);
    }
  }

  private async callLocalLLM(query: string): Promise&lt;string&gt; {
    // Would call local Ollama instance
    return 'Local LLM response';
  }

  private serveCachedOrGenericResponse(query: string): string {
    return `Service temporarily unavailable. Your query was: "${query.substring(0, 100)}...". Please try again shortly.`;
  }
}

Health Check Endpoint

Expose provider health status for monitoring.

interface ProviderHealthStatus {
  provider: string;
  healthy: boolean;
  last_check: Date;
  consecutive_failures: number;
  response_time_ms?: number;
}

class HealthCheckMonitor {
  private statuses: Map&lt;string, ProviderHealthStatus&gt; = new Map();

  async checkHealth(
    providers: Map&lt;string, LLMProvider&gt;
  ): Promise&lt;Map&lt;string, ProviderHealthStatus&gt;&gt; {
    const results = new Map&lt;string, ProviderHealthStatus&gt;();

    for (const [name, provider] of providers) {
      const startTime = Date.now();

      try {
        const healthy = await provider.health();
        results.set(name, {
          provider: name,
          healthy,
          last_check: new Date(),
          consecutive_failures: healthy ? 0 : 1,
          response_time_ms: Date.now() - startTime
        });
      } catch (error) {
        const previous = this.statuses.get(name);
        results.set(name, {
          provider: name,
          healthy: false,
          last_check: new Date(),
          consecutive_failures: (previous?.consecutive_failures || 0) + 1,
          response_time_ms: Date.now() - startTime
        });
      }
    }

    this.statuses = results;
    return results;
  }

  exposeHealthEndpoint(app: any): void {
    app.get('/health/llm', (req: any, res: any) =&gt; {
      const statuses = Array.from(this.statuses.values());
      const allHealthy = statuses.every(s =&gt; s.healthy);

      res.status(allHealthy ? 200 : 503).json({
        status: allHealthy ? 'healthy' : 'degraded',
        providers: statuses
      });
    });
  }
}

Checklist

Register at least 3 different LLM providers (OpenAI, Anthropic, Gemini)
Implement circuit breaker pattern for each provider
Use LiteLLM to reduce boilerplate provider-specific code
Define cost/capability tiers for intelligent routing
Route simple queries to cheap models, complex ones to capable models
Monitor provider health every 30-60 seconds
Expose /health/llm endpoint for operational visibility
Implement local fallback (Ollama) for complete outages
Cache responses to serve on total failure
Document model capabilities matrix for your team

Conclusion

Single points of failure in LLM infrastructure are unacceptable in production. Multi-provider architectures with circuit breakers, cost-based routing, and graceful degradation make your service resilient to the inevitable provider outages.