Published on

LLM Output Caching — Semantic Caching to Cut Costs by 60 Percent

Authors

Introduction

Not every question needs a fresh LLM call. Exact-match caching handles 30% of queries. Semantic caching handles 50%. This guide covers both strategies with production patterns for TTL and invalidation.

Exact-Match Caching

Hash the prompt and store responses for identical queries.

import crypto from 'crypto';
import { Redis } from 'ioredis';

interface CacheEntry {
  response: string;
  tokens_input: number;
  tokens_output: number;
  created_at: Date;
  expires_at: Date;
}

class ExactMatchCache {
  private redis: Redis;
  private ttl_seconds = 86400; // 24 hours default

  constructor(redis: Redis, defaultTtlSeconds: number = 86400) {
    this.redis = redis;
    this.ttl_seconds = defaultTtlSeconds;
  }

  private hashPrompt(prompt: string): string {
    return crypto
      .createHash('sha256')
      .update(prompt)
      .digest('hex');
  }

  async get(prompt: string): Promise<CacheEntry | null> {
    const hash = this.hashPrompt(prompt);
    const key = `llm:exact:${hash}`;

    const cached = await this.redis.get(key);
    if (!cached) return null;

    try {
      return JSON.parse(cached);
    } catch {
      return null;
    }
  }

  async set(
    prompt: string,
    response: string,
    tokens: { input: number; output: number },
    ttl: number = this.ttl_seconds
  ): Promise<void> {
    const hash = this.hashPrompt(prompt);
    const key = `llm:exact:${hash}`;

    const entry: CacheEntry = {
      response,
      tokens_input: tokens.input,
      tokens_output: tokens.output,
      created_at: new Date(),
      expires_at: new Date(Date.now() + ttl * 1000)
    };

    await this.redis.setex(key, ttl, JSON.stringify(entry));
  }

  async hitRate(): Promise<{ hits: number; misses: number; rate: number }> {
    const hits = parseInt(await this.redis.get('cache:hits') || '0');
    const misses = parseInt(await this.redis.get('cache:misses') || '0');
    const total = hits + misses;

    return {
      hits,
      misses,
      rate: total > 0 ? (hits / total) * 100 : 0
    };
  }

  recordHit(): void {
    this.redis.incr('cache:hits');
  }

  recordMiss(): void {
    this.redis.incr('cache:misses');
  }
}

Semantic Caching

Embed prompts and find similar cached responses using vector similarity.

interface SemanticCacheEntry {
  embedding: number[];
  response: string;
  prompt: string;
  created_at: Date;
  access_count: number;
}

class SemanticCache {
  private redis: Redis;
  private embedder: any; // Embedding model
  private similarity_threshold = 0.85;
  private ttl_seconds = 604800; // 7 days

  constructor(redis: Redis, embedder: any) {
    this.redis = redis;
    this.embedder = embedder;
  }

  async embedPrompt(prompt: string): Promise<number[]> {
    // Use OpenAI or local embeddings
    const response = await this.embedder.embed(prompt);
    return response.embedding;
  }

  cosineSimilarity(a: number[], b: number[]): number {
    const dotProduct = a.reduce((sum, av, i) => sum + av * b[i], 0);
    const normA = Math.sqrt(a.reduce((sum, v) => sum + v * v, 0));
    const normB = Math.sqrt(b.reduce((sum, v) => sum + v * v, 0));

    if (normA === 0 || normB === 0) return 0;
    return dotProduct / (normA * normB);
  }

  async find(prompt: string): Promise<SemanticCacheEntry | null> {
    const embedding = await this.embedPrompt(prompt);
    const allKeys = await this.redis.keys('llm:semantic:*');

    let bestMatch: SemanticCacheEntry | null = null;
    let bestScore = this.similarity_threshold;

    for (const key of allKeys) {
      const cached = await this.redis.get(key);
      if (!cached) continue;

      try {
        const entry: SemanticCacheEntry = JSON.parse(cached);
        const similarity = this.cosineSimilarity(embedding, entry.embedding);

        if (similarity > bestScore) {
          bestScore = similarity;
          bestMatch = entry;
        }
      } catch {
        continue;
      }
    }

    if (bestMatch) {
      // Record access for LRU eviction
      bestMatch.access_count++;
      await this.redis.setex(
        `llm:semantic:${crypto.randomUUID()}`,
        this.ttl_seconds,
        JSON.stringify(bestMatch)
      );
    }

    return bestMatch;
  }

  async store(
    prompt: string,
    response: string,
    ttl: number = this.ttl_seconds
  ): Promise<void> {
    const embedding = await this.embedPrompt(prompt);

    const entry: SemanticCacheEntry = {
      embedding,
      response,
      prompt,
      created_at: new Date(),
      access_count: 0
    };

    const key = `llm:semantic:${crypto.randomUUID()}`;
    await this.redis.setex(key, ttl, JSON.stringify(entry));
  }

  async stats(): Promise<{ cached_embeddings: number; avg_access_count: number }> {
    const keys = await this.redis.keys('llm:semantic:*');
    const entries: SemanticCacheEntry[] = [];

    for (const key of keys) {
      const cached = await this.redis.get(key);
      if (cached) entries.push(JSON.parse(cached));
    }

    const avgAccess = entries.length > 0
      ? entries.reduce((sum, e) => sum + e.access_count, 0) / entries.length
      : 0;

    return {
      cached_embeddings: entries.length,
      avg_access_count: avgAccess
    };
  }
}

GPTCache Architecture

Build a production-grade caching layer inspired by GPTCache.

interface CacheConfig {
  exact_match_ttl: number;
  semantic_ttl: number;
  embedding_model: string;
  similarity_threshold: number;
}

class GPTCacheArchitecture {
  private exactCache: ExactMatchCache;
  private semanticCache: SemanticCache;
  private config: CacheConfig;
  private redis: Redis;

  constructor(redis: Redis, embedder: any, config: Partial<CacheConfig> = {}) {
    this.redis = redis;
    this.config = {
      exact_match_ttl: 86400,
      semantic_ttl: 604800,
      embedding_model: 'text-embedding-3-small',
      similarity_threshold: 0.85,
      ...config
    };

    this.exactCache = new ExactMatchCache(redis, this.config.exact_match_ttl);
    this.semanticCache = new SemanticCache(redis, embedder);
  }

  async getOrGenerate(
    prompt: string,
    generator: () => Promise<string>
  ): Promise<{ response: string; source: 'exact' | 'semantic' | 'generated' }> {
    // Try exact match first (fastest)
    const exactMatch = await this.exactCache.get(prompt);
    if (exactMatch) {
      this.exactCache.recordHit();
      return { response: exactMatch.response, source: 'exact' };
    }

    // Try semantic match (slower but broader)
    const semanticMatch = await this.semanticCache.find(prompt);
    if (semanticMatch) {
      return { response: semanticMatch.response, source: 'semantic' };
    }

    // Generate new response
    this.exactCache.recordMiss();
    const response = await generator();

    // Cache for future use
    await Promise.all([
      this.exactCache.set(prompt, response, { input: 0, output: 0 }),
      this.semanticCache.store(prompt, response)
    ]);

    return { response, source: 'generated' };
  }

  async warmupWithQueries(queries: string[]): Promise<void> {
    // Pre-generate responses for common queries
    for (const query of queries) {
      const existing = await this.exactCache.get(query);
      if (!existing) {
        console.log(`Cache miss for common query: ${query}`);
        // Would generate and cache here
      }
    }
  }
}

TTL Strategies

Different query types warrant different TTL values.

interface TTLStrategy {
  query_type: string;
  ttl_seconds: number;
  pattern: RegExp;
}

class TTLStrategyManager {
  private strategies: TTLStrategy[] = [
    {
      query_type: 'factual',
      ttl_seconds: 604800, // 7 days
      pattern: /^(what|when|where|who) is/i
    },
    {
      query_type: 'opinion',
      ttl_seconds: 86400, // 1 day
      pattern: /^(why|how|what do you think)/i
    },
    {
      query_type: 'time_sensitive',
      ttl_seconds: 3600, // 1 hour
      pattern: /(today|tomorrow|current|latest|recent)/i
    },
    {
      query_type: 'personalized',
      ttl_seconds: 0, // No cache
      pattern: /(my|your user|my account)/i
    },
    {
      query_type: 'code_generation',
      ttl_seconds: 259200, // 3 days
      pattern: /(write|code|implement|generate).*(function|class|snippet)/i
    }
  ];

  determineTTL(query: string): number {
    for (const strategy of this.strategies) {
      if (strategy.pattern.test(query)) {
        return strategy.ttl_seconds;
      }
    }

    // Default TTL
    return 86400;
  }

  classifyQuery(query: string): string {
    const strategy = this.strategies.find(s => s.pattern.test(query));
    return strategy?.query_type || 'general';
  }
}

Cache Invalidation on Model Version Change

When you upgrade your LLM model, invalidate old cache entries.

class CacheInvalidationManager {
  private redis: Redis;
  private currentModelVersion: string;
  private modelVersionKey = 'llm:model:version';

  constructor(redis: Redis, initialVersion: string = '1.0.0') {
    this.redis = redis;
    this.currentModelVersion = initialVersion;
  }

  async upgradeModel(newVersion: string): Promise<void> {
    const oldVersion = await this.redis.get(this.modelVersionKey);

    // Update version
    await this.redis.set(this.modelVersionKey, newVersion);
    this.currentModelVersion = newVersion;

    if (oldVersion && oldVersion !== newVersion) {
      console.log(`Model upgraded from ${oldVersion} to ${newVersion}`);

      // Optionally invalidate old cache
      const shouldInvalidate = this.shouldInvalidateOnUpgrade(oldVersion, newVersion);
      if (shouldInvalidate) {
        await this.invalidateCache();
      }
    }
  }

  private shouldInvalidateOnUpgrade(oldVersion: string, newVersion: string): boolean {
    // Invalidate if major version changed (e.g., gpt-3.5 → gpt-4)
    const oldMajor = oldVersion.split('.')[0];
    const newMajor = newVersion.split('.')[0];
    return oldMajor !== newMajor;
  }

  private async invalidateCache(): Promise<void> {
    const keysToDelete = await this.redis.keys('llm:*');

    if (keysToDelete.length > 0) {
      await this.redis.del(...keysToDelete);
      console.log(`Invalidated ${keysToDelete.length} cache entries`);
    }
  }

  async invalidatePattern(pattern: string): Promise<number> {
    const keys = await this.redis.keys(pattern);
    if (keys.length === 0) return 0;

    await this.redis.del(...keys);
    return keys.length;
  }

  async invalidateByTag(tag: string): Promise<number> {
    const keys = await this.redis.keys(`llm:${tag}:*`);
    if (keys.length === 0) return 0;

    await this.redis.del(...keys);
    return keys.length;
  }
}

Redis for Cache Storage

Implement cache with Redis for distributed systems.

class RedisCacheStore {
  private redis: Redis;
  private maxMemory = 1024 * 1024 * 1024; // 1GB default

  constructor(redis: Redis) {
    this.redis = redis;
  }

  async configureEviction(): Promise<void> {
    // Set Redis to use LRU eviction when memory limit hit
    await this.redis.configSet('maxmemory', this.maxMemory.toString());
    await this.redis.configSet('maxmemory-policy', 'allkeys-lru');
  }

  async getMemoryUsage(): Promise<{
    used_memory: number;
    used_memory_human: string;
    memory_limit: number;
  }> {
    const info = await this.redis.info('memory');
    const lines = info.split('\r\n');
    const usedMemory = parseInt(
      lines.find(l => l.startsWith('used_memory:'))?.split(':')[1] || '0'
    );
    const humanReadable = lines.find(l => l.startsWith('used_memory_human:'))?.split(':')[1] || '0B';

    return {
      used_memory: usedMemory,
      used_memory_human: humanReadable,
      memory_limit: this.maxMemory
    };
  }

  async monitorKeyspaceStats(): Promise<Record<string, number>> {
    const stats: Record<string, number> = {};

    const keys = await this.redis.keys('llm:*');
    for (const pattern of ['llm:exact:*', 'llm:semantic:*', 'llm:user:*']) {
      const count = (await this.redis.keys(pattern)).length;
      stats[pattern] = count;
    }

    return stats;
  }
}

Cache Hit Rate Tracking

Monitor cache effectiveness to guide optimization.

interface CacheMetrics {
  hits: number;
  misses: number;
  hit_rate: number;
  avg_response_time_ms: number;
  cost_savings: number;
}

class CacheMetricsCollector {
  private redis: Redis;

  constructor(redis: Redis) {
    this.redis = redis;
  }

  async recordCacheHit(responseLengthTokens: number, savedCost: number): Promise<void> {
    const key = `cache:metrics:${new Date().toISOString().split('T')[0]}`;
    await this.redis.hincrby(key, 'hits', 1);
    await this.redis.hincrbyfloat(key, 'total_cost_saved', savedCost);
    await this.redis.expire(key, 86400 * 30); // Keep 30 days
  }

  async recordCacheMiss(): Promise<void> {
    const key = `cache:metrics:${new Date().toISOString().split('T')[0]}`;
    await this.redis.hincrby(key, 'misses', 1);
  }

  async getDailyMetrics(date: Date = new Date()): Promise<CacheMetrics> {
    const key = `cache:metrics:${date.toISOString().split('T')[0]}`;
    const data = await this.redis.hgetall(key);

    const hits = parseInt(data.hits || '0');
    const misses = parseInt(data.misses || '0');
    const total = hits + misses;
    const costSavings = parseFloat(data.total_cost_saved || '0');

    return {
      hits,
      misses,
      hit_rate: total > 0 ? (hits / total) * 100 : 0,
      avg_response_time_ms: 50, // Cached responses are ~50ms
      cost_savings: costSavings
    };
  }

  async getMonthlyTrend(): Promise<CacheMetrics[]> {
    const trend: CacheMetrics[] = [];

    for (let i = 0; i < 30; i++) {
      const date = new Date();
      date.setDate(date.getDate() - i);
      trend.push(await this.getDailyMetrics(date));
    }

    return trend.reverse();
  }
}

When NOT to Cache

Identify queries that should never be cached.

class CacheEligibilityChecker {
  private uncacheablePatterns = [
    /\b(my|personalized|custom|unique)\b/i,
    /\b(today|now|current|latest|recent)\b/i,
    /\b(secret|password|private|confidential|api_key)\b/i,
    /[0-9]{16}/, // Credit card-like
    /Bearer [A-Za-z0-9-._~+/]+=*/i // Auth tokens
  ];

  canCache(prompt: string, isPersonalizedUser: boolean = false): boolean {
    // Never cache personalized queries
    if (isPersonalizedUser) return false;

    // Check against patterns
    for (const pattern of this.uncacheablePatterns) {
      if (pattern.test(prompt)) {
        return false;
      }
    }

    // Check length (cache overhead not worth it for tiny prompts)
    if (prompt.length < 20) return false;

    return true;
  }

  whyCantCache(prompt: string): string[] {
    const reasons: string[] = [];

    if (prompt.length < 20) reasons.push('Prompt too short');

    for (const pattern of this.uncacheablePatterns) {
      if (pattern.test(prompt)) {
        reasons.push(`Matches uncacheable pattern: ${pattern}`);
        break;
      }
    }

    return reasons;
  }
}

Checklist

  • Implement exact-match cache with Redis (SHA256 hash of prompt)
  • Add semantic caching for 50%+ broader hit coverage
  • Use TTL strategies based on query type (7 days factual, 1 hour time-sensitive)
  • Invalidate cache on model version upgrades
  • Monitor cache hit rate daily and aim for >40%
  • Configure Redis with maxmemory-policy: allkeys-lru for automatic eviction
  • Never cache personalized, time-sensitive, or auth-related queries
  • Track cost savings from cache hits to justify infrastructure cost
  • Embed prompts once and reuse embeddings for similarity search
  • Set up alerts when hit rate drops below 20%

Conclusion

Semantic caching is the easiest way to cut LLM costs by 60% without changing your core logic. Pair exact-match caching for speed with semantic caching for coverage, and you've got a production-grade system that learns from every query.