Published on

LLM Response Caching — Semantic Caching to Cut Costs and Latency by 60%

Authors

Introduction

LLM API calls are expensive (0.0010.001–0.10+ per query). Caching identical requests is easy; semantic caching handles similar-but-not-identical queries by comparing embeddings. This post covers exact match caching, semantic caching with embeddings, similarity thresholds, Redis implementation, cost savings calculation, and TTL strategies for different query types.

Exact Match Caching

Start simple: cache identical queries and return cached responses.

interface CacheEntry {
  query: string;
  response: string;
  tokens: number;
  cost: number;
  createdAt: number;
  hits: number;
  lastAccessedAt: number;
}

class ExactMatchCache {
  private cache: Map<string, CacheEntry> = new Map();
  private redisClient: any; // Redis client for persistent cache
  private readonly ttlMs = 86400000; // 24 hours default

  async get(query: string): Promise<CacheEntry | null> {
    // Check in-memory cache first
    const cached = this.cache.get(query);

    if (cached && !this.isExpired(cached)) {
      cached.hits++;
      cached.lastAccessedAt = Date.now();
      return cached;
    }

    // Check Redis
    if (this.redisClient) {
      try {
        const redisEntry = await this.redisClient.get(`cache:${query}`);

        if (redisEntry) {
          const entry = JSON.parse(redisEntry) as CacheEntry;
          this.cache.set(query, entry); // Populate in-memory cache
          return entry;
        }
      } catch (error) {
        console.warn('Redis cache miss:', error);
      }
    }

    return null;
  }

  async set(query: string, response: string, tokens: number, cost: number): Promise<void> {
    const entry: CacheEntry = {
      query,
      response,
      tokens,
      cost,
      createdAt: Date.now(),
      hits: 0,
      lastAccessedAt: Date.now(),
    };

    // Store in memory
    this.cache.set(query, entry);

    // Store in Redis with TTL
    if (this.redisClient) {
      try {
        await this.redisClient.setex(
          `cache:${query}`,
          Math.floor(this.ttlMs / 1000),
          JSON.stringify(entry)
        );
      } catch (error) {
        console.warn('Redis cache write failed:', error);
      }
    }

    // Evict least recently used entries if cache exceeds memory
    if (this.cache.size > 100000) {
      const lruKey = Array.from(this.cache.entries()).sort(
        (a, b) => a[1].lastAccessedAt - b[1].lastAccessedAt
      )[0][0];

      this.cache.delete(lruKey);
    }
  }

  private isExpired(entry: CacheEntry): boolean {
    return Date.now() - entry.createdAt > this.ttlMs;
  }

  getStats(): { hitRate: string; entries: number; memorySizeBytes: number } {
    const totalHits = Array.from(this.cache.values()).reduce((sum, e) => sum + e.hits, 0);
    const totalAccesses = Array.from(this.cache.values()).reduce((sum, e) => sum + e.hits + 1, 0);

    const estimatedMemory = Array.from(this.cache.values()).reduce(
      (sum, e) => sum + e.query.length + e.response.length + 200,
      0
    );

    return {
      hitRate: totalAccesses === 0 ? '0%' : `${((totalHits / totalAccesses) * 100).toFixed(1)}%`,
      entries: this.cache.size,
      memorySizeBytes: estimatedMemory,
    };
  }
}

Semantic Caching with Embeddings

Similar queries should return cached results if embeddings are close enough.

import { cosineSimilarity } from 'vector-utils';

interface SemanticCacheEntry {
  query: string;
  embedding: number[];
  response: string;
  tokens: number;
  cost: number;
  createdAt: number;
  hits: number;
  similarity: number; // How similar to the current query
}

class SemanticCache {
  private cache: SemanticCacheEntry[] = [];
  private redisClient: any;
  private embeddingService: any;
  private readonly similarityThreshold = 0.95; // 95% similarity
  private readonly maxCacheSize = 10000;

  async get(
    query: string,
    embedding: number[]
  ): Promise<{ entry: SemanticCacheEntry; similarity: number } | null> {
    // Search for similar cached queries
    let bestMatch: SemanticCacheEntry | null = null;
    let bestSimilarity = 0;

    for (const cached of this.cache) {
      const similarity = cosineSimilarity(embedding, cached.embedding);

      if (similarity > bestSimilarity && similarity >= this.similarityThreshold) {
        bestMatch = cached;
        bestSimilarity = similarity;
      }
    }

    if (bestMatch) {
      bestMatch.hits++;
      bestMatch.similarity = bestSimilarity;
      return { entry: bestMatch, similarity: bestSimilarity };
    }

    // Try Redis for distributed semantic cache (slower but more comprehensive)
    if (this.redisClient && this.cache.length < this.maxCacheSize / 10) {
      try {
        const keys = await this.redisClient.keys('semantic:*');

        for (const key of keys) {
          const cached = await this.redisClient.get(key);
          const entry = JSON.parse(cached) as SemanticCacheEntry;

          const similarity = cosineSimilarity(embedding, entry.embedding);

          if (similarity > bestSimilarity && similarity >= this.similarityThreshold) {
            bestMatch = entry;
            bestSimilarity = similarity;
          }
        }

        if (bestMatch) {
          return { entry: bestMatch, similarity: bestSimilarity };
        }
      } catch (error) {
        console.warn('Redis semantic cache search failed:', error);
      }
    }

    return null;
  }

  async set(
    query: string,
    embedding: number[],
    response: string,
    tokens: number,
    cost: number
  ): Promise<void> {
    const entry: SemanticCacheEntry = {
      query,
      embedding,
      response,
      tokens,
      cost,
      createdAt: Date.now(),
      hits: 0,
      similarity: 1.0,
    };

    this.cache.push(entry);

    // Keep cache pruned
    if (this.cache.length > this.maxCacheSize) {
      // Remove least frequently used
      this.cache.sort((a, b) => a.hits - b.hits);
      this.cache = this.cache.slice(this.maxCacheSize / 2);
    }

    // Persist to Redis
    if (this.redisClient) {
      try {
        await this.redisClient.setex(
          `semantic:${query}:${Date.now()}`,
          86400, // 24 hour TTL
          JSON.stringify(entry)
        );
      } catch (error) {
        console.warn('Redis semantic cache write failed:', error);
      }
    }
  }

  getSimilarityStats(): {
    avgSimilarity: number;
    semanticHitRate: string;
    cachedQueries: number;
  } {
    const withHits = this.cache.filter(e => e.hits > 0);
    const avgSimilarity = withHits.length === 0 ? 0 : withHits.reduce((sum, e) => sum + e.similarity, 0) / withHits.length;
    const totalRequests = this.cache.reduce((sum, e) => sum + e.hits, 0) + this.cache.length;

    return {
      avgSimilarity,
      semanticHitRate: totalRequests === 0 ? '0%' : `${((this.cache.reduce((sum, e) => sum + e.hits, 0) / totalRequests) * 100).toFixed(1)}%`,
      cachedQueries: this.cache.length,
    };
  }
}

Similarity Threshold Tuning

Choose threshold based on your accuracy tolerance.

interface ThresholdAnalysis {
  threshold: number;
  hitRate: number;
  falsePositiveRate: number;
  costSavings: number;
  userSatisfaction: number; // 0-1
}

class ThresholdTuner {
  private testQueries: Array<{
    query: string;
    embedding: number[];
    optimalResponse: string;
    userRating: number; // 0-5
  }> = [];

  analyzeThreshold(threshold: number): ThresholdAnalysis {
    let hitCount = 0;
    let falsePositives = 0;
    let costSaved = 0;
    let satisfactionScore = 0;

    for (const test of this.testQueries) {
      // Find best semantic match
      const matches = this.testQueries
        .filter(t => t !== test)
        .map(t => ({
          query: t.query,
          similarity: cosineSimilarity(test.embedding, t.embedding),
          response: t.response,
        }))
        .filter(m => m.similarity >= threshold);

      if (matches.length > 0) {
        hitCount++;

        const topMatch = matches[0];
        const matchedUserRating = this.testQueries.find(q => q.query === topMatch.query)?.userRating || 0;

        // If match rating is close to optimal, it's a good hit
        if (Math.abs(matchedUserRating - test.userRating) <= 0.5) {
          costSaved += 0.05; // Assume $0.05 saved per hit
          satisfactionScore += test.userRating / 5;
        } else {
          falsePositives++;
          satisfactionScore -= 0.2; // Penalty for wrong answer
        }
      }
    }

    return {
      threshold,
      hitRate: hitCount / this.testQueries.length,
      falsePositiveRate: falsePositives / Math.max(1, hitCount),
      costSavings: costSaved,
      userSatisfaction: Math.max(0, satisfactionScore / this.testQueries.length),
    };
  }

  findOptimalThreshold(): number {
    const analyses = [0.85, 0.90, 0.92, 0.95, 0.97, 0.99].map(t => this.analyzeThreshold(t));

    // Optimize for: high hit rate, low false positive rate, high satisfaction
    let best = analyses[0];

    for (const analysis of analyses) {
      const score =
        analysis.hitRate * 0.3 +
        (1 - analysis.falsePositiveRate) * 0.3 +
        analysis.userSatisfaction * 0.4;

      const currentScore =
        best.hitRate * 0.3 + (1 - best.falsePositiveRate) * 0.3 + best.userSatisfaction * 0.4;

      if (score > currentScore) {
        best = analysis;
      }
    }

    return best.threshold;
  }
}

function cosineSimilarity(a: number[], b: number[]): number {
  const dotProduct = a.reduce((sum, x, i) => sum + x * b[i], 0);
  const normA = Math.sqrt(a.reduce((sum, x) => sum + x * x, 0));
  const normB = Math.sqrt(b.reduce((sum, x) => sum + x * x, 0));
  return dotProduct / (normA * normB);
}

Cache Invalidation for Prompt Changes

When system prompt changes, invalidate related cache entries.

class PromptVersionedCache {
  private cache: Map<string, Array<{ systemPromptHash: string; entry: CacheEntry }>> = new Map();
  private systemPromptVersion = 0;

  private hashSystemPrompt(prompt: string): string {
    const crypto = require('crypto');
    return crypto.createHash('sha256').update(prompt).digest('hex');
  }

  async get(
    query: string,
    systemPrompt: string
  ): Promise<CacheEntry | null> {
    const promptHash = this.hashSystemPrompt(systemPrompt);
    const cached = this.cache.get(query);

    if (!cached) {
      return null;
    }

    // Find entry with matching prompt version
    const match = cached.find(entry => entry.systemPromptHash === promptHash);

    return match?.entry || null;
  }

  async set(
    query: string,
    systemPrompt: string,
    response: string,
    tokens: number,
    cost: number
  ): Promise<void> {
    const promptHash = this.hashSystemPrompt(systemPrompt);

    if (!this.cache.has(query)) {
      this.cache.set(query, []);
    }

    const entries = this.cache.get(query)!;

    // Don't store duplicate prompt versions
    const existing = entries.find(e => e.systemPromptHash === promptHash);
    if (existing) {
      existing.entry = {
        query,
        response,
        tokens,
        cost,
        createdAt: Date.now(),
        hits: existing.entry.hits,
        lastAccessedAt: Date.now(),
      };
    } else {
      entries.push({
        systemPromptHash: promptHash,
        entry: {
          query,
          response,
          tokens,
          cost,
          createdAt: Date.now(),
          hits: 0,
          lastAccessedAt: Date.now(),
        },
      });
    }
  }

  invalidateAllForPrompt(oldPrompt: string): void {
    const oldHash = this.hashSystemPrompt(oldPrompt);

    for (const [query, entries] of this.cache) {
      const filtered = entries.filter(e => e.systemPromptHash !== oldHash);

      if (filtered.length === 0) {
        this.cache.delete(query);
      } else {
        this.cache.set(query, filtered);
      }
    }
  }
}

Redis Semantic Cache Implementation

Distributed semantic caching with Redis for multi-instance systems.

import Redis from 'ioredis';

class RedisSemanticCache {
  private redis: Redis;
  private readonly keyPrefix = 'semantic_cache';

  constructor(redisUrl: string) {
    this.redis = new Redis(redisUrl, {
      retryStrategy: (times: number) => {
        return Math.min(times * 50, 2000);
      },
    });
  }

  async get(queryEmbedding: number[], threshold: number = 0.95): Promise<string | null> {
    try {
      // Get all cached embeddings
      const keys = await this.redis.keys(`${this.keyPrefix}:embedding:*`);

      for (const key of keys) {
        const cached = await this.redis.get(key);

        if (!cached) continue;

        const data = JSON.parse(cached);
        const similarity = cosineSimilarity(queryEmbedding, data.embedding);

        if (similarity >= threshold) {
          // Update hit count
          await this.redis.hincrby(key, 'hits', 1);

          return data.response;
        }
      }
    } catch (error) {
      console.warn('Redis semantic cache error:', error);
    }

    return null;
  }

  async set(
    query: string,
    embedding: number[],
    response: string,
    tokens: number,
    cost: number,
    ttlSeconds: number = 86400
  ): Promise<void> {
    const key = `${this.keyPrefix}:embedding:${query.slice(0, 50)}:${Date.now()}`;

    await this.redis.setex(
      key,
      ttlSeconds,
      JSON.stringify({
        query,
        embedding,
        response,
        tokens,
        cost,
        createdAt: Date.now(),
        hits: 0,
      })
    );
  }

  async getStats(): Promise<{
    totalCachedItems: number;
    estimatedMemoryUsage: string;
    oldestEntry: number;
    newestEntry: number;
  }> {
    const keys = await this.redis.keys(`${this.keyPrefix}:*`);
    const info = await this.redis.info('memory');

    return {
      totalCachedItems: keys.length,
      estimatedMemoryUsage: info.split('\r\n').find(l => l.startsWith('used_memory_human')) || 'unknown',
      oldestEntry: Date.now(),
      newestEntry: Date.now(),
    };
  }

  async clear(): Promise<void> {
    const keys = await this.redis.keys(`${this.keyPrefix}:*`);
    if (keys.length > 0) {
      await this.redis.del(...keys);
    }
  }
}

Cost Savings Calculation

Quantify cache benefits in dollars.

interface CachingMetrics {
  totalQueries: number;
  cachedHits: number;
  semanticHits: number;
  totalTokensSaved: number;
  estimatedCostSaved: number;
  cachePayoff: string; // Payoff period
}

class CostAnalyzer {
  private metrics = {
    exactHits: 0,
    semanticHits: 0,
    totalQueries: 0,
    tokensSaved: 0,
  };

  private costPerToken = {
    input: 0.0000015,
    output: 0.000006,
  };

  recordQuery(type: 'miss' | 'exact_hit' | 'semantic_hit', tokensAvoidedByCache: number = 0): void {
    this.metrics.totalQueries++;

    if (type === 'exact_hit') {
      this.metrics.exactHits++;
      this.metrics.tokensSaved += tokensAvoidedByCache;
    } else if (type === 'semantic_hit') {
      this.metrics.semanticHits++;
      this.metrics.tokensSaved += tokensAvoidedByCache;
    }
  }

  getMetrics(): CachingMetrics {
    const cacheHitRate = (this.metrics.exactHits + this.metrics.semanticHits) / Math.max(1, this.metrics.totalQueries);
    const costPerToken = (this.costPerToken.input + this.costPerToken.output) / 2;
    const estimatedCostSaved = this.metrics.tokensSaved * costPerToken;

    // Assume $0.10/month per 1GB of cache storage
    const estimatedCacheStorageCost = 10; // per month

    return {
      totalQueries: this.metrics.totalQueries,
      cachedHits: this.metrics.exactHits,
      semanticHits: this.metrics.semanticHits,
      totalTokensSaved: this.metrics.tokensSaved,
      estimatedCostSaved,
      cachePayoff: estimatedCostSaved > estimatedCacheStorageCost ? 'Profitable' : 'Not yet profitable',
    };
  }
}

TTL Strategies for Different Query Types

Different query types have different staleness tolerances.

type QueryType = 'factual' | 'analytical' | 'creative' | 'time_sensitive' | 'user_specific';

interface TTLStrategy {
  queryType: QueryType;
  ttlSeconds: number;
  reason: string;
}

class TTLManager {
  private strategies: TTLStrategy[] = [
    {
      queryType: 'factual',
      ttlSeconds: 30 * 86400, // 30 days - factual info doesn't change often
      reason: 'Factual queries are stable',
    },
    {
      queryType: 'analytical',
      ttlSeconds: 7 * 86400, // 7 days - analytics refresh weekly
      reason: 'Analytical results should refresh periodically',
    },
    {
      queryType: 'creative',
      ttlSeconds: 1 * 86400, // 1 day - creative content varies
      reason: 'Creative content should refresh daily',
    },
    {
      queryType: 'time_sensitive',
      ttlSeconds: 60 * 60, // 1 hour - news, weather, etc
      reason: 'Time-sensitive data expires hourly',
    },
    {
      queryType: 'user_specific',
      ttlSeconds: 60 * 60, // 1 hour - user data can change
      reason: 'User-specific responses expire hourly',
    },
  ];

  detectQueryType(query: string): QueryType {
    if (query.match(/what is|define|explain|who was|when was/i)) {
      return 'factual';
    }
    if (query.match(/analyze|trend|growth|performance|statistics/i)) {
      return 'analytical';
    }
    if (query.match(/write|create|compose|generate|story|poem/i)) {
      return 'creative';
    }
    if (query.match(/today|now|current|latest|breaking|weather|stock/i)) {
      return 'time_sensitive';
    }
    if (query.match(/my|user|profile|account|personalized/i)) {
      return 'user_specific';
    }

    return 'analytical'; // Default
  }

  getTTL(query: string): number {
    const queryType = this.detectQueryType(query);
    const strategy = this.strategies.find(s => s.queryType === queryType);

    return strategy?.ttlSeconds || 86400;
  }

  getAllStrategies(): TTLStrategy[] {
    return this.strategies;
  }
}

LLM Caching Strategy Checklist

  • Implement exact match caching for identical queries
  • Add semantic caching with embeddings for similar queries
  • Choose similarity threshold (0.95 recommended)
  • Persist cache to Redis for distributed systems
  • Version cache by system prompt hash
  • Invalidate cache when prompts change
  • Analyze threshold impact on hit rate and false positives
  • Calculate cost savings vs cache storage cost
  • Implement TTL based on query type
  • Monitor hit rates per endpoint and query type
  • Track semantic vs exact hit distribution

Conclusion

LLM caching cuts costs and latency dramatically. Exact match caching is straightforward; semantic caching with 0.95 similarity threshold catches 30-60% more hits without sacrificing quality. Combined with smart TTL strategies and cost tracking, caching can reduce your LLM bill by 50-70% while speeding up responses by 100x.