- Published on
LLM Output Caching — Semantic Caching to Cut Costs by 60 Percent
- Authors

- Name
- Sanjeev Sharma
- @webcoderspeed1
Introduction
Not every question needs a fresh LLM call. Exact-match caching handles 30% of queries. Semantic caching handles 50%. This guide covers both strategies with production patterns for TTL and invalidation.
- Exact-Match Caching
- Semantic Caching
- GPTCache Architecture
- TTL Strategies
- Cache Invalidation on Model Version Change
- Redis for Cache Storage
- Cache Hit Rate Tracking
- When NOT to Cache
- Checklist
- Conclusion
Exact-Match Caching
Hash the prompt and store responses for identical queries.
import crypto from 'crypto';
import { Redis } from 'ioredis';
interface CacheEntry {
response: string;
tokens_input: number;
tokens_output: number;
created_at: Date;
expires_at: Date;
}
class ExactMatchCache {
private redis: Redis;
private ttl_seconds = 86400; // 24 hours default
constructor(redis: Redis, defaultTtlSeconds: number = 86400) {
this.redis = redis;
this.ttl_seconds = defaultTtlSeconds;
}
private hashPrompt(prompt: string): string {
return crypto
.createHash('sha256')
.update(prompt)
.digest('hex');
}
async get(prompt: string): Promise<CacheEntry | null> {
const hash = this.hashPrompt(prompt);
const key = `llm:exact:${hash}`;
const cached = await this.redis.get(key);
if (!cached) return null;
try {
return JSON.parse(cached);
} catch {
return null;
}
}
async set(
prompt: string,
response: string,
tokens: { input: number; output: number },
ttl: number = this.ttl_seconds
): Promise<void> {
const hash = this.hashPrompt(prompt);
const key = `llm:exact:${hash}`;
const entry: CacheEntry = {
response,
tokens_input: tokens.input,
tokens_output: tokens.output,
created_at: new Date(),
expires_at: new Date(Date.now() + ttl * 1000)
};
await this.redis.setex(key, ttl, JSON.stringify(entry));
}
async hitRate(): Promise<{ hits: number; misses: number; rate: number }> {
const hits = parseInt(await this.redis.get('cache:hits') || '0');
const misses = parseInt(await this.redis.get('cache:misses') || '0');
const total = hits + misses;
return {
hits,
misses,
rate: total > 0 ? (hits / total) * 100 : 0
};
}
recordHit(): void {
this.redis.incr('cache:hits');
}
recordMiss(): void {
this.redis.incr('cache:misses');
}
}
Semantic Caching
Embed prompts and find similar cached responses using vector similarity.
interface SemanticCacheEntry {
embedding: number[];
response: string;
prompt: string;
created_at: Date;
access_count: number;
}
class SemanticCache {
private redis: Redis;
private embedder: any; // Embedding model
private similarity_threshold = 0.85;
private ttl_seconds = 604800; // 7 days
constructor(redis: Redis, embedder: any) {
this.redis = redis;
this.embedder = embedder;
}
async embedPrompt(prompt: string): Promise<number[]> {
// Use OpenAI or local embeddings
const response = await this.embedder.embed(prompt);
return response.embedding;
}
cosineSimilarity(a: number[], b: number[]): number {
const dotProduct = a.reduce((sum, av, i) => sum + av * b[i], 0);
const normA = Math.sqrt(a.reduce((sum, v) => sum + v * v, 0));
const normB = Math.sqrt(b.reduce((sum, v) => sum + v * v, 0));
if (normA === 0 || normB === 0) return 0;
return dotProduct / (normA * normB);
}
async find(prompt: string): Promise<SemanticCacheEntry | null> {
const embedding = await this.embedPrompt(prompt);
const allKeys = await this.redis.keys('llm:semantic:*');
let bestMatch: SemanticCacheEntry | null = null;
let bestScore = this.similarity_threshold;
for (const key of allKeys) {
const cached = await this.redis.get(key);
if (!cached) continue;
try {
const entry: SemanticCacheEntry = JSON.parse(cached);
const similarity = this.cosineSimilarity(embedding, entry.embedding);
if (similarity > bestScore) {
bestScore = similarity;
bestMatch = entry;
}
} catch {
continue;
}
}
if (bestMatch) {
// Record access for LRU eviction
bestMatch.access_count++;
await this.redis.setex(
`llm:semantic:${crypto.randomUUID()}`,
this.ttl_seconds,
JSON.stringify(bestMatch)
);
}
return bestMatch;
}
async store(
prompt: string,
response: string,
ttl: number = this.ttl_seconds
): Promise<void> {
const embedding = await this.embedPrompt(prompt);
const entry: SemanticCacheEntry = {
embedding,
response,
prompt,
created_at: new Date(),
access_count: 0
};
const key = `llm:semantic:${crypto.randomUUID()}`;
await this.redis.setex(key, ttl, JSON.stringify(entry));
}
async stats(): Promise<{ cached_embeddings: number; avg_access_count: number }> {
const keys = await this.redis.keys('llm:semantic:*');
const entries: SemanticCacheEntry[] = [];
for (const key of keys) {
const cached = await this.redis.get(key);
if (cached) entries.push(JSON.parse(cached));
}
const avgAccess = entries.length > 0
? entries.reduce((sum, e) => sum + e.access_count, 0) / entries.length
: 0;
return {
cached_embeddings: entries.length,
avg_access_count: avgAccess
};
}
}
GPTCache Architecture
Build a production-grade caching layer inspired by GPTCache.
interface CacheConfig {
exact_match_ttl: number;
semantic_ttl: number;
embedding_model: string;
similarity_threshold: number;
}
class GPTCacheArchitecture {
private exactCache: ExactMatchCache;
private semanticCache: SemanticCache;
private config: CacheConfig;
private redis: Redis;
constructor(redis: Redis, embedder: any, config: Partial<CacheConfig> = {}) {
this.redis = redis;
this.config = {
exact_match_ttl: 86400,
semantic_ttl: 604800,
embedding_model: 'text-embedding-3-small',
similarity_threshold: 0.85,
...config
};
this.exactCache = new ExactMatchCache(redis, this.config.exact_match_ttl);
this.semanticCache = new SemanticCache(redis, embedder);
}
async getOrGenerate(
prompt: string,
generator: () => Promise<string>
): Promise<{ response: string; source: 'exact' | 'semantic' | 'generated' }> {
// Try exact match first (fastest)
const exactMatch = await this.exactCache.get(prompt);
if (exactMatch) {
this.exactCache.recordHit();
return { response: exactMatch.response, source: 'exact' };
}
// Try semantic match (slower but broader)
const semanticMatch = await this.semanticCache.find(prompt);
if (semanticMatch) {
return { response: semanticMatch.response, source: 'semantic' };
}
// Generate new response
this.exactCache.recordMiss();
const response = await generator();
// Cache for future use
await Promise.all([
this.exactCache.set(prompt, response, { input: 0, output: 0 }),
this.semanticCache.store(prompt, response)
]);
return { response, source: 'generated' };
}
async warmupWithQueries(queries: string[]): Promise<void> {
// Pre-generate responses for common queries
for (const query of queries) {
const existing = await this.exactCache.get(query);
if (!existing) {
console.log(`Cache miss for common query: ${query}`);
// Would generate and cache here
}
}
}
}
TTL Strategies
Different query types warrant different TTL values.
interface TTLStrategy {
query_type: string;
ttl_seconds: number;
pattern: RegExp;
}
class TTLStrategyManager {
private strategies: TTLStrategy[] = [
{
query_type: 'factual',
ttl_seconds: 604800, // 7 days
pattern: /^(what|when|where|who) is/i
},
{
query_type: 'opinion',
ttl_seconds: 86400, // 1 day
pattern: /^(why|how|what do you think)/i
},
{
query_type: 'time_sensitive',
ttl_seconds: 3600, // 1 hour
pattern: /(today|tomorrow|current|latest|recent)/i
},
{
query_type: 'personalized',
ttl_seconds: 0, // No cache
pattern: /(my|your user|my account)/i
},
{
query_type: 'code_generation',
ttl_seconds: 259200, // 3 days
pattern: /(write|code|implement|generate).*(function|class|snippet)/i
}
];
determineTTL(query: string): number {
for (const strategy of this.strategies) {
if (strategy.pattern.test(query)) {
return strategy.ttl_seconds;
}
}
// Default TTL
return 86400;
}
classifyQuery(query: string): string {
const strategy = this.strategies.find(s => s.pattern.test(query));
return strategy?.query_type || 'general';
}
}
Cache Invalidation on Model Version Change
When you upgrade your LLM model, invalidate old cache entries.
class CacheInvalidationManager {
private redis: Redis;
private currentModelVersion: string;
private modelVersionKey = 'llm:model:version';
constructor(redis: Redis, initialVersion: string = '1.0.0') {
this.redis = redis;
this.currentModelVersion = initialVersion;
}
async upgradeModel(newVersion: string): Promise<void> {
const oldVersion = await this.redis.get(this.modelVersionKey);
// Update version
await this.redis.set(this.modelVersionKey, newVersion);
this.currentModelVersion = newVersion;
if (oldVersion && oldVersion !== newVersion) {
console.log(`Model upgraded from ${oldVersion} to ${newVersion}`);
// Optionally invalidate old cache
const shouldInvalidate = this.shouldInvalidateOnUpgrade(oldVersion, newVersion);
if (shouldInvalidate) {
await this.invalidateCache();
}
}
}
private shouldInvalidateOnUpgrade(oldVersion: string, newVersion: string): boolean {
// Invalidate if major version changed (e.g., gpt-3.5 → gpt-4)
const oldMajor = oldVersion.split('.')[0];
const newMajor = newVersion.split('.')[0];
return oldMajor !== newMajor;
}
private async invalidateCache(): Promise<void> {
const keysToDelete = await this.redis.keys('llm:*');
if (keysToDelete.length > 0) {
await this.redis.del(...keysToDelete);
console.log(`Invalidated ${keysToDelete.length} cache entries`);
}
}
async invalidatePattern(pattern: string): Promise<number> {
const keys = await this.redis.keys(pattern);
if (keys.length === 0) return 0;
await this.redis.del(...keys);
return keys.length;
}
async invalidateByTag(tag: string): Promise<number> {
const keys = await this.redis.keys(`llm:${tag}:*`);
if (keys.length === 0) return 0;
await this.redis.del(...keys);
return keys.length;
}
}
Redis for Cache Storage
Implement cache with Redis for distributed systems.
class RedisCacheStore {
private redis: Redis;
private maxMemory = 1024 * 1024 * 1024; // 1GB default
constructor(redis: Redis) {
this.redis = redis;
}
async configureEviction(): Promise<void> {
// Set Redis to use LRU eviction when memory limit hit
await this.redis.configSet('maxmemory', this.maxMemory.toString());
await this.redis.configSet('maxmemory-policy', 'allkeys-lru');
}
async getMemoryUsage(): Promise<{
used_memory: number;
used_memory_human: string;
memory_limit: number;
}> {
const info = await this.redis.info('memory');
const lines = info.split('\r\n');
const usedMemory = parseInt(
lines.find(l => l.startsWith('used_memory:'))?.split(':')[1] || '0'
);
const humanReadable = lines.find(l => l.startsWith('used_memory_human:'))?.split(':')[1] || '0B';
return {
used_memory: usedMemory,
used_memory_human: humanReadable,
memory_limit: this.maxMemory
};
}
async monitorKeyspaceStats(): Promise<Record<string, number>> {
const stats: Record<string, number> = {};
const keys = await this.redis.keys('llm:*');
for (const pattern of ['llm:exact:*', 'llm:semantic:*', 'llm:user:*']) {
const count = (await this.redis.keys(pattern)).length;
stats[pattern] = count;
}
return stats;
}
}
Cache Hit Rate Tracking
Monitor cache effectiveness to guide optimization.
interface CacheMetrics {
hits: number;
misses: number;
hit_rate: number;
avg_response_time_ms: number;
cost_savings: number;
}
class CacheMetricsCollector {
private redis: Redis;
constructor(redis: Redis) {
this.redis = redis;
}
async recordCacheHit(responseLengthTokens: number, savedCost: number): Promise<void> {
const key = `cache:metrics:${new Date().toISOString().split('T')[0]}`;
await this.redis.hincrby(key, 'hits', 1);
await this.redis.hincrbyfloat(key, 'total_cost_saved', savedCost);
await this.redis.expire(key, 86400 * 30); // Keep 30 days
}
async recordCacheMiss(): Promise<void> {
const key = `cache:metrics:${new Date().toISOString().split('T')[0]}`;
await this.redis.hincrby(key, 'misses', 1);
}
async getDailyMetrics(date: Date = new Date()): Promise<CacheMetrics> {
const key = `cache:metrics:${date.toISOString().split('T')[0]}`;
const data = await this.redis.hgetall(key);
const hits = parseInt(data.hits || '0');
const misses = parseInt(data.misses || '0');
const total = hits + misses;
const costSavings = parseFloat(data.total_cost_saved || '0');
return {
hits,
misses,
hit_rate: total > 0 ? (hits / total) * 100 : 0,
avg_response_time_ms: 50, // Cached responses are ~50ms
cost_savings: costSavings
};
}
async getMonthlyTrend(): Promise<CacheMetrics[]> {
const trend: CacheMetrics[] = [];
for (let i = 0; i < 30; i++) {
const date = new Date();
date.setDate(date.getDate() - i);
trend.push(await this.getDailyMetrics(date));
}
return trend.reverse();
}
}
When NOT to Cache
Identify queries that should never be cached.
class CacheEligibilityChecker {
private uncacheablePatterns = [
/\b(my|personalized|custom|unique)\b/i,
/\b(today|now|current|latest|recent)\b/i,
/\b(secret|password|private|confidential|api_key)\b/i,
/[0-9]{16}/, // Credit card-like
/Bearer [A-Za-z0-9-._~+/]+=*/i // Auth tokens
];
canCache(prompt: string, isPersonalizedUser: boolean = false): boolean {
// Never cache personalized queries
if (isPersonalizedUser) return false;
// Check against patterns
for (const pattern of this.uncacheablePatterns) {
if (pattern.test(prompt)) {
return false;
}
}
// Check length (cache overhead not worth it for tiny prompts)
if (prompt.length < 20) return false;
return true;
}
whyCantCache(prompt: string): string[] {
const reasons: string[] = [];
if (prompt.length < 20) reasons.push('Prompt too short');
for (const pattern of this.uncacheablePatterns) {
if (pattern.test(prompt)) {
reasons.push(`Matches uncacheable pattern: ${pattern}`);
break;
}
}
return reasons;
}
}
Checklist
- Implement exact-match cache with Redis (SHA256 hash of prompt)
- Add semantic caching for 50%+ broader hit coverage
- Use TTL strategies based on query type (7 days factual, 1 hour time-sensitive)
- Invalidate cache on model version upgrades
- Monitor cache hit rate daily and aim for >40%
- Configure Redis with
maxmemory-policy: allkeys-lrufor automatic eviction - Never cache personalized, time-sensitive, or auth-related queries
- Track cost savings from cache hits to justify infrastructure cost
- Embed prompts once and reuse embeddings for similarity search
- Set up alerts when hit rate drops below 20%
Conclusion
Semantic caching is the easiest way to cut LLM costs by 60% without changing your core logic. Pair exact-match caching for speed with semantic caching for coverage, and you've got a production-grade system that learns from every query.