Published on

Monitoring RAG in Production — What to Track When Your Chatbot Goes Live

Authors

Introduction

Your RAG system works great in tests. Then it ships to production and users report hallucinations, slow responses, and irrelevant answers. You have no data to diagnose why.

Production monitoring is non-optional. This post covers what to track, how to detect drift, and how to act on signals.

Query Analytics

Track popular queries, search patterns, and failures:

interface QueryEvent {
  timestamp: number;
  queryId: string;
  userId: string;
  query: string;
  queryLength: number;
  category?: string; // Auto-classified
  noResults: boolean; // Retrieved <1 relevant chunk
  userSatisfaction?: number; // 1-5 rating if provided
}

interface QueryAnalytics {
  totalQueries: number;
  uniqueQueries: number;
  avgQueryLength: number;
  noResultRate: number; // % of queries with no results
  topQueries: Array<{ query: string; count: number }>;
  failedQueries: Array<{ query: string; reason: string }>;
  satisfactionDistribution: Record<number, number>; // 1-5 counts
}

class QueryAnalyticsCollector {
  private queries: QueryEvent[] = [];
  private noResultThreshold: number = 0.3; // Relevance score threshold

  recordQuery(query: QueryEvent): void {
    this.queries.push(query);
  }

  computeAnalytics(timeWindowMs: number = 60 * 60 * 1000): QueryAnalytics {
    const now = Date.now();
    const recentQueries = this.queries.filter(q => now - q.timestamp < timeWindowMs);

    if (recentQueries.length === 0) {
      return {
        totalQueries: 0,
        uniqueQueries: 0,
        avgQueryLength: 0,
        noResultRate: 0,
        topQueries: [],
        failedQueries: [],
        satisfactionDistribution: {},
      };
    }

    // Group by unique query
    const queryGroups = new Map<string, QueryEvent[]>();
    for (const q of recentQueries) {
      const key = q.query.toLowerCase();
      queryGroups.set(key, (queryGroups.get(key) || []).concat(q));
    }

    // Find top queries
    const topQueries = Array.from(queryGroups.entries())
      .sort((a, b) => b[1].length - a[1].length)
      .slice(0, 10)
      .map(([query, events]) => ({
        query,
        count: events.length,
      }));

    // Find failed queries (no results)
    const failedQueries = Array.from(queryGroups.entries())
      .filter(([, events]) => events.some(e => e.noResults))
      .map(([query]) => ({
        query,
        reason: 'no_relevant_results',
      }))
      .slice(0, 20);

    // Satisfaction distribution
    const satisfactionDistribution: Record<number, number> = { 1: 0, 2: 0, 3: 0, 4: 0, 5: 0 };
    for (const q of recentQueries) {
      if (q.userSatisfaction) {
        satisfactionDistribution[q.userSatisfaction]++;
      }
    }

    return {
      totalQueries: recentQueries.length,
      uniqueQueries: queryGroups.size,
      avgQueryLength: recentQueries.reduce((sum, q) => sum + q.queryLength, 0) / recentQueries.length,
      noResultRate: recentQueries.filter(q => q.noResults).length / recentQueries.length,
      topQueries,
      failedQueries,
      satisfactionDistribution,
    };
  }
}

Retrieval Quality Metrics

Monitor what your retriever is finding:

interface RetrievalEvent {
  timestamp: number;
  queryId: string;
  query: string;
  retrievedChunkIds: string[];
  relevanceScores: number[];
  avgRelevanceScore: number;
  maxRelevanceScore: number;
  minRelevanceScore: number;
  scoreDistribution: { low: number; medium: number; high: number }; // Counts
}

interface RetrievalMetrics {
  avgRelevanceScore: number;
  medianScore: number;
  p95Score: number; // 95th percentile
  lowQualityRetrievalRate: number; // &lt;0.4 score
  scoreDistribution: Record<string, number>;
  latencyMs: {
    p50: number;
    p95: number;
    p99: number;
  };
}

class RetrievalMonitor {
  private events: RetrievalEvent[] = [];
  private latencies: number[] = [];
  private scoreThresholds = {
    low: 0.4,
    medium: 0.7,
    high: 1.0,
  };

  recordRetrieval(event: RetrievalEvent, latencyMs: number): void {
    this.events.push(event);
    this.latencies.push(latencyMs);
  }

  computeMetrics(timeWindowMs: number = 60 * 60 * 1000): RetrievalMetrics {
    const now = Date.now();
    const recentEvents = this.events.filter(e => now - e.timestamp < timeWindowMs);
    const recentLatencies = this.latencies.slice(-Math.floor((recentEvents.length / this.events.length) * this.latencies.length));

    if (recentEvents.length === 0) {
      return {
        avgRelevanceScore: 0,
        medianScore: 0,
        p95Score: 0,
        lowQualityRetrievalRate: 0,
        scoreDistribution: {},
        latencyMs: { p50: 0, p95: 0, p99: 0 },
      };
    }

    // Collect all scores
    const allScores: number[] = [];
    let lowQualityCount = 0;

    for (const event of recentEvents) {
      allScores.push(...event.relevanceScores);

      for (const score of event.relevanceScores) {
        if (score &lt; this.scoreThresholds.low) {
          lowQualityCount++;
        }
      }
    }

    // Sort for percentile calculations
    allScores.sort((a, b) => a - b);

    // Calculate percentiles
    const p50 = allScores[Math.floor(allScores.length * 0.5)];
    const p95 = allScores[Math.floor(allScores.length * 0.95)];
    const p99 = allScores[Math.floor(allScores.length * 0.99)];

    // Score distribution
    const distribution: Record<string, number> = { low: 0, medium: 0, high: 0 };
    for (const score of allScores) {
      if (score &lt; this.scoreThresholds.low) distribution.low++;
      else if (score &lt; this.scoreThresholds.medium) distribution.medium++;
      else distribution.high++;
    }

    // Calculate latency percentiles
    recentLatencies.sort((a, b) => a - b);
    const latencyP50 = recentLatencies[Math.floor(recentLatencies.length * 0.5)];
    const latencyP95 = recentLatencies[Math.floor(recentLatencies.length * 0.95)];
    const latencyP99 = recentLatencies[Math.floor(recentLatencies.length * 0.99)];

    return {
      avgRelevanceScore: allScores.reduce((a, b) => a + b, 0) / allScores.length,
      medianScore: p50,
      p95Score: p95,
      lowQualityRetrievalRate: lowQualityCount / allScores.length,
      scoreDistribution: distribution,
      latencyMs: {
        p50: latencyP50,
        p95: latencyP95,
        p99: latencyP99,
      },
    };
  }
}

Generation Quality Sampling

Sample generated responses for quality assessment:

interface GenerationEvent {
  timestamp: number;
  queryId: string;
  answer: string;
  answerLength: number;
  generationLatencyMs: number;
  tokensGenerated: number;
  stopReason: 'max_tokens' | 'eos' | 'stop_sequence';
  temperature: number;
  faithfulness?: number; // From LLM-as-judge sampling
  hallucinations?: string[]; // Detected hallucinations
}

interface GenerationMetrics {
  sampledResponses: number; // How many we evaluated
  avgFaithfulness: number;
  halluccinationRate: number;
  avgLatencyMs: number;
  avgTokensPerResponse: number;
  stopReasonDistribution: Record<string, number>;
  commonHallucinations: Array<{ text: string; frequency: number }>;
}

class GenerationMonitor {
  private events: GenerationEvent[] = [];
  private sampleRate: number = 0.05; // Evaluate 5% to control costs

  shouldSample(): boolean {
    return Math.random() &lt; this.sampleRate;
  }

  recordGeneration(
    event: GenerationEvent,
    shouldEvaluate: boolean = true
  ): void {
    this.events.push(event);
  }

  computeMetrics(timeWindowMs: number = 60 * 60 * 1000): GenerationMetrics {
    const now = Date.now();
    const recentEvents = this.events.filter(e => now - e.timestamp &lt; timeWindowMs);
    const evaluatedEvents = recentEvents.filter(e => e.faithfulness !== undefined);

    if (recentEvents.length === 0) {
      return {
        sampledResponses: 0,
        avgFaithfulness: 0,
        halluccinationRate: 0,
        avgLatencyMs: 0,
        avgTokensPerResponse: 0,
        stopReasonDistribution: {},
        commonHallucinations: [],
      };
    }

    // Stop reason distribution
    const stopReasons: Record<string, number> = {};
    for (const event of recentEvents) {
      stopReasons[event.stopReason] = (stopReasons[event.stopReason] || 0) + 1;
    }

    // Hallucination aggregation
    const hallucinations = new Map&lt;string, number&gt;();
    for (const event of evaluatedEvents) {
      if (event.hallucinations) {
        event.hallucinations.forEach(h => {
          hallucinations.set(h, (hallucinations.get(h) || 0) + 1);
        });
      }
    }

    const commonHallucinations = Array.from(hallucinations.entries())
      .sort((a, b) => b[1] - a[1])
      .slice(0, 10)
      .map(([text, frequency]) => ({ text, frequency }));

    return {
      sampledResponses: evaluatedEvents.length,
      avgFaithfulness:
        evaluatedEvents.reduce((sum, e) => sum + (e.faithfulness || 0), 0) /
        Math.max(1, evaluatedEvents.length),
      halluccinationRate: evaluatedEvents.filter(e => (e.hallucinations || []).length &gt; 0).length /
        Math.max(1, evaluatedEvents.length),
      avgLatencyMs:
        recentEvents.reduce((sum, e) => sum + e.generationLatencyMs, 0) /
        recentEvents.length,
      avgTokensPerResponse:
        recentEvents.reduce((sum, e) => sum + e.tokensGenerated, 0) /
        recentEvents.length,
      stopReasonDistribution: stopReasons,
      commonHallucinations,
    };
  }
}

Latency Breakdown

Understand where time is spent:

interface PipelineLatencies {
  queryEmbedding: number; // ms
  vectorSearch: number; // ms
  reranking: number; // ms
  generation: number; // ms
  total: number; // ms
}

interface LatencyMetrics {
  components: Record&lt;string, { p50: number; p95: number; p99: number }&gt;;
  bottleneck: string; // Which component takes most time
  totalLatencyTarget: number; // SLA in ms
  slaViolationRate: number; // % of queries exceeding target
}

class LatencyMonitor {
  private latencies: PipelineLatencies[] = [];
  private latencyTarget: number = 2000; // 2 second SLA

  recordLatencies(latencies: PipelineLatencies): void {
    this.latencies.push(latencies);
  }

  computeMetrics(): LatencyMetrics {
    if (this.latencies.length === 0) {
      return {
        components: {},
        bottleneck: 'unknown',
        totalLatencyTarget: this.latencyTarget,
        slaViolationRate: 0,
      };
    }

    // Compute percentiles for each component
    const componentLatencies: Record&lt;string, number[]&gt; = {
      queryEmbedding: [],
      vectorSearch: [],
      reranking: [],
      generation: [],
      total: [],
    };

    for (const latency of this.latencies) {
      componentLatencies.queryEmbedding.push(latency.queryEmbedding);
      componentLatencies.vectorSearch.push(latency.vectorSearch);
      componentLatencies.reranking.push(latency.reranking);
      componentLatencies.generation.push(latency.generation);
      componentLatencies.total.push(latency.total);
    }

    // Compute percentiles
    const percentiles: Record&lt;string, { p50: number; p95: number; p99: number }&gt; = {};
    for (const [component, values] of Object.entries(componentLatencies)) {
      values.sort((a, b) => a - b);
      percentiles[component] = {
        p50: values[Math.floor(values.length * 0.5)],
        p95: values[Math.floor(values.length * 0.95)],
        p99: values[Math.floor(values.length * 0.99)],
      };
    }

    // Find bottleneck (highest p95)
    let bottleneck = 'total';
    let maxP95 = percentiles.total.p95;

    for (const [component, pct] of Object.entries(percentiles)) {
      if (component !== 'total' && pct.p95 &gt; maxP95) {
        bottleneck = component;
        maxP95 = pct.p95;
      }
    }

    // SLA violation rate
    const totalLatencies = componentLatencies.total;
    const violations = totalLatencies.filter(l => l &gt; this.latencyTarget).length;

    return {
      components: percentiles,
      bottleneck,
      totalLatencyTarget: this.latencyTarget,
      slaViolationRate: violations / totalLatencies.length,
    };
  }
}

User Feedback Collection

Capture feedback signals:

interface UserFeedback {
  timestamp: number;
  queryId: string;
  userId: string;
  rating: number; // 1-5 stars
  feedback: string; // Optional comment
  feedbackType: 'thumbs_up' | 'thumbs_down' | 'detailed_rating';
  issues?: string[]; // e.g., ["irrelevant", "incomplete", "hallucination"]
}

class UserFeedbackCollector {
  private feedback: UserFeedback[] = [];

  recordFeedback(feedback: UserFeedback): void {
    this.feedback.push(feedback);
  }

  computeMetrics(timeWindowMs: number = 24 * 60 * 60 * 1000): {
    averageRating: number;
    feedbackRate: number; // % of queries with feedback
    issueDistribution: Record&lt;string, number&gt;;
    negativeComments: string[];
  } {
    const now = Date.now();
    const recentFeedback = this.feedback.filter(f => now - f.timestamp &lt; timeWindowMs);

    if (recentFeedback.length === 0) {
      return {
        averageRating: 0,
        feedbackRate: 0,
        issueDistribution: {},
        negativeComments: [],
      };
    }

    // Average rating
    const avgRating = recentFeedback
      .filter(f => f.rating)
      .reduce((sum, f) => sum + f.rating, 0) / Math.max(1, recentFeedback.filter(f => f.rating).length);

    // Issues distribution
    const issues: Record&lt;string, number&gt; = {};
    for (const f of recentFeedback) {
      if (f.issues) {
        f.issues.forEach(issue => {
          issues[issue] = (issues[issue] || 0) + 1;
        });
      }
    }

    // Negative comments
    const negativeComments = recentFeedback
      .filter(f => f.rating &lt;= 2 && f.feedback)
      .map(f => f.feedback)
      .slice(0, 20);

    return {
      averageRating: avgRating,
      feedbackRate: recentFeedback.length / 1000, // Estimate based on sample
      issueDistribution: issues,
      negativeComments,
    };
  }
}

Drift Detection

Detect quality degradation:

interface QualityBaseline {
  timestamp: number;
  retrievalScore: number;
  faithfulness: number;
  userRating: number;
  latencyMs: number;
}

interface DriftSignal {
  metric: string;
  baseline: number;
  current: number;
  change: number; // percentage change
  zscore: number; // Standard deviations from mean
  isSignificant: boolean; // &gt; 2 sigma
}

class DriftDetector {
  private baselines: QualityBaseline[] = [];
  private history: Record&lt;string, number[]&gt; = {};
  private zscoreThreshold: number = 2.0; // 2 sigma

  recordMetrics(metrics: QualityBaseline): void {
    this.baselines.push(metrics);

    // Update running statistics
    if (!this.history['retrieval']) {
      this.history['retrieval'] = [];
      this.history['faithfulness'] = [];
      this.history['rating'] = [];
      this.history['latency'] = [];
    }

    this.history['retrieval'].push(metrics.retrievalScore);
    this.history['faithfulness'].push(metrics.faithfulness);
    this.history['rating'].push(metrics.userRating);
    this.history['latency'].push(metrics.latencyMs);
  }

  detectDrift(): DriftSignal[] {
    const signals: DriftSignal[] = [];

    for (const [metric, values] of Object.entries(this.history)) {
      if (values.length &lt; 10) continue; // Need minimum history

      // Calculate mean and stdev
      const mean = values.reduce((a, b) => a + b, 0) / values.length;
      const stdev = Math.sqrt(
        values.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / values.length
      );

      // Recent value (last recorded)
      const recent = values[values.length - 1];
      const zscore = Math.abs((recent - mean) / (stdev || 1));

      const isSignificant = zscore &gt; this.zscoreThreshold;

      if (isSignificant) {
        signals.push({
          metric,
          baseline: mean,
          current: recent,
          change: ((recent - mean) / mean) * 100,
          zscore,
          isSignificant,
        });
      }
    }

    return signals;
  }
}

Cost Per Query

Track LLM expenses:

interface CostEvent {
  timestamp: number;
  queryId: string;
  embeddingTokens: number;
  completionTokens: number;
  rerankingCalls: number;
  modelUsed: 'gpt-4' | 'gpt-3.5' | 'claude' | string;
  costUsd: number;
}

interface CostMetrics {
  totalCost: number;
  avgCostPerQuery: number;
  costByModel: Record&lt;string, number&gt;;
  projectedMonthlyCost: number;
  costDistribution: {
    embedding: number;
    generation: number;
    reranking: number;
  };
}

class CostMonitor {
  private events: CostEvent[] = [];
  private costs = {
    embeddingPerMKtoken: 0.02,
    completionPerKtoken: 0.06,
    promptPerKtoken: 0.03,
    rerankPerCall: 0.001,
  };

  recordCost(event: CostEvent): void {
    this.events.push(event);
  }

  computeMetrics(timeWindowMs: number = 24 * 60 * 60 * 1000): CostMetrics {
    const now = Date.now();
    const recentEvents = this.events.filter(e => now - e.timestamp &lt; timeWindowMs);

    if (recentEvents.length === 0) {
      return {
        totalCost: 0,
        avgCostPerQuery: 0,
        costByModel: {},
        projectedMonthlyCost: 0,
        costDistribution: { embedding: 0, generation: 0, reranking: 0 },
      };
    }

    let totalCost = 0;
    const costByModel: Record&lt;string, number&gt; = {};

    for (const event of recentEvents) {
      totalCost += event.costUsd;
      costByModel[event.modelUsed] = (costByModel[event.modelUsed] || 0) + event.costUsd;
    }

    // Cost breakdown
    const embeddingCost = recentEvents.reduce(
      (sum, e) => sum + (e.embeddingTokens * this.costs.embeddingPerMKtoken) / 1000,
      0
    );

    const generationCost = recentEvents.reduce(
      (sum, e) => sum + (e.completionTokens * this.costs.completionPerKtoken) / 1000,
      0
    );

    const rerankingCost = recentEvents.reduce(
      (sum, e) => sum + e.rerankingCalls * this.costs.rerankPerCall,
      0
    );

    // Project to monthly
    const hoursInWindow = timeWindowMs / (1000 * 60 * 60);
    const projectedMonthly = (totalCost / hoursInWindow) * 24 * 30;

    return {
      totalCost,
      avgCostPerQuery: totalCost / recentEvents.length,
      costByModel,
      projectedMonthlyCost: projectedMonthly,
      costDistribution: {
        embedding: embeddingCost,
        generation: generationCost,
        reranking: rerankingCost,
      },
    };
  }
}

Alerting Rules

Define thresholds and alerts:

interface AlertRule {
  name: string;
  metric: string;
  threshold: number;
  comparison: '&gt;' | '&lt;' | '==';
  windowMs: number;
  severity: 'warning' | 'critical';
  action: (context: any) => void;
}

const defaultAlerts: AlertRule[] = [
  {
    name: 'High No-Result Rate',
    metric: 'noResultRate',
    threshold: 0.1, // &gt;10% queries return no results
    comparison: '&gt;',
    windowMs: 60 * 60 * 1000,
    severity: 'critical',
    action: (context) => {
      console.error('🚨 High no-result rate detected. Check retrieval quality or index.');
    },
  },
  {
    name: 'Low Faithfulness',
    metric: 'avgFaithfulness',
    threshold: 0.75,
    comparison: '&lt;',
    windowMs: 60 * 60 * 1000,
    severity: 'critical',
    action: (context) => {
      console.error('🚨 Faithfulness dropped. Model may be hallucinating.');
    },
  },
  {
    name: 'High Latency',
    metric: 'latencyP95',
    threshold: 3000, // &gt;3s p95
    comparison: '&gt;',
    windowMs: 5 * 60 * 1000,
    severity: 'warning',
    action: (context) => {
      console.warn('⚠️ p95 latency is high. Check reranker or LLM performance.');
    },
  },
  {
    name: 'Cost Anomaly',
    metric: 'avgCostPerQuery',
    threshold: 0.05, // $0.05 per query
    comparison: '&gt;',
    windowMs: 24 * 60 * 60 * 1000,
    severity: 'warning',
    action: (context) => {
      console.warn(`⚠️ Cost per query is ${context.current}. Review token usage.`);
    },
  },
];

Checklist

  • Track top queries and no-result queries
  • Monitor retrieval relevance scores (p50, p95)
  • Sample generation for faithfulness (5-10%)
  • Break down latency by component (embed, search, rerank, generate)
  • Collect user feedback (thumbs up/down)
  • Implement drift detection on key metrics
  • Calculate cost per query and project monthly spend
  • Set up alerts for quality regressions
  • Monitor embedding distribution for drift
  • Create dashboard with key metrics visible

Conclusion

Production monitoring transforms reactive firefighting into proactive quality management. You can't optimize what you don't measure. Start with basic metrics (query volume, retrieval score, latency), add user feedback, then layer in drift detection. The signal-to-noise ratio improves with each addition. Within weeks, you'll have data-driven insights about where to focus next: Is retrieval the bottleneck? Is the LLM hallucinating? Are you burning too much money? The data answers everything.