Published on

LLM Observability — Tracing Prompts, Tokens, Latency, and Cost in Production

Authors

Introduction

You can't manage what you can't measure. LLM applications need observability: which prompts were used, how many tokens consumed, end-to-end latency, cost per feature, hallucination rates, and model degradation detection. This post covers production observability patterns with LangSmith, LangFuse, custom middleware, cost tracking, and alerting.

LangSmith Integration for Tracing

LangSmith records every LLM call, tool execution, and agent action for debugging and analysis.

import { Client } from 'langsmith';

const client = new Client({
  apiUrl: 'https://api.smith.langchain.com',
  apiKey: process.env.LANGSMITH_API_KEY,
});

interface TraceConfig {
  projectName: string;
  tags: string[];
  metadata: Record<string, any>;
}

class LangSmithObserver {
  private client: Client;
  private config: TraceConfig;

  constructor(config: TraceConfig) {
    this.client = client;
    this.config = config;
  }

  async tracePromptCall(
    userId: string,
    prompt: string,
    model: string,
    response: string,
    metadata?: Record<string, any>
  ): Promise<string> {
    const runId = await this.client.createRun({
      name: 'llm_call',
      run_type: 'llm',
      project_name: this.config.projectName,
      inputs: {
        prompt,
        model,
      },
      tags: this.config.tags,
      extra: {
        user_id: userId,
        ...metadata,
      },
    });

    try {
      // Log outputs and metadata
      await this.client.updateRun(runId, {
        outputs: { response },
        end_time: new Date(),
      });

      return response;
    } catch (error) {
      await this.client.updateRun(runId, {
        error: error instanceof Error ? error.message : String(error),
        end_time: new Date(),
      });
      throw error;
    }
  }

  async traceAgentLoop(
    agentId: string,
    goal: string,
    iterations: number,
    toolCalls: any[]
  ): Promise<string> {
    const runId = await this.client.createRun({
      name: 'agent_loop',
      run_type: 'chain',
      project_name: this.config.projectName,
      inputs: { goal, agent_id: agentId },
      tags: ['agent', ...this.config.tags],
      extra: {
        iterations,
        tool_count: toolCalls.length,
      },
    });

    try {
      // Log tool calls as child runs
      for (const toolCall of toolCalls) {
        await this.client.createRun({
          name: `tool_${toolCall.name}`,
          run_type: 'tool',
          project_name: this.config.projectName,
          parent_run_id: runId,
          inputs: toolCall.arguments,
          outputs: toolCall.result,
          extra: {
            status: toolCall.status,
            execution_time_ms: toolCall.executedAt ? Date.now() - toolCall.executedAt.getTime() : 0,
          },
        });
      }

      await this.client.updateRun(runId, {
        outputs: { success: true },
        end_time: new Date(),
      });

      return runId;
    } catch (error) {
      await this.client.updateRun(runId, {
        error: error instanceof Error ? error.message : String(error),
        end_time: new Date(),
      });
      throw error;
    }
  }
}

Custom Trace Middleware

Capture observability at API layer without modifying business logic.

import { Request, Response, NextFunction } from 'express';

interface RequestTrace {
  requestId: string;
  userId: string;
  endpoint: string;
  startTime: number;
  llmCallCount: number;
  totalInputTokens: number;
  totalOutputTokens: number;
  totalCost: number;
  latencies: {
    llmTime: number;
    databaseTime: number;
    totalTime: number;
  };
  models: Set<string>;
  errors: string[];
}

class ObservabilityMiddleware {
  private traces: Map<string, RequestTrace> = new Map();
  private metricsCollector: any;

  middleware() {
    return (req: Request, res: Response, next: NextFunction) => {
      const requestId = req.headers['x-request-id'] as string || `req_${Date.now()}`;
      const userId = (req.user as any)?.id || 'anonymous';

      const trace: RequestTrace = {
        requestId,
        userId,
        endpoint: `${req.method} ${req.path}`,
        startTime: Date.now(),
        llmCallCount: 0,
        totalInputTokens: 0,
        totalOutputTokens: 0,
        totalCost: 0,
        latencies: {
          llmTime: 0,
          databaseTime: 0,
          totalTime: 0,
        },
        models: new Set(),
        errors: [],
      };

      this.traces.set(requestId, trace);

      // Monkey-patch LLM calls to track observability
      const originalOpenAICall = this.wrapOpenAICall(requestId, trace);

      // Pass trace context
      req.trace = trace;

      // On response, record metrics
      res.on('finish', () => {
        trace.latencies.totalTime = Date.now() - trace.startTime;
        this.recordMetrics(trace);
        this.traces.delete(requestId);
      });

      next();
    };
  }

  private wrapOpenAICall(requestId: string, trace: RequestTrace) {
    return async (model: string, prompt: string, response: any) => {
      const startTime = Date.now();
      trace.models.add(model);

      // Simulate token counting
      const inputTokens = Math.ceil(prompt.length / 4);
      const outputTokens = Math.ceil(response.length / 4);

      trace.llmCallCount++;
      trace.totalInputTokens += inputTokens;
      trace.totalOutputTokens += outputTokens;
      trace.latencies.llmTime += Date.now() - startTime;

      // Calculate cost
      const costPerInputToken = this.getCostPerToken(model, 'input');
      const costPerOutputToken = this.getCostPerToken(model, 'output');
      const callCost = inputTokens * costPerInputToken + outputTokens * costPerOutputToken;

      trace.totalCost += callCost;
    };
  }

  private getCostPerToken(model: string, type: 'input' | 'output'): number {
    const costs: Record<string, Record<'input' | 'output', number>> = {
      'gpt-4-turbo-preview': { input: 0.00001, output: 0.00003 },
      'gpt-3.5-turbo': { input: 0.0000005, output: 0.0000015 },
      'claude-3-sonnet': { input: 0.000003, output: 0.000015 },
    };
    return costs[model]?.[type] || 0;
  }

  private recordMetrics(trace: RequestTrace): void {
    this.metricsCollector.recordMetric('llm.request.latency', trace.latencies.totalTime, {
      endpoint: trace.endpoint,
      user_id: trace.userId,
      model: Array.from(trace.models).join(','),
    });

    this.metricsCollector.recordMetric('llm.request.token_usage', trace.totalInputTokens + trace.totalOutputTokens, {
      endpoint: trace.endpoint,
      type: 'input_tokens',
    });

    this.metricsCollector.recordMetric('llm.request.cost', trace.totalCost, {
      endpoint: trace.endpoint,
      user_id: trace.userId,
    });

    if (trace.errors.length > 0) {
      this.metricsCollector.recordMetric('llm.request.errors', trace.errors.length, {
        endpoint: trace.endpoint,
        errors: trace.errors.join(','),
      });
    }
  }
}

Token Usage Tracking Per Endpoint

Know which endpoints consume the most tokens.

interface TokenMetrics {
  endpoint: string;
  totalInputTokens: number;
  totalOutputTokens: number;
  requestCount: number;
  avgInputTokensPerRequest: number;
  avgOutputTokensPerRequest: number;
  p95InputTokens: number;
  p95OutputTokens: number;
}

class TokenUsageTracker {
  private endpoints: Map<string, { tokens: number[]; outputTokens: number[] }> = new Map();

  recordTokenUsage(endpoint: string, inputTokens: number, outputTokens: number): void {
    let metrics = this.endpoints.get(endpoint);
    if (!metrics) {
      metrics = { tokens: [], outputTokens: [] };
      this.endpoints.set(endpoint, metrics);
    }

    metrics.tokens.push(inputTokens);
    metrics.outputTokens.push(outputTokens);

    // Keep only last 1000 data points per endpoint
    if (metrics.tokens.length > 1000) {
      metrics.tokens.shift();
      metrics.outputTokens.shift();
    }
  }

  getMetricsForEndpoint(endpoint: string): TokenMetrics | null {
    const metrics = this.endpoints.get(endpoint);
    if (!metrics) return null;

    const inputTokens = metrics.tokens;
    const outputTokens = metrics.outputTokens;
    const totalInput = inputTokens.reduce((a, b) => a + b, 0);
    const totalOutput = outputTokens.reduce((a, b) => a + b, 0);

    // Calculate p95
    const sortedInput = [...inputTokens].sort((a, b) => a - b);
    const sortedOutput = [...outputTokens].sort((a, b) => a - b);
    const p95Index = Math.floor(sortedInput.length * 0.95);

    return {
      endpoint,
      totalInputTokens: totalInput,
      totalOutputTokens: totalOutput,
      requestCount: inputTokens.length,
      avgInputTokensPerRequest: totalInput / inputTokens.length,
      avgOutputTokensPerRequest: totalOutput / outputTokens.length,
      p95InputTokens: sortedInput[p95Index],
      p95OutputTokens: sortedOutput[p95Index],
    };
  }

  getDailyReport(): TokenMetrics[] {
    return Array.from(this.endpoints.keys())
      .map(endpoint => this.getMetricsForEndpoint(endpoint))
      .filter((m): m is TokenMetrics => m !== null)
      .sort((a, b) => b.totalInputTokens - a.totalInputTokens);
  }
}

P95 Latency Tracking

Monitor end-to-end latency distribution, not just averages.

class LatencyMonitor {
  private latencies: Map<string, number[]> = new Map();
  private alerts: any[] = [];

  recordLatency(endpoint: string, latencyMs: number): void {
    let measurements = this.latencies.get(endpoint);
    if (!measurements) {
      measurements = [];
      this.latencies.set(endpoint, measurements);
    }

    measurements.push(latencyMs);

    // Keep last 10000 measurements
    if (measurements.length > 10000) {
      measurements.shift();
    }

    // Check for anomalies
    const stats = this.calculateStats(measurements);
    if (latencyMs > stats.p95) {
      this.alerts.push({
        endpoint,
        latencyMs,
        p95: stats.p95,
        timestamp: new Date(),
        severity: latencyMs > stats.p99 ? 'critical' : 'warning',
      });
    }
  }

  private calculateStats(measurements: number[]) {
    const sorted = [...measurements].sort((a, b) => a - b);
    const len = sorted.length;

    return {
      min: sorted[0],
      max: sorted[len - 1],
      mean: measurements.reduce((a, b) => a + b, 0) / len,
      median: sorted[Math.floor(len / 2)],
      p50: sorted[Math.floor(len * 0.5)],
      p95: sorted[Math.floor(len * 0.95)],
      p99: sorted[Math.floor(len * 0.99)],
    };
  }

  getStats(endpoint: string) {
    const measurements = this.latencies.get(endpoint);
    if (!measurements || measurements.length === 0) return null;

    return this.calculateStats(measurements);
  }

  getDailyReport(): Array<{ endpoint: string; stats: any }> {
    const report = [];
    for (const [endpoint, measurements] of this.latencies) {
      if (measurements.length > 0) {
        report.push({
          endpoint,
          stats: this.calculateStats(measurements),
        });
      }
    }
    return report;
  }
}

Cost Attribution Per User/Feature

Track which users and features cost the most to run.

interface CostAttribution {
  userId?: string;
  featureId?: string;
  model: string;
  inputTokens: number;
  outputTokens: number;
  cost: number;
  timestamp: Date;
}

class CostAttributor {
  private attributions: CostAttribution[] = [];
  private costPerToken = {
    'gpt-4-turbo-preview': { input: 0.00001, output: 0.00003 },
    'gpt-3.5-turbo': { input: 0.0000005, output: 0.0000015 },
    'claude-3-sonnet': { input: 0.000003, output: 0.000015 },
  };

  recordCost(
    userId: string | undefined,
    featureId: string | undefined,
    model: string,
    inputTokens: number,
    outputTokens: number
  ): void {
    const rates = this.costPerToken[model as keyof typeof this.costPerToken] || { input: 0, output: 0 };
    const cost = inputTokens * rates.input + outputTokens * rates.output;

    this.attributions.push({
      userId,
      featureId,
      model,
      inputTokens,
      outputTokens,
      cost,
      timestamp: new Date(),
    });
  }

  getCostByUser(userId: string, days: number = 30): number {
    const cutoff = new Date(Date.now() - days * 86400000);
    return this.attributions
      .filter(a => a.userId === userId && a.timestamp > cutoff)
      .reduce((sum, a) => sum + a.cost, 0);
  }

  getCostByFeature(featureId: string, days: number = 30): number {
    const cutoff = new Date(Date.now() - days * 86400000);
    return this.attributions
      .filter(a => a.featureId === featureId && a.timestamp > cutoff)
      .reduce((sum, a) => sum + a.cost, 0);
  }

  getTopCostlyUsers(limit: number = 10, days: number = 30): Array<{ userId: string; cost: number }> {
    const cutoff = new Date(Date.now() - days * 86400000);
    const costByUser = new Map<string, number>();

    for (const attr of this.attributions) {
      if (attr.userId && attr.timestamp > cutoff) {
        costByUser.set(attr.userId, (costByUser.get(attr.userId) || 0) + attr.cost);
      }
    }

    return Array.from(costByUser)
      .map(([userId, cost]) => ({ userId, cost }))
      .sort((a, b) => b.cost - a.cost)
      .slice(0, limit);
  }

  getTopCostlyFeatures(limit: number = 10, days: number = 30): Array<{ featureId: string; cost: number }> {
    const cutoff = new Date(Date.now() - days * 86400000);
    const costByFeature = new Map<string, number>();

    for (const attr of this.attributions) {
      if (attr.featureId && attr.timestamp > cutoff) {
        costByFeature.set(attr.featureId, (costByFeature.get(attr.featureId) || 0) + attr.cost);
      }
    }

    return Array.from(costByFeature)
      .map(([featureId, cost]) => ({ featureId, cost }))
      .sort((a, b) => b.cost - a.cost)
      .slice(0, limit);
  }
}

Quality Scoring and Hallucination Detection

Score LLM output quality automatically.

interface QualityScore {
  responseId: string;
  hallucination_score: number; // 0-1, lower is better
  relevance_score: number; // 0-1, higher is better
  citation_accuracy: number; // 0-1, higher is better
  overall_quality: number; // 0-1, higher is better
  timestamp: Date;
}

class QualityScoringEngine {
  private referenceDatastore: any;

  async scoreResponse(
    query: string,
    response: string,
    sources: string[]
  ): Promise<QualityScore> {
    const responseId = `resp_${Date.now()}`;

    // 1. Check for hallucinations (claims not in sources)
    const hallucScore = await this.detectHallucinations(response, sources);

    // 2. Check relevance to query
    const relevanceScore = await this.scoreRelevance(query, response);

    // 3. Check citation accuracy
    const citationScore = await this.validateCitations(response, sources);

    // 4. Calculate weighted overall score
    const overallQuality = 0.3 * (1 - hallucScore) + 0.4 * relevanceScore + 0.3 * citationScore;

    return {
      responseId,
      hallucination_score: hallucScore,
      relevance_score: relevanceScore,
      citation_accuracy: citationScore,
      overall_quality: overallQuality,
      timestamp: new Date(),
    };
  }

  private async detectHallucinations(response: string, sources: string[]): Promise<number> {
    // Compare claims in response against source content
    // Return 0 (no hallucinations) to 1 (completely hallucinated)

    // Simple heuristic: citations vs unsupported claims
    const citedClaims = (response.match(/\[citation:\s*\d+\]/g) || []).length;
    const totalClaims = (response.match(/[.!?]/g) || []).length;

    if (totalClaims === 0) return 0;

    const hallucScore = Math.max(0, (totalClaims - citedClaims) / totalClaims);
    return Math.min(1, hallucScore);
  }

  private async scoreRelevance(query: string, response: string): Promise<number> {
    // Use embeddings to measure semantic similarity
    // Higher similarity = higher relevance

    const queryWords = new Set(query.toLowerCase().split(/\s+/));
    const responseWords = new Set(response.toLowerCase().split(/\s+/));

    const intersection = new Set([...queryWords].filter(w => responseWords.has(w)));
    const union = new Set([...queryWords, ...responseWords]);

    const jaccardSimilarity = intersection.size / union.size;
    return jaccardSimilarity;
  }

  private async validateCitations(response: string, sources: string[]): Promise<number> {
    // Check if all citations point to valid sources
    const citationRegex = /\[citation:\s*(\d+)\]/g;
    let match;
    let validCitations = 0;
    let totalCitations = 0;

    while ((match = citationRegex.exec(response)) !== null) {
      totalCitations++;
      const sourceIndex = parseInt(match[1]);
      if (sourceIndex < sources.length) {
        validCitations++;
      }
    }

    if (totalCitations === 0) return 1.0; // No citations = perfect (not hallucinating)
    return validCitations / totalCitations;
  }
}

Alerting on LLM Degradation

Trigger alerts when model quality or latency degrades.

interface DegradationAlert {
  type: 'latency' | 'error_rate' | 'quality' | 'cost_anomaly';
  severity: 'warning' | 'critical';
  message: string;
  timestamp: Date;
  metrics: Record<string, any>;
}

class DegradationAlertSystem {
  private baselineMetrics = {
    p95Latency: 2000, // ms
    errorRate: 0.01, // 1%
    qualityScore: 0.85,
    costPerRequest: 0.05, // $
  };

  private alerts: DegradationAlert[] = [];
  private notificationService: any;

  checkDegradation(
    currentMetrics: {
      p95Latency: number;
      errorRate: number;
      qualityScore: number;
      costPerRequest: number;
    }
  ): DegradationAlert[] {
    const detectedAlerts: DegradationAlert[] = [];

    // Latency degradation: >20% slower than baseline
    if (currentMetrics.p95Latency > this.baselineMetrics.p95Latency * 1.2) {
      detectedAlerts.push({
        type: 'latency',
        severity: currentMetrics.p95Latency > this.baselineMetrics.p95Latency * 1.5 ? 'critical' : 'warning',
        message: `P95 latency degraded: ${currentMetrics.p95Latency}ms (baseline: ${this.baselineMetrics.p95Latency}ms)`,
        timestamp: new Date(),
        metrics: { current: currentMetrics.p95Latency, baseline: this.baselineMetrics.p95Latency },
      });
    }

    // Error rate spike: >50% increase
    if (currentMetrics.errorRate > this.baselineMetrics.errorRate * 1.5) {
      detectedAlerts.push({
        type: 'error_rate',
        severity: 'critical',
        message: `Error rate spike: ${(currentMetrics.errorRate * 100).toFixed(2)}% (baseline: ${(this.baselineMetrics.errorRate * 100).toFixed(2)}%)`,
        timestamp: new Date(),
        metrics: { current: currentMetrics.errorRate, baseline: this.baselineMetrics.errorRate },
      });
    }

    // Quality degradation: >10% drop
    if (currentMetrics.qualityScore < this.baselineMetrics.qualityScore * 0.9) {
      detectedAlerts.push({
        type: 'quality',
        severity: 'critical',
        message: `Quality score degraded: ${currentMetrics.qualityScore.toFixed(2)} (baseline: ${this.baselineMetrics.qualityScore})`,
        timestamp: new Date(),
        metrics: { current: currentMetrics.qualityScore, baseline: this.baselineMetrics.qualityScore },
      });
    }

    // Cost anomaly: >30% above baseline
    if (currentMetrics.costPerRequest > this.baselineMetrics.costPerRequest * 1.3) {
      detectedAlerts.push({
        type: 'cost_anomaly',
        severity: 'warning',
        message: `Cost per request spike: $${currentMetrics.costPerRequest.toFixed(4)} (baseline: $${this.baselineMetrics.costPerRequest.toFixed(4)})`,
        timestamp: new Date(),
        metrics: { current: currentMetrics.costPerRequest, baseline: this.baselineMetrics.costPerRequest },
      });
    }

    // Send alerts
    for (const alert of detectedAlerts) {
      this.notificationService.send({
        type: 'alert',
        severity: alert.severity,
        message: alert.message,
        channel: alert.severity === 'critical' ? 'slack-oncall' : 'slack-general',
      });
    }

    this.alerts.push(...detectedAlerts);
    return detectedAlerts;
  }

  getRecentAlerts(hours: number = 1): DegradationAlert[] {
    const cutoff = new Date(Date.now() - hours * 3600000);
    return this.alerts.filter(a => a.timestamp > cutoff);
  }
}

LLM Observability Checklist

  • Integrate LangSmith or LangFuse for trace logging
  • Implement custom middleware to capture observability
  • Track token usage per endpoint and user
  • Monitor p95/p99 latency distribution
  • Implement cost attribution per user and feature
  • Score response quality (hallucinations, relevance, citations)
  • Set up baselines for latency, quality, and cost
  • Alert on >20% latency degradation
  • Alert on error rate spikes
  • Create dashboards for daily cost and token usage

Conclusion

LLM observability is essential for production systems. Trace every call with LangSmith, track tokens and cost per endpoint, monitor latency percentiles, score quality automatically, and alert on degradation. Without visibility, you're flying blind.