- Published on
LLM Observability — Tracing Prompts, Tokens, Latency, and Cost in Production
- Authors

- Name
- Sanjeev Sharma
- @webcoderspeed1
Introduction
You can't manage what you can't measure. LLM applications need observability: which prompts were used, how many tokens consumed, end-to-end latency, cost per feature, hallucination rates, and model degradation detection. This post covers production observability patterns with LangSmith, LangFuse, custom middleware, cost tracking, and alerting.
- LangSmith Integration for Tracing
- Custom Trace Middleware
- Token Usage Tracking Per Endpoint
- P95 Latency Tracking
- Cost Attribution Per User/Feature
- Quality Scoring and Hallucination Detection
- Alerting on LLM Degradation
- LLM Observability Checklist
- Conclusion
LangSmith Integration for Tracing
LangSmith records every LLM call, tool execution, and agent action for debugging and analysis.
import { Client } from 'langsmith';
const client = new Client({
apiUrl: 'https://api.smith.langchain.com',
apiKey: process.env.LANGSMITH_API_KEY,
});
interface TraceConfig {
projectName: string;
tags: string[];
metadata: Record<string, any>;
}
class LangSmithObserver {
private client: Client;
private config: TraceConfig;
constructor(config: TraceConfig) {
this.client = client;
this.config = config;
}
async tracePromptCall(
userId: string,
prompt: string,
model: string,
response: string,
metadata?: Record<string, any>
): Promise<string> {
const runId = await this.client.createRun({
name: 'llm_call',
run_type: 'llm',
project_name: this.config.projectName,
inputs: {
prompt,
model,
},
tags: this.config.tags,
extra: {
user_id: userId,
...metadata,
},
});
try {
// Log outputs and metadata
await this.client.updateRun(runId, {
outputs: { response },
end_time: new Date(),
});
return response;
} catch (error) {
await this.client.updateRun(runId, {
error: error instanceof Error ? error.message : String(error),
end_time: new Date(),
});
throw error;
}
}
async traceAgentLoop(
agentId: string,
goal: string,
iterations: number,
toolCalls: any[]
): Promise<string> {
const runId = await this.client.createRun({
name: 'agent_loop',
run_type: 'chain',
project_name: this.config.projectName,
inputs: { goal, agent_id: agentId },
tags: ['agent', ...this.config.tags],
extra: {
iterations,
tool_count: toolCalls.length,
},
});
try {
// Log tool calls as child runs
for (const toolCall of toolCalls) {
await this.client.createRun({
name: `tool_${toolCall.name}`,
run_type: 'tool',
project_name: this.config.projectName,
parent_run_id: runId,
inputs: toolCall.arguments,
outputs: toolCall.result,
extra: {
status: toolCall.status,
execution_time_ms: toolCall.executedAt ? Date.now() - toolCall.executedAt.getTime() : 0,
},
});
}
await this.client.updateRun(runId, {
outputs: { success: true },
end_time: new Date(),
});
return runId;
} catch (error) {
await this.client.updateRun(runId, {
error: error instanceof Error ? error.message : String(error),
end_time: new Date(),
});
throw error;
}
}
}
Custom Trace Middleware
Capture observability at API layer without modifying business logic.
import { Request, Response, NextFunction } from 'express';
interface RequestTrace {
requestId: string;
userId: string;
endpoint: string;
startTime: number;
llmCallCount: number;
totalInputTokens: number;
totalOutputTokens: number;
totalCost: number;
latencies: {
llmTime: number;
databaseTime: number;
totalTime: number;
};
models: Set<string>;
errors: string[];
}
class ObservabilityMiddleware {
private traces: Map<string, RequestTrace> = new Map();
private metricsCollector: any;
middleware() {
return (req: Request, res: Response, next: NextFunction) => {
const requestId = req.headers['x-request-id'] as string || `req_${Date.now()}`;
const userId = (req.user as any)?.id || 'anonymous';
const trace: RequestTrace = {
requestId,
userId,
endpoint: `${req.method} ${req.path}`,
startTime: Date.now(),
llmCallCount: 0,
totalInputTokens: 0,
totalOutputTokens: 0,
totalCost: 0,
latencies: {
llmTime: 0,
databaseTime: 0,
totalTime: 0,
},
models: new Set(),
errors: [],
};
this.traces.set(requestId, trace);
// Monkey-patch LLM calls to track observability
const originalOpenAICall = this.wrapOpenAICall(requestId, trace);
// Pass trace context
req.trace = trace;
// On response, record metrics
res.on('finish', () => {
trace.latencies.totalTime = Date.now() - trace.startTime;
this.recordMetrics(trace);
this.traces.delete(requestId);
});
next();
};
}
private wrapOpenAICall(requestId: string, trace: RequestTrace) {
return async (model: string, prompt: string, response: any) => {
const startTime = Date.now();
trace.models.add(model);
// Simulate token counting
const inputTokens = Math.ceil(prompt.length / 4);
const outputTokens = Math.ceil(response.length / 4);
trace.llmCallCount++;
trace.totalInputTokens += inputTokens;
trace.totalOutputTokens += outputTokens;
trace.latencies.llmTime += Date.now() - startTime;
// Calculate cost
const costPerInputToken = this.getCostPerToken(model, 'input');
const costPerOutputToken = this.getCostPerToken(model, 'output');
const callCost = inputTokens * costPerInputToken + outputTokens * costPerOutputToken;
trace.totalCost += callCost;
};
}
private getCostPerToken(model: string, type: 'input' | 'output'): number {
const costs: Record<string, Record<'input' | 'output', number>> = {
'gpt-4-turbo-preview': { input: 0.00001, output: 0.00003 },
'gpt-3.5-turbo': { input: 0.0000005, output: 0.0000015 },
'claude-3-sonnet': { input: 0.000003, output: 0.000015 },
};
return costs[model]?.[type] || 0;
}
private recordMetrics(trace: RequestTrace): void {
this.metricsCollector.recordMetric('llm.request.latency', trace.latencies.totalTime, {
endpoint: trace.endpoint,
user_id: trace.userId,
model: Array.from(trace.models).join(','),
});
this.metricsCollector.recordMetric('llm.request.token_usage', trace.totalInputTokens + trace.totalOutputTokens, {
endpoint: trace.endpoint,
type: 'input_tokens',
});
this.metricsCollector.recordMetric('llm.request.cost', trace.totalCost, {
endpoint: trace.endpoint,
user_id: trace.userId,
});
if (trace.errors.length > 0) {
this.metricsCollector.recordMetric('llm.request.errors', trace.errors.length, {
endpoint: trace.endpoint,
errors: trace.errors.join(','),
});
}
}
}
Token Usage Tracking Per Endpoint
Know which endpoints consume the most tokens.
interface TokenMetrics {
endpoint: string;
totalInputTokens: number;
totalOutputTokens: number;
requestCount: number;
avgInputTokensPerRequest: number;
avgOutputTokensPerRequest: number;
p95InputTokens: number;
p95OutputTokens: number;
}
class TokenUsageTracker {
private endpoints: Map<string, { tokens: number[]; outputTokens: number[] }> = new Map();
recordTokenUsage(endpoint: string, inputTokens: number, outputTokens: number): void {
let metrics = this.endpoints.get(endpoint);
if (!metrics) {
metrics = { tokens: [], outputTokens: [] };
this.endpoints.set(endpoint, metrics);
}
metrics.tokens.push(inputTokens);
metrics.outputTokens.push(outputTokens);
// Keep only last 1000 data points per endpoint
if (metrics.tokens.length > 1000) {
metrics.tokens.shift();
metrics.outputTokens.shift();
}
}
getMetricsForEndpoint(endpoint: string): TokenMetrics | null {
const metrics = this.endpoints.get(endpoint);
if (!metrics) return null;
const inputTokens = metrics.tokens;
const outputTokens = metrics.outputTokens;
const totalInput = inputTokens.reduce((a, b) => a + b, 0);
const totalOutput = outputTokens.reduce((a, b) => a + b, 0);
// Calculate p95
const sortedInput = [...inputTokens].sort((a, b) => a - b);
const sortedOutput = [...outputTokens].sort((a, b) => a - b);
const p95Index = Math.floor(sortedInput.length * 0.95);
return {
endpoint,
totalInputTokens: totalInput,
totalOutputTokens: totalOutput,
requestCount: inputTokens.length,
avgInputTokensPerRequest: totalInput / inputTokens.length,
avgOutputTokensPerRequest: totalOutput / outputTokens.length,
p95InputTokens: sortedInput[p95Index],
p95OutputTokens: sortedOutput[p95Index],
};
}
getDailyReport(): TokenMetrics[] {
return Array.from(this.endpoints.keys())
.map(endpoint => this.getMetricsForEndpoint(endpoint))
.filter((m): m is TokenMetrics => m !== null)
.sort((a, b) => b.totalInputTokens - a.totalInputTokens);
}
}
P95 Latency Tracking
Monitor end-to-end latency distribution, not just averages.
class LatencyMonitor {
private latencies: Map<string, number[]> = new Map();
private alerts: any[] = [];
recordLatency(endpoint: string, latencyMs: number): void {
let measurements = this.latencies.get(endpoint);
if (!measurements) {
measurements = [];
this.latencies.set(endpoint, measurements);
}
measurements.push(latencyMs);
// Keep last 10000 measurements
if (measurements.length > 10000) {
measurements.shift();
}
// Check for anomalies
const stats = this.calculateStats(measurements);
if (latencyMs > stats.p95) {
this.alerts.push({
endpoint,
latencyMs,
p95: stats.p95,
timestamp: new Date(),
severity: latencyMs > stats.p99 ? 'critical' : 'warning',
});
}
}
private calculateStats(measurements: number[]) {
const sorted = [...measurements].sort((a, b) => a - b);
const len = sorted.length;
return {
min: sorted[0],
max: sorted[len - 1],
mean: measurements.reduce((a, b) => a + b, 0) / len,
median: sorted[Math.floor(len / 2)],
p50: sorted[Math.floor(len * 0.5)],
p95: sorted[Math.floor(len * 0.95)],
p99: sorted[Math.floor(len * 0.99)],
};
}
getStats(endpoint: string) {
const measurements = this.latencies.get(endpoint);
if (!measurements || measurements.length === 0) return null;
return this.calculateStats(measurements);
}
getDailyReport(): Array<{ endpoint: string; stats: any }> {
const report = [];
for (const [endpoint, measurements] of this.latencies) {
if (measurements.length > 0) {
report.push({
endpoint,
stats: this.calculateStats(measurements),
});
}
}
return report;
}
}
Cost Attribution Per User/Feature
Track which users and features cost the most to run.
interface CostAttribution {
userId?: string;
featureId?: string;
model: string;
inputTokens: number;
outputTokens: number;
cost: number;
timestamp: Date;
}
class CostAttributor {
private attributions: CostAttribution[] = [];
private costPerToken = {
'gpt-4-turbo-preview': { input: 0.00001, output: 0.00003 },
'gpt-3.5-turbo': { input: 0.0000005, output: 0.0000015 },
'claude-3-sonnet': { input: 0.000003, output: 0.000015 },
};
recordCost(
userId: string | undefined,
featureId: string | undefined,
model: string,
inputTokens: number,
outputTokens: number
): void {
const rates = this.costPerToken[model as keyof typeof this.costPerToken] || { input: 0, output: 0 };
const cost = inputTokens * rates.input + outputTokens * rates.output;
this.attributions.push({
userId,
featureId,
model,
inputTokens,
outputTokens,
cost,
timestamp: new Date(),
});
}
getCostByUser(userId: string, days: number = 30): number {
const cutoff = new Date(Date.now() - days * 86400000);
return this.attributions
.filter(a => a.userId === userId && a.timestamp > cutoff)
.reduce((sum, a) => sum + a.cost, 0);
}
getCostByFeature(featureId: string, days: number = 30): number {
const cutoff = new Date(Date.now() - days * 86400000);
return this.attributions
.filter(a => a.featureId === featureId && a.timestamp > cutoff)
.reduce((sum, a) => sum + a.cost, 0);
}
getTopCostlyUsers(limit: number = 10, days: number = 30): Array<{ userId: string; cost: number }> {
const cutoff = new Date(Date.now() - days * 86400000);
const costByUser = new Map<string, number>();
for (const attr of this.attributions) {
if (attr.userId && attr.timestamp > cutoff) {
costByUser.set(attr.userId, (costByUser.get(attr.userId) || 0) + attr.cost);
}
}
return Array.from(costByUser)
.map(([userId, cost]) => ({ userId, cost }))
.sort((a, b) => b.cost - a.cost)
.slice(0, limit);
}
getTopCostlyFeatures(limit: number = 10, days: number = 30): Array<{ featureId: string; cost: number }> {
const cutoff = new Date(Date.now() - days * 86400000);
const costByFeature = new Map<string, number>();
for (const attr of this.attributions) {
if (attr.featureId && attr.timestamp > cutoff) {
costByFeature.set(attr.featureId, (costByFeature.get(attr.featureId) || 0) + attr.cost);
}
}
return Array.from(costByFeature)
.map(([featureId, cost]) => ({ featureId, cost }))
.sort((a, b) => b.cost - a.cost)
.slice(0, limit);
}
}
Quality Scoring and Hallucination Detection
Score LLM output quality automatically.
interface QualityScore {
responseId: string;
hallucination_score: number; // 0-1, lower is better
relevance_score: number; // 0-1, higher is better
citation_accuracy: number; // 0-1, higher is better
overall_quality: number; // 0-1, higher is better
timestamp: Date;
}
class QualityScoringEngine {
private referenceDatastore: any;
async scoreResponse(
query: string,
response: string,
sources: string[]
): Promise<QualityScore> {
const responseId = `resp_${Date.now()}`;
// 1. Check for hallucinations (claims not in sources)
const hallucScore = await this.detectHallucinations(response, sources);
// 2. Check relevance to query
const relevanceScore = await this.scoreRelevance(query, response);
// 3. Check citation accuracy
const citationScore = await this.validateCitations(response, sources);
// 4. Calculate weighted overall score
const overallQuality = 0.3 * (1 - hallucScore) + 0.4 * relevanceScore + 0.3 * citationScore;
return {
responseId,
hallucination_score: hallucScore,
relevance_score: relevanceScore,
citation_accuracy: citationScore,
overall_quality: overallQuality,
timestamp: new Date(),
};
}
private async detectHallucinations(response: string, sources: string[]): Promise<number> {
// Compare claims in response against source content
// Return 0 (no hallucinations) to 1 (completely hallucinated)
// Simple heuristic: citations vs unsupported claims
const citedClaims = (response.match(/\[citation:\s*\d+\]/g) || []).length;
const totalClaims = (response.match(/[.!?]/g) || []).length;
if (totalClaims === 0) return 0;
const hallucScore = Math.max(0, (totalClaims - citedClaims) / totalClaims);
return Math.min(1, hallucScore);
}
private async scoreRelevance(query: string, response: string): Promise<number> {
// Use embeddings to measure semantic similarity
// Higher similarity = higher relevance
const queryWords = new Set(query.toLowerCase().split(/\s+/));
const responseWords = new Set(response.toLowerCase().split(/\s+/));
const intersection = new Set([...queryWords].filter(w => responseWords.has(w)));
const union = new Set([...queryWords, ...responseWords]);
const jaccardSimilarity = intersection.size / union.size;
return jaccardSimilarity;
}
private async validateCitations(response: string, sources: string[]): Promise<number> {
// Check if all citations point to valid sources
const citationRegex = /\[citation:\s*(\d+)\]/g;
let match;
let validCitations = 0;
let totalCitations = 0;
while ((match = citationRegex.exec(response)) !== null) {
totalCitations++;
const sourceIndex = parseInt(match[1]);
if (sourceIndex < sources.length) {
validCitations++;
}
}
if (totalCitations === 0) return 1.0; // No citations = perfect (not hallucinating)
return validCitations / totalCitations;
}
}
Alerting on LLM Degradation
Trigger alerts when model quality or latency degrades.
interface DegradationAlert {
type: 'latency' | 'error_rate' | 'quality' | 'cost_anomaly';
severity: 'warning' | 'critical';
message: string;
timestamp: Date;
metrics: Record<string, any>;
}
class DegradationAlertSystem {
private baselineMetrics = {
p95Latency: 2000, // ms
errorRate: 0.01, // 1%
qualityScore: 0.85,
costPerRequest: 0.05, // $
};
private alerts: DegradationAlert[] = [];
private notificationService: any;
checkDegradation(
currentMetrics: {
p95Latency: number;
errorRate: number;
qualityScore: number;
costPerRequest: number;
}
): DegradationAlert[] {
const detectedAlerts: DegradationAlert[] = [];
// Latency degradation: >20% slower than baseline
if (currentMetrics.p95Latency > this.baselineMetrics.p95Latency * 1.2) {
detectedAlerts.push({
type: 'latency',
severity: currentMetrics.p95Latency > this.baselineMetrics.p95Latency * 1.5 ? 'critical' : 'warning',
message: `P95 latency degraded: ${currentMetrics.p95Latency}ms (baseline: ${this.baselineMetrics.p95Latency}ms)`,
timestamp: new Date(),
metrics: { current: currentMetrics.p95Latency, baseline: this.baselineMetrics.p95Latency },
});
}
// Error rate spike: >50% increase
if (currentMetrics.errorRate > this.baselineMetrics.errorRate * 1.5) {
detectedAlerts.push({
type: 'error_rate',
severity: 'critical',
message: `Error rate spike: ${(currentMetrics.errorRate * 100).toFixed(2)}% (baseline: ${(this.baselineMetrics.errorRate * 100).toFixed(2)}%)`,
timestamp: new Date(),
metrics: { current: currentMetrics.errorRate, baseline: this.baselineMetrics.errorRate },
});
}
// Quality degradation: >10% drop
if (currentMetrics.qualityScore < this.baselineMetrics.qualityScore * 0.9) {
detectedAlerts.push({
type: 'quality',
severity: 'critical',
message: `Quality score degraded: ${currentMetrics.qualityScore.toFixed(2)} (baseline: ${this.baselineMetrics.qualityScore})`,
timestamp: new Date(),
metrics: { current: currentMetrics.qualityScore, baseline: this.baselineMetrics.qualityScore },
});
}
// Cost anomaly: >30% above baseline
if (currentMetrics.costPerRequest > this.baselineMetrics.costPerRequest * 1.3) {
detectedAlerts.push({
type: 'cost_anomaly',
severity: 'warning',
message: `Cost per request spike: $${currentMetrics.costPerRequest.toFixed(4)} (baseline: $${this.baselineMetrics.costPerRequest.toFixed(4)})`,
timestamp: new Date(),
metrics: { current: currentMetrics.costPerRequest, baseline: this.baselineMetrics.costPerRequest },
});
}
// Send alerts
for (const alert of detectedAlerts) {
this.notificationService.send({
type: 'alert',
severity: alert.severity,
message: alert.message,
channel: alert.severity === 'critical' ? 'slack-oncall' : 'slack-general',
});
}
this.alerts.push(...detectedAlerts);
return detectedAlerts;
}
getRecentAlerts(hours: number = 1): DegradationAlert[] {
const cutoff = new Date(Date.now() - hours * 3600000);
return this.alerts.filter(a => a.timestamp > cutoff);
}
}
LLM Observability Checklist
- Integrate LangSmith or LangFuse for trace logging
- Implement custom middleware to capture observability
- Track token usage per endpoint and user
- Monitor p95/p99 latency distribution
- Implement cost attribution per user and feature
- Score response quality (hallucinations, relevance, citations)
- Set up baselines for latency, quality, and cost
- Alert on >20% latency degradation
- Alert on error rate spikes
- Create dashboards for daily cost and token usage
Conclusion
LLM observability is essential for production systems. Trace every call with LangSmith, track tokens and cost per endpoint, monitor latency percentiles, score quality automatically, and alert on degradation. Without visibility, you're flying blind.