- Published on
LLM Observability in Production — Tracing, Evaluating, and Debugging AI Features
- Authors

- Name
- Sanjeev Sharma
- @webcoderspeed1
Introduction
LLM outputs are probabilistic. Without observability, you cannot detect quality regressions, identify failure patterns, or optimize performance. This guide covers production observability strategies used by scaled AI companies.
- LangSmith and Langfuse Instrumentation
- Span Instrumentation for Chains
- LLM-as-Judge Evaluation
- Cosine Similarity and Exact Match Metrics
- Golden Dataset Maintenance
- Latency Percentiles for AI Endpoints
- Token Usage Dashboards
- Regression Detection and Model Migration Testing
- Checklist
- Conclusion
LangSmith and Langfuse Instrumentation
Set up centralized tracing for all LLM calls with detailed span information.
import { LangSmith } from 'langsmith';
import { Langfuse } from 'langfuse';
class ObservableLLMClient {
private langsmith: LangSmith;
private langfuse: Langfuse;
constructor(
langsmithKey: string,
langfuseKey: string,
langfusePublicKey: string
) {
this.langsmith = new LangSmith({ apiKey: langsmithKey });
this.langfuse = new Langfuse({ secretKey: langfuseKey, publicKey: langfusePublicKey });
}
async traceCompletion(
model: string,
messages: Array<{ role: string; content: string }>,
userId: string,
feature: string
): Promise<string> {
const trace = this.langfuse.trace({
name: `llm_completion_${feature}`,
input: { model, messageCount: messages.length },
userId,
});
const generation = trace.span({
name: 'api_call',
input: { messages },
metadata: { model, feature, userId },
});
try {
const startTime = Date.now();
const response = await fetch('https://api.openai.com/v1/chat/completions', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
},
body: JSON.stringify({
model,
messages,
temperature: 0.7,
}),
});
const data = (await response.json()) as {
choices: Array<{ message: { content: string } }>;
usage: { prompt_tokens: number; completion_tokens: number };
};
const latencyMs = Date.now() - startTime;
const content = data.choices[0].message.content;
generation.end({
output: content,
metadata: {
promptTokens: data.usage.prompt_tokens,
completionTokens: data.usage.completion_tokens,
latencyMs,
},
});
trace.update({
output: { content, success: true },
metadata: { totalLatencyMs: latencyMs },
});
return content;
} catch (error) {
generation.end({
output: null,
level: 'ERROR',
statusMessage: error instanceof Error ? error.message : String(error),
});
throw error;
}
}
}
const client = new ObservableLLMClient(
process.env.LANGSMITH_API_KEY!,
process.env.LANGFUSE_SECRET!,
process.env.LANGFUSE_PUBLIC!
);
const result = await client.traceCompletion(
'gpt-4-turbo',
[{ role: 'user', content: 'Summarize this document...' }],
'user123',
'document_summarization'
);
Span Instrumentation for Chains
Break down multi-step AI workflows into measurable spans.
class ChainTracer {
private spans: Map<string, { startTime: number; data: Record<string, unknown> }> = new Map();
startSpan(spanId: string, name: string, metadata: Record<string, unknown> = {}): void {
this.spans.set(spanId, {
startTime: Date.now(),
data: { name, metadata, events: [] },
});
}
addEvent(spanId: string, eventName: string, data: Record<string, unknown> = {}): void {
const span = this.spans.get(spanId);
if (!span) return;
(span.data.events as Array<{ name: string; data: Record<string, unknown>; timestamp: number }> = span.data.events || []).push({
name: eventName,
data,
timestamp: Date.now(),
});
}
endSpan(spanId: string, status: 'success' | 'error' = 'success', output: unknown = null): Record<string, unknown> {
const span = this.spans.get(spanId);
if (!span) return {};
const duration = Date.now() - span.startTime;
const record = {
...span.data,
duration,
status,
output,
timestamp: new Date().toISOString(),
};
this.spans.delete(spanId);
console.log(`[SPAN] ${record.name} completed in ${duration}ms`);
return record;
}
}
async function tracedAIChain(input: string): Promise<void> {
const tracer = new ChainTracer();
const chainId = `chain_${Date.now()}`;
tracer.startSpan(`${chainId}_preprocess`, 'Preprocessing', { inputLength: input.length });
const preprocessed = input.toLowerCase().trim();
tracer.addEvent(`${chainId}_preprocess`, 'cleaned_input', { length: preprocessed.length });
tracer.endSpan(`${chainId}_preprocess`, 'success', preprocessed);
tracer.startSpan(`${chainId}_embed`, 'Embedding', { text: preprocessed });
// Simulate embedding API call
await new Promise((resolve) => setTimeout(resolve, 100));
const embedding = Array(384).fill(0).map(() => Math.random());
tracer.endSpan(`${chainId}_embed`, 'success', { dimensions: embedding.length });
tracer.startSpan(`${chainId}_retrieval`, 'Vector Search', { embeddingDims: embedding.length });
const matches = [{ id: 'doc1', score: 0.95 }];
tracer.endSpan(`${chainId}_retrieval`, 'success', { matchCount: matches.length });
tracer.startSpan(`${chainId}_llm`, 'LLM Generation', { contextDocs: matches.length });
const result = 'Generated response based on retrieved context...';
tracer.endSpan(`${chainId}_llm`, 'success', result);
}
await tracedAIChain('Sample input for processing');
LLM-as-Judge Evaluation
Use an LLM to evaluate the quality of another LLM's outputs against criteria.
interface EvaluationResult {
score: number;
reasoning: string;
passed: boolean;
}
class LLMJudge {
async evaluate(
output: string,
criteria: string,
examples?: Array<{ output: string; score: number }>
): Promise<EvaluationResult> {
const examplePrompt = examples
? `Examples:\n${examples.map((ex) => `Output: ${ex.output}\nScore: ${ex.score}/10`).join('\n\n')}\n\n`
: '';
const prompt = `${examplePrompt}Evaluate this output based on the criteria: ${criteria}\n\nOutput: "${output}"\n\nRespond with JSON: { "score": <0-10>, "reasoning": "<explanation>" }`;
const response = await fetch('https://api.openai.com/v1/chat/completions', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
},
body: JSON.stringify({
model: 'gpt-4-turbo',
messages: [{ role: 'user', content: prompt }],
temperature: 0,
}),
});
const data = (await response.json()) as { choices: Array<{ message: { content: string } }> };
const result = JSON.parse(data.choices[0].message.content);
return {
score: result.score,
reasoning: result.reasoning,
passed: result.score >= 7,
};
}
async evaluateBatch(
outputs: string[],
criteria: string
): Promise<Array<{ output: string; evaluation: EvaluationResult }>> {
const results = await Promise.all(outputs.map((out) => this.evaluate(out, criteria)));
return outputs.map((output, i) => ({ output, evaluation: results[i] }));
}
}
const judge = new LLMJudge();
const evaluation = await judge.evaluate(
'The capital of France is Paris, located on the Seine River.',
'Is the response factually accurate and relevant?'
);
console.log(`Score: ${evaluation.score}/10 - ${evaluation.reasoning}`);
Cosine Similarity and Exact Match Metrics
Evaluate semantic similarity and exact correctness.
class EvaluationMetrics {
cosineSimilarity(vecA: number[], vecB: number[]): number {
if (vecA.length !== vecB.length) {
throw new Error('Vectors must have same dimensions');
}
const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0);
const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0));
const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0));
return dotProduct / (magnitudeA * magnitudeB);
}
exactMatch(predicted: string, expected: string): boolean {
return predicted.trim().toLowerCase() === expected.trim().toLowerCase();
}
partialMatch(predicted: string, expected: string): number {
const predWords = predicted.toLowerCase().split(/\s+/);
const expectedWords = expected.toLowerCase().split(/\s+/);
const matches = predWords.filter((word) => expectedWords.includes(word)).length;
const total = Math.max(predWords.length, expectedWords.length);
return matches / total;
}
recallAtK(predictions: string[], goldStandard: string[], k: number = 5): number {
const topK = predictions.slice(0, k);
const matches = topK.filter((pred) => goldStandard.includes(pred)).length;
return matches / Math.min(k, goldStandard.length);
}
meanReciprocalRank(predictions: string[], goldStandard: string[]): number {
for (let i = 0; i < predictions.length; i++) {
if (goldStandard.includes(predictions[i])) {
return 1 / (i + 1);
}
}
return 0;
}
}
const metrics = new EvaluationMetrics();
const vec1 = [1, 0, 1, 0];
const vec2 = [1, 0, 0, 1];
console.log(`Cosine similarity: ${metrics.cosineSimilarity(vec1, vec2)}`);
console.log(`Exact match: ${metrics.exactMatch('Paris', 'paris')}`);
console.log(`Partial match: ${metrics.partialMatch('The capital is Paris', 'Paris city')}`);
console.log(`Recall@5: ${metrics.recallAtK(['a', 'b', 'c', 'd', 'e'], ['c', 'd', 'f'], 5)}`);
console.log(`MRR: ${metrics.meanReciprocalRank(['a', 'b', 'c'], ['c', 'd'])}`);
Golden Dataset Maintenance
Build and maintain a golden dataset for continuous evaluation.
interface GoldenExample {
id: string;
input: string;
expectedOutput: string;
category: string;
createdAt: Date;
reviewedBy: string;
}
class GoldenDataset {
private examples: GoldenExample[] = [];
private filename = 'golden_dataset.json';
addExample(input: string, expectedOutput: string, category: string, reviewer: string): void {
this.examples.push({
id: `golden_${Date.now()}_${Math.random()}`,
input,
expectedOutput,
category,
createdAt: new Date(),
reviewedBy: reviewer,
});
}
getByCategory(category: string): GoldenExample[] {
return this.examples.filter((ex) => ex.category === category);
}
async evaluateAgainstGolden(
model: (input: string) => Promise<string>
): Promise<{ passed: number; failed: number; results: Array<{ example: GoldenExample; actual: string; match: boolean }> }> {
const results = [];
let passed = 0;
let failed = 0;
for (const example of this.examples) {
const actual = await model(example.input);
const match = actual.trim().toLowerCase() === example.expectedOutput.trim().toLowerCase();
results.push({ example, actual, match });
if (match) passed++;
else failed++;
}
return { passed, failed, results };
}
exportForBackup(): string {
return JSON.stringify(this.examples, null, 2);
}
}
const goldenSet = new GoldenDataset();
goldenSet.addExample('What is 2+2?', '4', 'math', 'reviewer@example.com');
goldenSet.addExample('What is the capital of France?', 'Paris', 'geography', 'reviewer@example.com');
const mockModel = async (input: string): Promise<string> => {
if (input.includes('2+2')) return '4';
if (input.includes('capital')) return 'Paris';
return 'Unknown';
};
const evaluation = await goldenSet.evaluateAgainstGolden(mockModel);
console.log(`Passed: ${evaluation.passed}, Failed: ${evaluation.failed}`);
Latency Percentiles for AI Endpoints
Monitor response time distribution to catch performance degradation.
class LatencyMonitor {
private latencies: number[] = [];
recordLatency(ms: number): void {
this.latencies.push(ms);
}
getPercentile(p: number): number {
if (this.latencies.length === 0) return 0;
const sorted = [...this.latencies].sort((a, b) => a - b);
const index = Math.ceil((p / 100) * sorted.length) - 1;
return sorted[Math.max(0, index)];
}
getStats(): {
p50: number;
p95: number;
p99: number;
mean: number;
stdDev: number;
} {
if (this.latencies.length === 0) {
return { p50: 0, p95: 0, p99: 0, mean: 0, stdDev: 0 };
}
const mean = this.latencies.reduce((a, b) => a + b, 0) / this.latencies.length;
const variance = this.latencies.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / this.latencies.length;
const stdDev = Math.sqrt(variance);
return {
p50: this.getPercentile(50),
p95: this.getPercentile(95),
p99: this.getPercentile(99),
mean,
stdDev,
};
}
reset(): void {
this.latencies = [];
}
}
const monitor = new LatencyMonitor();
[120, 135, 145, 150, 180, 200, 220, 250, 300, 500].forEach((latency) => {
monitor.recordLatency(latency);
});
const stats = monitor.getStats();
console.log(`P50: ${stats.p50}ms, P95: ${stats.p95}ms, P99: ${stats.p99}ms`);
Token Usage Dashboards
Track token consumption by feature, model, and user.
interface TokenUsageRecord {
feature: string;
model: string;
userId: string;
inputTokens: number;
outputTokens: number;
timestamp: Date;
}
class TokenDashboard {
private records: TokenUsageRecord[] = [];
recordUsage(
feature: string,
model: string,
userId: string,
inputTokens: number,
outputTokens: number
): void {
this.records.push({
feature,
model,
userId,
inputTokens,
outputTokens,
timestamp: new Date(),
});
}
getFeatureUsage(timeframeMs: number = 86400000): Record<string, { input: number; output: number }> {
const cutoff = new Date(Date.now() - timeframeMs);
const usage: Record<string, { input: number; output: number }> = {};
for (const record of this.records) {
if (record.timestamp >= cutoff) {
if (!usage[record.feature]) {
usage[record.feature] = { input: 0, output: 0 };
}
usage[record.feature].input += record.inputTokens;
usage[record.feature].output += record.outputTokens;
}
}
return usage;
}
getModelComparison(): Record<string, { totalTokens: number; usageCount: number }> {
const comparison: Record<string, { totalTokens: number; usageCount: number }> = {};
for (const record of this.records) {
if (!comparison[record.model]) {
comparison[record.model] = { totalTokens: 0, usageCount: 0 };
}
comparison[record.model].totalTokens += record.inputTokens + record.outputTokens;
comparison[record.model].usageCount += 1;
}
return comparison;
}
getTopUsers(limit: number = 10): Array<{ userId: string; totalTokens: number }> {
const userUsage: Record<string, number> = {};
for (const record of this.records) {
userUsage[record.userId] = (userUsage[record.userId] || 0) + record.inputTokens + record.outputTokens;
}
return Object.entries(userUsage)
.map(([userId, totalTokens]) => ({ userId, totalTokens }))
.sort((a, b) => b.totalTokens - a.totalTokens)
.slice(0, limit);
}
}
const dashboard = new TokenDashboard();
dashboard.recordUsage('summarize', 'gpt-4', 'user1', 150, 200);
dashboard.recordUsage('classify', 'gpt-3.5-turbo', 'user2', 50, 75);
console.log('Feature usage:', dashboard.getFeatureUsage());
console.log('Top users:', dashboard.getTopUsers());
Regression Detection and Model Migration Testing
Detect quality drops when swapping models.
class RegressionDetector {
private baselineMetrics: Record<string, number> = {};
setBaseline(model: string, metrics: Record<string, number>): void {
this.baselineMetrics[model] = metrics.successRate || 0;
}
detectRegression(model: string, currentMetrics: Record<string, number>, threshold: number = 0.05): {
regressed: boolean;
change: number;
message: string;
} {
const baseline = this.baselineMetrics[model] || 1.0;
const current = currentMetrics.successRate || 0;
const change = (baseline - current) / baseline;
return {
regressed: change > threshold,
change: change * 100,
message: change > threshold ? `REGRESSION: ${(change * 100).toFixed(2)}% drop` : 'No regression detected',
};
}
testModelMigration(
oldModel: string,
newModel: string,
testCases: Array<{ input: string; expected: string }>,
evaluator: (output: string, expected: string) => boolean
): { oldScore: number; newScore: number; safe: boolean } {
let oldScore = 0;
let newScore = 0;
for (const testCase of testCases) {
// Simulate old model performance
const oldOutput = `old_${testCase.input}`;
if (evaluator(oldOutput, testCase.expected)) oldScore++;
// Simulate new model performance
const newOutput = `new_${testCase.input}`;
if (evaluator(newOutput, testCase.expected)) newScore++;
}
const oldRate = oldScore / testCases.length;
const newRate = newScore / testCases.length;
const safe = newRate >= oldRate * 0.95;
return { oldScore: oldRate, newScore: newRate, safe };
}
}
const detector = new RegressionDetector();
detector.setBaseline('gpt-4', { successRate: 0.95 });
const regression = detector.detectRegression('gpt-4', { successRate: 0.89 });
console.log(regression.message);
Checklist
- Instrument all LLM calls with LangSmith or Langfuse
- Break down chains into measurable spans with events
- Use LLM-as-judge for subjective quality evaluation
- Maintain golden datasets with reviewed examples
- Monitor p50, p95, p99 latencies weekly
- Create feature-level token usage dashboards
- Set up regression detection alerts for model changes
- Test model migrations against golden datasets before deployment
- Build cost vs. quality tradeoff dashboards
- Automate eval runs on every model or prompt change
Conclusion
LLM observability is the foundation of reliable AI systems. Start with basic tracing in LangSmith, add LLM-as-judge evaluations, and maintain golden datasets. As you scale, add latency monitoring, token dashboards, and automated regression detection. This layered approach catches quality regressions early and provides data-driven confidence for model migrations.