Published on

Testing AI Systems — Unit Tests, Integration Tests, and Non-Determinism

Authors

Introduction

Testing AI systems is harder than traditional code: outputs are non-deterministic, harder to verify, and prone to subtle regressions. This guide covers practical testing strategies from unit tests to production monitoring.

Mocking LLM Responses in Unit Tests

Avoid expensive API calls in tests by mocking responses:

interface MockLLMConfig {
  defaultResponse: string;
  responseMap: Map<string, string>;
  latency: number; // ms
  shouldFail: boolean;
}

class MockLLMClient {
  private config: MockLLMConfig;

  constructor(config: Partial<MockLLMConfig> = {}) {
    this.config = {
      defaultResponse: 'Mocked response',
      responseMap: new Map(),
      latency: 50,
      shouldFail: false,
      ...config
    };
  }

  async sendMessage(
    messages: Array<{ role: string; content: string }>
  ): Promise<string> {
    // Simulate latency
    await new Promise(resolve => setTimeout(resolve, this.config.latency));

    if (this.config.shouldFail) {
      throw new Error('Mock LLM error');
    }

    // Check if specific response configured
    const userMessage = messages[messages.length - 1]?.content || '';
    const key = userMessage.substring(0, 50);

    if (this.config.responseMap.has(key)) {
      return this.config.responseMap.get(key)!;
    }

    return this.config.defaultResponse;
  }

  setResponse(prompt: string, response: string): void {
    this.config.responseMap.set(prompt.substring(0, 50), response);
  }
}

// Usage in tests
async function testUserClassification() {
  const mockLLM = new MockLLMClient({
    responseMap: new Map([
      ['Classify sentiment', 'positive']
    ])
  });

  const result = await classifyUserMessage('I love this product!', mockLLM);
  expect(result).toBe('positive');
}

async function testErrorHandling() {
  const mockLLM = new MockLLMClient({ shouldFail: true });

  try {
    await classifyUserMessage('test', mockLLM);
    fail('Should have thrown');
  } catch (e) {
    expect(e.message).toContain('Mock LLM error');
  }
}

// Benefits:
// - No API calls, instant tests
// - Deterministic outputs
// - Test error paths
// - Cost-free testing at scale

Snapshot Testing for Prompts

Detect unintended prompt changes:

import * as fs from 'fs';
import * as crypto from 'crypto';

interface PromptSnapshot {
  name: string;
  prompt: string;
  hash: string;
  timestamp: number;
}

class PromptSnapshotManager {
  private snapshotDir = './snapshots';

  generatePrompt(
    instruction: string,
    context: string,
    examples: string[]
  ): string {
    let prompt = instruction + '\n\n';

    if (context) {
      prompt += `Context: ${context}\n\n`;
    }

    if (examples.length &gt; 0) {
      prompt += 'Examples:\n';
      examples.forEach((ex, i) => {
        prompt += `${i + 1}. ${ex}\n`;
      });
      prompt += '\n';
    }

    return prompt;
  }

  hashPrompt(prompt: string): string {
    return crypto.createHash('sha256').update(prompt).digest('hex');
  }

  saveSnapshot(name: string, prompt: string): void {
    const snapshot: PromptSnapshot = {
      name,
      prompt,
      hash: this.hashPrompt(prompt),
      timestamp: Date.now()
    };

    const path = `${this.snapshotDir}/${name}.json`;
    fs.mkdirSync(this.snapshotDir, { recursive: true });
    fs.writeFileSync(path, JSON.stringify(snapshot, null, 2));
  }

  loadSnapshot(name: string): PromptSnapshot | null {
    const path = `${this.snapshotDir}/${name}.json`;

    if (!fs.existsSync(path)) {
      return null;
    }

    return JSON.parse(fs.readFileSync(path, 'utf-8'));
  }

  verifyPrompt(name: string, prompt: string): {
    match: boolean;
    message: string;
    oldPrompt?: string;
  } {
    const saved = this.loadSnapshot(name);

    if (!saved) {
      console.warn(`No snapshot for ${name}. Creating...`);
      this.saveSnapshot(name, prompt);
      return { match: true, message: 'Snapshot created' };
    }

    const newHash = this.hashPrompt(prompt);

    if (newHash === saved.hash) {
      return { match: true, message: 'Prompt unchanged' };
    }

    return {
      match: false,
      message: `Prompt changed! Old hash: ${saved.hash}, new: ${newHash}`,
      oldPrompt: saved.prompt
    };
  }
}

// Usage
function testClassificationPrompt() {
  const manager = new PromptSnapshotManager();

  const prompt = manager.generatePrompt(
    'Classify customer feedback as positive, negative, or neutral.',
    'Focus on sentiment, not grammar.',
    [
      'Love this product! → positive',
      'Terrible quality → negative'
    ]
  );

  const result = manager.verifyPrompt('sentiment_classifier', prompt);
  expect(result.match).toBe(true);
}

// Workflow:
// 1. First run: Creates snapshot
// 2. Subsequent runs: Compares hash
// 3. If changed: Shows diff, fails test
// 4. Developer reviews change, updates snapshot

Property-Based Testing for AI Outputs

Test invariants that should always hold:

interface TextProperty {
  name: string;
  validate: (output: string) => boolean;
}

async function testWithProperties(
  generateOutput: () => Promise<string>,
  properties: TextProperty[],
  iterations: number = 100
): Promise<{ passed: number; failed: number; failures: string[] }> {
  let passed = 0;
  let failed = 0;
  const failures: string[] = [];

  for (let i = 0; i &lt; iterations; i++) {
    const output = await generateOutput();

    for (const prop of properties) {
      if (!prop.validate(output)) {
        failed++;
        failures.push(
          `Iteration ${i}, property "${prop.name}": ${output.substring(0, 50)}...`
        );
      } else {
        passed++;
      }
    }
  }

  return { passed, failed, failures };
}

// Define properties for JSON response
const jsonProperties: TextProperty[] = [
  {
    name: 'valid_json',
    validate: (output: string) => {
      try {
        JSON.parse(output);
        return true;
      } catch {
        return false;
      }
    }
  },
  {
    name: 'has_required_fields',
    validate: (output: string) => {
      try {
        const obj = JSON.parse(output);
        return obj.hasOwnProperty('class') && obj.hasOwnProperty('confidence');
      } catch {
        return false;
      }
    }
  },
  {
    name: 'confidence_in_range',
    validate: (output: string) => {
      try {
        const obj = JSON.parse(output);
        return obj.confidence &gt;= 0 && obj.confidence &lt;= 1;
      } catch {
        return false;
      }
    }
  },
  {
    name: 'non_empty_class',
    validate: (output: string) => {
      try {
        const obj = JSON.parse(output);
        return typeof obj.class === 'string' && obj.class.length &gt; 0;
      } catch {
        return false;
      }
    }
  }
];

// Properties for text classification
const textProperties: TextProperty[] = [
  {
    name: 'non_empty_output',
    validate: (output: string) => output.trim().length &gt; 0
  },
  {
    name: 'not_too_long',
    validate: (output: string) => output.length &lt; 2000
  },
  {
    name: 'valid_utf8',
    validate: (output: string) => {
      try {
        Buffer.from(output, 'utf-8');
        return true;
      } catch {
        return false;
      }
    }
  },
  {
    name: 'no_repeated_substrings',
    validate: (output: string) => {
      const words = output.split(/\s+/);
      const seen = new Set<string>();
      for (const word of words) {
        if (seen.has(word)) {
          // Allow some repetition (pronouns, articles)
          const repetitions = words.filter(w => w === word).length;
          if (repetitions &gt; 10) {
            return false;
          }
        }
        seen.add(word);
      }
      return true;
    }
  }
];

Integration Tests with Real LLM

When you need to test actual model behavior:

interface IntegrationTestConfig {
  runRealTests: boolean; // Only in CI after manual approval
  apiKey: string;
  testSampleSize: number;
}

async function integrationTestWithRealLLM(
  config: IntegrationTestConfig
): Promise<{
  success: boolean;
  metrics: { accuracy: number; avgLatency: number; costPerTest: number };
}> {
  if (!config.runRealTests) {
    console.log('Skipping real LLM tests (set runRealTests=true)');
    return { success: true, metrics: { accuracy: 0, avgLatency: 0, costPerTest: 0 } };
  }

  const testCases = [
    {
      input: 'I love this product!',
      expectedClass: 'positive'
    },
    {
      input: 'Terrible quality, waste of money',
      expectedClass: 'negative'
    },
    {
      input: 'It works as expected',
      expectedClass: 'neutral'
    }
  ];

  let correct = 0;
  let totalLatency = 0;
  let totalCost = 0;

  for (const testCase of testCases) {
    const startTime = Date.now();

    const response = await fetch('https://api.openai.com/v1/chat/completions', {
      method: 'POST',
      headers: {
        'Authorization': `Bearer ${config.apiKey}`,
        'Content-Type': 'application/json'
      },
      body: JSON.stringify({
        model: 'gpt-3.5-turbo',
        messages: [
          {
            role: 'user',
            content: `Classify sentiment: ${testCase.input}`
          }
        ],
        temperature: 0,
        max_tokens: 10
      })
    });

    const data = await response.json();
    const latency = Date.now() - startTime;
    totalLatency += latency;

    // Check if output matches expected
    const output = data.choices[0].message.content.toLowerCase();
    if (output.includes(testCase.expectedClass)) {
      correct++;
    }

    // Estimate cost
    totalCost += 0.001; // gpt-3.5-turbo ~$0.001 per API call
  }

  const accuracy = correct / testCases.length;
  const avgLatency = totalLatency / testCases.length;

  return {
    success: accuracy &gt;= 0.8, // 80% threshold
    metrics: {
      accuracy,
      avgLatency,
      costPerTest: totalCost / testCases.length
    }
  };
}

// Run only:
// - Before major releases
// - After model updates
// - In staging/CI (not on every commit)
// - With approval to avoid costs

Non-Determinism Control with Temperature=0

interface DeterministicTestConfig {
  temperature: number;
  seed?: number;
  topP: number;
  frequencyPenalty: number;
  presencePenalty: number;
}

async function makeDeterministicRequest(
  prompt: string,
  config: DeterministicTestConfig
): Promise<string> {
  const response = await fetch('https://api.openai.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: 'gpt-3.5-turbo',
      messages: [{ role: 'user', content: prompt }],
      temperature: config.temperature, // 0 for deterministic
      top_p: config.topP,
      frequency_penalty: config.frequencyPenalty,
      presence_penalty: config.presencePenalty,
      seed: config.seed // Available in newer OpenAI models
    })
  });

  const data = await response.json();
  return data.choices[0].message.content;
}

async function testDeterminism() {
  const config: DeterministicTestConfig = {
    temperature: 0,
    topP: 1,
    frequencyPenalty: 0,
    presencePenalty: 0
  };

  const prompt = 'Extract the number from: &quot;The answer is 42&quot;';

  // Run twice with same parameters
  const result1 = await makeDeterministicRequest(prompt, config);
  const result2 = await makeDeterministicRequest(prompt, config);

  // Should be identical
  expect(result1).toBe(result2);
  expect(result1).toContain('42');
}

// Settings for deterministic behavior:
// - temperature: 0 (always pick highest probability token)
// - top_p: 1 (use full probability distribution)
// - frequency_penalty: 0 (don''t penalize repetition)
// - presence_penalty: 0 (don''t penalize topics)
// - seed: Consistent across runs (if supported)

Contract Tests for LLM API

Ensure LLM responses follow expected structure:

interface APIContract {
  inputSchema: {
    type: string;
    properties: { [key: string]: any };
    required: string[];
  };
  outputSchema: {
    type: string;
    properties: { [key: string]: any };
    required: string[];
  };
}

function validateContract(
  data: any,
  schema: APIContract['outputSchema']
): { valid: boolean; errors: string[] } {
  const errors: string[] = [];

  // Check required fields
  for (const field of schema.required) {
    if (!data.hasOwnProperty(field)) {
      errors.push(`Missing required field: ${field}`);
    }
  }

  // Check types
  for (const [field, spec] of Object.entries(schema.properties)) {
    if (!data.hasOwnProperty(field)) continue;

    const value = data[field];
    const expectedType = (spec as any).type;

    if (typeof value !== expectedType) {
      errors.push(`${field}: expected ${expectedType}, got ${typeof value}`);
    }
  }

  return {
    valid: errors.length === 0,
    errors
  };
}

// Define contract
const classificationContract: APIContract = {
  inputSchema: {
    type: 'object',
    properties: {
      text: { type: 'string' }
    },
    required: ['text']
  },
  outputSchema: {
    type: 'object',
    properties: {
      class: { type: 'string' },
      confidence: { type: 'number' }
    },
    required: ['class', 'confidence']
  }
};

async function testAPIContract() {
  const response = await fetch('https://api.openai.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: 'gpt-3.5-turbo',
      messages: [
        {
          role: 'user',
          content: 'Classify this. Response as JSON: {"class": string, "confidence": number}'
        }
      ]
    })
  });

  const data = await response.json();
  const output = JSON.parse(data.choices[0].message.content);

  const validation = validateContract(
    output,
    classificationContract.outputSchema
  );

  expect(validation.valid).toBe(true);
  if (!validation.valid) {
    console.log('Contract violations:', validation.errors);
  }
}

// Ensures API client expectations match actual LLM output

Testing RAG Retrieval in Isolation

interface RAGComponent {
  name: 'retriever' | 'ranker' | 'generator' | 'full_rag';
  passRate: number;
}

async function testRetrieverComponent(
  query: string,
  expectedDocuments: string[],
  retriever: (q: string) => Promise<string[]>
): Promise<{ pass: boolean; retrieved: string[]; missed: string[] }> {
  const retrieved = await retriever(query);

  // Check if all expected documents are in results
  const missed = expectedDocuments.filter(doc => !retrieved.includes(doc));

  return {
    pass: missed.length === 0,
    retrieved,
    missed
  };
}

async function testRAGRanking(
  query: string,
  documents: string[],
  ranker: (q: string, docs: string[]) => Promise<string[]>
): Promise<{ pass: boolean; ranking: string[]; topDocScore: number }> {
  const ranked = await ranker(query, documents);

  // Check if top result is relevant
  const topDoc = ranked[0];
  const relevance = calculateRelevance(query, topDoc);

  return {
    pass: relevance &gt; 0.7,
    ranking: ranked,
    topDocScore: relevance
  };
}

function calculateRelevance(query: string, document: string): number {
  // Simple: count shared words
  const queryWords = new Set(query.toLowerCase().split(/\s+/));
  const docWords = new Set(document.toLowerCase().split(/\s+/));

  let shared = 0;
  queryWords.forEach(w => {
    if (docWords.has(w)) shared++;
  });

  return shared / Math.max(queryWords.size, 1);
}

// Test components independently:
// - Retriever: Does it find relevant documents?
// - Ranker: Does it order them well?
// - Generator: Does it use context correctly?
// - Integration: Does full RAG work end-to-end?

Regression Suite with Golden Examples

interface GoldenExample {
  name: string;
  input: string;
  expectedOutput: string;
  tolerance: number; // For approximate matches
}

async function loadGoldenExamples(): Promise<GoldenExample[]> {
  // In production: load from database or file
  return [
    {
      name: 'positive_review',
      input: 'Love this product!',
      expectedOutput: 'positive',
      tolerance: 0.9 // Allow minor variation
    },
    {
      name: 'negative_review',
      input: 'Terrible, broke after 2 days',
      expectedOutput: 'negative',
      tolerance: 0.9
    },
    {
      name: 'neutral_feedback',
      input: 'It works',
      expectedOutput: 'neutral',
      tolerance: 0.8
    }
  ];
}

async function runRegressionSuite(
  modelEndpoint: (input: string) => Promise<string>
): Promise<{
  passed: number;
  failed: number;
  regressions: string[];
}> {
  const examples = await loadGoldenExamples();
  const regressions: string[] = [];
  let passed = 0;
  let failed = 0;

  for (const example of examples) {
    const output = await modelEndpoint(example.input);

    // Check if output is close to expected
    const similarity = stringSimilarity(output, example.expectedOutput);

    if (similarity &gt;= example.tolerance) {
      passed++;
    } else {
      failed++;
      regressions.push(
        `${example.name}: expected &quot;${example.expectedOutput}&quot;, got &quot;${output}&quot; (similarity: ${similarity.toFixed(2)})`
      );
    }
  }

  return { passed, failed, regressions };
}

function stringSimilarity(str1: string, str2: string): number {
  // Simplified: check if str1 contains str2
  if (str1.toLowerCase().includes(str2.toLowerCase())) {
    return 1.0;
  }

  // Levenshtein distance-based similarity (simplified)
  const max = Math.max(str1.length, str2.length);
  if (max === 0) return 1.0;

  let similarity = 1.0 - levenshteinDistance(str1, str2) / max;
  return Math.max(0, similarity);
}

function levenshteinDistance(s1: string, s2: string): number {
  const arr: number[][] = [];

  for (let i = 0; i &lt;= s1.length; i++) {
    arr[i] = [i];
  }

  for (let j = 0; j &lt;= s2.length; j++) {
    arr[0][j] = j;
  }

  for (let i = 1; i &lt;= s1.length; i++) {
    for (let j = 1; j &lt;= s2.length; j++) {
      arr[i][j] =
        s1[i - 1] === s2[j - 1]
          ? arr[i - 1][j - 1]
          : Math.min(
              arr[i - 1][j - 1],
              arr[i][j - 1],
              arr[i - 1][j]
            ) + 1;
    }
  }

  return arr[s1.length][s2.length];
}

CI Pipeline for AI Systems

interface CIPipeline {
  stage: string;
  duration: number;
  cost: number;
  success: boolean;
}

async function buildAICIPipeline(): Promise<CIPipeline[]> {
  const stages: CIPipeline[] = [
    {
      stage: 'unit_tests',
      duration: 30, // seconds
      cost: 0,
      success: true
    },
    {
      stage: 'snapshot_tests',
      duration: 10,
      cost: 0,
      success: true
    },
    {
      stage: 'property_based_tests',
      duration: 60,
      cost: 0.05, // Mock calls
      success: true
    },
    {
      stage: 'integration_tests',
      duration: 120,
      cost: 0.50, // Real API calls, on-demand
      success: true
    },
    {
      stage: 'regression_suite',
      duration: 45,
      cost: 0.10,
      success: true
    }
  ];

  return stages;
}

// Pipeline structure:
// 1. Unit tests (mock): 30s, free, on every commit
// 2. Snapshot tests: 10s, free, catch prompt drift
// 3. Property tests (mock): 60s, cheap, invariants
// 4. Integration tests: 120s, paid, real API, on-demand
// 5. Regression suite: 45s, cheap, golden examples
// Total: ~265 seconds = 4.5 minutes, $0.65 per run

Checklist

  • Implemented mock LLM client for unit tests
  • Created prompt snapshot manager to catch drift
  • Defined properties for JSON/text outputs (property-based tests)
  • Set up integration test suite with real LLM (on-demand)
  • Configured temperature=0 for deterministic testing
  • Built API contract validator for output structure
  • Created RAG component tests (retriever, ranker, generator)
  • Loaded golden examples for regression testing
  • Set up CI pipeline with tiered testing strategy
  • Added cost tracking per test stage
  • Configured alerts for regressions >5%

Conclusion

Test AI systems with layered approach: mock unit tests (free, fast), property-based tests for invariants, snapshot tests for prompts, and selective real API tests in CI. Target <5 minutes for full pipeline with <$1 cost. Use golden examples for regression detection and run nightly full integration tests.