Prompt Versioning and Testing — Treating Prompts Like Code

Sanjeev SharmaSanjeev Sharma
12 min read

Advertisement

Introduction

Prompts are code. Changes should go through version control, testing, and staged rollouts. Yet most teams treat prompts as black-box strings changed ad-hoc. This guide covers production-grade prompt management and testing.

Prompt as Code in Version Control

Store prompts with templates, variables, and metadata in Git.

interface PromptVersion {
  id: string;
  name: string;
  template: string;
  variables: Array<{ name: string; type: string; description: string }>;
  model: string;
  temperature: number;
  maxTokens: number;
  createdAt: Date;
  createdBy: string;
  version: number;
  metadata?: Record<string, unknown>;
}

class PromptVersionManager {
  private prompts = new Map<string, PromptVersion[]>();

  registerPrompt(name: string, prompt: PromptVersion): void {
    if (!this.prompts.has(name)) {
      this.prompts.set(name, []);
    }

    const versions = this.prompts.get(name)!;
    prompt.version = versions.length + 1;
    versions.push(prompt);
  }

  getLatest(name: string): PromptVersion | undefined {
    const versions = this.prompts.get(name);
    return versions?.[versions.length - 1];
  }

  getVersion(name: string, version: number): PromptVersion | undefined {
    return this.prompts.get(name)?.[version - 1];
  }

  render(name: string, variables: Record<string, unknown>, version?: number): string {
    const prompt = version
      ? this.getVersion(name, version)
      : this.getLatest(name);

    if (!prompt) {
      throw new Error(`Prompt ${name} not found`);
    }

    let rendered = prompt.template;

    for (const [key, value] of Object.entries(variables)) {
      rendered = rendered.replace(new RegExp(`{{\\s*${key}\\s*}}`, 'g'), String(value));
    }

    return rendered;
  }

  getHistory(name: string): PromptVersion[] {
    return this.prompts.get(name) || [];
  }

  compare(name: string, version1: number, version2: number): { before: string; after: string } {
    const v1 = this.getVersion(name, version1);
    const v2 = this.getVersion(name, version2);

    if (!v1 || !v2) {
      throw new Error('Version not found');
    }

    return { before: v1.template, after: v2.template };
  }

  exportForGit(name: string): string {
    const versions = this.prompts.get(name) || [];

    const markdown = versions
      .map((v) => `# Version ${v.version}\nCreated: ${v.createdAt}\nBy: ${v.createdBy}\n\n${v.template}`)
      .join('\n\n---\n\n');

    return markdown;
  }
}

const manager = new PromptVersionManager();

manager.registerPrompt('summarize', {
  id: 'summary_v1',
  name: 'summarize',
  template: 'Summarize the following text in {{ language }}:\n\n{{ text }}',
  variables: [
    { name: 'text', type: 'string', description: 'Text to summarize' },
    { name: 'language', type: 'string', description: 'Output language' },
  ],
  model: 'gpt-4-turbo',
  temperature: 0.3,
  maxTokens: 500,
  createdAt: new Date(),
  createdBy: 'alice@example.com',
});

const rendered = manager.render('summarize', {
  text: 'Long document...',
  language: 'English',
});

console.log(rendered);

Promptfoo for Regression Testing

Set up automated testing of prompts with eval metrics.

interface PromptEvalCase {
  input: Record<string, unknown>;
  expected?: string;
  expectedScores?: Record<string, number>;
}

class PromptTestRunner {
  async runTests(
    promptName: string,
    promptFn: (input: Record<string, unknown>) => Promise<string>,
    testCases: PromptEvalCase[],
    evaluators: Array<{ name: string; fn: (output: string, expected: string) => number }>
  ): Promise<{ passed: number; failed: number; scores: Record<string, number> }> {
    let passed = 0;
    let failed = 0;
    const scores: Record<string, number> = {};

    for (const evaluator of evaluators) {
      scores[evaluator.name] = 0;
    }

    for (const testCase of testCases) {
      try {
        const output = await promptFn(testCase.input);

        if (testCase.expected) {
          for (const evaluator of evaluators) {
            const score = evaluator.fn(output, testCase.expected);
            scores[evaluator.name] += score / testCases.length;
          }
        }

        if (testCase.expectedScores) {
          const meetsExpectations = Object.entries(testCase.expectedScores).every(
            ([metric, threshold]) => scores[metric] >= threshold
          );

          if (meetsExpectations) {
            passed++;
          } else {
            failed++;
          }
        } else {
          passed++;
        }
      } catch (error) {
        failed++;
      }
    }

    return { passed, failed, scores };
  }
}

const evaluators = [
  {
    name: 'exactMatch',
    fn: (output: string, expected: string) => (output.trim() === expected.trim() ? 1 : 0),
  },
  {
    name: 'length',
    fn: (output: string, expected: string) => {
      const lengthDiff = Math.abs(output.length - expected.length);
      return Math.max(0, 1 - lengthDiff / Math.max(output.length, expected.length));
    },
  },
];

const testRunner = new PromptTestRunner();

const testCases: PromptEvalCase[] = [
  { input: { text: 'Short text' }, expected: 'Summary' },
  { input: { text: 'Long document...' }, expected: 'Expected summary' },
];

const mockPromptFn = async (input: Record<string, unknown>) => 'Generated summary';

const results = await testRunner.runTests('summarize', mockPromptFn, testCases, evaluators);

console.log(`Tests: ${results.passed} passed, ${results.failed} failed`);
console.log('Scores:', results.scores);

Eval Dataset Management

Maintain golden datasets for continuous testing.

interface EvalDataset {
  name: string;
  category: string;
  cases: Array<{
    id: string;
    input: Record<string, unknown>;
    expected: string;
    metadata?: Record<string, unknown>;
  }>;
  createdAt: Date;
  updatedAt: Date;
  version: number;
}

class EvalDatasetManager {
  private datasets = new Map<string, EvalDataset>();

  createDataset(name: string, category: string): EvalDataset {
    const dataset: EvalDataset = {
      name,
      category,
      cases: [],
      createdAt: new Date(),
      updatedAt: new Date(),
      version: 1,
    };

    this.datasets.set(name, dataset);
    return dataset;
  }

  addCase(
    datasetName: string,
    input: Record<string, unknown>,
    expected: string,
    metadata?: Record<string, unknown>
  ): void {
    const dataset = this.datasets.get(datasetName);
    if (!dataset) {
      throw new Error(`Dataset ${datasetName} not found`);
    }

    dataset.cases.push({
      id: `case_${Date.now()}_${Math.random()}`,
      input,
      expected,
      metadata,
    });

    dataset.updatedAt = new Date();
  }

  getDataset(name: string): EvalDataset | undefined {
    return this.datasets.get(name);
  }

  splitTrainTest(datasetName: string, ratio: number = 0.8): {
    train: EvalDataset;
    test: EvalDataset;
  } {
    const dataset = this.datasets.get(datasetName);
    if (!dataset) {
      throw new Error(`Dataset ${datasetName} not found`);
    }

    const splitIdx = Math.floor(dataset.cases.length * ratio);
    const trainCases = dataset.cases.slice(0, splitIdx);
    const testCases = dataset.cases.slice(splitIdx);

    return {
      train: { ...dataset, name: `${datasetName}_train`, cases: trainCases },
      test: { ...dataset, name: `${datasetName}_test`, cases: testCases },
    };
  }

  exportCsv(datasetName: string): string {
    const dataset = this.datasets.get(datasetName);
    if (!dataset) {
      throw new Error(`Dataset ${datasetName} not found`);
    }

    const headers = ['id', 'input', 'expected', 'metadata'];
    const rows = dataset.cases.map((c) => [
      c.id,
      JSON.stringify(c.input),
      c.expected,
      JSON.stringify(c.metadata || {}),
    ]);

    const csv = [headers, ...rows].map((row) => row.map((cell) => `"${cell}"`).join(',')).join('\n');

    return csv;
  }
}

const datasetManager = new EvalDatasetManager();

const dataset = datasetManager.createDataset('summarization', 'text');
datasetManager.addCase(dataset.name, { text: 'Article content' }, 'Expected summary');
datasetManager.addCase(dataset.name, { text: 'Another article' }, 'Another summary');

const { train, test } = datasetManager.splitTrainTest(dataset.name, 0.8);
console.log(`Train: ${train.cases.length}, Test: ${test.cases.length}`);

A/B Testing Prompts in Production

Deploy multiple prompts and measure performance differences.

interface ABTestConfig {
  name: string;
  promptA: { id: string; weight: number };
  promptB: { id: string; weight: number };
  metrics: string[];
  startAt: Date;
  endAt: Date;
}

class ABTester {
  private activeTests = new Map<string, ABTestConfig>();
  private results = new Map<string, { promptA: Record<string, number>; promptB: Record<string, number> }>();

  startTest(config: ABTestConfig): void {
    this.activeTests.set(config.name, config);
    this.results.set(config.name, {
      promptA: Object.fromEntries(config.metrics.map((m) => [m, 0])),
      promptB: Object.fromEntries(config.metrics.map((m) => [m, 0])),
    });
  }

  selectPrompt(testName: string, userId: string): string {
    const test = this.activeTests.get(testName);
    if (!test) {
      throw new Error(`Test ${testName} not found`);
    }

    // Consistent hashing based on userId
    const hash = this.hashUserId(userId);
    const variant = hash % 100 < test.promptA.weight * 100 ? 'A' : 'B';

    return variant === 'A' ? test.promptA.id : test.promptB.id;
  }

  recordMetric(testName: string, promptId: string, metric: string, value: number): void {
    const results = this.results.get(testName);
    if (!results) return;

    const test = this.activeTests.get(testName);
    if (!test) return;

    if (promptId === test.promptA.id) {
      results.promptA[metric] = (results.promptA[metric] || 0) + value;
    } else if (promptId === test.promptB.id) {
      results.promptB[metric] = (results.promptB[metric] || 0) + value;
    }
  }

  getResults(testName: string): {
    promptA: Record<string, number>;
    promptB: Record<string, number>;
    winner?: string;
  } {
    const results = this.results.get(testName);
    if (!results) {
      throw new Error(`Test ${testName} not found`);
    }

    // Determine winner based on primary metric
    const primaryMetric = this.activeTests.get(testName)?.metrics[0] || '';
    const winner =
      results.promptA[primaryMetric] > results.promptB[primaryMetric] ? 'A' : 'B';

    return { ...results, winner };
  }

  private hashUserId(userId: string): number {
    let hash = 0;
    for (let i = 0; i < userId.length; i++) {
      hash = (hash << 5) - hash + userId.charCodeAt(i);
      hash |= 0; // Convert to 32-bit integer
    }
    return Math.abs(hash);
  }
}

const abTester = new ABTester();

abTester.startTest({
  name: 'prompt_test_v1',
  promptA: { id: 'prompt_v1', weight: 0.5 },
  promptB: { id: 'prompt_v2', weight: 0.5 },
  metrics: ['quality_score', 'latency_ms'],
  startAt: new Date(),
  endAt: new Date(Date.now() + 86400000 * 7),
});

const selected = abTester.selectPrompt('prompt_test_v1', 'user123');
console.log(`Selected prompt: ${selected}`);

abTester.recordMetric('prompt_test_v1', selected, 'quality_score', 0.85);
const results = abTester.getResults('prompt_test_v1');
console.log(`Results:`, results);

Canary Prompt Deployments

Gradually roll out prompt changes to verify safety.

interface CanaryConfig {
  promptId: string;
  baseline: string;
  canary: string;
  trafficPercentage: number;
  errorThreshold: number;
  qualityThreshold: number;
}

class CanaryDeployer {
  private deployments = new Map<string, CanaryConfig>();
  private metrics = new Map<string, { errors: number; quality: number; requests: number }>();

  initializeCanary(config: CanaryConfig): void {
    this.deployments.set(config.promptId, config);
    this.metrics.set(config.promptId, { errors: 0, quality: 0, requests: 0 });
  }

  selectPrompt(promptId: string): string {
    const config = this.deployments.get(promptId);
    if (!config) {
      return 'baseline';
    }

    const rand = Math.random() * 100;
    return rand < config.trafficPercentage ? 'canary' : 'baseline';
  }

  recordResult(promptId: string, success: boolean, qualityScore: number): void {
    const metrics = this.metrics.get(promptId);
    if (!metrics) return;

    metrics.requests++;
    if (!success) metrics.errors++;
    metrics.quality += qualityScore;
  }

  shouldRollback(promptId: string): boolean {
    const config = this.deployments.get(promptId);
    const metrics = this.metrics.get(promptId);

    if (!config || !metrics || metrics.requests === 0) {
      return false;
    }

    const errorRate = metrics.errors / metrics.requests;
    const avgQuality = metrics.quality / metrics.requests;

    return errorRate > config.errorThreshold || avgQuality < config.qualityThreshold;
  }

  promoteCanary(promptId: string): boolean {
    const config = this.deployments.get(promptId);
    if (!config) return false;

    if (this.shouldRollback(promptId)) {
      console.log(`Canary rollback triggered for ${promptId}`);
      return false;
    }

    console.log(`Promoting canary to baseline for ${promptId}`);
    config.baseline = config.canary;
    config.trafficPercentage = 100;

    return true;
  }

  getMetrics(promptId: string): Record<string, number> | undefined {
    const metrics = this.metrics.get(promptId);
    if (!metrics) return undefined;

    return {
      requests: metrics.requests,
      errorRate: metrics.errors / metrics.requests,
      avgQuality: metrics.quality / metrics.requests,
    };
  }
}

const deployer = new CanaryDeployer();

deployer.initializeCanary({
  promptId: 'summarize',
  baseline: 'original_prompt',
  canary: 'new_prompt_v2',
  trafficPercentage: 10,
  errorThreshold: 0.05,
  qualityThreshold: 0.75,
});

const selected = deployer.selectPrompt('summarize');
deployer.recordResult('summarize', true, 0.88);

console.log('Metrics:', deployer.getMetrics('summarize'));

Prompt Template Libraries With Variable Injection

Build reusable prompt components with parameterized variables.

class PromptLibrary {
  private templates = new Map<string, string>();
  private components = new Map<string, string>();

  registerTemplate(name: string, template: string): void {
    this.templates.set(name, template);
  }

  registerComponent(name: string, template: string): void {
    this.components.set(name, template);
  }

  compose(templateName: string, variables: Record<string, unknown>): string {
    const template = this.templates.get(templateName);
    if (!template) {
      throw new Error(`Template ${templateName} not found`);
    }

    return this.inject(template, variables);
  }

  composeWithComponents(
    templateName: string,
    components: Record<string, unknown>,
    variables: Record<string, unknown>
  ): string {
    const template = this.templates.get(templateName);
    if (!template) {
      throw new Error(`Template ${templateName} not found`);
    }

    // First expand components
    let result = template;

    for (const [key, value] of Object.entries(components)) {
      const component = typeof value === 'string' ? this.components.get(value) : String(value);
      if (component) {
        result = result.replace(new RegExp(`\\[${key}\\]`, 'g'), component);
      }
    }

    // Then inject variables
    return this.inject(result, variables);
  }

  private inject(template: string, variables: Record<string, unknown>): string {
    let result = template;

    for (const [key, value] of Object.entries(variables)) {
      result = result.replace(new RegExp(`{{\\s*${key}\\s*}}`, 'g'), String(value));
    }

    return result;
  }
}

const library = new PromptLibrary();

library.registerComponent('system_message', 'You are a helpful assistant.');
library.registerComponent('constraint', 'Keep responses concise and factual.');

library.registerTemplate(
  'default',
  `[system_message]
[constraint]

User input: {{ input }}

Response:`
);

const prompt = library.composeWithComponents(
  'default',
  { system_message: 'system_message', constraint: 'constraint' },
  { input: 'What is AI?' }
);

console.log(prompt);

Model Migration Testing

Test prompt compatibility across different models before switching.

class ModelMigrationTester {
  async testMigration(
    prompt: string,
    testCases: Array<{ input: string; expectedQuality: number }>,
    sourceModel: string,
    targetModel: string
  ): Promise<{
    sourceMetrics: Record<string, number>;
    targetMetrics: Record<string, number>;
    safeToMigrate: boolean;
  }> {
    const sourceResults = await this.testModel(prompt, testCases, sourceModel);
    const targetResults = await this.testModel(prompt, testCases, targetModel);

    const compatibilityRatio = targetResults.avgQuality / sourceResults.avgQuality;
    const safeToMigrate = compatibilityRatio >= 0.95; // At least 95% quality preserved

    return {
      sourceMetrics: sourceResults,
      targetMetrics: targetResults,
      safeToMigrate,
    };
  }

  private async testModel(
    prompt: string,
    testCases: Array<{ input: string; expectedQuality: number }>,
    model: string
  ): Promise<Record<string, number>> {
    let totalQuality = 0;
    let successCount = 0;

    for (const testCase of testCases) {
      try {
        // Simulate API call
        const quality = Math.random() * 0.9 + 0.8; // Mock quality 0.8-1.0
        totalQuality += quality;
        if (quality >= testCase.expectedQuality) {
          successCount++;
        }
      } catch (error) {
        // Count as failure
      }
    }

    return {
      avgQuality: totalQuality / testCases.length,
      successRate: successCount / testCases.length,
    };
  }
}

const migrationTester = new ModelMigrationTester();

const testCases = [
  { input: 'Test 1', expectedQuality: 0.85 },
  { input: 'Test 2', expectedQuality: 0.8 },
];

const result = await migrationTester.testMigration('Prompt text', testCases, 'gpt-4', 'claude-3');

console.log('Safe to migrate:', result.safeToMigrate);
console.log('Source metrics:', result.sourceMetrics);
console.log('Target metrics:', result.targetMetrics);

CI Pipeline for Prompt Regressions

Automate prompt testing on every commit.

interface CIPipelineConfig {
  promptName: string;
  datasetPath: string;
  minPassRate: number;
  minQualityScore: number;
}

class PromptCIPipeline {
  async runTests(config: CIPipelineConfig): Promise<{ passed: boolean; details: Record<string, unknown> }> {
    console.log(`Running CI tests for ${config.promptName}...`);

    // 1. Load eval dataset
    const dataset = this.loadDataset(config.datasetPath);

    // 2. Run tests
    const results = await this.runEvals(config.promptName, dataset);

    // 3. Check thresholds
    const passRate = results.passed / results.total;
    const avgQuality = results.totalQuality / results.total;

    const passed = passRate >= config.minPassRate && avgQuality >= config.minQualityScore;

    console.log(`Pass rate: ${(passRate * 100).toFixed(2)}% (min: ${config.minPassRate * 100}%)`);
    console.log(`Avg quality: ${avgQuality.toFixed(2)} (min: ${config.minQualityScore})`);

    return {
      passed,
      details: {
        passRate,
        avgQuality,
        testCount: results.total,
        failedTests: results.total - results.passed,
      },
    };
  }

  private loadDataset(path: string): Array<{ input: string; expected: string }> {
    // In real implementation, load from file
    return [
      { input: 'Test input 1', expected: 'Expected output 1' },
      { input: 'Test input 2', expected: 'Expected output 2' },
    ];
  }

  private async runEvals(
    promptName: string,
    dataset: Array<{ input: string; expected: string }>
  ): Promise<{ passed: number; total: number; totalQuality: number }> {
    let passed = 0;
    let totalQuality = 0;

    for (const testCase of dataset) {
      // Simulate prompt execution
      const output = `Response to: ${testCase.input}`;

      // Score output
      const quality = output.includes(testCase.input.slice(0, 5)) ? 0.9 : 0.7;
      totalQuality += quality;

      if (quality > 0.75) {
        passed++;
      }
    }

    return { passed, total: dataset.length, totalQuality };
  }
}

const pipeline = new PromptCIPipeline();

const result = await pipeline.runTests({
  promptName: 'summarize',
  datasetPath: './datasets/summarization_eval.json',
  minPassRate: 0.9,
  minQualityScore: 0.8,
});

console.log('CI Test Result:', result.passed ? 'PASSED' : 'FAILED');

Checklist

  • Store all prompts in version control with metadata
  • Create eval datasets for continuous testing
  • Implement automated regression tests for every prompt change
  • Use A/B testing to measure prompt improvements in production
  • Canary deploy new prompts to 5-10% traffic first
  • Test prompts on new models before switching
  • Maintain a prompt template library with reusable components
  • Gate prompt deployments on eval quality thresholds
  • Monitor prompt performance metrics continuously
  • Log all prompt changes with rationale and author
  • Run CI tests on every prompt commit
  • Document expected quality metrics per prompt

Conclusion

Treating prompts as code is essential for reliable AI applications. Version all prompts, test against golden datasets before deployment, use A/B testing for data-driven improvements, and canary new prompts to catch quality regressions early. This approach scales prompt management from manual tweaking to disciplined engineering.

Advertisement

Sanjeev Sharma

Written by

Sanjeev Sharma

Full Stack Engineer · E-mopro