Published on

Prompt Versioning and Testing — Treating Prompts Like Code

Authors

Introduction

Prompts are code. Changes should go through version control, testing, and staged rollouts. Yet most teams treat prompts as black-box strings changed ad-hoc. This guide covers production-grade prompt management and testing.

Prompt as Code in Version Control

Store prompts with templates, variables, and metadata in Git.

interface PromptVersion {
  id: string;
  name: string;
  template: string;
  variables: Array<{ name: string; type: string; description: string }>;
  model: string;
  temperature: number;
  maxTokens: number;
  createdAt: Date;
  createdBy: string;
  version: number;
  metadata?: Record<string, unknown>;
}

class PromptVersionManager {
  private prompts = new Map<string, PromptVersion[]>();

  registerPrompt(name: string, prompt: PromptVersion): void {
    if (!this.prompts.has(name)) {
      this.prompts.set(name, []);
    }

    const versions = this.prompts.get(name)!;
    prompt.version = versions.length + 1;
    versions.push(prompt);
  }

  getLatest(name: string): PromptVersion | undefined {
    const versions = this.prompts.get(name);
    return versions?.[versions.length - 1];
  }

  getVersion(name: string, version: number): PromptVersion | undefined {
    return this.prompts.get(name)?.[version - 1];
  }

  render(name: string, variables: Record<string, unknown>, version?: number): string {
    const prompt = version
      ? this.getVersion(name, version)
      : this.getLatest(name);

    if (!prompt) {
      throw new Error(`Prompt ${name} not found`);
    }

    let rendered = prompt.template;

    for (const [key, value] of Object.entries(variables)) {
      rendered = rendered.replace(new RegExp(`{{\\s*${key}\\s*}}`, 'g'), String(value));
    }

    return rendered;
  }

  getHistory(name: string): PromptVersion[] {
    return this.prompts.get(name) || [];
  }

  compare(name: string, version1: number, version2: number): { before: string; after: string } {
    const v1 = this.getVersion(name, version1);
    const v2 = this.getVersion(name, version2);

    if (!v1 || !v2) {
      throw new Error('Version not found');
    }

    return { before: v1.template, after: v2.template };
  }

  exportForGit(name: string): string {
    const versions = this.prompts.get(name) || [];

    const markdown = versions
      .map((v) => `# Version ${v.version}\nCreated: ${v.createdAt}\nBy: ${v.createdBy}\n\n${v.template}`)
      .join('\n\n---\n\n');

    return markdown;
  }
}

const manager = new PromptVersionManager();

manager.registerPrompt('summarize', {
  id: 'summary_v1',
  name: 'summarize',
  template: 'Summarize the following text in {{ language }}:\n\n{{ text }}',
  variables: [
    { name: 'text', type: 'string', description: 'Text to summarize' },
    { name: 'language', type: 'string', description: 'Output language' },
  ],
  model: 'gpt-4-turbo',
  temperature: 0.3,
  maxTokens: 500,
  createdAt: new Date(),
  createdBy: 'alice@example.com',
});

const rendered = manager.render('summarize', {
  text: 'Long document...',
  language: 'English',
});

console.log(rendered);

Promptfoo for Regression Testing

Set up automated testing of prompts with eval metrics.

interface PromptEvalCase {
  input: Record<string, unknown>;
  expected?: string;
  expectedScores?: Record<string, number>;
}

class PromptTestRunner {
  async runTests(
    promptName: string,
    promptFn: (input: Record<string, unknown>) => Promise<string>,
    testCases: PromptEvalCase[],
    evaluators: Array<{ name: string; fn: (output: string, expected: string) => number }>
  ): Promise<{ passed: number; failed: number; scores: Record<string, number> }> {
    let passed = 0;
    let failed = 0;
    const scores: Record<string, number> = {};

    for (const evaluator of evaluators) {
      scores[evaluator.name] = 0;
    }

    for (const testCase of testCases) {
      try {
        const output = await promptFn(testCase.input);

        if (testCase.expected) {
          for (const evaluator of evaluators) {
            const score = evaluator.fn(output, testCase.expected);
            scores[evaluator.name] += score / testCases.length;
          }
        }

        if (testCase.expectedScores) {
          const meetsExpectations = Object.entries(testCase.expectedScores).every(
            ([metric, threshold]) => scores[metric] >= threshold
          );

          if (meetsExpectations) {
            passed++;
          } else {
            failed++;
          }
        } else {
          passed++;
        }
      } catch (error) {
        failed++;
      }
    }

    return { passed, failed, scores };
  }
}

const evaluators = [
  {
    name: 'exactMatch',
    fn: (output: string, expected: string) => (output.trim() === expected.trim() ? 1 : 0),
  },
  {
    name: 'length',
    fn: (output: string, expected: string) => {
      const lengthDiff = Math.abs(output.length - expected.length);
      return Math.max(0, 1 - lengthDiff / Math.max(output.length, expected.length));
    },
  },
];

const testRunner = new PromptTestRunner();

const testCases: PromptEvalCase[] = [
  { input: { text: 'Short text' }, expected: 'Summary' },
  { input: { text: 'Long document...' }, expected: 'Expected summary' },
];

const mockPromptFn = async (input: Record<string, unknown>) => 'Generated summary';

const results = await testRunner.runTests('summarize', mockPromptFn, testCases, evaluators);

console.log(`Tests: ${results.passed} passed, ${results.failed} failed`);
console.log('Scores:', results.scores);

Eval Dataset Management

Maintain golden datasets for continuous testing.

interface EvalDataset {
  name: string;
  category: string;
  cases: Array<{
    id: string;
    input: Record<string, unknown>;
    expected: string;
    metadata?: Record<string, unknown>;
  }>;
  createdAt: Date;
  updatedAt: Date;
  version: number;
}

class EvalDatasetManager {
  private datasets = new Map<string, EvalDataset>();

  createDataset(name: string, category: string): EvalDataset {
    const dataset: EvalDataset = {
      name,
      category,
      cases: [],
      createdAt: new Date(),
      updatedAt: new Date(),
      version: 1,
    };

    this.datasets.set(name, dataset);
    return dataset;
  }

  addCase(
    datasetName: string,
    input: Record<string, unknown>,
    expected: string,
    metadata?: Record<string, unknown>
  ): void {
    const dataset = this.datasets.get(datasetName);
    if (!dataset) {
      throw new Error(`Dataset ${datasetName} not found`);
    }

    dataset.cases.push({
      id: `case_${Date.now()}_${Math.random()}`,
      input,
      expected,
      metadata,
    });

    dataset.updatedAt = new Date();
  }

  getDataset(name: string): EvalDataset | undefined {
    return this.datasets.get(name);
  }

  splitTrainTest(datasetName: string, ratio: number = 0.8): {
    train: EvalDataset;
    test: EvalDataset;
  } {
    const dataset = this.datasets.get(datasetName);
    if (!dataset) {
      throw new Error(`Dataset ${datasetName} not found`);
    }

    const splitIdx = Math.floor(dataset.cases.length * ratio);
    const trainCases = dataset.cases.slice(0, splitIdx);
    const testCases = dataset.cases.slice(splitIdx);

    return {
      train: { ...dataset, name: `${datasetName}_train`, cases: trainCases },
      test: { ...dataset, name: `${datasetName}_test`, cases: testCases },
    };
  }

  exportCsv(datasetName: string): string {
    const dataset = this.datasets.get(datasetName);
    if (!dataset) {
      throw new Error(`Dataset ${datasetName} not found`);
    }

    const headers = ['id', 'input', 'expected', 'metadata'];
    const rows = dataset.cases.map((c) => [
      c.id,
      JSON.stringify(c.input),
      c.expected,
      JSON.stringify(c.metadata || {}),
    ]);

    const csv = [headers, ...rows].map((row) => row.map((cell) => `"${cell}"`).join(',')).join('\n');

    return csv;
  }
}

const datasetManager = new EvalDatasetManager();

const dataset = datasetManager.createDataset('summarization', 'text');
datasetManager.addCase(dataset.name, { text: 'Article content' }, 'Expected summary');
datasetManager.addCase(dataset.name, { text: 'Another article' }, 'Another summary');

const { train, test } = datasetManager.splitTrainTest(dataset.name, 0.8);
console.log(`Train: ${train.cases.length}, Test: ${test.cases.length}`);

A/B Testing Prompts in Production

Deploy multiple prompts and measure performance differences.

interface ABTestConfig {
  name: string;
  promptA: { id: string; weight: number };
  promptB: { id: string; weight: number };
  metrics: string[];
  startAt: Date;
  endAt: Date;
}

class ABTester {
  private activeTests = new Map<string, ABTestConfig>();
  private results = new Map<string, { promptA: Record<string, number>; promptB: Record<string, number> }>();

  startTest(config: ABTestConfig): void {
    this.activeTests.set(config.name, config);
    this.results.set(config.name, {
      promptA: Object.fromEntries(config.metrics.map((m) => [m, 0])),
      promptB: Object.fromEntries(config.metrics.map((m) => [m, 0])),
    });
  }

  selectPrompt(testName: string, userId: string): string {
    const test = this.activeTests.get(testName);
    if (!test) {
      throw new Error(`Test ${testName} not found`);
    }

    // Consistent hashing based on userId
    const hash = this.hashUserId(userId);
    const variant = hash % 100 < test.promptA.weight * 100 ? 'A' : 'B';

    return variant === 'A' ? test.promptA.id : test.promptB.id;
  }

  recordMetric(testName: string, promptId: string, metric: string, value: number): void {
    const results = this.results.get(testName);
    if (!results) return;

    const test = this.activeTests.get(testName);
    if (!test) return;

    if (promptId === test.promptA.id) {
      results.promptA[metric] = (results.promptA[metric] || 0) + value;
    } else if (promptId === test.promptB.id) {
      results.promptB[metric] = (results.promptB[metric] || 0) + value;
    }
  }

  getResults(testName: string): {
    promptA: Record<string, number>;
    promptB: Record<string, number>;
    winner?: string;
  } {
    const results = this.results.get(testName);
    if (!results) {
      throw new Error(`Test ${testName} not found`);
    }

    // Determine winner based on primary metric
    const primaryMetric = this.activeTests.get(testName)?.metrics[0] || '';
    const winner =
      results.promptA[primaryMetric] > results.promptB[primaryMetric] ? 'A' : 'B';

    return { ...results, winner };
  }

  private hashUserId(userId: string): number {
    let hash = 0;
    for (let i = 0; i < userId.length; i++) {
      hash = (hash << 5) - hash + userId.charCodeAt(i);
      hash |= 0; // Convert to 32-bit integer
    }
    return Math.abs(hash);
  }
}

const abTester = new ABTester();

abTester.startTest({
  name: 'prompt_test_v1',
  promptA: { id: 'prompt_v1', weight: 0.5 },
  promptB: { id: 'prompt_v2', weight: 0.5 },
  metrics: ['quality_score', 'latency_ms'],
  startAt: new Date(),
  endAt: new Date(Date.now() + 86400000 * 7),
});

const selected = abTester.selectPrompt('prompt_test_v1', 'user123');
console.log(`Selected prompt: ${selected}`);

abTester.recordMetric('prompt_test_v1', selected, 'quality_score', 0.85);
const results = abTester.getResults('prompt_test_v1');
console.log(`Results:`, results);

Canary Prompt Deployments

Gradually roll out prompt changes to verify safety.

interface CanaryConfig {
  promptId: string;
  baseline: string;
  canary: string;
  trafficPercentage: number;
  errorThreshold: number;
  qualityThreshold: number;
}

class CanaryDeployer {
  private deployments = new Map<string, CanaryConfig>();
  private metrics = new Map<string, { errors: number; quality: number; requests: number }>();

  initializeCanary(config: CanaryConfig): void {
    this.deployments.set(config.promptId, config);
    this.metrics.set(config.promptId, { errors: 0, quality: 0, requests: 0 });
  }

  selectPrompt(promptId: string): string {
    const config = this.deployments.get(promptId);
    if (!config) {
      return 'baseline';
    }

    const rand = Math.random() * 100;
    return rand < config.trafficPercentage ? 'canary' : 'baseline';
  }

  recordResult(promptId: string, success: boolean, qualityScore: number): void {
    const metrics = this.metrics.get(promptId);
    if (!metrics) return;

    metrics.requests++;
    if (!success) metrics.errors++;
    metrics.quality += qualityScore;
  }

  shouldRollback(promptId: string): boolean {
    const config = this.deployments.get(promptId);
    const metrics = this.metrics.get(promptId);

    if (!config || !metrics || metrics.requests === 0) {
      return false;
    }

    const errorRate = metrics.errors / metrics.requests;
    const avgQuality = metrics.quality / metrics.requests;

    return errorRate > config.errorThreshold || avgQuality < config.qualityThreshold;
  }

  promoteCanary(promptId: string): boolean {
    const config = this.deployments.get(promptId);
    if (!config) return false;

    if (this.shouldRollback(promptId)) {
      console.log(`Canary rollback triggered for ${promptId}`);
      return false;
    }

    console.log(`Promoting canary to baseline for ${promptId}`);
    config.baseline = config.canary;
    config.trafficPercentage = 100;

    return true;
  }

  getMetrics(promptId: string): Record<string, number> | undefined {
    const metrics = this.metrics.get(promptId);
    if (!metrics) return undefined;

    return {
      requests: metrics.requests,
      errorRate: metrics.errors / metrics.requests,
      avgQuality: metrics.quality / metrics.requests,
    };
  }
}

const deployer = new CanaryDeployer();

deployer.initializeCanary({
  promptId: 'summarize',
  baseline: 'original_prompt',
  canary: 'new_prompt_v2',
  trafficPercentage: 10,
  errorThreshold: 0.05,
  qualityThreshold: 0.75,
});

const selected = deployer.selectPrompt('summarize');
deployer.recordResult('summarize', true, 0.88);

console.log('Metrics:', deployer.getMetrics('summarize'));

Prompt Template Libraries With Variable Injection

Build reusable prompt components with parameterized variables.

class PromptLibrary {
  private templates = new Map<string, string>();
  private components = new Map<string, string>();

  registerTemplate(name: string, template: string): void {
    this.templates.set(name, template);
  }

  registerComponent(name: string, template: string): void {
    this.components.set(name, template);
  }

  compose(templateName: string, variables: Record<string, unknown>): string {
    const template = this.templates.get(templateName);
    if (!template) {
      throw new Error(`Template ${templateName} not found`);
    }

    return this.inject(template, variables);
  }

  composeWithComponents(
    templateName: string,
    components: Record<string, unknown>,
    variables: Record<string, unknown>
  ): string {
    const template = this.templates.get(templateName);
    if (!template) {
      throw new Error(`Template ${templateName} not found`);
    }

    // First expand components
    let result = template;

    for (const [key, value] of Object.entries(components)) {
      const component = typeof value === 'string' ? this.components.get(value) : String(value);
      if (component) {
        result = result.replace(new RegExp(`\\[${key}\\]`, 'g'), component);
      }
    }

    // Then inject variables
    return this.inject(result, variables);
  }

  private inject(template: string, variables: Record<string, unknown>): string {
    let result = template;

    for (const [key, value] of Object.entries(variables)) {
      result = result.replace(new RegExp(`{{\\s*${key}\\s*}}`, 'g'), String(value));
    }

    return result;
  }
}

const library = new PromptLibrary();

library.registerComponent('system_message', 'You are a helpful assistant.');
library.registerComponent('constraint', 'Keep responses concise and factual.');

library.registerTemplate(
  'default',
  `[system_message]
[constraint]

User input: {{ input }}

Response:`
);

const prompt = library.composeWithComponents(
  'default',
  { system_message: 'system_message', constraint: 'constraint' },
  { input: 'What is AI?' }
);

console.log(prompt);

Model Migration Testing

Test prompt compatibility across different models before switching.

class ModelMigrationTester {
  async testMigration(
    prompt: string,
    testCases: Array<{ input: string; expectedQuality: number }>,
    sourceModel: string,
    targetModel: string
  ): Promise<{
    sourceMetrics: Record<string, number>;
    targetMetrics: Record<string, number>;
    safeToMigrate: boolean;
  }> {
    const sourceResults = await this.testModel(prompt, testCases, sourceModel);
    const targetResults = await this.testModel(prompt, testCases, targetModel);

    const compatibilityRatio = targetResults.avgQuality / sourceResults.avgQuality;
    const safeToMigrate = compatibilityRatio >= 0.95; // At least 95% quality preserved

    return {
      sourceMetrics: sourceResults,
      targetMetrics: targetResults,
      safeToMigrate,
    };
  }

  private async testModel(
    prompt: string,
    testCases: Array<{ input: string; expectedQuality: number }>,
    model: string
  ): Promise<Record<string, number>> {
    let totalQuality = 0;
    let successCount = 0;

    for (const testCase of testCases) {
      try {
        // Simulate API call
        const quality = Math.random() * 0.9 + 0.8; // Mock quality 0.8-1.0
        totalQuality += quality;
        if (quality >= testCase.expectedQuality) {
          successCount++;
        }
      } catch (error) {
        // Count as failure
      }
    }

    return {
      avgQuality: totalQuality / testCases.length,
      successRate: successCount / testCases.length,
    };
  }
}

const migrationTester = new ModelMigrationTester();

const testCases = [
  { input: 'Test 1', expectedQuality: 0.85 },
  { input: 'Test 2', expectedQuality: 0.8 },
];

const result = await migrationTester.testMigration('Prompt text', testCases, 'gpt-4', 'claude-3');

console.log('Safe to migrate:', result.safeToMigrate);
console.log('Source metrics:', result.sourceMetrics);
console.log('Target metrics:', result.targetMetrics);

CI Pipeline for Prompt Regressions

Automate prompt testing on every commit.

interface CIPipelineConfig {
  promptName: string;
  datasetPath: string;
  minPassRate: number;
  minQualityScore: number;
}

class PromptCIPipeline {
  async runTests(config: CIPipelineConfig): Promise<{ passed: boolean; details: Record<string, unknown> }> {
    console.log(`Running CI tests for ${config.promptName}...`);

    // 1. Load eval dataset
    const dataset = this.loadDataset(config.datasetPath);

    // 2. Run tests
    const results = await this.runEvals(config.promptName, dataset);

    // 3. Check thresholds
    const passRate = results.passed / results.total;
    const avgQuality = results.totalQuality / results.total;

    const passed = passRate >= config.minPassRate && avgQuality >= config.minQualityScore;

    console.log(`Pass rate: ${(passRate * 100).toFixed(2)}% (min: ${config.minPassRate * 100}%)`);
    console.log(`Avg quality: ${avgQuality.toFixed(2)} (min: ${config.minQualityScore})`);

    return {
      passed,
      details: {
        passRate,
        avgQuality,
        testCount: results.total,
        failedTests: results.total - results.passed,
      },
    };
  }

  private loadDataset(path: string): Array<{ input: string; expected: string }> {
    // In real implementation, load from file
    return [
      { input: 'Test input 1', expected: 'Expected output 1' },
      { input: 'Test input 2', expected: 'Expected output 2' },
    ];
  }

  private async runEvals(
    promptName: string,
    dataset: Array<{ input: string; expected: string }>
  ): Promise<{ passed: number; total: number; totalQuality: number }> {
    let passed = 0;
    let totalQuality = 0;

    for (const testCase of dataset) {
      // Simulate prompt execution
      const output = `Response to: ${testCase.input}`;

      // Score output
      const quality = output.includes(testCase.input.slice(0, 5)) ? 0.9 : 0.7;
      totalQuality += quality;

      if (quality > 0.75) {
        passed++;
      }
    }

    return { passed, total: dataset.length, totalQuality };
  }
}

const pipeline = new PromptCIPipeline();

const result = await pipeline.runTests({
  promptName: 'summarize',
  datasetPath: './datasets/summarization_eval.json',
  minPassRate: 0.9,
  minQualityScore: 0.8,
});

console.log('CI Test Result:', result.passed ? 'PASSED' : 'FAILED');

Checklist

  • Store all prompts in version control with metadata
  • Create eval datasets for continuous testing
  • Implement automated regression tests for every prompt change
  • Use A/B testing to measure prompt improvements in production
  • Canary deploy new prompts to 5-10% traffic first
  • Test prompts on new models before switching
  • Maintain a prompt template library with reusable components
  • Gate prompt deployments on eval quality thresholds
  • Monitor prompt performance metrics continuously
  • Log all prompt changes with rationale and author
  • Run CI tests on every prompt commit
  • Document expected quality metrics per prompt

Conclusion

Treating prompts as code is essential for reliable AI applications. Version all prompts, test against golden datasets before deployment, use A/B testing for data-driven improvements, and canary new prompts to catch quality regressions early. This approach scales prompt management from manual tweaking to disciplined engineering.