Published on

LLM Prompt Management — Versioning, Testing, and Deploying Prompts Like Code

Authors

Introduction

Prompts are code. A 1% change causes 20% output variance. Without versioning and testing, you ship regressions silently. This guide covers treating prompts with the same rigor as application code.

Prompt as Code

Store prompts in version control with metadata, not hardcoded in applications.

import fs from 'fs';
import path from 'path';

interface PromptVersion {
  id: string;
  name: string;
  version: string;
  created_at: Date;
  created_by: string;
  content: string;
  variables: string[];
  model: string;
  temperature: number;
  max_tokens: number;
  tags: string[];
  description: string;
}

class PromptRegistry {
  private promptsDir = './prompts';
  private promptsDb: Map<string, PromptVersion[]> = new Map();

  constructor() {
    this.loadPromptsFromDisk();
  }

  private loadPromptsFromDisk(): void {
    if (!fs.existsSync(this.promptsDir)) {
      fs.mkdirSync(this.promptsDir, { recursive: true });
      return;
    }

    const files = fs.readdirSync(this.promptsDir);

    for (const file of files) {
      if (!file.endsWith('.json')) continue;

      const filePath = path.join(this.promptsDir, file);
      const content = fs.readFileSync(filePath, 'utf-8');
      const data = JSON.parse(content);

      const name = file.replace('.json', '');
      this.promptsDb.set(name, data.versions || []);
    }
  }

  async savePrompt(prompt: PromptVersion): Promise<void> {
    const versions = this.promptsDb.get(prompt.name) || [];
    versions.push(prompt);
    this.promptsDb.set(prompt.name, versions);

    const filePath = path.join(this.promptsDir, `${prompt.name}.json`);
    fs.writeFileSync(
      filePath,
      JSON.stringify(
        {
          name: prompt.name,
          versions,
          latest: prompt.version
        },
        null,
        2
      )
    );
  }

  getPrompt(name: string, version?: string): PromptVersion | null {
    const versions = this.promptsDb.get(name);
    if (!versions || versions.length === 0) return null;

    if (!version) {
      // Return latest
      return versions[versions.length - 1];
    }

    return versions.find(v => v.version === version) || null;
  }

  getAllVersions(name: string): PromptVersion[] {
    return this.promptsDb.get(name) || [];
  }

  renderPrompt(prompt: PromptVersion, variables: Record<string, string>): string {
    let rendered = prompt.content;

    for (const [key, value] of Object.entries(variables)) {
      rendered = rendered.replace(new RegExp(`\\$\\{${key}\\}`, 'g'), value);
    }

    return rendered;
  }
}

Prompt Versioning with Semver

Use semantic versioning for prompts to track breaking changes and improvements.

class PromptVersionManager {
  private registry = new PromptRegistry();

  parseVersion(version: string): { major: number; minor: number; patch: number } {
    const [major, minor, patch] = version.split('.').map(Number);
    return { major, minor, patch };
  }

  formatVersion(major: number, minor: number, patch: number): string {
    return `${major}.${minor}.${patch}`;
  }

  nextPatch(currentVersion: string): string {
    const parsed = this.parseVersion(currentVersion);
    return this.formatVersion(parsed.major, parsed.minor, parsed.patch + 1);
  }

  nextMinor(currentVersion: string): string {
    const parsed = this.parseVersion(currentVersion);
    return this.formatVersion(parsed.major, parsed.minor + 1, 0);
  }

  nextMajor(currentVersion: string): string {
    const parsed = this.parseVersion(currentVersion);
    return this.formatVersion(parsed.major + 1, 0, 0);
  }

  // When to bump versions
  getVersionBump(oldPrompt: string, newPrompt: string): 'patch' | 'minor' | 'major' {
    // Major: changes output format or removes variables
    if (!/\$\{(\w+)\}/.test(oldPrompt) && /\$\{(\w+)\}/.test(newPrompt)) {
      return 'major';
    }

    // Minor: new optional instruction or output behavior change
    if (newPrompt.length > oldPrompt.length * 1.2) {
      return 'minor';
    }

    // Patch: typo fix, small wording improvement
    return 'patch';
  }

  comparePrompts(v1: PromptVersion, v2: PromptVersion): {
    breaking_changes: boolean;
    behavioral_changes: boolean;
    diff_size: number;
  } {
    const oldTokens = v1.content.split(/\s+/).length;
    const newTokens = v2.content.split(/\s+/).length;
    const diffSize = Math.abs(newTokens - oldTokens);

    return {
      breaking_changes: v1.model !== v2.model || v1.max_tokens !== v2.max_tokens,
      behavioral_changes: diffSize > oldTokens * 0.1, // >10% change
      diff_size: diffSize
    };
  }
}

A/B Testing Prompts with Split Traffic

Route traffic to different prompt versions and measure quality.

interface ABTestConfig {
  name: string;
  prompt_a: PromptVersion;
  prompt_b: PromptVersion;
  split_percent: number; // 0-100, percent to version B
  start_date: Date;
  end_date: Date;
  metric: 'latency' | 'cost' | 'quality';
}

interface ABTestResult {
  request_id: string;
  variant: 'a' | 'b';
  metric_value: number;
  timestamp: Date;
}

class ABTestRunner {
  private activeTests: Map<string, ABTestConfig> = new Map();
  private results: ABTestResult[] = [];

  startTest(config: ABTestConfig): void {
    this.activeTests.set(config.name, config);
  }

  selectVariant(testName: string, userId: string): 'a' | 'b' {
    const test = this.activeTests.get(testName);
    if (!test) return 'a';

    // Hash user ID to determine assignment (consistent)
    const hash = parseInt(
      Array.from(userId).reduce((acc, char) => acc + char.charCodeAt(0), '').toString().slice(0, 10)
    );

    const threshold = (test.split_percent / 100) * Number.MAX_SAFE_INTEGER;
    return hash % Number.MAX_SAFE_INTEGER < threshold ? 'b' : 'a';
  }

  recordResult(testName: string, result: ABTestResult): void {
    this.results.push(result);
  }

  getResults(testName: string): ABTestResult[] {
    return this.results.filter(r => {
      const test = this.activeTests.get(testName);
      return test &&
        r.timestamp >= test.start_date &&
        r.timestamp <= test.end_date;
    });
  }

  analyzeTest(testName: string): {
    variant_a: { mean: number; count: number };
    variant_b: { mean: number; count: number };
    winner: 'a' | 'b' | null;
  } {
    const results = this.getResults(testName);

    const aResults = results.filter(r => r.variant === 'a');
    const bResults = results.filter(r => r.variant === 'b');

    const aMean = aResults.length > 0
      ? aResults.reduce((sum, r) => sum + r.metric_value, 0) / aResults.length
      : 0;

    const bMean = bResults.length > 0
      ? bResults.reduce((sum, r) => sum + r.metric_value, 0) / bResults.length
      : 0;

    // Need minimum sample size
    const minSamples = 100;
    let winner: 'a' | 'b' | null = null;

    if (aResults.length > minSamples && bResults.length > minSamples) {
      winner = aMean > bMean ? 'a' : 'b';
    }

    return {
      variant_a: { mean: aMean, count: aResults.length },
      variant_b: { mean: bMean, count: bResults.length },
      winner
    };
  }
}

Regression Testing with Golden Dataset

Test prompts against a golden dataset of expected inputs and outputs.

interface GoldenExample {
  id: string;
  input: Record<string, string>;
  expected_output: string;
  category: string;
}

interface RegressionTest {
  prompt_version: string;
  examples: GoldenExample[];
  passed: number;
  failed: number;
  error_rate: number;
}

class RegressionTestRunner {
  private goldenDataset: GoldenExample[] = [];
  private similarityThreshold = 0.85;

  loadGoldenDataset(filePath: string): void {
    const content = fs.readFileSync(filePath, 'utf-8');
    this.goldenDataset = JSON.parse(content);
  }

  async runRegressionTest(
    promptVersion: PromptVersion,
    client: any
  ): Promise<RegressionTest> {
    let passed = 0;
    let failed = 0;
    const registry = new PromptRegistry();

    for (const example of this.goldenDataset) {
      const renderedPrompt = registry.renderPrompt(promptVersion, example.input);

      try {
        const response = await client.chat.completions.create({
          model: promptVersion.model,
          messages: [{ role: 'user', content: renderedPrompt }],
          temperature: promptVersion.temperature,
          max_tokens: promptVersion.max_tokens
        });

        const output = response.choices[0].message.content;
        const similarity = this.calculateSimilarity(output, example.expected_output);

        if (similarity >= this.similarityThreshold) {
          passed++;
        } else {
          failed++;
          console.warn(`Regression test failed for example ${example.id}`);
          console.warn(`Expected: ${example.expected_output}`);
          console.warn(`Got: ${output}`);
        }
      } catch (error) {
        failed++;
        console.error(`Error testing example ${example.id}: ${error}`);
      }
    }

    return {
      prompt_version: promptVersion.version,
      examples: this.goldenDataset,
      passed,
      failed,
      error_rate: failed / this.goldenDataset.length
    };
  }

  private calculateSimilarity(a: string, b: string): number {
    // Simple overlap-based similarity (in production, use embeddings)
    const aWords = new Set(a.toLowerCase().split(/\s+/));
    const bWords = new Set(b.toLowerCase().split(/\s+/));

    const intersection = [...aWords].filter(w => bWords.has(w)).length;
    const union = new Set([...aWords, ...bWords]).size;

    return union > 0 ? intersection / union : 0;
  }

  passedAllTests(test: RegressionTest): boolean {
    return test.failed === 0;
  }

  passedPercentage(test: RegressionTest): number {
    return (test.passed / (test.passed + test.failed)) * 100;
  }
}

Prompt Template Engine

Render prompts with variables using a template engine.

interface TemplateContext {
  [key: string]: string | number | boolean | string[];
}

class PromptTemplateEngine {
  render(template: string, context: TemplateContext): string {
    let result = template;

    // Handle simple variable substitution: ${varName}
    result = result.replace(/\$\{(\w+)\}/g, (match, varName) => {
      const value = context[varName];
      if (value === undefined) throw new Error(`Missing variable: ${varName}`);
      return String(value);
    });

    // Handle conditionals: {{#if condition}}...{{/if}}
    result = result.replace(
      /\{\{#if\s+(\w+)\}\}([\s\S]*?)\{\{\/if\}\}/g,
      (match, condition, content) => {
        return context[condition] ? content : '';
      }
    );

    // Handle loops: {{#each items}}...{{/each}}
    result = result.replace(
      /\{\{#each\s+(\w+)\}\}([\s\S]*?)\{\{\/each\}\}/g,
      (match, varName, content) => {
        const items = context[varName];
        if (!Array.isArray(items)) return '';

        return items
          .map((item, index) =>
            content
              .replace(/\{\{this\}\}/g, String(item))
              .replace(/\{\{@index\}\}/g, String(index))
          )
          .join('');
      }
    );

    return result;
  }

  validate(template: string): { valid: boolean; errors: string[] } {
    const errors: string[] = [];

    // Find undefined variables
    const varMatches = template.match(/\$\{(\w+)\}/g) || [];
    const definedVars = new Set(varMatches.map(m => m.slice(2, -1)));

    // Check for mismatched conditionals
    const ifCount = (template.match(/\{\{#if/g) || []).length;
    const ifEndCount = (template.match(/\{\{\/if\}\}/g) || []).length;

    if (ifCount !== ifEndCount) {
      errors.push(`Mismatched if/endif: ${ifCount} if, ${ifEndCount} endif`);
    }

    return {
      valid: errors.length === 0,
      errors
    };
  }
}

Prompt Injection Prevention

Sanitize user inputs to prevent prompt injection attacks.

class PromptInjectionPreventionManager {
  private dangerousPatterns = [
    /ignore previous instructions/i,
    /forget the system prompt/i,
    /forget all previous/i,
    /jailbreak/i,
    /bypass security/i,
    /admin override/i
  ];

  isSuspicious(text: string): boolean {
    for (const pattern of this.dangerousPatterns) {
      if (pattern.test(text)) {
        return true;
      }
    }

    // Check for unusual quote patterns that might break prompts
    const quoteCount = (text.match(/"|"|''|``/g) || []).length;
    if (quoteCount > 10) {
      return true;
    }

    return false;
  }

  sanitize(text: string): string {
    // Escape special characters
    return text
      .replace(/</g, '<')
      .replace(/>/g, '>')
      .replace(/"/g, '"')
      .replace(/'/g, ''');
  }

  buildSafePrompt(
    template: string,
    userInput: string,
    variableName: string = 'user_input'
  ): string {
    if (this.isSuspicious(userInput)) {
      console.warn(`Suspicious input detected: ${userInput.substring(0, 50)}`);
    }

    const sanitized = this.sanitize(userInput);

    // Wrap in quotes and comment marker for clarity
    const safeInput = `"${sanitized}"`;

    return template.replace(`$\{${variableName}\}`, safeInput);
  }
}

Promoting Prompts Through Environments

Pipeline: Development → Staging → Production with automated promotion.

type Environment = 'development' | 'staging' | 'production';

interface PromptEnvironment {
  environment: Environment;
  version: string;
  promoted_at: Date;
  promoted_by: string;
}

class PromptPromotionPipeline {
  private promotionHistory: PromptEnvironment[] = [];
  private redis: any; // Redis client

  async promoteToEnvironment(
    promptName: string,
    version: string,
    targetEnv: Environment,
    promotedBy: string
  ): Promise<void> {
    const key = `prompt:${promptName}:${targetEnv}`;

    await this.redis.set(key, version);
    this.promotionHistory.push({
      environment: targetEnv,
      version,
      promoted_at: new Date(),
      promoted_by: promotedBy
    });
  }

  async getActivePrompt(promptName: string, env: Environment): Promise<string | null> {
    const key = `prompt:${promptName}:${env}`;
    return await this.redis.get(key);
  }

  async validateBeforePromotion(
    promptName: string,
    version: string,
    targetEnv: Environment
  ): Promise<{ valid: boolean; errors: string[] }> {
    const errors: string[] = [];

    // Can't skip environments
    if (targetEnv === 'production') {
      const stagingVersion = await this.getActivePrompt(promptName, 'staging');
      if (stagingVersion !== version) {
        errors.push('Must be validated in staging first');
      }
    }

    // Must pass regression tests
    // (would call RegressionTestRunner here)

    return {
      valid: errors.length === 0,
      errors
    };
  }

  async rollback(promptName: string, env: Environment): Promise<string | null> {
    const history = this.promotionHistory.filter(
      p => p.environment === env
    ).sort((a, b) => b.promoted_at.getTime() - a.promoted_at.getTime());

    if (history.length < 2) return null; // Not enough history

    const previousVersion = history[1].version;
    await this.promoteToEnvironment(promptName, previousVersion, env, 'system');

    return previousVersion;
  }

  getPromotionHistory(promptName?: string): PromptEnvironment[] {
    if (!promptName) return this.promotionHistory;

    return this.promotionHistory.filter(
      p => p.promoted_by.includes(promptName)
    );
  }
}

Rollback on Quality Regression

Automatically rollback when quality metrics degrade.

interface QualityMetric {
  regression_test_pass_rate: number;
  average_latency_ms: number;
  cost_per_request: number;
  user_satisfaction_score: number;
}

class QualityMonitor {
  private baselineMetrics: Map<string, QualityMetric> = new Map();
  private degradationThreshold = 0.05; // 5% degradation

  recordBaseline(promptVersion: string, metrics: QualityMetric): void {
    this.baselineMetrics.set(promptVersion, metrics);
  }

  detectDegradation(
    promptVersion: string,
    currentMetrics: QualityMetric
  ): { degraded: boolean; reasons: string[] } {
    const baseline = this.baselineMetrics.get(promptVersion);
    if (!baseline) {
      return { degraded: false, reasons: [] };
    }

    const reasons: string[] = [];

    // Check each metric
    const passRateDrop = 1 - (currentMetrics.regression_test_pass_rate / baseline.regression_test_pass_rate);
    if (passRateDrop > this.degradationThreshold) {
      reasons.push(`Regression test pass rate dropped ${(passRateDrop * 100).toFixed(1)}%`);
    }

    const satisfactionDrop = 1 - (currentMetrics.user_satisfaction_score / baseline.user_satisfaction_score);
    if (satisfactionDrop > this.degradationThreshold) {
      reasons.push(`User satisfaction dropped ${(satisfactionDrop * 100).toFixed(1)}%`);
    }

    const latencyIncrease = (currentMetrics.average_latency_ms / baseline.average_latency_ms) - 1;
    if (latencyIncrease > 0.2) {
      reasons.push(`Latency increased ${(latencyIncrease * 100).toFixed(1)}%`);
    }

    return {
      degraded: reasons.length > 0,
      reasons
    };
  }

  async triggerRollbackIfNeeded(
    promptName: string,
    env: Environment,
    metrics: QualityMetric,
    pipeline: PromptPromotionPipeline
  ): Promise<boolean> {
    const currentVersion = await pipeline.getActivePrompt(promptName, env);
    if (!currentVersion) return false;

    const degradation = this.detectDegradation(currentVersion, metrics);

    if (degradation.degraded) {
      console.error(`Quality degradation detected: ${degradation.reasons.join('; ')}`);

      const previous = await pipeline.rollback(promptName, env);
      if (previous) {
        console.log(`Rolled back to ${previous}`);
        return true;
      }
    }

    return false;
  }
}

Checklist

  • Store all prompts in version control (Git) alongside application code
  • Use semantic versioning to track breaking changes
  • Maintain a golden dataset of 50+ examples for regression testing
  • Test every prompt version against golden dataset before promotion
  • A/B test new prompts with 10-50% traffic split before full rollout
  • Sanitize user inputs to prevent prompt injection
  • Implement promotion pipeline: development → staging → production
  • Automatically rollback if pass rate drops >5% or latency increases >20%
  • Track quality metrics (pass rate, latency, cost) per version
  • Keep audit trail of all promotions with timestamps and approvers

Conclusion

Prompts are code. Treat them with the same rigor: version control, testing, CI/CD pipelines, and automated rollbacks. The teams that ship the highest quality AI products are those treating prompts as first-class artifacts, not magic strings.