- Published on
LLM Prompt Management — Versioning, Testing, and Deploying Prompts Like Code
- Authors

- Name
- Sanjeev Sharma
- @webcoderspeed1
Introduction
Prompts are code. A 1% change causes 20% output variance. Without versioning and testing, you ship regressions silently. This guide covers treating prompts with the same rigor as application code.
- Prompt as Code
- Prompt Versioning with Semver
- A/B Testing Prompts with Split Traffic
- Regression Testing with Golden Dataset
- Prompt Template Engine
- Prompt Injection Prevention
- Promoting Prompts Through Environments
- Rollback on Quality Regression
- Checklist
- Conclusion
Prompt as Code
Store prompts in version control with metadata, not hardcoded in applications.
import fs from 'fs';
import path from 'path';
interface PromptVersion {
id: string;
name: string;
version: string;
created_at: Date;
created_by: string;
content: string;
variables: string[];
model: string;
temperature: number;
max_tokens: number;
tags: string[];
description: string;
}
class PromptRegistry {
private promptsDir = './prompts';
private promptsDb: Map<string, PromptVersion[]> = new Map();
constructor() {
this.loadPromptsFromDisk();
}
private loadPromptsFromDisk(): void {
if (!fs.existsSync(this.promptsDir)) {
fs.mkdirSync(this.promptsDir, { recursive: true });
return;
}
const files = fs.readdirSync(this.promptsDir);
for (const file of files) {
if (!file.endsWith('.json')) continue;
const filePath = path.join(this.promptsDir, file);
const content = fs.readFileSync(filePath, 'utf-8');
const data = JSON.parse(content);
const name = file.replace('.json', '');
this.promptsDb.set(name, data.versions || []);
}
}
async savePrompt(prompt: PromptVersion): Promise<void> {
const versions = this.promptsDb.get(prompt.name) || [];
versions.push(prompt);
this.promptsDb.set(prompt.name, versions);
const filePath = path.join(this.promptsDir, `${prompt.name}.json`);
fs.writeFileSync(
filePath,
JSON.stringify(
{
name: prompt.name,
versions,
latest: prompt.version
},
null,
2
)
);
}
getPrompt(name: string, version?: string): PromptVersion | null {
const versions = this.promptsDb.get(name);
if (!versions || versions.length === 0) return null;
if (!version) {
// Return latest
return versions[versions.length - 1];
}
return versions.find(v => v.version === version) || null;
}
getAllVersions(name: string): PromptVersion[] {
return this.promptsDb.get(name) || [];
}
renderPrompt(prompt: PromptVersion, variables: Record<string, string>): string {
let rendered = prompt.content;
for (const [key, value] of Object.entries(variables)) {
rendered = rendered.replace(new RegExp(`\\$\\{${key}\\}`, 'g'), value);
}
return rendered;
}
}
Prompt Versioning with Semver
Use semantic versioning for prompts to track breaking changes and improvements.
class PromptVersionManager {
private registry = new PromptRegistry();
parseVersion(version: string): { major: number; minor: number; patch: number } {
const [major, minor, patch] = version.split('.').map(Number);
return { major, minor, patch };
}
formatVersion(major: number, minor: number, patch: number): string {
return `${major}.${minor}.${patch}`;
}
nextPatch(currentVersion: string): string {
const parsed = this.parseVersion(currentVersion);
return this.formatVersion(parsed.major, parsed.minor, parsed.patch + 1);
}
nextMinor(currentVersion: string): string {
const parsed = this.parseVersion(currentVersion);
return this.formatVersion(parsed.major, parsed.minor + 1, 0);
}
nextMajor(currentVersion: string): string {
const parsed = this.parseVersion(currentVersion);
return this.formatVersion(parsed.major + 1, 0, 0);
}
// When to bump versions
getVersionBump(oldPrompt: string, newPrompt: string): 'patch' | 'minor' | 'major' {
// Major: changes output format or removes variables
if (!/\$\{(\w+)\}/.test(oldPrompt) && /\$\{(\w+)\}/.test(newPrompt)) {
return 'major';
}
// Minor: new optional instruction or output behavior change
if (newPrompt.length > oldPrompt.length * 1.2) {
return 'minor';
}
// Patch: typo fix, small wording improvement
return 'patch';
}
comparePrompts(v1: PromptVersion, v2: PromptVersion): {
breaking_changes: boolean;
behavioral_changes: boolean;
diff_size: number;
} {
const oldTokens = v1.content.split(/\s+/).length;
const newTokens = v2.content.split(/\s+/).length;
const diffSize = Math.abs(newTokens - oldTokens);
return {
breaking_changes: v1.model !== v2.model || v1.max_tokens !== v2.max_tokens,
behavioral_changes: diffSize > oldTokens * 0.1, // >10% change
diff_size: diffSize
};
}
}
A/B Testing Prompts with Split Traffic
Route traffic to different prompt versions and measure quality.
interface ABTestConfig {
name: string;
prompt_a: PromptVersion;
prompt_b: PromptVersion;
split_percent: number; // 0-100, percent to version B
start_date: Date;
end_date: Date;
metric: 'latency' | 'cost' | 'quality';
}
interface ABTestResult {
request_id: string;
variant: 'a' | 'b';
metric_value: number;
timestamp: Date;
}
class ABTestRunner {
private activeTests: Map<string, ABTestConfig> = new Map();
private results: ABTestResult[] = [];
startTest(config: ABTestConfig): void {
this.activeTests.set(config.name, config);
}
selectVariant(testName: string, userId: string): 'a' | 'b' {
const test = this.activeTests.get(testName);
if (!test) return 'a';
// Hash user ID to determine assignment (consistent)
const hash = parseInt(
Array.from(userId).reduce((acc, char) => acc + char.charCodeAt(0), '').toString().slice(0, 10)
);
const threshold = (test.split_percent / 100) * Number.MAX_SAFE_INTEGER;
return hash % Number.MAX_SAFE_INTEGER < threshold ? 'b' : 'a';
}
recordResult(testName: string, result: ABTestResult): void {
this.results.push(result);
}
getResults(testName: string): ABTestResult[] {
return this.results.filter(r => {
const test = this.activeTests.get(testName);
return test &&
r.timestamp >= test.start_date &&
r.timestamp <= test.end_date;
});
}
analyzeTest(testName: string): {
variant_a: { mean: number; count: number };
variant_b: { mean: number; count: number };
winner: 'a' | 'b' | null;
} {
const results = this.getResults(testName);
const aResults = results.filter(r => r.variant === 'a');
const bResults = results.filter(r => r.variant === 'b');
const aMean = aResults.length > 0
? aResults.reduce((sum, r) => sum + r.metric_value, 0) / aResults.length
: 0;
const bMean = bResults.length > 0
? bResults.reduce((sum, r) => sum + r.metric_value, 0) / bResults.length
: 0;
// Need minimum sample size
const minSamples = 100;
let winner: 'a' | 'b' | null = null;
if (aResults.length > minSamples && bResults.length > minSamples) {
winner = aMean > bMean ? 'a' : 'b';
}
return {
variant_a: { mean: aMean, count: aResults.length },
variant_b: { mean: bMean, count: bResults.length },
winner
};
}
}
Regression Testing with Golden Dataset
Test prompts against a golden dataset of expected inputs and outputs.
interface GoldenExample {
id: string;
input: Record<string, string>;
expected_output: string;
category: string;
}
interface RegressionTest {
prompt_version: string;
examples: GoldenExample[];
passed: number;
failed: number;
error_rate: number;
}
class RegressionTestRunner {
private goldenDataset: GoldenExample[] = [];
private similarityThreshold = 0.85;
loadGoldenDataset(filePath: string): void {
const content = fs.readFileSync(filePath, 'utf-8');
this.goldenDataset = JSON.parse(content);
}
async runRegressionTest(
promptVersion: PromptVersion,
client: any
): Promise<RegressionTest> {
let passed = 0;
let failed = 0;
const registry = new PromptRegistry();
for (const example of this.goldenDataset) {
const renderedPrompt = registry.renderPrompt(promptVersion, example.input);
try {
const response = await client.chat.completions.create({
model: promptVersion.model,
messages: [{ role: 'user', content: renderedPrompt }],
temperature: promptVersion.temperature,
max_tokens: promptVersion.max_tokens
});
const output = response.choices[0].message.content;
const similarity = this.calculateSimilarity(output, example.expected_output);
if (similarity >= this.similarityThreshold) {
passed++;
} else {
failed++;
console.warn(`Regression test failed for example ${example.id}`);
console.warn(`Expected: ${example.expected_output}`);
console.warn(`Got: ${output}`);
}
} catch (error) {
failed++;
console.error(`Error testing example ${example.id}: ${error}`);
}
}
return {
prompt_version: promptVersion.version,
examples: this.goldenDataset,
passed,
failed,
error_rate: failed / this.goldenDataset.length
};
}
private calculateSimilarity(a: string, b: string): number {
// Simple overlap-based similarity (in production, use embeddings)
const aWords = new Set(a.toLowerCase().split(/\s+/));
const bWords = new Set(b.toLowerCase().split(/\s+/));
const intersection = [...aWords].filter(w => bWords.has(w)).length;
const union = new Set([...aWords, ...bWords]).size;
return union > 0 ? intersection / union : 0;
}
passedAllTests(test: RegressionTest): boolean {
return test.failed === 0;
}
passedPercentage(test: RegressionTest): number {
return (test.passed / (test.passed + test.failed)) * 100;
}
}
Prompt Template Engine
Render prompts with variables using a template engine.
interface TemplateContext {
[key: string]: string | number | boolean | string[];
}
class PromptTemplateEngine {
render(template: string, context: TemplateContext): string {
let result = template;
// Handle simple variable substitution: ${varName}
result = result.replace(/\$\{(\w+)\}/g, (match, varName) => {
const value = context[varName];
if (value === undefined) throw new Error(`Missing variable: ${varName}`);
return String(value);
});
// Handle conditionals: {{#if condition}}...{{/if}}
result = result.replace(
/\{\{#if\s+(\w+)\}\}([\s\S]*?)\{\{\/if\}\}/g,
(match, condition, content) => {
return context[condition] ? content : '';
}
);
// Handle loops: {{#each items}}...{{/each}}
result = result.replace(
/\{\{#each\s+(\w+)\}\}([\s\S]*?)\{\{\/each\}\}/g,
(match, varName, content) => {
const items = context[varName];
if (!Array.isArray(items)) return '';
return items
.map((item, index) =>
content
.replace(/\{\{this\}\}/g, String(item))
.replace(/\{\{@index\}\}/g, String(index))
)
.join('');
}
);
return result;
}
validate(template: string): { valid: boolean; errors: string[] } {
const errors: string[] = [];
// Find undefined variables
const varMatches = template.match(/\$\{(\w+)\}/g) || [];
const definedVars = new Set(varMatches.map(m => m.slice(2, -1)));
// Check for mismatched conditionals
const ifCount = (template.match(/\{\{#if/g) || []).length;
const ifEndCount = (template.match(/\{\{\/if\}\}/g) || []).length;
if (ifCount !== ifEndCount) {
errors.push(`Mismatched if/endif: ${ifCount} if, ${ifEndCount} endif`);
}
return {
valid: errors.length === 0,
errors
};
}
}
Prompt Injection Prevention
Sanitize user inputs to prevent prompt injection attacks.
class PromptInjectionPreventionManager {
private dangerousPatterns = [
/ignore previous instructions/i,
/forget the system prompt/i,
/forget all previous/i,
/jailbreak/i,
/bypass security/i,
/admin override/i
];
isSuspicious(text: string): boolean {
for (const pattern of this.dangerousPatterns) {
if (pattern.test(text)) {
return true;
}
}
// Check for unusual quote patterns that might break prompts
const quoteCount = (text.match(/"|"|''|``/g) || []).length;
if (quoteCount > 10) {
return true;
}
return false;
}
sanitize(text: string): string {
// Escape special characters
return text
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '"')
.replace(/'/g, ''');
}
buildSafePrompt(
template: string,
userInput: string,
variableName: string = 'user_input'
): string {
if (this.isSuspicious(userInput)) {
console.warn(`Suspicious input detected: ${userInput.substring(0, 50)}`);
}
const sanitized = this.sanitize(userInput);
// Wrap in quotes and comment marker for clarity
const safeInput = `"${sanitized}"`;
return template.replace(`$\{${variableName}\}`, safeInput);
}
}
Promoting Prompts Through Environments
Pipeline: Development → Staging → Production with automated promotion.
type Environment = 'development' | 'staging' | 'production';
interface PromptEnvironment {
environment: Environment;
version: string;
promoted_at: Date;
promoted_by: string;
}
class PromptPromotionPipeline {
private promotionHistory: PromptEnvironment[] = [];
private redis: any; // Redis client
async promoteToEnvironment(
promptName: string,
version: string,
targetEnv: Environment,
promotedBy: string
): Promise<void> {
const key = `prompt:${promptName}:${targetEnv}`;
await this.redis.set(key, version);
this.promotionHistory.push({
environment: targetEnv,
version,
promoted_at: new Date(),
promoted_by: promotedBy
});
}
async getActivePrompt(promptName: string, env: Environment): Promise<string | null> {
const key = `prompt:${promptName}:${env}`;
return await this.redis.get(key);
}
async validateBeforePromotion(
promptName: string,
version: string,
targetEnv: Environment
): Promise<{ valid: boolean; errors: string[] }> {
const errors: string[] = [];
// Can't skip environments
if (targetEnv === 'production') {
const stagingVersion = await this.getActivePrompt(promptName, 'staging');
if (stagingVersion !== version) {
errors.push('Must be validated in staging first');
}
}
// Must pass regression tests
// (would call RegressionTestRunner here)
return {
valid: errors.length === 0,
errors
};
}
async rollback(promptName: string, env: Environment): Promise<string | null> {
const history = this.promotionHistory.filter(
p => p.environment === env
).sort((a, b) => b.promoted_at.getTime() - a.promoted_at.getTime());
if (history.length < 2) return null; // Not enough history
const previousVersion = history[1].version;
await this.promoteToEnvironment(promptName, previousVersion, env, 'system');
return previousVersion;
}
getPromotionHistory(promptName?: string): PromptEnvironment[] {
if (!promptName) return this.promotionHistory;
return this.promotionHistory.filter(
p => p.promoted_by.includes(promptName)
);
}
}
Rollback on Quality Regression
Automatically rollback when quality metrics degrade.
interface QualityMetric {
regression_test_pass_rate: number;
average_latency_ms: number;
cost_per_request: number;
user_satisfaction_score: number;
}
class QualityMonitor {
private baselineMetrics: Map<string, QualityMetric> = new Map();
private degradationThreshold = 0.05; // 5% degradation
recordBaseline(promptVersion: string, metrics: QualityMetric): void {
this.baselineMetrics.set(promptVersion, metrics);
}
detectDegradation(
promptVersion: string,
currentMetrics: QualityMetric
): { degraded: boolean; reasons: string[] } {
const baseline = this.baselineMetrics.get(promptVersion);
if (!baseline) {
return { degraded: false, reasons: [] };
}
const reasons: string[] = [];
// Check each metric
const passRateDrop = 1 - (currentMetrics.regression_test_pass_rate / baseline.regression_test_pass_rate);
if (passRateDrop > this.degradationThreshold) {
reasons.push(`Regression test pass rate dropped ${(passRateDrop * 100).toFixed(1)}%`);
}
const satisfactionDrop = 1 - (currentMetrics.user_satisfaction_score / baseline.user_satisfaction_score);
if (satisfactionDrop > this.degradationThreshold) {
reasons.push(`User satisfaction dropped ${(satisfactionDrop * 100).toFixed(1)}%`);
}
const latencyIncrease = (currentMetrics.average_latency_ms / baseline.average_latency_ms) - 1;
if (latencyIncrease > 0.2) {
reasons.push(`Latency increased ${(latencyIncrease * 100).toFixed(1)}%`);
}
return {
degraded: reasons.length > 0,
reasons
};
}
async triggerRollbackIfNeeded(
promptName: string,
env: Environment,
metrics: QualityMetric,
pipeline: PromptPromotionPipeline
): Promise<boolean> {
const currentVersion = await pipeline.getActivePrompt(promptName, env);
if (!currentVersion) return false;
const degradation = this.detectDegradation(currentVersion, metrics);
if (degradation.degraded) {
console.error(`Quality degradation detected: ${degradation.reasons.join('; ')}`);
const previous = await pipeline.rollback(promptName, env);
if (previous) {
console.log(`Rolled back to ${previous}`);
return true;
}
}
return false;
}
}
Checklist
- Store all prompts in version control (Git) alongside application code
- Use semantic versioning to track breaking changes
- Maintain a golden dataset of 50+ examples for regression testing
- Test every prompt version against golden dataset before promotion
- A/B test new prompts with 10-50% traffic split before full rollout
- Sanitize user inputs to prevent prompt injection
- Implement promotion pipeline: development → staging → production
- Automatically rollback if pass rate drops >5% or latency increases >20%
- Track quality metrics (pass rate, latency, cost) per version
- Keep audit trail of all promotions with timestamps and approvers
Conclusion
Prompts are code. Treat them with the same rigor: version control, testing, CI/CD pipelines, and automated rollbacks. The teams that ship the highest quality AI products are those treating prompts as first-class artifacts, not magic strings.