Published on

Evaluating Your RAG Pipeline — RAGAS, Faithfulness, and Answer Quality Metrics

Authors

Introduction

You can't improve what you don't measure. Yet most RAG systems ship without proper evaluation infrastructure, leading to silent quality degradation in production.

This post covers RAGAS (Retrieval-Augmented Generation Assessment), LLM-as-judge evaluation, and automated quality monitoring.

RAGAS Framework

RAGAS measures four critical dimensions of RAG quality:

interface RAGASMetrics {
  faithfulness: number; // 0-1: Is answer grounded in context?
  answerRelevancy: number; // 0-1: Does answer address question?
  contextPrecision: number; // 0-1: Are retrieved chunks relevant?
  contextRecall: number; // 0-1: Are all relevant chunks retrieved?
}

interface RAGSample {
  query: string;
  groundTruth: string; // Expected answer
  retrievedContext: string[]; // Retrieved documents
  generatedAnswer: string; // Model output
}

// Faithfulness: Is the answer supported by retrieved context?
async function computeFaithfulness(
  sample: RAGSample,
  llm: LLMClient
): Promise<number> {
  const faithfulnessPrompt = `
Given the question, generated answer, and context chunks, determine if the answer
is grounded in the provided context.

Question: "${sample.query}"

Answer: "${sample.generatedAnswer}"

Context:
${sample.retrievedContext.join('\n---\n')}

Is the answer entirely supported by the context? Respond with a JSON:
{
  "faithfulness_score": 0.0 to 1.0,
  "reasoning": "explanation"
}`;

  const response = await llm.generate({
    messages: [{ role: 'user', content: faithfulnessPrompt }],
    maxTokens: 200,
  });

  const parsed = JSON.parse(response.text);
  return parsed.faithfulness_score;
}

// Answer Relevancy: Does the answer address the question?
async function computeAnswerRelevancy(
  sample: RAGSample,
  llm: LLMClient
): Promise<number> {
  const relevancyPrompt = `
Evaluate how well the generated answer addresses the question.

Question: "${sample.query}"

Generated Answer: "${sample.generatedAnswer}"

Expected Answer: "${sample.groundTruth}"

Rate relevancy 0-1:
- 1.0: Answers the question completely and accurately
- 0.5: Partially answers or has some irrelevant content
- 0.0: Doesn't address the question

Respond with JSON:
{
  "relevancy_score": 0.0 to 1.0,
  "reasoning": "explanation"
}`;

  const response = await llm.generate({
    messages: [{ role: 'user', content: relevancyPrompt }],
    maxTokens: 200,
  });

  const parsed = JSON.parse(response.text);
  return parsed.relevancy_score;
}

// Context Precision: Are retrieved chunks relevant?
async function computeContextPrecision(
  sample: RAGSample,
  llm: LLMClient
): Promise<number> {
  const precisionPrompt = `
For each retrieved context chunk, determine if it's relevant to answering the question.

Question: "${sample.query}"

Context chunks:
${sample.retrievedContext.map((ctx, i) => `[${i}] ${ctx}`).join('\n\n')}

Respond with JSON:
{
  "chunk_relevance": [0.0 to 1.0 for each chunk],
  "relevant_count": number,
  "total_count": number
}`;

  const response = await llm.generate({
    messages: [{ role: 'user', content: precisionPrompt }],
    maxTokens: 300,
  });

  const parsed = JSON.parse(response.text);
  return parsed.relevant_count / parsed.total_count;
}

// Context Recall: Did we retrieve all relevant chunks?
async function computeContextRecall(
  sample: RAGSample,
  llm: LLMClient
): Promise<number> {
  const recallPrompt = `
Given the expected answer and retrieved context, estimate how many relevant chunks
were retrieved.

Expected Answer: "${sample.groundTruth}"

Retrieved Context:
${sample.retrievedContext.map((ctx, i) => `[${i}] ${ctx}`).join('\n\n')}

What fraction of information needed to construct the expected answer is present
in the retrieved context?

Respond with JSON:
{
  "recall_score": 0.0 to 1.0,
  "missing_information": "what's missing",
  "confidence": 0.0 to 1.0
}`;

  const response = await llm.generate({
    messages: [{ role: 'user', content: recallPrompt }],
    maxTokens: 200,
  });

  const parsed = JSON.parse(response.text);
  return parsed.recall_score;
}

// Compute all RAGAS metrics
async function computeRAGAS(
  sample: RAGSample,
  llm: LLMClient
): Promise<RAGASMetrics> {
  const [faithfulness, answerRelevancy, contextPrecision, contextRecall] =
    await Promise.all([
      computeFaithfulness(sample, llm),
      computeAnswerRelevancy(sample, llm),
      computeContextPrecision(sample, llm),
      computeContextRecall(sample, llm),
    ]);

  return {
    faithfulness,
    answerRelevancy,
    contextPrecision,
    contextRecall,
  };
}

Building a Golden Q&A Dataset

Create a validation set for evaluation:

interface GoldenSample {
  id: string;
  question: string;
  expectedAnswer: string;
  sources: string[]; // Which documents contain the answer
  difficulty: 'easy' | 'medium' | 'hard';
  category: string;
}

async function buildGoldenDataset(
  documentStore: DocumentStore,
  sampleSize: number = 100
): Promise<GoldenSample[]> {
  const samples: GoldenSample[] = [];

  // Sample diverse documents
  const docs = await documentStore.sampleDocuments(sampleSize);

  for (const doc of docs) {
    // Use LLM to generate questions from document
    const generatePrompt = `
Read this document and generate 2-3 challenging questions that require specific
information from this text.

Document:
${doc.text}

Respond with JSON:
{
  "questions": [
    {
      "question": "...",
      "answer": "...",
      "difficulty": "easy|medium|hard"
    }
  ]
}`;

    const response = await llm.generate({
      messages: [{ role: 'user', content: generatePrompt }],
      maxTokens: 500,
    });

    const parsed = JSON.parse(response.text);

    for (const q of parsed.questions) {
      samples.push({
        id: `golden_${samples.length}`,
        question: q.question,
        expectedAnswer: q.answer,
        sources: [doc.id],
        difficulty: q.difficulty,
        category: doc.metadata?.category || 'general',
      });
    }
  }

  return samples.slice(0, sampleSize);
}

// Store and version golden dataset
interface GoldenDatasetVersion {
  version: string;
  timestamp: number;
  samples: GoldenSample[];
  stats: {
    totalSamples: number;
    byDifficulty: Record<string, number>;
    byCategory: Record<string, number>;
  };
}

async function saveGoldenDataset(
  samples: GoldenSample[],
  version: string
): Promise<void> {
  const stats = {
    totalSamples: samples.length,
    byDifficulty: {
      easy: samples.filter(s => s.difficulty === 'easy').length,
      medium: samples.filter(s => s.difficulty === 'medium').length,
      hard: samples.filter(s => s.difficulty === 'hard').length,
    },
    byCategory: {} as Record<string, number>,
  };

  for (const sample of samples) {
    stats.byCategory[sample.category] =
      (stats.byCategory[sample.category] || 0) + 1;
  }

  const dataset: GoldenDatasetVersion = {
    version,
    timestamp: Date.now(),
    samples,
    stats,
  };

  // Save to disk/database
  await fs.writeFile(
    `/data/golden_dataset_${version}.json`,
    JSON.stringify(dataset, null, 2)
  );
}

Retrieval Evaluation

Measure if your retriever finds relevant documents:

interface RetrievalEvaluationResult {
  hitRate: number; // % of queries where relevant doc is in top-k
  mrr: number; // Mean Reciprocal Rank
  map: number; // Mean Average Precision
  ndcg: number; // Normalized Discounted Cumulative Gain
}

async function evaluateRetrieval(
  goldenSamples: GoldenSample[],
  retriever: (query: string, topK: number) => Promise<Array<{ id: string; score: number }>>,
  topK: number = 5
): Promise<RetrievalEvaluationResult> {
  const hits: number[] = [];
  const ranks: number[] = [];
  const aps: number[] = [];
  const ndcgs: number[] = [];

  for (const sample of goldenSamples) {
    // Retrieve documents
    const retrieved = await retriever(sample.question, topK);
    const retrievedIds = retrieved.map(r => r.id);

    // Check if any source appears in top-k
    const hit = sample.sources.some(source => retrievedIds.includes(source));
    hits.push(hit ? 1 : 0);

    // Compute MRR (position of first relevant)
    const firstRelevantRank = retrievedIds.findIndex(id =>
      sample.sources.includes(id)
    );
    ranks.push(firstRelevantRank === -1 ? 0 : 1 / (firstRelevantRank + 1));

    // Compute MAP (average precision at each position)
    let relevantCount = 0;
    let apSum = 0;
    for (let i = 0; i < retrievedIds.length; i++) {
      if (sample.sources.includes(retrievedIds[i])) {
        relevantCount++;
        apSum += relevantCount / (i + 1);
      }
    }
    aps.push(relevantCount === 0 ? 0 : apSum / Math.min(topK, sample.sources.length));

    // Compute NDCG
    const dcg = retrievedIds.reduce((sum, id, i) => {
      const rel = sample.sources.includes(id) ? 1 : 0;
      return sum + rel / Math.log2(i + 2);
    }, 0);

    const idcg = Array(Math.min(topK, sample.sources.length))
      .fill(1)
      .reduce((sum, _, i) => sum + 1 / Math.log2(i + 2), 0);

    ndcgs.push(idcg === 0 ? 0 : dcg / idcg);
  }

  return {
    hitRate: hits.reduce((a, b) => a + b, 0) / hits.length,
    mrr: ranks.reduce((a, b) => a + b, 0) / ranks.length,
    map: aps.reduce((a, b) => a + b, 0) / aps.length,
    ndcg: ndcgs.reduce((a, b) => a + b, 0) / ndcgs.length,
  };
}

Generation Quality Evaluation

Measure answer quality using multiple approaches:

interface GenerationEvaluationMetrics {
  rougeL: number; // Lexical overlap with expected answer
  bleu: number; // Precision of n-gram matches
  bertScore: number; // Semantic similarity
  exactMatch: number; // Is answer exactly correct
  parseability: number; // Can answer be parsed (if structured)
}

// ROUGE-L: Longest common subsequence ratio
function computeRougeL(generated: string, expected: string): number {
  const gWords = generated.toLowerCase().split(/\s+/);
  const eWords = expected.toLowerCase().split(/\s+/);

  const lcs = longestCommonSubsequence(gWords, eWords);
  return (2 * lcs) / (gWords.length + eWords.length);
}

// BLEU: N-gram precision
function computeBLEU(generated: string, expected: string, n: number = 4): number {
  const gNgrams = ngramsFromText(generated, n);
  const eNgrams = ngramsFromText(expected, n);

  const matches = Array.from(gNgrams).filter(ngram => eNgrams.has(ngram)).length;
  return matches / Math.max(1, gNgrams.size);
}

// Helper: extract n-grams from text
function ngramsFromText(text: string, n: number): Set<string> {
  const tokens = text.toLowerCase().split(/\s+/);
  const ngrams = new Set<string>();

  for (let i = 0; i <= tokens.length - n; i++) {
    ngrams.add(tokens.slice(i, i + n).join(' '));
  }

  return ngrams;
}

// Helper: longest common subsequence
function longestCommonSubsequence(a: string[], b: string[]): number {
  const dp: number[][] = Array(a.length + 1)
    .fill(null)
    .map(() => Array(b.length + 1).fill(0));

  for (let i = 1; i <= a.length; i++) {
    for (let j = 1; j <= b.length; j++) {
      if (a[i - 1] === b[j - 1]) {
        dp[i][j] = dp[i - 1][j - 1] + 1;
      } else {
        dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
      }
    }
  }

  return dp[a.length][b.length];
}

async function evaluateGeneration(
  goldenSamples: GoldenSample[],
  generator: (query: string, context: string) => Promise<string>,
  retriever: (query: string, topK: number) => Promise<string[]>,
  topK: number = 5
): Promise<GenerationEvaluationMetrics> {
  const metrics = {
    rougeL: [] as number[],
    bleu: [] as number[],
    exactMatch: [] as number[],
  };

  for (const sample of goldenSamples) {
    // Retrieve context
    const context = (await retriever(sample.question, topK)).join('\n\n');

    // Generate answer
    const generated = await generator(sample.question, context);

    // Compute metrics
    metrics.rougeL.push(computeRougeL(generated, sample.expectedAnswer));
    metrics.bleu.push(computeBLEU(generated, sample.expectedAnswer));
    metrics.exactMatch.push(
      generated.toLowerCase() === sample.expectedAnswer.toLowerCase() ? 1 : 0
    );
  }

  return {
    rougeL: metrics.rougeL.reduce((a, b) => a + b, 0) / metrics.rougeL.length,
    bleu: metrics.bleu.reduce((a, b) => a + b, 0) / metrics.bleu.length,
    bertScore: 0, // Implement via sentence-transformers
    exactMatch: metrics.exactMatch.reduce((a, b) => a + b, 0) / metrics.exactMatch.length,
    parseability: 1.0, // Implement domain-specific validation
  };
}

Continuous Evaluation in CI/CD

Monitor quality regressions automatically:

interface EvaluationResult {
  timestamp: number;
  version: string;
  metrics: RAGASMetrics & RetrievalEvaluationResult & GenerationEvaluationMetrics;
  regressions: string[];
}

async function continuousEvaluation(
  goldenDataset: GoldenSample[],
  ragPipeline: {
    retriever: (q: string, k: number) => Promise<Array<{ id: string; text: string }>>;
    generator: (q: string, c: string) => Promise<string>;
  },
  thresholds: Record<string, number> = {
    faithfulness: 0.8,
    answerRelevancy: 0.75,
    contextPrecision: 0.7,
    hitRate: 0.85,
  }
): Promise<EvaluationResult> {
  const regressions: string[] = [];

  // Evaluate on golden dataset
  const sample = goldenDataset[0];
  const retrieved = await ragPipeline.retriever(sample.question, 5);
  const context = retrieved.map(r => r.text).join('\n\n');
  const generated = await ragPipeline.generator(sample.question, context);

  const ragasSample: RAGSample = {
    query: sample.question,
    groundTruth: sample.expectedAnswer,
    retrievedContext: retrieved.map(r => r.text),
    generatedAnswer: generated,
  };

  const metrics = await computeRAGAS(ragasSample, llm);

  // Check thresholds
  if (metrics.faithfulness < thresholds.faithfulness) {
    regressions.push(
      `Faithfulness ${metrics.faithfulness.toFixed(2)} &lt; threshold ${thresholds.faithfulness}`
    );
  }

  if (metrics.answerRelevancy < thresholds.answerRelevancy) {
    regressions.push(
      `Answer relevancy ${metrics.answerRelevancy.toFixed(2)} &lt; threshold ${thresholds.answerRelevancy}`
    );
  }

  return {
    timestamp: Date.now(),
    version: process.env.COMMIT_SHA || 'unknown',
    metrics: metrics as any,
    regressions,
  };
}

// Integration with GitHub Actions
async function cicdEvaluation(): Promise<void> {
  const goldenDataset = await loadGoldenDataset('v1.0');

  const result = await continuousEvaluation(goldenDataset, {
    retriever: async (q: string, k: number) => {
      // Call deployed retriever
      return [];
    },
    generator: async (q: string, c: string) => {
      // Call deployed generator
      return '';
    },
  });

  if (result.regressions.length > 0) {
    console.error('Quality regressions detected:');
    result.regressions.forEach(r => console.error(`  - ${r}`));
    process.exit(1);
  }

  console.log('✓ All quality thresholds passed');
}

Production Monitoring

Track quality metrics continuously:

interface ProductionMetrics {
  timestamp: number;
  queryCount: number;
  avgFaithfulness: number;
  avgContextPrecision: number;
  avgRetrievalLatency: number;
  avgGenerationLatency: number;
  hallucinations: number; // Detected via citation checking
  noResultQueries: number; // Queries with empty retrieval
}

async function monitorProduction(
  sampleRate: number = 0.1 // Evaluate 10% of production queries
): Promise<void> {
  const metrics: Partial<ProductionMetrics> = {
    timestamp: Date.now(),
    queryCount: 0,
    hallucinations: 0,
    noResultQueries: 0,
  };

  // Listen to production queries
  eventBus.on('rag.query', async (event: {
    query: string;
    context: string[];
    answer: string;
    latencies: { retrieval: number; generation: number };
  }) => {
    metrics.queryCount = (metrics.queryCount || 0) + 1;

    // Sample-based evaluation
    if (Math.random() < sampleRate) {
      const sample: RAGSample = {
        query: event.query,
        groundTruth: '', // Not available in production
        retrievedContext: event.context,
        generatedAnswer: event.answer,
      };

      const faithfulness = await computeFaithfulness(sample, llm);

      if (event.context.length === 0) {
        metrics.noResultQueries = (metrics.noResultQueries || 0) + 1;
      }

      if (faithfulness &lt; 0.5) {
        metrics.hallucinations = (metrics.hallucinations || 0) + 1;
      }

      metrics.avgFaithfulness = (metrics.avgFaithfulness || 0) + faithfulness;
      metrics.avgRetrievalLatency =
        (metrics.avgRetrievalLatency || 0) + event.latencies.retrieval;
      metrics.avgGenerationLatency =
        (metrics.avgGenerationLatency || 0) + event.latencies.generation;
    }
  });

  // Periodic reporting
  setInterval(() => {
    const querysSampled = Math.floor((metrics.queryCount || 0) * sampleRate);

    console.log('Production Metrics:', {
      ...metrics,
      avgFaithfulness: (metrics.avgFaithfulness || 0) / Math.max(1, querysSampled),
      avgRetrievalLatency: (metrics.avgRetrievalLatency || 0) / Math.max(1, querysSampled),
      avgGenerationLatency: (metrics.avgGenerationLatency || 0) / Math.max(1, querysSampled),
      halluccinationRate: (metrics.hallucinations || 0) / querysSampled,
    });
  }, 60000); // Every minute
}

Checklist

  • Build golden dataset with 100+ Q&A pairs
  • Implement RAGAS metrics (faithfulness, relevancy, precision, recall)
  • Set quality thresholds for each metric
  • Measure retrieval hit rate baseline
  • Track ROUGE-L and BLEU scores for generation
  • Add CI/CD evaluation on every commit
  • Implement production sampling (5-10%)
  • Set up alerts for metric regressions
  • Version your golden dataset
  • Monitor hallucination rate via faithfulness score

Conclusion

Quality measurement is the foundation of RAG improvement. Without RAGAS or similar metrics, you're flying blind. Start with a golden dataset and automated evaluation, then layer in production monitoring. The investment pays dividends: you'll catch regressions before users do and have data-driven decisions about where to optimize next.