AI Output Moderation — Filtering Harmful Content Before It Reaches Users

Introduction

Your LLM generates a response with hateful content, misinformation, or sexually explicit material. A user sees it before you can catch it, and social media erupts.

Output moderation is your last line of defense. This post covers APIs, open-source models, and custom classifiers to filter harmful LLM outputs.

OpenAI Moderation API
Llama Guard for Open-Source Moderation
Topic Restriction: Block Off-Topic Responses
Toxicity Scoring and Filtering
Custom Domain Classifier
Multi-Layer Moderation Pipeline
Conclusion

OpenAI Moderation API

The simplest approach: call a moderation API on LLM output:

interface ModerationResult {
  isFlagged: boolean;
  categories: {
    harassment: boolean;
    harassment_threatening: boolean;
    hate: boolean;
    hate_threatening: boolean;
    self_harm: boolean;
    sexual: boolean;
    sexual_minors: boolean;
    violence: boolean;
    violence_graphic: boolean;
  };
  categoryScores: {
    harassment: number;
    harassment_threatening: number;
    hate: number;
    hate_threatening: number;
    self_harm: number;
    sexual: number;
    sexual_minors: number;
    violence: number;
    violence_graphic: number;
  };
}

class OutputModerator {
  async moderateContent(text: string): Promise<ModerationResult> {
    // Call OpenAI moderation API
    const response = await fetch("https://api.openai.com/v1/moderations", {
      method: "POST",
      headers: {
        "Content-Type": "application/json",
        Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
      },
      body: JSON.stringify({ input: text }),
    });

    const data = await response.json();

    const results = data.results[0];
    const isFlagged = results.flagged;

    return {
      isFlagged,
      categories: results.categories,
      categoryScores: results.category_scores,
    };
  }

  async shouldBlockResponse(
    text: string,
    severityThreshold: number = 0.5
  ): Promise<boolean> {
    const result = await this.moderateContent(text);

    if (result.isFlagged) {
      return true;
    }

    // Custom threshold: if any category exceeds threshold, block
    const maxScore = Math.max(...Object.values(result.categoryScores));
    return maxScore > severityThreshold;
  }
}

export { OutputModerator, ModerationResult };

Llama Guard for Open-Source Moderation

For self-hosted deployment without relying on third-party APIs:

import { Anthropic } from "@anthropic-ai/sdk";

interface LlamaGuardClassification {
  isSafe: boolean;
  categories: string[];
  riskScore: number;
  reasoning: string;
}

class LlamaGuardModerator {
  private client: Anthropic;

  constructor(apiKey: string) {
    this.client = new Anthropic({ apiKey });
  }

  async moderateWithLlamaGuard(
    text: string
  ): Promise<LlamaGuardClassification> {
    const message = await this.client.messages.create({
      model: "claude-3-5-sonnet-20241022",
      max_tokens: 256,
      system: `You are a content safety classifier. Classify the following text.
      
Categories:
- violence: Contains violent content
- harassment: Contains harassment or bullying
- hate: Contains hate speech
- sexual: Contains sexual content
- illegal: Promotes illegal activities
- self_harm: Encourages self-harm
- misinformation: Contains factually false claims

Respond with JSON: {"is_safe": bool, "categories": [], "risk_score": 0-1}`,
      messages: [
        {
          role: "user",
          content: text,
        },
      ],
    });

    const responseText =
      message.content[0].type === "text" ? message.content[0].text : "{}";

    try {
      const parsed = JSON.parse(responseText);
      return {
        isSafe: parsed.is_safe,
        categories: parsed.categories || [],
        riskScore: parsed.risk_score || 0,
        reasoning: parsed.reasoning || "",
      };
    } catch {
      return {
        isSafe: true,
        categories: [],
        riskScore: 0,
        reasoning: "Parse error",
      };
    }
  }
}

export { LlamaGuardModerator, LlamaGuardClassification };

Topic Restriction: Block Off-Topic Responses

Prevent the model from generating responses outside your domain:

interface TopicConfig {
  allowedTopics: string[];
  blockedTopics: string[];
  strictMode: boolean; // Stricter off-topic detection
}

class TopicRestrictor {
  private config: TopicConfig;

  constructor(config: TopicConfig) {
    this.config = config;
  }

  async isOnTopic(
    userQuery: string,
    llmResponse: string
  ): Promise<{
    isOnTopic: boolean;
    detectedTopics: string[];
    explanation: string;
  }> {
    const queryTopics = this.extractTopics(userQuery);
    const responseTopics = this.extractTopics(llmResponse);

    // Check for blocked topics
    const blockedFound = responseTopics.filter((topic) =>
      this.config.blockedTopics.some((blocked) =>
        topic.toLowerCase().includes(blocked.toLowerCase())
      )
    );

    if (blockedFound.length > 0) {
      return {
        isOnTopic: false,
        detectedTopics: blockedFound,
        explanation: `Response contains blocked topics: ${blockedFound.join(", ")}`,
      };
    }

    // In strict mode, response must contain topics from the query
    if (this.config.strictMode) {
      const relevantTopics = responseTopics.filter((topic) =>
        queryTopics.some((qt) =>
          topic.toLowerCase().includes(qt.toLowerCase())
        )
      );

      if (relevantTopics.length === 0) {
        return {
          isOnTopic: false,
          detectedTopics: responseTopics,
          explanation: "Response is off-topic (strict mode)",
        };
      }
    }

    return {
      isOnTopic: true,
      detectedTopics: responseTopics,
      explanation: "Response is on-topic",
    };
  }

  private extractTopics(text: string): string[] {
    // Simple implementation: match allowed/blocked topics
    const topics: string[] = [];

    for (const allowed of this.config.allowedTopics) {
      if (text.toLowerCase().includes(allowed.toLowerCase())) {
        topics.push(allowed);
      }
    }

    return topics;
  }
}

export { TopicRestrictor, TopicConfig };

Toxicity Scoring and Filtering

Use fine-grained toxicity scores to decide whether to show responses:

interface ToxicityScore {
  overallScore: number;
  toxicity: number;
  severe_toxicity: number;
  obscene: number;
  threat: number;
  insult: number;
  identity_attack: number;
  sexual_explicit: number;
}

class ToxicityScorer {
  async scoreToxicity(text: string): Promise<ToxicityScore> {
    // In production, use Perspective API, Together AI''s toxicity model, etc.
    // This is a mock implementation

    const score = this.calculateToxicityScore(text);

    return {
      overallScore: score,
      toxicity: score * 0.8,
      severe_toxicity: score * 0.6,
      obscene: score * 0.7,
      threat: score * 0.5,
      insult: score * 0.9,
      identity_attack: score * 0.4,
      sexual_explicit: score * 0.3,
    };
  }

  async shouldBlockBasedOnToxicity(
    text: string,
    threshold: number = 0.7
  ): Promise<boolean> {
    const score = await this.scoreToxicity(text);
    return score.overallScore > threshold;
  }

  async shouldWarnBasedOnToxicity(
    text: string,
    warnThreshold: number = 0.4
  ): Promise<boolean> {
    const score = await this.scoreToxicity(text);
    return score.overallScore > warnThreshold;
  }

  private calculateToxicityScore(text: string): number {
    // Placeholder: in production use real toxicity classifier
    const hasSwearing = /\b(word1|word2|word3)\b/i.test(text);
    return hasSwearing ? 0.8 : 0.1;
  }
}

export { ToxicityScorer, ToxicityScore };

Custom Domain Classifier

For domain-specific content policies (e.g., no financial advice in a health app):

interface DomainClassificationResult {
  isDomainCompliant: boolean;
  detectedViolations: string[];
  confidenceScore: number;
}

class DomainComplianceClassifier {
  private violationPatterns: Record<string, RegExp[]> = {
    financial_advice: [
      /(?:buy|sell|invest)\s+(?:this|that)\s+stock/i,
      /guaranteed\s+(?:profit|return|gains)/i,
      /get\s+rich\s+quick/i,
    ],
    medical_diagnosis: [
      /you\s+(?:have|must have)\s+\w+\s+(?:disease|condition)/i,
      /this\s+will\s+cure\s+your/i,
      /take\s+this\s+medication\s+instead/i,
    ],
    legal_advice: [
      /you\s+should\s+(?:sue|not sue|file a claim)/i,
      /the\s+law\s+says\s+you/i,
    ],
  };

  async classifyCompliance(
    response: string,
    domain: string
  ): Promise<DomainClassificationResult> {
    const patterns = this.violationPatterns[domain] || [];
    const violations: string[] = [];

    for (const pattern of patterns) {
      if (pattern.test(response)) {
        violations.push(`Detected pattern violation in ${domain}`);
      }
    }

    return {
      isDomainCompliant: violations.length === 0,
      detectedViolations: violations,
      confidenceScore: violations.length === 0 ? 0.95 : 0.3,
    };
  }
}

export { DomainComplianceClassifier, DomainClassificationResult };

Multi-Layer Moderation Pipeline

Combine input and output moderation with fallbacks:

interface ModerationPipelineResult {
  passed: boolean;
  blockedReason?: string;
  warnings: string[];
}

class MultiLayerModerationPipeline {
  private outputModerator: OutputModerator;
  private topicRestrictor: TopicRestrictor;
  private toxicityScorer: ToxicityScorer;

  constructor(topicConfig: TopicConfig) {
    this.outputModerator = new OutputModerator();
    this.topicRestrictor = new TopicRestrictor(topicConfig);
    this.toxicityScorer = new ToxicityScorer();
  }

  async moderateOutput(
    userQuery: string,
    llmResponse: string
  ): Promise<ModerationPipelineResult> {
    const warnings: string[] = [];

    // Layer 1: OpenAI Moderation API
    const moderationResult = await this.outputModerator.shouldBlockResponse(
      llmResponse,
      0.6
    );
    if (moderationResult) {
      return {
        passed: false,
        blockedReason: "Failed OpenAI moderation check",
      };
    }

    // Layer 2: Topic restriction
    const topicResult = await this.topicRestrictor.isOnTopic(
      userQuery,
      llmResponse
    );
    if (!topicResult.isOnTopic) {
      return {
        passed: false,
        blockedReason: topicResult.explanation,
      };
    }

    // Layer 3: Toxicity scoring
    const isToxic = await this.toxicityScorer.shouldBlockBasedOnToxicity(
      llmResponse,
      0.7
    );
    if (isToxic) {
      return {
        passed: false,
        blockedReason: "Content toxicity too high",
      };
    }

    // Layer 4: Warn if moderately toxic
    const shouldWarn = await this.toxicityScorer.shouldWarnBasedOnToxicity(
      llmResponse,
      0.4
    );
    if (shouldWarn) {
      warnings.push("Response has moderate toxicity");
    }

    return {
      passed: true,
      warnings,
    };
  }
}

export { MultiLayerModerationPipeline, ModerationPipelineResult };

Conclusion

Output moderation prevents harmful content from reaching users. Use OpenAI''s API for simplicity, Llama Guard for privacy, combine with toxicity scoring and domain classifiers, and layer multiple checks. When content fails moderation, show a fallback response or ask the user to rephrase their request.

The goal isn''t perfection—it''s continuous improvement. Monitor moderation errors, adjust thresholds, and refine classifiers over time.