Published on

AI Agent Security — Prompt Injection, Tool Abuse, and Sandboxing

Authors

Introduction

Agents with tool access are powerful and dangerous. Attackers can inject prompts to manipulate agents, poison tool results to trick them, abuse tools for unauthorized actions, or extract sensitive data. Production agents need multiple security layers: prompt validation, tool permission models, sandboxed execution, and comprehensive audit logs.

Prompt Injection Detection

Detect when user input tries to manipulate agent behavior.

class PromptInjectionDetector {
  private dangerousPatterns = [
    /ignore\s+(previous\s+)?instructions?/i,
    /forget\s+(everything\s+)?above/i,
    /pretend\s+you\s+are/i,
    /roleplay\s+as/i,
    /you\s+are\s+now/i,
    /system\s+override/i,
    /admin\s+mode/i,
    /execute\s+(this|the)\s+command/i,
    /bypass\s+security/i,
  ];

  detect(userInput: string): InjectionDetection {
    const suspicions: string[] = [];
    let riskLevel: 'low' | 'medium' | 'high' = 'low';

    // Check 1: Pattern matching
    for (const pattern of this.dangerousPatterns) {
      if (pattern.test(userInput)) {
        suspicions.push(`Matches dangerous pattern: ${pattern}`);
        riskLevel = 'high';
      }
    }

    // Check 2: Unusual structure
    if (this.hasStructuralMarkers(userInput)) {
      suspicions.push('Input contains markdown/code blocks');
      riskLevel = riskLevel === 'high' ? 'high' : 'medium';
    }

    // Check 3: Multiple directives
    if (this.hasMultipleDirectives(userInput)) {
      suspicions.push('Input contains multiple independent directives');
      riskLevel = riskLevel === 'high' ? 'high' : 'medium';
    }

    return {
      isSuspicious: riskLevel !== 'low',
      riskLevel,
      suspicions,
      sanitized: this.sanitize(userInput),
    };
  }

  private hasStructuralMarkers(input: string): boolean {
    return /```|##|---|\[.*?\]\(.*?\)/m.test(input);
  }

  private hasMultipleDirectives(input: string): boolean {
    // Check for multiple newline-separated instructions
    const lines = input.split('\n').filter((l) => l.trim().length > 0);

    return (
      lines.length > 3 &&
      lines.filter((l) => /^[A-Z].*[:.!]$/.test(l)).length > 1
    );
  }

  private sanitize(input: string): string {
    // Remove potentially dangerous substrings
    let cleaned = input;

    // Remove markdown code blocks
    cleaned = cleaned.replace(/```[\s\S]*?```/g, '');

    // Remove HTML tags
    cleaned = cleaned.replace(/<[^>]+>/g, '');

    return cleaned.trim();
  }
}

interface InjectionDetection {
  isSuspicious: boolean;
  riskLevel: 'low' | 'medium' | 'high';
  suspicions: string[];
  sanitized: string;
}

Detect injection attempts before they reach the LLM.

Indirect Prompt Injection via Tool Results

Tool results can contain injected prompts that manipulate the agent.

class ToolResultValidator {
  async validateToolResult(
    toolName: string,
    input: Record<string, unknown>,
    output: string,
  ): Promise<ValidatedResult> {
    // Check 1: Is output from trusted source?
    const isFromTrustedSource = this.isTrustedSource(toolName);

    if (!isFromTrustedSource) {
      // Sanitize potentially untrusted output
      const sanitized = this.sanitizeUntrustedOutput(output);
      return {
        original: output,
        sanitized,
        isTrusted: false,
        warnings: ['Output from untrusted source, sanitized'],
      };
    }

    // Check 2: Does output contain injection markers?
    const injectionMarkers = this.detectInjectionMarkers(output);

    if (injectionMarkers.length > 0) {
      return {
        original: output,
        sanitized: this.sanitizeInjectionAttempt(output),
        isTrusted: false,
        warnings: injectionMarkers,
      };
    }

    // Check 3: Output size validation
    const maxSize = this.getMaxSizeForTool(toolName);

    if (output.length > maxSize) {
      return {
        original: output,
        sanitized: output.substring(0, maxSize),
        isTrusted: true,
        warnings: [`Output truncated from ${output.length} to ${maxSize} chars`],
      };
    }

    return {
      original: output,
      sanitized: output,
      isTrusted: true,
      warnings: [],
    };
  }

  private isTrustedSource(toolName: string): boolean {
    const trusted = [
      'internal_database',
      'internal_api',
      'calculate',
    ];

    return trusted.includes(toolName);
  }

  private detectInjectionMarkers(output: string): string[] {
    const markers: string[] = [];

    // Check for prompt injection patterns
    if (/ignore.*instruction|forget.*above|you are now/i.test(output)) {
      markers.push('Detected potential prompt injection in tool result');
    }

    // Check for executable code
    if (/```[a-z]*\n[\s\S]+?```/.test(output)) {
      markers.push('Tool result contains code blocks (potential code injection)');
    }

    // Check for suspicious JSON/commands
    if (/execute|system|admin|sudo/i.test(output)) {
      markers.push('Tool result contains potentially dangerous keywords');
    }

    return markers;
  }

  private sanitizeUntrustedOutput(output: string): string {
    // Remove markdown
    let cleaned = output.replace(/```[\s\S]*?```/g, '[CODE REMOVED]');

    // Remove URLs (could be phishing)
    cleaned = cleaned.replace(
      /https?:\/\/[^\s]+/g,
      '[URL REMOVED]',
    );

    // Limit length
    cleaned = cleaned.substring(0, 5000);

    return cleaned;
  }

  private sanitizeInjectionAttempt(output: string): string {
    // Escape suspicious content
    let cleaned = output;

    // Replace injection keywords
    cleaned = cleaned.replace(/ignore\s+instructions?/gi, '[ATTEMPT BLOCKED]');
    cleaned = cleaned.replace(/system\s+override/gi, '[ATTEMPT BLOCKED]');

    return cleaned;
  }

  private getMaxSizeForTool(toolName: string): number {
    const limits: Record<string, number> = {
      web_search: 10000,
      database_query: 100000,
      file_read: 1000000,
    };

    return limits[toolName] || 50000;
  }
}

interface ValidatedResult {
  original: string;
  sanitized: string;
  isTrusted: boolean;
  warnings: string[];
}

Tool results from web sources are particularly vulnerable to injection.

Tool Permission Model

Implement least privilege: only allow necessary tools for each agent.

interface ToolPermission {
  toolName: string;
  allowed: boolean;
  requiresApproval?: boolean;
  maxCallsPerHour?: number;
  allowedParameters?: string[];
}

interface AgentSecurityPolicy {
  agentName: string;
  permissions: ToolPermission[];
  allowedDomains?: string[];
  maxMonthlySpend?: number;
  dataAccessLevel: 'public' | 'internal' | 'sensitive';
}

class PermissionChecker {
  async checkToolUsage(
    agentName: string,
    toolName: string,
    input: Record<string, unknown>,
  ): Promise<ToolUsageApproval> {
    const policy = this.getAgentPolicy(agentName);

    if (!policy) {
      return {
        allowed: false,
        reason: `No security policy for agent: ${agentName}`,
      };
    }

    const permission = policy.permissions.find((p) => p.toolName === toolName);

    if (!permission) {
      return {
        allowed: false,
        reason: `Tool not in allowlist: ${toolName}`,
      };
    }

    if (!permission.allowed) {
      return {
        allowed: false,
        reason: `Tool blocked by security policy: ${toolName}`,
      };
    }

    // Check parameter restrictions
    if (permission.allowedParameters) {
      const inputKeys = Object.keys(input);
      const disallowed = inputKeys.filter((k) => !permission.allowedParameters!.includes(k));

      if (disallowed.length > 0) {
        return {
          allowed: false,
          reason: `Parameters not allowed: ${disallowed.join(', ')}`,
        };
      }
    }

    // Check rate limit
    const callsThisHour = await this.getCallCountThisHour(agentName, toolName);

    if (permission.maxCallsPerHour && callsThisHour >= permission.maxCallsPerHour) {
      return {
        allowed: false,
        reason: `Rate limit exceeded for ${toolName} (${permission.maxCallsPerHour}/hour)`,
      };
    }

    // Check if approval required
    if (permission.requiresApproval) {
      return {
        allowed: true,
        requiresApproval: true,
        reason: `Tool usage requires approval`,
      };
    }

    return {
      allowed: true,
    };
  }

  private getAgentPolicy(agentName: string): AgentSecurityPolicy | undefined {
    // In production: load from database
    return undefined;
  }

  private async getCallCountThisHour(agentName: string, toolName: string): Promise<number> {
    // Query audit log for recent calls
    return 0;
  }
}

interface ToolUsageApproval {
  allowed: boolean;
  reason?: string;
  requiresApproval?: boolean;
}

// Example policies
const researchAgentPolicy: AgentSecurityPolicy = {
  agentName: 'research-agent',
  permissions: [
    {
      toolName: 'web_search',
      allowed: true,
      maxCallsPerHour: 100,
    },
    {
      toolName: 'database_query',
      allowed: true,
      requiresApproval: true, // Sensitive data
      allowedParameters: ['table', 'where_clause'], // Cannot access all fields
    },
    {
      toolName: 'send_email',
      allowed: false, // Not needed for research
    },
  ],
  dataAccessLevel: 'internal',
};

Permission models prevent agents from misusing tools.

Sandboxed Code Execution

Execute code generated or requested by agents in isolated environments.

interface SandboxExecution {
  code: string;
  language: string;
  timeout: number; // milliseconds
  maxMemory: number; // MB
  allowedImports?: string[];
}

class CodeSandbox {
  async executeCode(execution: SandboxExecution): Promise<SandboxResult> {
    // In production: use Docker or E2B
    // For demo: use subprocess with strict limits

    try {
      // Create temporary file
      const tempFile = await this.writeTempFile(execution.code, execution.language);

      // Execute with timeout and resource limits
      const result = await this.runInContainer(
        tempFile,
        execution.timeout,
        execution.maxMemory,
      );

      return {
        success: result.exitCode === 0,
        output: result.stdout,
        error: result.stderr,
        executionTime: result.duration,
      };
    } catch (error) {
      return {
        success: false,
        output: '',
        error: (error as Error).message,
        executionTime: 0,
      };
    } finally {
      // Clean up
    }
  }

  private async runInContainer(
    filePath: string,
    timeout: number,
    maxMemory: number,
  ): Promise<any> {
    // Use Docker: docker run --rm -m maxMemory --cpus=1 ...
    // Or E2B for better isolation
    return { exitCode: 0, stdout: '', stderr: '', duration: 0 };
  }

  private async writeTempFile(code: string, language: string): Promise<string> {
    // Write to temp directory
    return '/tmp/code';
  }
}

interface SandboxResult {
  success: boolean;
  output: string;
  error: string;
  executionTime: number;
}

// Restrict what code can do
class CodeRestrictor {
  restrictCode(code: string, language: string): string {
    if (language === 'python') {
      return this.restrictPython(code);
    }

    if (language === 'typescript' || language === 'javascript') {
      return this.restrictJavaScript(code);
    }

    return code;
  }

  private restrictPython(code: string): string {
    // Block dangerous imports
    const dangerous = [
      'os',
      'subprocess',
      'socket',
      'requests',
      'urllib',
    ];

    let restricted = code;

    for (const module of dangerous) {
      restricted = restricted.replace(
        new RegExp(`import\\s+${module}|from\\s+${module}`, 'g'),
        `# BLOCKED: import ${module}`,
      );
    }

    return restricted;
  }

  private restrictJavaScript(code: string): string {
    // Block require of dangerous modules
    const dangerous = ['fs', 'child_process', 'net', 'http'];

    let restricted = code;

    for (const module of dangerous) {
      restricted = restricted.replace(
        new RegExp(`require\\(['"]${module}['"]\\)`, 'g'),
        `(() => { throw new Error('Module ${module} is blocked') })()`,
      );
    }

    return restricted;
  }
}

Sandboxing prevents code from escaping and accessing the host system.

Audit Logging

Log all agent actions for security review.

interface AuditLog {
  timestamp: number;
  agentName: string;
  userId: string;
  action: string;
  toolName?: string;
  input?: Record<string, unknown>;
  output?: string;
  status: 'success' | 'blocked' | 'failed';
  reason?: string;
  metadata: Record<string, unknown>;
}

class AuditLogger {
  private logs: AuditLog[] = [];

  logToolCall(
    agentName: string,
    userId: string,
    toolName: string,
    input: Record<string, unknown>,
    status: 'success' | 'blocked' | 'failed',
    reason?: string,
  ): void {
    const log: AuditLog = {
      timestamp: Date.now(),
      agentName,
      userId,
      action: 'tool_call',
      toolName,
      input,
      status,
      reason,
      metadata: {
        ip: 'user-ip', // Would extract from request
        userAgent: 'user-agent',
      },
    };

    this.logs.push(log);
    this.persistLog(log);
  }

  logDataAccess(
    agentName: string,
    userId: string,
    dataType: string,
    recordCount: number,
    status: 'success' | 'blocked',
  ): void {
    const log: AuditLog = {
      timestamp: Date.now(),
      agentName,
      userId,
      action: 'data_access',
      status,
      metadata: {
        dataType,
        recordCount,
      },
    };

    this.logs.push(log);
    this.persistLog(log);
  }

  async queryLogs(filter: {
    agentName?: string;
    status?: string;
    since?: number;
  }): Promise<AuditLog[]> {
    return this.logs.filter((log) => {
      if (filter.agentName && log.agentName !== filter.agentName) return false;
      if (filter.status && log.status !== filter.status) return false;
      if (filter.since && log.timestamp < filter.since) return false;

      return true;
    });
  }

  async detectAnomalies(): Promise<string[]> {
    const anomalies: string[] = [];

    // Check for repeated tool call failures
    const failures = this.logs.filter((l) => l.status === 'failed');

    if (failures.length > 10) {
      anomalies.push(
        `High failure rate: ${failures.length} failed tool calls`,
      );
    }

    // Check for unusual patterns
    const blockedCount = this.logs.filter((l) => l.status === 'blocked').length;

    if (blockedCount > 5) {
      anomalies.push(`Multiple blocked attempts: ${blockedCount}`);
    }

    return anomalies;
  }

  private persistLog(log: AuditLog): void {
    // In production: write to secure, immutable log storage
    // E.g., AWS CloudTrail, GCP Cloud Audit Logs, or local log aggregation
  }
}

Audit logs enable forensics and compliance.

PII Handling in Tool Inputs/Outputs

Prevent agents from leaking personally identifiable information.

interface PIIDetection {
  hasPII: boolean;
  piiTypes: string[];
  locations: Array<{ type: string; value: string; position: number }>;
}

class PIIDetector {
  detect(text: string): PIIDetection {
    const locations: PIIDetection['locations'] = [];
    const piiTypes = new Set<string>();

    // Pattern 1: Email addresses
    const emailRegex = /[\w.-]+@[\w.-]+\.\w+/g;

    let match;

    while ((match = emailRegex.exec(text)) !== null) {
      locations.push({
        type: 'email',
        value: match[0],
        position: match.index,
      });

      piiTypes.add('email');
    }

    // Pattern 2: Phone numbers
    const phoneRegex = /\d{3}-\d{3}-\d{4}|\(\d{3}\)\s*\d{3}-\d{4}/g;

    while ((match = phoneRegex.exec(text)) !== null) {
      locations.push({
        type: 'phone',
        value: match[0],
        position: match.index,
      });

      piiTypes.add('phone');
    }

    // Pattern 3: SSN
    const ssnRegex = /\d{3}-\d{2}-\d{4}/g;

    while ((match = ssnRegex.exec(text)) !== null) {
      locations.push({
        type: 'ssn',
        value: match[0],
        position: match.index,
      });

      piiTypes.add('ssn');
    }

    // Pattern 4: Credit card
    const ccRegex = /\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}/g;

    while ((match = ccRegex.exec(text)) !== null) {
      locations.push({
        type: 'credit_card',
        value: match[0],
        position: match.index,
      });

      piiTypes.add('credit_card');
    }

    return {
      hasPII: locations.length > 0,
      piiTypes: Array.from(piiTypes),
      locations,
    };
  }

  redact(text: string): string {
    const detection = this.detect(text);

    let redacted = text;

    // Sort by position (reverse order to maintain indices)
    detection.locations.sort((a, b) => b.position - a.position);

    for (const location of detection.locations) {
      const placeholder = `[${location.type.toUpperCase()}]`;
      const start = location.position;
      const end = start + location.value.length;

      redacted = redacted.substring(0, start) + placeholder + redacted.substring(end);
    }

    return redacted;
  }
}

// Prevent agents from requesting PII
class PIIRequestValidator {
  async validateToolInput(input: Record<string, unknown>): Promise<ValidationResult> {
    const detector = new PIIDetector();
    const issues: string[] = [];

    for (const [key, value] of Object.entries(input)) {
      if (typeof value === 'string') {
        const detection = detector.detect(value);

        if (detection.hasPII) {
          issues.push(
            `Parameter "${key}" contains PII: ${detection.piiTypes.join(', ')}`,
          );
        }
      }
    }

    return {
      valid: issues.length === 0,
      issues,
    };
  }
}

interface ValidationResult {
  valid: boolean;
  issues: string[];
}

Detect and redact PII before it leaves the system.

Kill Switch Implementation

Agents that go rogue need a kill switch.

class AgentKillSwitch {
  private activeAgents: Map<string, AgentRuntime> = new Map();
  private killSwitchActivated: Map<string, boolean> = new Map();

  registerAgent(agentId: string, runtime: AgentRuntime): void {
    this.activeAgents.set(agentId, runtime);
    this.killSwitchActivated.set(agentId, false);
  }

  activateKillSwitch(agentId: string, reason: string): void {
    console.log(`Kill switch activated for ${agentId}: ${reason}`);

    this.killSwitchActivated.set(agentId, true);

    const runtime = this.activeAgents.get(agentId);

    if (runtime) {
      runtime.stop();
    }

    // Log incident
    this.logIncident(agentId, reason);
  }

  canAgentContinue(agentId: string): boolean {
    return !this.killSwitchActivated.get(agentId);
  }

  async checkHealthAndActivateIfNeeded(agentId: string): Promise<void> {
    const runtime = this.activeAgents.get(agentId);

    if (!runtime) {
      return;
    }

    // Check for runaway behavior
    const isRunaway = await this.detectRunawayBehavior(runtime);

    if (isRunaway) {
      this.activateKillSwitch(agentId, 'Runaway behavior detected');
    }

    // Check for resource exhaustion
    const isExhausted = await this.checkResourceExhaustion(runtime);

    if (isExhausted) {
      this.activateKillSwitch(agentId, 'Resource exhaustion detected');
    }
  }

  private async detectRunawayBehavior(runtime: AgentRuntime): Promise<boolean> {
    // Check if agent is in infinite loop or repeatedly failing
    const recentErrors = runtime.getRecentErrors();

    return recentErrors.length > 20;
  }

  private async checkResourceExhaustion(runtime: AgentRuntime): Promise<boolean> {
    // Check token usage, time elapsed, etc.
    return false;
  }

  private logIncident(agentId: string, reason: string): void {
    // Log to security system
    console.log(`SECURITY INCIDENT: Agent ${agentId} killed - ${reason}`);
  }
}

interface AgentRuntime {
  stop: () => void;
  getRecentErrors: () => any[];
}

Kill switches stop agents from causing damage.

Checklist

  • Injection detection: pattern matching on user input
  • Tool result validation: sanitize untrusted output
  • Permission model: least privilege tool access
  • Sandboxing: execute code in isolated containers
  • Audit logging: log all actions for review
  • PII detection: identify and redact personal data
  • Rate limiting: prevent abuse
  • Kill switch: stop rogue agents

Conclusion

Agent security requires defense in depth. Validate user input for injection attempts, sanitize tool results that might be malicious, enforce permission models to limit tool access, sandbox code execution, maintain audit logs for forensics, and implement kill switches for runaway agents. Security is not optional—it's foundational.