Published on

Multimodal API Integration — Vision, Audio, and Document Processing in Production

Authors

Introduction

Multimodal models unlock new capabilities but introduce complexity: different modalities have different costs, latencies, and failure modes. This guide covers production patterns for reliable multimodal processing.

Image Analysis With Claude/GPT-4o Vision

Process images using vision models with proper encoding and error handling.

import fetch from 'node-fetch';
import fs from 'fs';
import path from 'path';

class VisionAnalyzer {
  async analyzeImageBase64(
    base64Image: string,
    prompt: string,
    model: string = 'gpt-4-vision'
  ): Promise<string> {
    const response = await fetch('https://api.openai.com/v1/chat/completions', {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
        Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
      },
      body: JSON.stringify({
        model,
        messages: [
          {
            role: 'user',
            content: [
              {
                type: 'image_url',
                image_url: {
                  url: `data:image/jpeg;base64,${base64Image}`,
                },
              },
              {
                type: 'text',
                text: prompt,
              },
            ],
          },
        ],
        max_tokens: 1024,
      }),
    });

    const data = (await response.json()) as { choices: Array<{ message: { content: string } }> };
    return data.choices[0].message.content;
  }

  async analyzeImageURL(
    imageUrl: string,
    prompt: string,
    model: string = 'gpt-4-vision'
  ): Promise<string> {
    const response = await fetch('https://api.openai.com/v1/chat/completions', {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
        Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
      },
      body: JSON.stringify({
        model,
        messages: [
          {
            role: 'user',
            content: [
              {
                type: 'image_url',
                image_url: { url: imageUrl },
              },
              {
                type: 'text',
                text: prompt,
              },
            ],
          },
        ],
        max_tokens: 1024,
      }),
    });

    const data = (await response.json()) as { choices: Array<{ message: { content: string } }> };
    return data.choices[0].message.content;
  }

  async encodeImageToBase64(imagePath: string): Promise<string> {
    const imageBuffer = fs.readFileSync(imagePath);
    return imageBuffer.toString('base64');
  }

  async analyzeLocalImage(
    imagePath: string,
    prompt: string,
    useBase64: boolean = true
  ): Promise<string> {
    if (useBase64) {
      const base64 = await this.encodeImageToBase64(imagePath);
      return this.analyzeImageBase64(base64, prompt);
    } else {
      // Upload to temporary storage and use URL
      const uploadedUrl = await this.uploadImageTemporarily(imagePath);
      return this.analyzeImageURL(uploadedUrl, prompt);
    }
  }

  private async uploadImageTemporarily(imagePath: string): Promise<string> {
    // In production, use actual file upload service
    return `https://example.com/temp/${path.basename(imagePath)}`;
  }
}

const analyzer = new VisionAnalyzer();

const analysis = await analyzer.analyzeImageURL(
  'https://example.com/image.jpg',
  'What objects are in this image? List them.'
);

console.log('Analysis:', analysis);

Base64 vs URL for Image Passing

Choose between encoding strategies based on performance and constraints.

interface ImageEncodingStrategy {
  encode(imagePath: string): Promise<string>;
  cost: 'low' | 'medium' | 'high';
  latency: 'low' | 'medium' | 'high';
  suitableFor: string[];
}

class Base64EncodingStrategy implements ImageEncodingStrategy {
  cost = 'high' as const; // Larger requests
  latency = 'medium' as const;
  suitableFor = ['small images', 'private content', 'offline processing'];

  async encode(imagePath: string): Promise<string> {
    const imageBuffer = fs.readFileSync(imagePath);
    return imageBuffer.toString('base64');
  }
}

class URLEncodingStrategy implements ImageEncodingStrategy {
  cost = 'low' as const; // Reference, not data
  latency = 'low' as const;
  suitableFor = ['large images', 'public URLs', 'high throughput'];

  async encode(imagePath: string): Promise<string> {
    // Upload to CDN and return URL
    return this.uploadToCDN(imagePath);
  }

  private async uploadToCDN(imagePath: string): Promise<string> {
    // Simulate CDN upload
    return `https://cdn.example.com/${path.basename(imagePath)}?token=${Date.now()}`;
  }
}

class ImageEncodingSelector {
  selectStrategy(imageSize: number, isPublic: boolean, isLatencySensitive: boolean): ImageEncodingStrategy {
    // Small + private + not latency-sensitive = Base64
    if (imageSize < 1024 * 1024 && !isPublic && !isLatencySensitive) {
      return new Base64EncodingStrategy();
    }

    // Large or public or latency-sensitive = URL
    return new URLEncodingStrategy();
  }

  async encodeImage(
    imagePath: string,
    imageSize: number,
    isPublic: boolean,
    isLatencySensitive: boolean
  ): Promise<string> {
    const strategy = this.selectStrategy(imageSize, isPublic, isLatencySensitive);

    console.log(`Using ${strategy.constructor.name} (cost: ${strategy.cost}, latency: ${strategy.latency})`);

    return strategy.encode(imagePath);
  }
}

const selector = new ImageEncodingSelector();

const encoded = await selector.encodeImage('./image.jpg', 2000000, false, true);

PDF Extraction With Vision

Extract content from PDFs page by page using vision models.

class PDFVisionExtractor {
  async extractFromPDF(
    pdfPath: string,
    prompt: string = 'Extract all text and structured data from this page'
  ): Promise<Array<{ pageNumber: number; content: string; metadata: Record<string, unknown> }>> {
    // Simulate PDF conversion to images
    const pageImages = await this.convertPDFToImages(pdfPath);

    const results: Array<{ pageNumber: number; content: string; metadata: Record<string, unknown> }> = [];

    for (let i = 0; i < pageImages.length; i++) {
      try {
        const content = await this.analyzePageImage(pageImages[i], prompt);

        results.push({
          pageNumber: i + 1,
          content,
          metadata: {
            extractedAt: new Date(),
            model: 'gpt-4-vision',
            confidence: 0.95,
          },
        });

        // Rate limiting
        if (i < pageImages.length - 1) {
          await new Promise((resolve) => setTimeout(resolve, 500));
        }
      } catch (error) {
        results.push({
          pageNumber: i + 1,
          content: '',
          metadata: {
            error: error instanceof Error ? error.message : String(error),
            extractedAt: new Date(),
          },
        });
      }
    }

    return results;
  }

  async extractStructuredData(
    pdfPath: string,
    schema: Record<string, unknown>
  ): Promise<Array<{ pageNumber: number; data: unknown }>> {
    const schemaJson = JSON.stringify(schema);

    const extractedPages = await this.extractFromPDF(
      pdfPath,
      `Extract data matching this schema: ${schemaJson}\n\nReturn valid JSON.`
    );

    return extractedPages.map((page) => ({
      pageNumber: page.pageNumber,
      data: JSON.parse(page.content),
    }));
  }

  private async convertPDFToImages(pdfPath: string): Promise<string[]> {
    // In production, use pdf-lib or similar
    // Returns array of base64 encoded page images
    return Array(3).fill(`data:image/png;base64,${Buffer.alloc(100).toString('base64')}`);
  }

  private async analyzePageImage(base64Image: string, prompt: string): Promise<string> {
    const response = await fetch('https://api.openai.com/v1/chat/completions', {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
        Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
      },
      body: JSON.stringify({
        model: 'gpt-4-vision',
        messages: [
          {
            role: 'user',
            content: [
              {
                type: 'image_url',
                image_url: { url: base64Image },
              },
              {
                type: 'text',
                text: prompt,
              },
            ],
          },
        ],
        max_tokens: 2000,
      }),
    });

    const data = (await response.json()) as { choices: Array<{ message: { content: string } }> };
    return data.choices[0].message.content;
  }
}

const pdfExtractor = new PDFVisionExtractor();

const extracted = await pdfExtractor.extractFromPDF('./invoice.pdf');
console.log(`Extracted ${extracted.length} pages`);

Audio Transcription With Whisper

Transcribe audio files using OpenAI's Whisper API with streaming support.

class WhisperTranscriber {
  async transcribeFile(
    audioPath: string,
    language?: string
  ): Promise<{ text: string; language: string; duration: number }> {
    const audioBuffer = fs.readFileSync(audioPath);

    const formData = new FormData();
    formData.append('file', new Blob([audioBuffer]), path.basename(audioPath));
    formData.append('model', 'whisper-1');
    if (language) formData.append('language', language);

    const response = await fetch('https://api.openai.com/v1/audio/transcriptions', {
      method: 'POST',
      headers: {
        Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
      },
      body: formData,
    });

    const data = (await response.json()) as { text: string; language?: string };

    return {
      text: data.text,
      language: data.language || 'unknown',
      duration: Math.random() * 300, // Mock duration
    };
  }

  async transcribeWithTimestamps(audioPath: string): Promise<Array<{ text: string; start: number; end: number }>> {
    // Note: Returns simple word-level timestamps
    const result = await this.transcribeFile(audioPath);

    const words = result.text.split(/\s+/);
    const avgWordDuration = 0.4; // seconds per word

    return words.map((word, i) => ({
      text: word,
      start: i * avgWordDuration,
      end: (i + 1) * avgWordDuration,
    }));
  }

  async transcribeBatch(
    audioPaths: string[],
    maxConcurrent: number = 3
  ): Promise<Map<string, string>> {
    const results = new Map<string, string>();

    for (let i = 0; i < audioPaths.length; i += maxConcurrent) {
      const batch = audioPaths.slice(i, i + maxConcurrent);
      const batchResults = await Promise.all(batch.map((path) => this.transcribeFile(path)));

      batchResults.forEach((result, idx) => {
        results.set(batch[idx], result.text);
      });

      // Rate limiting between batches
      if (i + maxConcurrent < audioPaths.length) {
        await new Promise((resolve) => setTimeout(resolve, 1000));
      }
    }

    return results;
  }

  estimateCost(durationSeconds: number): number {
    // $0.02 per minute
    return (durationSeconds / 60) * 0.02;
  }
}

const transcriber = new WhisperTranscriber();

// const result = await transcriber.transcribeFile('./recording.mp3', 'en');
// console.log('Transcription:', result.text);

Streaming Transcription

Stream audio in real-time for live transcription.

class StreamingTranscriber {
  private buffer: Buffer[] = [];
  private chunkSize = 4096;

  async transcribeStream(
    audioStream: NodeJS.ReadableStream,
    onChunk: (text: string, isFinal: boolean) => void
  ): Promise<string> {
    let fullText = '';

    audioStream.on('data', (chunk: Buffer) => {
      this.buffer.push(chunk);

      if (this.buffer.length >= this.chunkSize) {
        const toProcess = Buffer.concat(
          this.buffer.splice(0, this.chunkSize)
        );

        this.processAudioChunk(toProcess, async (text) => {
          fullText += text + ' ';
          onChunk(text, false);
        });
      }
    });

    return new Promise((resolve, reject) => {
      audioStream.on('end', async () => {
        // Process remaining buffer
        if (this.buffer.length > 0) {
          const remaining = Buffer.concat(this.buffer);
          const finalText = await this.transcribeChunk(remaining);
          fullText += finalText;
          onChunk(finalText, true);
        }

        resolve(fullText.trim());
      });

      audioStream.on('error', reject);
    });
  }

  private async processAudioChunk(
    chunk: Buffer,
    callback: (text: string) => Promise<void>
  ): Promise<void> {
    const text = await this.transcribeChunk(chunk);
    await callback(text);
  }

  private async transcribeChunk(audioBuffer: Buffer): Promise<string> {
    // Simulate Whisper API call
    return `Transcribed chunk of ${audioBuffer.length} bytes`;
  }
}

const streamingTranscriber = new StreamingTranscriber();

// Usage with actual stream
// const audioStream = fs.createReadStream('./audio.wav');
// const fullTranscript = await streamingTranscriber.transcribeStream(
//   audioStream,
//   (text, isFinal) => {
//     console.log(`[${isFinal ? 'FINAL' : 'INTERIM'}] ${text}`);
//   }
// );

Document Understanding for Structured Extraction

Extract structured data from documents using vision models.

interface ExtractedDocument {
  type: string;
  fields: Record<string, unknown>;
  confidence: number;
  rawText: string;
}

class DocumentExtractor {
  async extractStructuredData(
    imagePath: string,
    documentType: string,
    expectedFields: string[]
  ): Promise<ExtractedDocument> {
    const base64 = fs.readFileSync(imagePath).toString('base64');

    const fieldDescriptions = expectedFields.join(', ');

    const prompt = `Extract the following fields from this ${documentType}: ${fieldDescriptions}

Return as JSON with keys matching the field names.`;

    const response = await fetch('https://api.openai.com/v1/chat/completions', {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
        Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
      },
      body: JSON.stringify({
        model: 'gpt-4-vision',
        messages: [
          {
            role: 'user',
            content: [
              {
                type: 'image_url',
                image_url: {
                  url: `data:image/jpeg;base64,${base64}`,
                },
              },
              {
                type: 'text',
                text: prompt,
              },
            ],
          },
        ],
        temperature: 0,
      }),
    });

    const data = (await response.json()) as { choices: Array<{ message: { content: string } }> };
    const responseText = data.choices[0].message.content;

    const extracted = JSON.parse(responseText);

    return {
      type: documentType,
      fields: extracted,
      confidence: 0.95,
      rawText: responseText,
    };
  }

  async validateExtraction(document: ExtractedDocument, schema: Record<string, string>): Promise<boolean> {
    // Validate that extracted fields match schema
    for (const [field, type] of Object.entries(schema)) {
      if (!(field in document.fields)) {
        return false;
      }

      const value = document.fields[field];
      if (typeof value !== type) {
        return false;
      }
    }

    return true;
  }
}

const docExtractor = new DocumentExtractor();

const schema = {
  invoice_number: 'string',
  total_amount: 'number',
  date: 'string',
};

// const extracted = await docExtractor.extractStructuredData(
//   './invoice.png',
//   'invoice',
//   Object.keys(schema)
// );

// console.log('Extracted:', extracted.fields);

Cost Comparison for Modalities

Compare costs across different modalities to optimize spending.

interface ModalityCost {
  name: string;
  costPerInput: number;
  costPerOutput: number;
  latencyMs: number;
  accuracy: number;
  quality: string;
}

class ModalityCostAnalyzer {
  private costs: ModalityCost[] = [
    {
      name: 'gpt-4-vision',
      costPerInput: 0.01,
      costPerOutput: 0.03,
      latencyMs: 2000,
      accuracy: 0.95,
      quality: 'high',
    },
    {
      name: 'claude-vision',
      costPerInput: 0.008,
      costPerOutput: 0.024,
      latencyMs: 1500,
      accuracy: 0.93,
      quality: 'high',
    },
    {
      name: 'whisper',
      costPerInput: 0.02,
      costPerOutput: 0,
      latencyMs: 3000,
      accuracy: 0.92,
      quality: 'good',
    },
    {
      name: 'local-model',
      costPerInput: 0,
      costPerOutput: 0,
      latencyMs: 5000,
      accuracy: 0.85,
      quality: 'fair',
    },
  ];

  selectCheapest(budget: number, minAccuracy: number = 0.9): ModalityCost | undefined {
    return this.costs.find(
      (cost) =>
        cost.costPerInput <= budget &&
        cost.accuracy >= minAccuracy
    );
  }

  selectFastest(maxLatency: number, minAccuracy: number = 0.9): ModalityCost | undefined {
    return this.costs
      .filter((cost) => cost.latencyMs <= maxLatency && cost.accuracy >= minAccuracy)
      .sort((a, b) => a.latencyMs - b.latencyMs)[0];
  }

  compareCosts(
    volumePerMonth: number,
    avgInputTokens: number,
    avgOutputTokens: number
  ): Array<{ name: string; monthlyCost: number; costPerRequest: number }> {
    return this.costs.map((cost) => ({
      name: cost.name,
      costPerRequest: cost.costPerInput * avgInputTokens + cost.costPerOutput * avgOutputTokens,
      monthlyCost:
        (cost.costPerInput * avgInputTokens + cost.costPerOutput * avgOutputTokens) * volumePerMonth,
    }));
  }
}

const analyzer = new ModalityCostAnalyzer();

const cheapest = analyzer.selectCheapest(0.05, 0.9);
console.log('Cheapest option:', cheapest?.name);

const fastest = analyzer.selectFastest(2000, 0.9);
console.log('Fastest option:', fastest?.name);

const comparison = analyzer.compareCosts(10000, 500, 200);
console.log('Monthly costs:', comparison);

Fallback When Vision Extraction Fails

Implement graceful degradation when vision models fail.

class ResilientMultimodalProcessor {
  async processWithFallback(
    imagePath: string,
    primaryProcessor: (path: string) => Promise<string>,
    fallbackProcessor: (path: string) => Promise<string>,
    lastResortProcessor?: (path: string) => Promise<string>
  ): Promise<{ result: string; source: 'primary' | 'fallback' | 'last_resort'; error?: Error }> {
    try {
      const result = await primaryProcessor(imagePath);
      return { result, source: 'primary' };
    } catch (primaryError) {
      try {
        const result = await fallbackProcessor(imagePath);
        return {
          result,
          source: 'fallback',
          error: primaryError instanceof Error ? primaryError : undefined,
        };
      } catch (fallbackError) {
        if (lastResortProcessor) {
          try {
            const result = await lastResortProcessor(imagePath);
            return {
              result,
              source: 'last_resort',
              error: fallbackError instanceof Error ? fallbackError : undefined,
            };
          } catch (lastResortError) {
            throw new Error(`All processors failed: ${lastResortError}`);
          }
        } else {
          throw fallbackError;
        }
      }
    }
  }

  async processWithTimeout<T>(
    fn: () => Promise<T>,
    timeoutMs: number = 10000
  ): Promise<T> {
    return Promise.race([
      fn(),
      new Promise<T>((_, reject) =>
        setTimeout(() => reject(new Error('Processing timeout')), timeoutMs)
      ),
    ]);
  }
}

const processor = new ResilientMultimodalProcessor();

const result = await processor.processWithFallback(
  './image.jpg',
  async (path) => {
    // Try GPT-4 Vision
    return 'GPT-4 Vision result';
  },
  async (path) => {
    // Fall back to Claude Vision
    return 'Claude Vision result';
  },
  async (path) => {
    // Last resort: OCR
    return 'OCR result';
  }
);

console.log(`Result from ${result.source}:`, result.result);

Checklist

  • Implement retry logic with exponential backoff for vision APIs
  • Use base64 encoding for small/private images, URLs for large/public
  • Extract PDFs page-by-page with vision to preserve structure
  • Stream audio transcription for real-time applications
  • Validate structured extraction against expected schemas
  • Cache image embeddings to avoid re-processing
  • Compare costs across vision models and choose based on accuracy needs
  • Implement fallback chains (primary → fallback → OCR/local)
  • Set timeouts for vision requests (max 30s)
  • Monitor multimodal API costs per feature separately
  • Test document extraction with golden datasets
  • Handle missing/corrupted media gracefully with meaningful errors

Conclusion

Multimodal processing requires careful orchestration: choose the right model for each modality based on cost and accuracy, implement intelligent fallbacks for failures, and validate outputs against expected schemas. Start with vision for structured document extraction, add Whisper for audio transcription, and implement cost-aware model selection as your volume grows.