- Published on
Multimodal API Integration — Vision, Audio, and Document Processing in Production
- Authors

- Name
- Sanjeev Sharma
- @webcoderspeed1
Introduction
Multimodal models unlock new capabilities but introduce complexity: different modalities have different costs, latencies, and failure modes. This guide covers production patterns for reliable multimodal processing.
- Image Analysis With Claude/GPT-4o Vision
- Base64 vs URL for Image Passing
- PDF Extraction With Vision
- Audio Transcription With Whisper
- Streaming Transcription
- Document Understanding for Structured Extraction
- Cost Comparison for Modalities
- Fallback When Vision Extraction Fails
- Checklist
- Conclusion
Image Analysis With Claude/GPT-4o Vision
Process images using vision models with proper encoding and error handling.
import fetch from 'node-fetch';
import fs from 'fs';
import path from 'path';
class VisionAnalyzer {
async analyzeImageBase64(
base64Image: string,
prompt: string,
model: string = 'gpt-4-vision'
): Promise<string> {
const response = await fetch('https://api.openai.com/v1/chat/completions', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
},
body: JSON.stringify({
model,
messages: [
{
role: 'user',
content: [
{
type: 'image_url',
image_url: {
url: `data:image/jpeg;base64,${base64Image}`,
},
},
{
type: 'text',
text: prompt,
},
],
},
],
max_tokens: 1024,
}),
});
const data = (await response.json()) as { choices: Array<{ message: { content: string } }> };
return data.choices[0].message.content;
}
async analyzeImageURL(
imageUrl: string,
prompt: string,
model: string = 'gpt-4-vision'
): Promise<string> {
const response = await fetch('https://api.openai.com/v1/chat/completions', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
},
body: JSON.stringify({
model,
messages: [
{
role: 'user',
content: [
{
type: 'image_url',
image_url: { url: imageUrl },
},
{
type: 'text',
text: prompt,
},
],
},
],
max_tokens: 1024,
}),
});
const data = (await response.json()) as { choices: Array<{ message: { content: string } }> };
return data.choices[0].message.content;
}
async encodeImageToBase64(imagePath: string): Promise<string> {
const imageBuffer = fs.readFileSync(imagePath);
return imageBuffer.toString('base64');
}
async analyzeLocalImage(
imagePath: string,
prompt: string,
useBase64: boolean = true
): Promise<string> {
if (useBase64) {
const base64 = await this.encodeImageToBase64(imagePath);
return this.analyzeImageBase64(base64, prompt);
} else {
// Upload to temporary storage and use URL
const uploadedUrl = await this.uploadImageTemporarily(imagePath);
return this.analyzeImageURL(uploadedUrl, prompt);
}
}
private async uploadImageTemporarily(imagePath: string): Promise<string> {
// In production, use actual file upload service
return `https://example.com/temp/${path.basename(imagePath)}`;
}
}
const analyzer = new VisionAnalyzer();
const analysis = await analyzer.analyzeImageURL(
'https://example.com/image.jpg',
'What objects are in this image? List them.'
);
console.log('Analysis:', analysis);
Base64 vs URL for Image Passing
Choose between encoding strategies based on performance and constraints.
interface ImageEncodingStrategy {
encode(imagePath: string): Promise<string>;
cost: 'low' | 'medium' | 'high';
latency: 'low' | 'medium' | 'high';
suitableFor: string[];
}
class Base64EncodingStrategy implements ImageEncodingStrategy {
cost = 'high' as const; // Larger requests
latency = 'medium' as const;
suitableFor = ['small images', 'private content', 'offline processing'];
async encode(imagePath: string): Promise<string> {
const imageBuffer = fs.readFileSync(imagePath);
return imageBuffer.toString('base64');
}
}
class URLEncodingStrategy implements ImageEncodingStrategy {
cost = 'low' as const; // Reference, not data
latency = 'low' as const;
suitableFor = ['large images', 'public URLs', 'high throughput'];
async encode(imagePath: string): Promise<string> {
// Upload to CDN and return URL
return this.uploadToCDN(imagePath);
}
private async uploadToCDN(imagePath: string): Promise<string> {
// Simulate CDN upload
return `https://cdn.example.com/${path.basename(imagePath)}?token=${Date.now()}`;
}
}
class ImageEncodingSelector {
selectStrategy(imageSize: number, isPublic: boolean, isLatencySensitive: boolean): ImageEncodingStrategy {
// Small + private + not latency-sensitive = Base64
if (imageSize < 1024 * 1024 && !isPublic && !isLatencySensitive) {
return new Base64EncodingStrategy();
}
// Large or public or latency-sensitive = URL
return new URLEncodingStrategy();
}
async encodeImage(
imagePath: string,
imageSize: number,
isPublic: boolean,
isLatencySensitive: boolean
): Promise<string> {
const strategy = this.selectStrategy(imageSize, isPublic, isLatencySensitive);
console.log(`Using ${strategy.constructor.name} (cost: ${strategy.cost}, latency: ${strategy.latency})`);
return strategy.encode(imagePath);
}
}
const selector = new ImageEncodingSelector();
const encoded = await selector.encodeImage('./image.jpg', 2000000, false, true);
PDF Extraction With Vision
Extract content from PDFs page by page using vision models.
class PDFVisionExtractor {
async extractFromPDF(
pdfPath: string,
prompt: string = 'Extract all text and structured data from this page'
): Promise<Array<{ pageNumber: number; content: string; metadata: Record<string, unknown> }>> {
// Simulate PDF conversion to images
const pageImages = await this.convertPDFToImages(pdfPath);
const results: Array<{ pageNumber: number; content: string; metadata: Record<string, unknown> }> = [];
for (let i = 0; i < pageImages.length; i++) {
try {
const content = await this.analyzePageImage(pageImages[i], prompt);
results.push({
pageNumber: i + 1,
content,
metadata: {
extractedAt: new Date(),
model: 'gpt-4-vision',
confidence: 0.95,
},
});
// Rate limiting
if (i < pageImages.length - 1) {
await new Promise((resolve) => setTimeout(resolve, 500));
}
} catch (error) {
results.push({
pageNumber: i + 1,
content: '',
metadata: {
error: error instanceof Error ? error.message : String(error),
extractedAt: new Date(),
},
});
}
}
return results;
}
async extractStructuredData(
pdfPath: string,
schema: Record<string, unknown>
): Promise<Array<{ pageNumber: number; data: unknown }>> {
const schemaJson = JSON.stringify(schema);
const extractedPages = await this.extractFromPDF(
pdfPath,
`Extract data matching this schema: ${schemaJson}\n\nReturn valid JSON.`
);
return extractedPages.map((page) => ({
pageNumber: page.pageNumber,
data: JSON.parse(page.content),
}));
}
private async convertPDFToImages(pdfPath: string): Promise<string[]> {
// In production, use pdf-lib or similar
// Returns array of base64 encoded page images
return Array(3).fill(`data:image/png;base64,${Buffer.alloc(100).toString('base64')}`);
}
private async analyzePageImage(base64Image: string, prompt: string): Promise<string> {
const response = await fetch('https://api.openai.com/v1/chat/completions', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
},
body: JSON.stringify({
model: 'gpt-4-vision',
messages: [
{
role: 'user',
content: [
{
type: 'image_url',
image_url: { url: base64Image },
},
{
type: 'text',
text: prompt,
},
],
},
],
max_tokens: 2000,
}),
});
const data = (await response.json()) as { choices: Array<{ message: { content: string } }> };
return data.choices[0].message.content;
}
}
const pdfExtractor = new PDFVisionExtractor();
const extracted = await pdfExtractor.extractFromPDF('./invoice.pdf');
console.log(`Extracted ${extracted.length} pages`);
Audio Transcription With Whisper
Transcribe audio files using OpenAI's Whisper API with streaming support.
class WhisperTranscriber {
async transcribeFile(
audioPath: string,
language?: string
): Promise<{ text: string; language: string; duration: number }> {
const audioBuffer = fs.readFileSync(audioPath);
const formData = new FormData();
formData.append('file', new Blob([audioBuffer]), path.basename(audioPath));
formData.append('model', 'whisper-1');
if (language) formData.append('language', language);
const response = await fetch('https://api.openai.com/v1/audio/transcriptions', {
method: 'POST',
headers: {
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
},
body: formData,
});
const data = (await response.json()) as { text: string; language?: string };
return {
text: data.text,
language: data.language || 'unknown',
duration: Math.random() * 300, // Mock duration
};
}
async transcribeWithTimestamps(audioPath: string): Promise<Array<{ text: string; start: number; end: number }>> {
// Note: Returns simple word-level timestamps
const result = await this.transcribeFile(audioPath);
const words = result.text.split(/\s+/);
const avgWordDuration = 0.4; // seconds per word
return words.map((word, i) => ({
text: word,
start: i * avgWordDuration,
end: (i + 1) * avgWordDuration,
}));
}
async transcribeBatch(
audioPaths: string[],
maxConcurrent: number = 3
): Promise<Map<string, string>> {
const results = new Map<string, string>();
for (let i = 0; i < audioPaths.length; i += maxConcurrent) {
const batch = audioPaths.slice(i, i + maxConcurrent);
const batchResults = await Promise.all(batch.map((path) => this.transcribeFile(path)));
batchResults.forEach((result, idx) => {
results.set(batch[idx], result.text);
});
// Rate limiting between batches
if (i + maxConcurrent < audioPaths.length) {
await new Promise((resolve) => setTimeout(resolve, 1000));
}
}
return results;
}
estimateCost(durationSeconds: number): number {
// $0.02 per minute
return (durationSeconds / 60) * 0.02;
}
}
const transcriber = new WhisperTranscriber();
// const result = await transcriber.transcribeFile('./recording.mp3', 'en');
// console.log('Transcription:', result.text);
Streaming Transcription
Stream audio in real-time for live transcription.
class StreamingTranscriber {
private buffer: Buffer[] = [];
private chunkSize = 4096;
async transcribeStream(
audioStream: NodeJS.ReadableStream,
onChunk: (text: string, isFinal: boolean) => void
): Promise<string> {
let fullText = '';
audioStream.on('data', (chunk: Buffer) => {
this.buffer.push(chunk);
if (this.buffer.length >= this.chunkSize) {
const toProcess = Buffer.concat(
this.buffer.splice(0, this.chunkSize)
);
this.processAudioChunk(toProcess, async (text) => {
fullText += text + ' ';
onChunk(text, false);
});
}
});
return new Promise((resolve, reject) => {
audioStream.on('end', async () => {
// Process remaining buffer
if (this.buffer.length > 0) {
const remaining = Buffer.concat(this.buffer);
const finalText = await this.transcribeChunk(remaining);
fullText += finalText;
onChunk(finalText, true);
}
resolve(fullText.trim());
});
audioStream.on('error', reject);
});
}
private async processAudioChunk(
chunk: Buffer,
callback: (text: string) => Promise<void>
): Promise<void> {
const text = await this.transcribeChunk(chunk);
await callback(text);
}
private async transcribeChunk(audioBuffer: Buffer): Promise<string> {
// Simulate Whisper API call
return `Transcribed chunk of ${audioBuffer.length} bytes`;
}
}
const streamingTranscriber = new StreamingTranscriber();
// Usage with actual stream
// const audioStream = fs.createReadStream('./audio.wav');
// const fullTranscript = await streamingTranscriber.transcribeStream(
// audioStream,
// (text, isFinal) => {
// console.log(`[${isFinal ? 'FINAL' : 'INTERIM'}] ${text}`);
// }
// );
Document Understanding for Structured Extraction
Extract structured data from documents using vision models.
interface ExtractedDocument {
type: string;
fields: Record<string, unknown>;
confidence: number;
rawText: string;
}
class DocumentExtractor {
async extractStructuredData(
imagePath: string,
documentType: string,
expectedFields: string[]
): Promise<ExtractedDocument> {
const base64 = fs.readFileSync(imagePath).toString('base64');
const fieldDescriptions = expectedFields.join(', ');
const prompt = `Extract the following fields from this ${documentType}: ${fieldDescriptions}
Return as JSON with keys matching the field names.`;
const response = await fetch('https://api.openai.com/v1/chat/completions', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
},
body: JSON.stringify({
model: 'gpt-4-vision',
messages: [
{
role: 'user',
content: [
{
type: 'image_url',
image_url: {
url: `data:image/jpeg;base64,${base64}`,
},
},
{
type: 'text',
text: prompt,
},
],
},
],
temperature: 0,
}),
});
const data = (await response.json()) as { choices: Array<{ message: { content: string } }> };
const responseText = data.choices[0].message.content;
const extracted = JSON.parse(responseText);
return {
type: documentType,
fields: extracted,
confidence: 0.95,
rawText: responseText,
};
}
async validateExtraction(document: ExtractedDocument, schema: Record<string, string>): Promise<boolean> {
// Validate that extracted fields match schema
for (const [field, type] of Object.entries(schema)) {
if (!(field in document.fields)) {
return false;
}
const value = document.fields[field];
if (typeof value !== type) {
return false;
}
}
return true;
}
}
const docExtractor = new DocumentExtractor();
const schema = {
invoice_number: 'string',
total_amount: 'number',
date: 'string',
};
// const extracted = await docExtractor.extractStructuredData(
// './invoice.png',
// 'invoice',
// Object.keys(schema)
// );
// console.log('Extracted:', extracted.fields);
Cost Comparison for Modalities
Compare costs across different modalities to optimize spending.
interface ModalityCost {
name: string;
costPerInput: number;
costPerOutput: number;
latencyMs: number;
accuracy: number;
quality: string;
}
class ModalityCostAnalyzer {
private costs: ModalityCost[] = [
{
name: 'gpt-4-vision',
costPerInput: 0.01,
costPerOutput: 0.03,
latencyMs: 2000,
accuracy: 0.95,
quality: 'high',
},
{
name: 'claude-vision',
costPerInput: 0.008,
costPerOutput: 0.024,
latencyMs: 1500,
accuracy: 0.93,
quality: 'high',
},
{
name: 'whisper',
costPerInput: 0.02,
costPerOutput: 0,
latencyMs: 3000,
accuracy: 0.92,
quality: 'good',
},
{
name: 'local-model',
costPerInput: 0,
costPerOutput: 0,
latencyMs: 5000,
accuracy: 0.85,
quality: 'fair',
},
];
selectCheapest(budget: number, minAccuracy: number = 0.9): ModalityCost | undefined {
return this.costs.find(
(cost) =>
cost.costPerInput <= budget &&
cost.accuracy >= minAccuracy
);
}
selectFastest(maxLatency: number, minAccuracy: number = 0.9): ModalityCost | undefined {
return this.costs
.filter((cost) => cost.latencyMs <= maxLatency && cost.accuracy >= minAccuracy)
.sort((a, b) => a.latencyMs - b.latencyMs)[0];
}
compareCosts(
volumePerMonth: number,
avgInputTokens: number,
avgOutputTokens: number
): Array<{ name: string; monthlyCost: number; costPerRequest: number }> {
return this.costs.map((cost) => ({
name: cost.name,
costPerRequest: cost.costPerInput * avgInputTokens + cost.costPerOutput * avgOutputTokens,
monthlyCost:
(cost.costPerInput * avgInputTokens + cost.costPerOutput * avgOutputTokens) * volumePerMonth,
}));
}
}
const analyzer = new ModalityCostAnalyzer();
const cheapest = analyzer.selectCheapest(0.05, 0.9);
console.log('Cheapest option:', cheapest?.name);
const fastest = analyzer.selectFastest(2000, 0.9);
console.log('Fastest option:', fastest?.name);
const comparison = analyzer.compareCosts(10000, 500, 200);
console.log('Monthly costs:', comparison);
Fallback When Vision Extraction Fails
Implement graceful degradation when vision models fail.
class ResilientMultimodalProcessor {
async processWithFallback(
imagePath: string,
primaryProcessor: (path: string) => Promise<string>,
fallbackProcessor: (path: string) => Promise<string>,
lastResortProcessor?: (path: string) => Promise<string>
): Promise<{ result: string; source: 'primary' | 'fallback' | 'last_resort'; error?: Error }> {
try {
const result = await primaryProcessor(imagePath);
return { result, source: 'primary' };
} catch (primaryError) {
try {
const result = await fallbackProcessor(imagePath);
return {
result,
source: 'fallback',
error: primaryError instanceof Error ? primaryError : undefined,
};
} catch (fallbackError) {
if (lastResortProcessor) {
try {
const result = await lastResortProcessor(imagePath);
return {
result,
source: 'last_resort',
error: fallbackError instanceof Error ? fallbackError : undefined,
};
} catch (lastResortError) {
throw new Error(`All processors failed: ${lastResortError}`);
}
} else {
throw fallbackError;
}
}
}
}
async processWithTimeout<T>(
fn: () => Promise<T>,
timeoutMs: number = 10000
): Promise<T> {
return Promise.race([
fn(),
new Promise<T>((_, reject) =>
setTimeout(() => reject(new Error('Processing timeout')), timeoutMs)
),
]);
}
}
const processor = new ResilientMultimodalProcessor();
const result = await processor.processWithFallback(
'./image.jpg',
async (path) => {
// Try GPT-4 Vision
return 'GPT-4 Vision result';
},
async (path) => {
// Fall back to Claude Vision
return 'Claude Vision result';
},
async (path) => {
// Last resort: OCR
return 'OCR result';
}
);
console.log(`Result from ${result.source}:`, result.result);
Checklist
- Implement retry logic with exponential backoff for vision APIs
- Use base64 encoding for small/private images, URLs for large/public
- Extract PDFs page-by-page with vision to preserve structure
- Stream audio transcription for real-time applications
- Validate structured extraction against expected schemas
- Cache image embeddings to avoid re-processing
- Compare costs across vision models and choose based on accuracy needs
- Implement fallback chains (primary → fallback → OCR/local)
- Set timeouts for vision requests (max 30s)
- Monitor multimodal API costs per feature separately
- Test document extraction with golden datasets
- Handle missing/corrupted media gracefully with meaningful errors
Conclusion
Multimodal processing requires careful orchestration: choose the right model for each modality based on cost and accuracy, implement intelligent fallbacks for failures, and validate outputs against expected schemas. Start with vision for structured document extraction, add Whisper for audio transcription, and implement cost-aware model selection as your volume grows.