Multimodal AI Guide 2026: Text, Images, Audio and Video in One Model
Advertisement
Multimodal AI 2026: Beyond Text
The most powerful AI applications in 2026 combine multiple modalities. Multimodal models can see, hear, and read — in one API call.
- Image Analysis with GPT-4o
- OCR and Document Extraction
- Speech to Text with Whisper
- Text to Speech
- Video Understanding with Gemini
- Build a Visual Q&A App
Image Analysis with GPT-4o
import base64
from openai import OpenAI
client = OpenAI()
def analyze_image(image_path: str, question: str) -> str:
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode("utf-8")
ext = image_path.split(".")[-1].lower()
mime = {"jpg": "jpeg", "jpeg": "jpeg", "png": "png", "gif": "gif", "webp": "webp"}
response = client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": f"data:image/{mime.get(ext, ext)};base64,{image_data}"}},
],
}],
max_tokens=500,
)
return response.choices[0].message.content
# Real-world use cases:
print(analyze_image("chart.png", "Extract all data points from this chart as a table"))
print(analyze_image("receipt.jpg", "Extract: store name, date, items, and total cost as JSON"))
print(analyze_image("code_screenshot.png", "What bugs or issues do you see in this code?"))
print(analyze_image("whiteboard.jpg", "Transcribe the text and equations on this whiteboard"))
OCR and Document Extraction
def extract_invoice_data(pdf_path: str) -> dict:
"""Extract structured data from invoice PDFs using GPT-4o vision."""
import fitz # PyMuPDF
doc = fitz.open(pdf_path)
page = doc[0]
mat = fitz.Matrix(2, 2) # 2x zoom for better OCR
pix = page.get_pixmap(matrix=mat)
img_bytes = pix.tobytes("png")
img_base64 = base64.b64encode(img_bytes).decode()
prompt = """Extract invoice data as JSON with these exact keys:
invoice_number, date, vendor_name, vendor_address,
line_items (array of {description, quantity, unit_price, total}),
subtotal, tax, total_amount, payment_due_date"""
response = client.chat.completions.create(
model="gpt-4o",
response_format={"type": "json_object"},
messages=[{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}},
],
}],
)
import json
return json.loads(response.choices[0].message.content)
Speech to Text with Whisper
# OpenAI Whisper API — best accuracy available
def transcribe_audio(audio_path: str, language: str = "en") -> dict:
with open(audio_path, "rb") as f:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=f,
language=language,
response_format="verbose_json", # includes timestamps
timestamp_granularities=["word"], # word-level timestamps
)
return {
"text": transcript.text,
"words": [(w.word, w.start, w.end) for w in transcript.words],
"duration": transcript.duration,
}
# Real-time transcription with streaming
import pyaudio
import io
def transcribe_realtime():
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
RECORD_SECONDS = 5
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
print("Recording...")
frames = [stream.read(CHUNK) for _ in range(int(RATE / CHUNK * RECORD_SECONDS))]
print("Transcribing...")
stream.stop_stream()
stream.close()
p.terminate()
audio_data = b"".join(frames)
audio_file = io.BytesIO(audio_data)
audio_file.name = "audio.raw"
transcript = client.audio.transcriptions.create(model="whisper-1", file=audio_file)
return transcript.text
Text to Speech
from pathlib import Path
def text_to_speech(text: str, output_path: str = "output.mp3", voice: str = "alloy"):
"""
Voices: alloy, echo, fable, onyx, nova, shimmer
Models: tts-1 (fast) or tts-1-hd (high quality)
"""
response = client.audio.speech.create(
model="tts-1-hd",
voice=voice,
input=text,
speed=1.0, # 0.25 to 4.0
)
Path(output_path).write_bytes(response.content)
print(f"Audio saved to {output_path}")
text_to_speech("Welcome to webcoderspeed.com! Let's learn AI together.", voice="nova")
Video Understanding with Gemini
import google.generativeai as genai
import time
genai.configure(api_key="your-key")
model = genai.GenerativeModel("gemini-2.0-flash")
def analyze_video(video_path: str, questions: list[str]) -> dict:
"""Analyze video with Gemini's 1M context window."""
# Upload video to Google
print(f"Uploading {video_path}...")
video_file = genai.upload_file(path=video_path)
# Wait for processing
while video_file.state.name == "PROCESSING":
time.sleep(2)
video_file = genai.get_file(video_file.name)
if video_file.state.name == "FAILED":
raise ValueError("Video processing failed")
answers = {}
for question in questions:
response = model.generate_content([video_file, question])
answers[question] = response.text
return answers
results = analyze_video("lecture.mp4", [
"Create a detailed transcript of the lecture",
"List the main topics covered in order",
"Identify any diagrams or code shown on screen",
"Generate quiz questions based on the content",
])
Build a Visual Q&A App
from fastapi import FastAPI, UploadFile, File, Form
import io
from PIL import Image
app = FastAPI()
@app.post("/visual-qa")
async def visual_qa(
image: UploadFile = File(...),
question: str = Form(...)
) -> dict:
# Read and validate image
img_bytes = await image.read()
img = Image.open(io.BytesIO(img_bytes))
# Resize if too large (cost optimization)
max_size = (1024, 1024)
img.thumbnail(max_size, Image.LANCZOS)
# Convert back to bytes
output = io.BytesIO()
img.save(output, format="PNG")
img_base64 = base64.b64encode(output.getvalue()).decode()
response = client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}},
],
}],
max_tokens=500,
)
return {
"question": question,
"answer": response.choices[0].message.content,
"tokens_used": response.usage.total_tokens,
}
Advertisement