Multimodal AI Guide 2026: Text, Images, Audio and Video in One Model

Multimodal AI 2026: Beyond Text

The most powerful AI applications in 2026 combine multiple modalities. Multimodal models can see, hear, and read — in one API call.

Image Analysis with GPT-4o
OCR and Document Extraction
Speech to Text with Whisper
Text to Speech
Video Understanding with Gemini
Build a Visual Q&A App

Image Analysis with GPT-4o

import base64
from openai import OpenAI

client = OpenAI()

def analyze_image(image_path: str, question: str) -> str:
    with open(image_path, "rb") as f:
        image_data = base64.b64encode(f.read()).decode("utf-8")

    ext = image_path.split(".")[-1].lower()
    mime = {"jpg": "jpeg", "jpeg": "jpeg", "png": "png", "gif": "gif", "webp": "webp"}

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {"type": "image_url", "image_url": {"url": f"data:image/{mime.get(ext, ext)};base64,{image_data}"}},
            ],
        }],
        max_tokens=500,
    )
    return response.choices[0].message.content

# Real-world use cases:
print(analyze_image("chart.png", "Extract all data points from this chart as a table"))
print(analyze_image("receipt.jpg", "Extract: store name, date, items, and total cost as JSON"))
print(analyze_image("code_screenshot.png", "What bugs or issues do you see in this code?"))
print(analyze_image("whiteboard.jpg", "Transcribe the text and equations on this whiteboard"))

OCR and Document Extraction

def extract_invoice_data(pdf_path: str) -> dict:
    """Extract structured data from invoice PDFs using GPT-4o vision."""
    import fitz  # PyMuPDF

    doc = fitz.open(pdf_path)
    page = doc[0]
    mat = fitz.Matrix(2, 2)  # 2x zoom for better OCR
    pix = page.get_pixmap(matrix=mat)
    img_bytes = pix.tobytes("png")
    img_base64 = base64.b64encode(img_bytes).decode()

    prompt = """Extract invoice data as JSON with these exact keys:
    invoice_number, date, vendor_name, vendor_address,
    line_items (array of {description, quantity, unit_price, total}),
    subtotal, tax, total_amount, payment_due_date"""

    response = client.chat.completions.create(
        model="gpt-4o",
        response_format={"type": "json_object"},
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}},
            ],
        }],
    )

    import json
    return json.loads(response.choices[0].message.content)

Speech to Text with Whisper

# OpenAI Whisper API — best accuracy available
def transcribe_audio(audio_path: str, language: str = "en") -> dict:
    with open(audio_path, "rb") as f:
        transcript = client.audio.transcriptions.create(
            model="whisper-1",
            file=f,
            language=language,
            response_format="verbose_json",  # includes timestamps
            timestamp_granularities=["word"],  # word-level timestamps
        )

    return {
        "text": transcript.text,
        "words": [(w.word, w.start, w.end) for w in transcript.words],
        "duration": transcript.duration,
    }

# Real-time transcription with streaming
import pyaudio
import io

def transcribe_realtime():
    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    RECORD_SECONDS = 5

    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)

    print("Recording...")
    frames = [stream.read(CHUNK) for _ in range(int(RATE / CHUNK * RECORD_SECONDS))]
    print("Transcribing...")

    stream.stop_stream()
    stream.close()
    p.terminate()

    audio_data = b"".join(frames)
    audio_file = io.BytesIO(audio_data)
    audio_file.name = "audio.raw"

    transcript = client.audio.transcriptions.create(model="whisper-1", file=audio_file)
    return transcript.text

Text to Speech

from pathlib import Path

def text_to_speech(text: str, output_path: str = "output.mp3", voice: str = "alloy"):
    """
    Voices: alloy, echo, fable, onyx, nova, shimmer
    Models: tts-1 (fast) or tts-1-hd (high quality)
    """
    response = client.audio.speech.create(
        model="tts-1-hd",
        voice=voice,
        input=text,
        speed=1.0,  # 0.25 to 4.0
    )
    Path(output_path).write_bytes(response.content)
    print(f"Audio saved to {output_path}")

text_to_speech("Welcome to webcoderspeed.com! Let's learn AI together.", voice="nova")

Video Understanding with Gemini

import google.generativeai as genai
import time

genai.configure(api_key="your-key")
model = genai.GenerativeModel("gemini-2.0-flash")

def analyze_video(video_path: str, questions: list[str]) -> dict:
    """Analyze video with Gemini's 1M context window."""

    # Upload video to Google
    print(f"Uploading {video_path}...")
    video_file = genai.upload_file(path=video_path)

    # Wait for processing
    while video_file.state.name == "PROCESSING":
        time.sleep(2)
        video_file = genai.get_file(video_file.name)

    if video_file.state.name == "FAILED":
        raise ValueError("Video processing failed")

    answers = {}
    for question in questions:
        response = model.generate_content([video_file, question])
        answers[question] = response.text

    return answers

results = analyze_video("lecture.mp4", [
    "Create a detailed transcript of the lecture",
    "List the main topics covered in order",
    "Identify any diagrams or code shown on screen",
    "Generate quiz questions based on the content",
])

Build a Visual Q&A App

from fastapi import FastAPI, UploadFile, File, Form
import io
from PIL import Image

app = FastAPI()

@app.post("/visual-qa")
async def visual_qa(
    image: UploadFile = File(...),
    question: str = Form(...)
) -> dict:
    # Read and validate image
    img_bytes = await image.read()
    img = Image.open(io.BytesIO(img_bytes))

    # Resize if too large (cost optimization)
    max_size = (1024, 1024)
    img.thumbnail(max_size, Image.LANCZOS)

    # Convert back to bytes
    output = io.BytesIO()
    img.save(output, format="PNG")
    img_base64 = base64.b64encode(output.getvalue()).decode()

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}},
            ],
        }],
        max_tokens=500,
    )

    return {
        "question": question,
        "answer": response.choices[0].message.content,
        "tokens_used": response.usage.total_tokens,
    }