OpenAI API Complete Guide 2026: GPT-4o, Assistants, Vision, Function Calling

OpenAI API Complete Guide 2026

The OpenAI API powers millions of AI applications. This guide covers every major feature with production-ready examples.

Setup
Chat Completions (Core API)
Vision: Analyze Images
Function Calling (Tool Use)
Embeddings
Structured Outputs (Pydantic)
Rate Limiting and Retries
Cost Optimization Tips

Setup

pip install openai

from openai import OpenAI
client = OpenAI(api_key="sk-...")  # or set OPENAI_API_KEY env var

Chat Completions (Core API)

# Basic completion
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful coding assistant."},
        {"role": "user", "content": "Explain async/await in Python"},
    ],
    max_tokens=500,
    temperature=0.7,
)
print(response.choices[0].message.content)
print(f"Tokens used: {response.usage.total_tokens}")

# Streaming response
with client.chat.completions.stream(
    model="gpt-4o",
    messages=[{"role": "user", "content": "Write a quicksort in Go"}],
) as stream:
    for text in stream.text_stream:
        print(text, end="", flush=True)

Vision: Analyze Images

import base64

# Analyze local image
with open("screenshot.png", "rb") as f:
    image_data = base64.b64encode(f.read()).decode("utf-8")

response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe any bugs or issues you see in this UI screenshot"},
            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_data}"}},
        ],
    }],
)

# Analyze image from URL
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{
        "role": "user",
        "content": [
            {"type": "text", "text": "What programming language is this code written in?"},
            {"type": "image_url", "image_url": {"url": "https://example.com/code-screenshot.png"}},
        ],
    }],
)

Function Calling (Tool Use)

Function calling lets the LLM decide when to call your functions:

import json

# Define tools
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {"type": "string", "description": "City name, e.g. 'Delhi'"},
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                },
                "required": ["location"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "search_database",
            "description": "Search the product database",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {"type": "string"},
                    "limit": {"type": "integer", "default": 5},
                },
                "required": ["query"],
            },
        },
    },
]

def get_weather(location: str, unit: str = "celsius") -> dict:
    # Your real implementation
    return {"temperature": 28, "condition": "sunny", "location": location}

def search_database(query: str, limit: int = 5) -> list:
    return [{"id": 1, "name": "Example Product", "price": 99.99}]

# Agentic loop
messages = [{"role": "user", "content": "What's the weather in Mumbai and search for monsoon gear?"}]

while True:
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        tools=tools,
        tool_choice="auto",
    )

    message = response.choices[0].message
    messages.append(message)

    if response.choices[0].finish_reason == "tool_calls":
        for tool_call in message.tool_calls:
            fn_name = tool_call.function.name
            fn_args = json.loads(tool_call.function.arguments)

            if fn_name == "get_weather":
                result = get_weather(**fn_args)
            elif fn_name == "search_database":
                result = search_database(**fn_args)

            messages.append({
                "role": "tool",
                "tool_call_id": tool_call.id,
                "content": json.dumps(result),
            })
    else:
        print(message.content)
        break

Embeddings

# Single embedding
response = client.embeddings.create(
    model="text-embedding-3-large",
    input="The quick brown fox jumps over the lazy dog",
)
vector = response.data[0].embedding  # 3072-dimensional list of floats

# Batch embeddings
texts = ["Hello world", "How are you?", "Python is great"]
response = client.embeddings.create(model="text-embedding-3-large", input=texts)
vectors = [item.embedding for item in response.data]

# Semantic search
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

query_vec = client.embeddings.create(model="text-embedding-3-large", input="python tutorial").data[0].embedding
scores = [(cosine_similarity(query_vec, v), t) for v, t in zip(vectors, texts)]
scores.sort(reverse=True)
print("Most similar:", scores[0][1])

Structured Outputs (Pydantic)

from pydantic import BaseModel
from typing import Optional

class CodeReview(BaseModel):
    overall_quality: str  # "good" | "needs_work" | "critical"
    bugs_found: int
    suggestions: list[str]
    refactored_code: Optional[str] = None

response = client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[{
        "role": "user",
        "content": "Review this Python code:\n\ndef add(a, b):\n    return a + b"
    }],
    response_format=CodeReview,
)

review = response.choices[0].message.parsed
print(f"Quality: {review.overall_quality}")
print(f"Bugs: {review.bugs_found}")
for s in review.suggestions:
    print(f"  - {s}")

Rate Limiting and Retries

from openai import RateLimitError, APIError
import time

def call_with_retry(prompt: str, max_retries: int = 3) -> str:
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}],
            )
            return response.choices[0].message.content
        except RateLimitError:
            wait = 2 ** attempt  # Exponential backoff: 1s, 2s, 4s
            print(f"Rate limited. Waiting {wait}s...")
            time.sleep(wait)
        except APIError as e:
            print(f"API error: {e}")
            raise
    raise Exception("Max retries exceeded")

Cost Optimization Tips

# 1. Use gpt-4o-mini for simple tasks (90% cheaper)
response = client.chat.completions.create(
    model="gpt-4o-mini",  # vs "gpt-4o"
    messages=[{"role": "user", "content": "Summarize in 1 sentence: " + long_text}],
    max_tokens=50,  # limit output tokens
)

# 2. Cache frequent prompts (same input = same output at temp=0)
import hashlib, functools

@functools.lru_cache(maxsize=1000)
def cached_completion(prompt_hash: str, prompt: str) -> str:
    return call_with_retry(prompt)

def smart_complete(prompt: str) -> str:
    h = hashlib.md5(prompt.encode()).hexdigest()
    return cached_completion(h, prompt)