Build Semantic Search with Embeddings 2026: Complete Python Guide
Advertisement
Semantic Search with Embeddings 2026
Traditional keyword search breaks for synonyms, paraphrases, and intent. Semantic search understands meaning — "car" matches "automobile", "Python tutorial" matches "learn programming".
- How Embeddings Work
- Build a Document Search Engine
- Hybrid Search: Vector + BM25
- FastAPI Search Service
- Cost Optimization
How Embeddings Work
An embedding converts text into a dense vector — a list of numbers representing the meaning of the text. Similar meanings → similar vectors → close in vector space.
from openai import OpenAI
import numpy as np
client = OpenAI()
def embed(text: str) -> list[float]:
return client.embeddings.create(
model="text-embedding-3-small", # 1536 dims, cheaper
input=text
).data[0].embedding
# Semantically similar texts have similar vectors
vec1 = embed("Python programming language")
vec2 = embed("coding in Python")
vec3 = embed("recipe for chocolate cake")
def cosine_similarity(a, b):
a, b = np.array(a), np.array(b)
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
print(cosine_similarity(vec1, vec2)) # ~0.92 — very similar
print(cosine_similarity(vec1, vec3)) # ~0.12 — very different
Build a Document Search Engine
from openai import OpenAI
import numpy as np
import json
from pathlib import Path
client = OpenAI()
class SemanticSearchEngine:
def __init__(self, model: str = "text-embedding-3-small"):
self.model = model
self.documents: list[str] = []
self.embeddings: np.ndarray | None = None
self.metadata: list[dict] = []
def embed_batch(self, texts: list[str]) -> np.ndarray:
"""Embed multiple texts efficiently."""
response = client.embeddings.create(model=self.model, input=texts)
return np.array([item.embedding for item in response.data])
def index(self, documents: list[str], metadata: list[dict] | None = None):
"""Index documents for search."""
self.documents = documents
self.metadata = metadata or [{} for _ in documents]
# Batch in groups of 100 (API limit)
all_embeddings = []
for i in range(0, len(documents), 100):
batch = documents[i:i+100]
batch_embeddings = self.embed_batch(batch)
all_embeddings.append(batch_embeddings)
print(f"Indexed {min(i+100, len(documents))}/{len(documents)}")
self.embeddings = np.vstack(all_embeddings)
def search(self, query: str, top_k: int = 5) -> list[dict]:
"""Find most semantically similar documents."""
if self.embeddings is None:
raise ValueError("Index documents first with .index()")
query_embedding = np.array(self.embed_batch([query])[0])
# Compute cosine similarities efficiently with numpy
norms = np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
similarities = self.embeddings @ query_embedding / norms
# Get top-k results
top_indices = np.argsort(similarities)[::-1][:top_k]
return [
{
"document": self.documents[i],
"score": float(similarities[i]),
"metadata": self.metadata[i],
}
for i in top_indices
]
def save(self, path: str):
Path(path).mkdir(exist_ok=True)
np.save(f"{path}/embeddings.npy", self.embeddings)
with open(f"{path}/data.json", "w") as f:
json.dump({"documents": self.documents, "metadata": self.metadata}, f)
def load(self, path: str):
self.embeddings = np.load(f"{path}/embeddings.npy")
with open(f"{path}/data.json") as f:
data = json.load(f)
self.documents = data["documents"]
self.metadata = data["metadata"]
# Usage
engine = SemanticSearchEngine()
docs = [
"Python is a high-level programming language known for its simplicity",
"JavaScript is the primary language for web development",
"Machine learning uses algorithms to learn from data",
"Docker containerizes applications for consistent deployment",
"React is a JavaScript library for building user interfaces",
]
metadata = [
{"topic": "python", "difficulty": "beginner"},
{"topic": "javascript", "difficulty": "beginner"},
{"topic": "ml", "difficulty": "intermediate"},
{"topic": "devops", "difficulty": "intermediate"},
{"topic": "react", "difficulty": "intermediate"},
]
engine.index(docs, metadata)
results = engine.search("how to build web apps", top_k=3)
for r in results:
print(f"Score: {r['score']:.3f} | {r['document'][:60]}...")
Hybrid Search: Vector + BM25
Pure semantic search sometimes misses exact keyword matches. Hybrid search combines both:
from rank_bm25 import BM25Okapi
import nltk
class HybridSearchEngine(SemanticSearchEngine):
def __init__(self, *args, alpha: float = 0.5, **kwargs):
super().__init__(*args, **kwargs)
self.bm25 = None
self.alpha = alpha # weight: 1.0=pure semantic, 0.0=pure BM25
def index(self, documents: list[str], metadata=None):
super().index(documents, metadata)
# Build BM25 index
tokenized = [doc.lower().split() for doc in documents]
self.bm25 = BM25Okapi(tokenized)
def search(self, query: str, top_k: int = 5) -> list[dict]:
# Semantic scores
semantic_results = super().search(query, top_k=len(self.documents))
semantic_scores = {r["document"]: r["score"] for r in semantic_results}
# BM25 scores
bm25_scores = self.bm25.get_scores(query.lower().split())
bm25_max = max(bm25_scores) if max(bm25_scores) > 0 else 1
# Combine with reciprocal rank fusion
combined = []
for i, doc in enumerate(self.documents):
sem_score = semantic_scores.get(doc, 0)
bm25_score = bm25_scores[i] / bm25_max
combined_score = self.alpha * sem_score + (1 - self.alpha) * bm25_score
combined.append((combined_score, i))
combined.sort(reverse=True)
return [
{"document": self.documents[i], "score": score, "metadata": self.metadata[i]}
for score, i in combined[:top_k]
]
FastAPI Search Service
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI()
engine = HybridSearchEngine()
engine.load("./search_index")
class SearchRequest(BaseModel):
query: str
top_k: int = 5
alpha: float = 0.5
@app.post("/search")
async def search(req: SearchRequest):
engine.alpha = req.alpha
results = engine.search(req.query, req.top_k)
return {"query": req.query, "results": results}
@app.post("/index")
async def index(documents: list[str]):
engine.index(documents)
engine.save("./search_index")
return {"indexed": len(documents)}
Cost Optimization
# Use smaller, cheaper models for bulk indexing
# text-embedding-3-small: $0.02/1M tokens (vs $0.13 for large)
# Cache embeddings to avoid re-computing
import hashlib, pickle
embedding_cache = {}
def cached_embed(text: str) -> list[float]:
key = hashlib.md5(text.encode()).hexdigest()
if key not in embedding_cache:
embedding_cache[key] = embed(text)
return embedding_cache[key]
Advertisement