Chroma DB — Open Source Vector DB Guide
Advertisement
Introduction
Chroma is an open-source vector database that runs locally or in-memory. Perfect for development, testing, and small-scale deployments, Chroma offers simplicity without sacrificing functionality. This guide covers everything you need to build semantic search into your applications.
- Installation and Setup
- In-Memory Client
- Persistent Client
- HTTP Client (Remote)
- Creating and Managing Collections
- Adding Documents and Embeddings
- With Automatic Embeddings
- With Pre-computed Embeddings
- Querying Collections
- Simple Query
- Query by Embedding
- Multiple Queries
- Metadata Filtering
- Updating and Deleting
- Update Documents
- Delete Documents
- Working with Different Embedding Models
- Building a RAG Application
- Integration with LangChain
- Performance Tips
- Conclusion
- FAQ
Installation and Setup
pip install chromadb
In-Memory Client
import chromadb
# Create in-memory client (data lost on restart)
client = chromadb.Client()
# List collections
print(client.list_collections())
Persistent Client
import chromadb
# Create persistent client (saves to disk)
client = chromadb.PersistentClient(path="/path/to/data")
# Data persists between sessions
HTTP Client (Remote)
# Run Chroma server in one terminal
# chroma run --host 0.0.0.0 --port 8000
import chromadb
client = chromadb.HttpClient(host="localhost", port=8000)
Creating and Managing Collections
# Create collection
collection = client.create_collection(
name="documents",
metadata={"hnsw:space": "cosine"}
)
# Get existing collection
collection = client.get_collection(name="documents")
# Get or create
collection = client.get_or_create_collection(name="documents")
# Delete collection
client.delete_collection(name="documents")
# List all collections
print(client.list_collections())
Adding Documents and Embeddings
With Automatic Embeddings
# Chroma generates embeddings automatically
collection.add(
ids=["id1", "id2", "id3"],
documents=[
"This is a document about machine learning",
"This is about deep learning networks",
"This is about natural language processing"
],
metadatas=[
{"source": "doc1.txt", "category": "AI"},
{"source": "doc2.txt", "category": "AI"},
{"source": "doc3.txt", "category": "NLP"}
]
)
With Pre-computed Embeddings
from sentence_transformers import SentenceTransformer
# Generate embeddings locally
model = SentenceTransformer('all-MiniLM-L6-v2')
documents = [
"Machine learning model",
"Neural network architecture",
"Data preprocessing"
]
embeddings = model.encode(documents).tolist()
# Add with pre-computed embeddings
collection.add(
ids=["1", "2", "3"],
embeddings=embeddings,
documents=documents,
metadatas=[{"type": "ML"}, {"type": "DL"}, {"type": "Data"}]
)
Querying Collections
Simple Query
# Query by document text (Chroma embeds the query)
results = collection.query(
query_texts=["Tell me about machine learning"],
n_results=3
)
print(results['documents'])
print(results['distances'])
print(results['metadatas'])
Query by Embedding
# Query using pre-computed embeddings
query_embedding = model.encode(["machine learning models"]).tolist()
results = collection.query(
query_embeddings=query_embedding,
n_results=3,
include=["documents", "metadatas", "distances"]
)
Multiple Queries
results = collection.query(
query_texts=[
"machine learning",
"neural networks",
"data science"
],
n_results=2
)
# Returns results for each query
for i, docs in enumerate(results['documents']):
print(f"Query {i}: {docs}")
Metadata Filtering
# Query with metadata filter
results = collection.query(
query_texts=["machine learning"],
where={"category": "AI"},
n_results=5
)
# Complex filters
results = collection.query(
query_texts=["topic"],
where={
"$or": [
{"category": "AI"},
{"category": "ML"}
]
},
n_results=5
)
# Numeric filters
collection.add(
ids=["1", "2", "3"],
documents=["doc1", "doc2", "doc3"],
metadatas=[
{"score": 0.9},
{"score": 0.7},
{"score": 0.5}
]
)
results = collection.query(
query_texts=["query"],
where={"score": {"$gte": 0.7}},
n_results=3
)
Updating and Deleting
Update Documents
# Update existing items
collection.upsert(
ids=["id1"],
documents=["Updated document content"],
metadatas=[{"updated": True}]
)
Delete Documents
# Delete by ID
collection.delete(ids=["id1"])
# Delete by filter
collection.delete(where={"category": "outdated"})
Working with Different Embedding Models
from chromadb.utils import embedding_functions
# OpenAI embeddings
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
api_key="your-api-key",
model_name="text-embedding-3-small"
)
collection = client.get_or_create_collection(
name="openai-docs",
embedding_function=openai_ef
)
# Sentence Transformers (local, no API key needed)
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="all-MiniLM-L6-v2"
)
collection = client.get_or_create_collection(
name="local-docs",
embedding_function=sentence_transformer_ef
)
Building a RAG Application
from chromadb.utils import embedding_functions
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
# Setup Chroma
client = chromadb.PersistentClient(path="./chroma_data")
collection = client.get_or_create_collection(
name="knowledge_base",
embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction()
)
# Add documents
collection.add(
ids=["1", "2"],
documents=[
"Python is a programming language",
"Machine learning requires data"
]
)
# Query
def answer_question(question: str) -> str:
# Retrieve relevant documents
results = collection.query(
query_texts=[question],
n_results=2
)
context = "\n".join(results['documents'][0])
# Generate answer
llm = ChatOpenAI(model="gpt-4")
prompt = ChatPromptTemplate.from_template(
"""Using this context, answer the question:
Context: {context}
Question: {question}
Answer:"""
)
response = llm.invoke(prompt.format_messages(
context=context,
question=question
))
return response.content
# Use it
answer = answer_question("What is Python?")
print(answer)
Integration with LangChain
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
# Create Chroma vector store
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(
documents=docs,
embedding=embeddings,
persist_directory="./chroma_data"
)
# Use as retriever
retriever = vectorstore.as_retriever()
# Query
docs = retriever.get_relevant_documents("query")
Performance Tips
- Batch Operations: Add multiple items at once
- Use Appropriate Embedding Model: Smaller models for speed, larger for accuracy
- Metadata Filtering: Filter before similarity search
- Persistence: Use persistent client for production
- Index Configuration: Adjust HNSW parameters if needed
# Configure HNSW index
collection = client.create_collection(
name="optimized",
metadata={
"hnsw:space": "cosine",
"hnsw:construction_ef": 200,
"hnsw:ef": 10
}
)
Conclusion
Chroma excels for development and embedded applications. Its simplicity and local-first design make it ideal for prototyping before migrating to larger-scale systems.
FAQ
Q: Can I switch from Chroma to Pinecone later? A: Yes, export vectors from Chroma and import to Pinecone. The migration is straightforward.
Q: How much data can Chroma handle? A: Depends on system memory. For millions of vectors, consider Pinecone or Weaviate.
Q: Is Chroma suitable for production? A: For single-machine deployments and small teams, yes. For high availability and scale, use managed solutions.
Advertisement