Chroma DB — Open Source Vector DB Guide

Introduction

Chroma is an open-source vector database that runs locally or in-memory. Perfect for development, testing, and small-scale deployments, Chroma offers simplicity without sacrificing functionality. This guide covers everything you need to build semantic search into your applications.

Installation and Setup
In-Memory Client
Persistent Client
HTTP Client (Remote)
Creating and Managing Collections
Adding Documents and Embeddings
With Automatic Embeddings
With Pre-computed Embeddings
Querying Collections
Simple Query
Query by Embedding
Multiple Queries
Metadata Filtering
Updating and Deleting
Update Documents
Delete Documents
Working with Different Embedding Models
Building a RAG Application
Integration with LangChain
Performance Tips
Conclusion
FAQ

Installation and Setup

pip install chromadb

In-Memory Client

import chromadb

# Create in-memory client (data lost on restart)
client = chromadb.Client()

# List collections
print(client.list_collections())

Persistent Client

import chromadb

# Create persistent client (saves to disk)
client = chromadb.PersistentClient(path="/path/to/data")

# Data persists between sessions

HTTP Client (Remote)

# Run Chroma server in one terminal
# chroma run --host 0.0.0.0 --port 8000

import chromadb

client = chromadb.HttpClient(host="localhost", port=8000)

Creating and Managing Collections

# Create collection
collection = client.create_collection(
    name="documents",
    metadata={"hnsw:space": "cosine"}
)

# Get existing collection
collection = client.get_collection(name="documents")

# Get or create
collection = client.get_or_create_collection(name="documents")

# Delete collection
client.delete_collection(name="documents")

# List all collections
print(client.list_collections())

Adding Documents and Embeddings

With Automatic Embeddings

# Chroma generates embeddings automatically
collection.add(
    ids=["id1", "id2", "id3"],
    documents=[
        "This is a document about machine learning",
        "This is about deep learning networks",
        "This is about natural language processing"
    ],
    metadatas=[
        {"source": "doc1.txt", "category": "AI"},
        {"source": "doc2.txt", "category": "AI"},
        {"source": "doc3.txt", "category": "NLP"}
    ]
)

With Pre-computed Embeddings

from sentence_transformers import SentenceTransformer

# Generate embeddings locally
model = SentenceTransformer('all-MiniLM-L6-v2')

documents = [
    "Machine learning model",
    "Neural network architecture",
    "Data preprocessing"
]

embeddings = model.encode(documents).tolist()

# Add with pre-computed embeddings
collection.add(
    ids=["1", "2", "3"],
    embeddings=embeddings,
    documents=documents,
    metadatas=[{"type": "ML"}, {"type": "DL"}, {"type": "Data"}]
)

Querying Collections

Simple Query

# Query by document text (Chroma embeds the query)
results = collection.query(
    query_texts=["Tell me about machine learning"],
    n_results=3
)

print(results['documents'])
print(results['distances'])
print(results['metadatas'])

Query by Embedding

# Query using pre-computed embeddings
query_embedding = model.encode(["machine learning models"]).tolist()

results = collection.query(
    query_embeddings=query_embedding,
    n_results=3,
    include=["documents", "metadatas", "distances"]
)

Multiple Queries

results = collection.query(
    query_texts=[
        "machine learning",
        "neural networks",
        "data science"
    ],
    n_results=2
)

# Returns results for each query
for i, docs in enumerate(results['documents']):
    print(f"Query {i}: {docs}")

Metadata Filtering

# Query with metadata filter
results = collection.query(
    query_texts=["machine learning"],
    where={"category": "AI"},
    n_results=5
)

# Complex filters
results = collection.query(
    query_texts=["topic"],
    where={
        "$or": [
            {"category": "AI"},
            {"category": "ML"}
        ]
    },
    n_results=5
)

# Numeric filters
collection.add(
    ids=["1", "2", "3"],
    documents=["doc1", "doc2", "doc3"],
    metadatas=[
        {"score": 0.9},
        {"score": 0.7},
        {"score": 0.5}
    ]
)

results = collection.query(
    query_texts=["query"],
    where={"score": {"$gte": 0.7}},
    n_results=3
)

Updating and Deleting

Update Documents

# Update existing items
collection.upsert(
    ids=["id1"],
    documents=["Updated document content"],
    metadatas=[{"updated": True}]
)

Delete Documents

# Delete by ID
collection.delete(ids=["id1"])

# Delete by filter
collection.delete(where={"category": "outdated"})

Working with Different Embedding Models

from chromadb.utils import embedding_functions

# OpenAI embeddings
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key="your-api-key",
    model_name="text-embedding-3-small"
)

collection = client.get_or_create_collection(
    name="openai-docs",
    embedding_function=openai_ef
)

# Sentence Transformers (local, no API key needed)
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

collection = client.get_or_create_collection(
    name="local-docs",
    embedding_function=sentence_transformer_ef
)

Building a RAG Application

from chromadb.utils import embedding_functions
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

# Setup Chroma
client = chromadb.PersistentClient(path="./chroma_data")
collection = client.get_or_create_collection(
    name="knowledge_base",
    embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction()
)

# Add documents
collection.add(
    ids=["1", "2"],
    documents=[
        "Python is a programming language",
        "Machine learning requires data"
    ]
)

# Query
def answer_question(question: str) -> str:
    # Retrieve relevant documents
    results = collection.query(
        query_texts=[question],
        n_results=2
    )

    context = "\n".join(results['documents'][0])

    # Generate answer
    llm = ChatOpenAI(model="gpt-4")
    prompt = ChatPromptTemplate.from_template(
        """Using this context, answer the question:

Context: {context}

Question: {question}

Answer:"""
    )

    response = llm.invoke(prompt.format_messages(
        context=context,
        question=question
    ))

    return response.content

# Use it
answer = answer_question("What is Python?")
print(answer)

Integration with LangChain

from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# Create Chroma vector store
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory="./chroma_data"
)

# Use as retriever
retriever = vectorstore.as_retriever()

# Query
docs = retriever.get_relevant_documents("query")

Performance Tips

Batch Operations: Add multiple items at once
Use Appropriate Embedding Model: Smaller models for speed, larger for accuracy
Metadata Filtering: Filter before similarity search
Persistence: Use persistent client for production
Index Configuration: Adjust HNSW parameters if needed

# Configure HNSW index
collection = client.create_collection(
    name="optimized",
    metadata={
        "hnsw:space": "cosine",
        "hnsw:construction_ef": 200,
        "hnsw:ef": 10
    }
)

Conclusion

Chroma excels for development and embedded applications. Its simplicity and local-first design make it ideal for prototyping before migrating to larger-scale systems.

FAQ

Q: Can I switch from Chroma to Pinecone later? A: Yes, export vectors from Chroma and import to Pinecone. The migration is straightforward.

Q: How much data can Chroma handle? A: Depends on system memory. For millions of vectors, consider Pinecone or Weaviate.

Q: Is Chroma suitable for production? A: For single-machine deployments and small teams, yes. For high availability and scale, use managed solutions.