Weaviate Vector DB — Getting Started

Sanjeev SharmaSanjeev Sharma
4 min read

Advertisement

Introduction

Weaviate is an open-source, distributed vector database built for scale. With GraphQL APIs, hybrid search, and modular ML capabilities, Weaviate powers production systems handling millions of vectors. This guide gets you started quickly.

Installation Options

Local Docker

# Start Weaviate locally
docker run -d -p 8080:8080 \
  -e TRANSFORMERS_INFERENCE_API="http://host.docker.internal:8000" \
  cr.weaviate.io/semitechnologies/weaviate:latest

Weaviate Cloud Service (Managed)

import weaviate
from weaviate.auth import AuthApiKey

# Connect to cloud instance
client = weaviate.connect_to_weaviate_cloud(
    cluster_url="https://your-cluster.weaviate.network",
    auth_credentials=AuthApiKey("your-api-key")
)

Python Client Setup

pip install -U weaviate-client
import weaviate
from weaviate.connect import ConnectionParams

# Connect to local Weaviate
client = weaviate.connect_to_local()

# Verify connection
if client.is_ready():
    print("Connected to Weaviate")

Creating Collections (Schema)

import weaviate.classes as wvc

# Create a collection
client.collections.create(
    name="Article",
    description="Articles about AI and machine learning",
    properties=[
        wvc.Property(
            name="title",
            data_type=wvc.DataType.TEXT,
            description="Article title"
        ),
        wvc.Property(
            name="content",
            data_type=wvc.DataType.TEXT_ARRAY,
            description="Article paragraphs"
        ),
        wvc.Property(
            name="published_date",
            data_type=wvc.DataType.DATE,
            description="Publication date"
        ),
        wvc.Property(
            name="author",
            data_type=wvc.DataType.TEXT,
            description="Author name"
        ),
    ],
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(
        model="text-embedding-3-small"
    )
)

print("Collection created successfully")

Adding Data

Batch Insert

articles = [
    {
        "title": "Introduction to LLMs",
        "content": "Large language models are neural networks...",
        "published_date": "2024-01-15",
        "author": "John Doe"
    },
    {
        "title": "RAG Systems Explained",
        "content": "Retrieval-augmented generation combines retrieval...",
        "published_date": "2024-02-20",
        "author": "Jane Smith"
    }
]

collection = client.collections.get("Article")

# Batch add objects
with collection.batch.dynamic() as batch:
    for article in articles:
        batch.add_object(
            properties=article
        )

print(f"Added {len(articles)} articles")

Batch with Custom Vectors

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

with collection.batch.dynamic() as batch:
    for article in articles:
        text_embedding = model.encode(article["content"]).tolist()

        batch.add_object(
            properties=article,
            vector=text_embedding
        )
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

# Vector search
query_text = "machine learning models"
query_vector = model.encode(query_text).tolist()

collection = client.collections.get("Article")

results = collection.query.near_vector(
    near_vector=query_vector,
    limit=3,
    return_metadata=wvc.query.MetadataQuery(distance=True),
    where=None  # Optional: add filters
)

for item in results.objects:
    print(f"Title: {item.properties['title']}")
    print(f"Distance: {item.metadata.distance}")
    print("---")
# Text search (keyword-based)
results = collection.query.bm25(
    query="machine learning",
    limit=3
)

# Hybrid search (combines vector + text)
results = collection.query.hybrid(
    query="machine learning fundamentals",
    limit=5,
    alpha=0.5  # 0=pure keyword, 1=pure vector, 0.5=balanced
)

Filtering with Where Clauses

from datetime import datetime

# Simple filter
results = collection.query.near_vector(
    near_vector=query_vector,
    limit=3,
    where=wvc.query.Filter.by_property("author").equal("John Doe")
)

# Complex filters
results = collection.query.near_vector(
    near_vector=query_vector,
    limit=3,
    where=wvc.query.Filter.by_property("published_date").greater_than(
        datetime(2024, 1, 1)
    )
)

# AND conditions
results = collection.query.near_vector(
    near_vector=query_vector,
    limit=3,
    where=wvc.query.Filter.all_of([
        wvc.query.Filter.by_property("author").equal("John Doe"),
        wvc.query.Filter.by_property("published_date").greater_than(
            datetime(2024, 1, 1)
        )
    ])
)

Updating and Deleting

Update Objects

# Get object UUID first
results = collection.query.fetch_objects(limit=1)
obj_id = results.objects[0].uuid

# Update
collection.data.update(
    uuid=obj_id,
    properties={"author": "Updated Author"}
)

Delete Objects

# Delete by UUID
collection.data.delete_by_id(uuid=obj_id)

# Delete by filter
collection.data.delete_many(
    where=wvc.query.Filter.by_property("author").equal("Old Author")
)

Using GraphQL API

# Raw GraphQL query
query = """
{
  Get {
    Article(
      limit: 3
      where: {
        path: "author"
        operator: Equal
        valueString: "John Doe"
      }
    ) {
      title
      content
      author
    }
  }
}
"""

results = client.graphql_raw_query(query)
print(results)

Integration with LangChain

from langchain_weaviate.vectorstores import Weaviate
from langchain_openai import OpenAIEmbeddings

# Create vector store
vectorstore = Weaviate(
    client=client,
    index_name="Article",
    text_key="content",
    embedding=OpenAIEmbeddings()
)

# Use as retriever
retriever = vectorstore.as_retriever()
docs = retriever.get_relevant_documents("machine learning")

Best Practices

  1. Schema Design: Define properties and types clearly
  2. Vectorizer Selection: Choose appropriate embedding model
  3. Batch Operations: Use batch API for efficiency
  4. Indexing: Monitor and tune HNSW parameters
  5. Backups: Regular backups of persistent data
# Export data
objects = collection.query.fetch_objects(
    limit=10000,
    return_properties=wvc.query.QueryReference.ALL
)

# Save to file
import json
with open("backup.json", "w") as f:
    json.dump([obj.dict() for obj in objects.objects], f)

Conclusion

Weaviate provides enterprise-grade vector search with flexibility and scale. Its GraphQL API, hybrid search, and modular architecture make it ideal for complex, production systems.

FAQ

Q: Should I use Weaviate or managed services? A: Weaviate is self-hosted, giving full control but requiring infrastructure management. Use managed Weaviate Cloud for zero-ops.

Q: Can I migrate data between vector databases? A: Yes, export vectors from Weaviate and import elsewhere. The migration process depends on target database.

Q: How does Weaviate handle scaling? A: Weaviate supports multi-node clusters with replication and sharding for horizontal scaling.

Advertisement

Sanjeev Sharma

Written by

Sanjeev Sharma

Full Stack Engineer · E-mopro