Embeddings Search at Scale — Approximate Nearest Neighbor Beyond Simple Similarity

Introduction

Simple cosine similarity on embeddings doesn't scale beyond thousands of vectors. Approximate Nearest Neighbor (ANN) indexes like HNSW and IVFFlat achieve sub-millisecond search on billions of vectors. This post covers index tradeoffs, batch embedding generation, incremental updates, hybrid vector+keyword search, filtering strategies, caching, and dimension reduction for production-scale search.

HNSW vs IVFFlat Tradeoffs
Batch Embedding Generation
Incremental Index Updates
Hybrid Search: Vector + Keyword BM25
Filtering Before/After ANN
Embedding Cache
Dimension Reduction
Embeddings Search at Scale Checklist
Conclusion

HNSW vs IVFFlat Tradeoffs

HNSW (Hierarchical Navigable Small World) and IVFFlat (Inverted File) are the main ANN algorithms. Choose based on your data size and latency requirements.

// HNSW: Best for &lt;5M vectors, excellent latency
// - O(log N) search complexity
// - p50 latency: 2-10ms
// - Memory intensive (~200 bytes per vector beyond the embedding)
// - Better for real-time search

// IVFFlat: Best for >5M vectors, good throughput
// - O(k) search complexity, where k = number of clusters (small)
// - p50 latency: 5-50ms
// - Memory efficient (~50 bytes per vector beyond the embedding)
// - Better for batch/analytical queries

interface ANNIndexConfig {
  algorithm: 'hnsw' | 'ivfflat';
  vectorDimension: number;
  vectorCount: number;
  targetLatencyMs: number;
  memoryLimitGB: number;
}

function selectOptimalIndex(config: ANNIndexConfig): 'hnsw' | 'ivfflat' {
  // HNSW if small dataset
  if (config.vectorCount < 100_000) return 'hnsw';

  // HNSW if latency-critical
  if (config.targetLatencyMs < 10) return 'hnsw';

  // IVFFlat if large dataset and memory-constrained
  if (config.vectorCount > 5_000_000 && config.memoryLimitGB < 100) return 'ivfflat';

  // Default to IVFFlat for huge datasets
  if (config.vectorCount > 100_000_000) return 'ivfflat';

  return 'hnsw';
}

// Index creation SQL
const hnswIndexSQL = `
CREATE INDEX ON vectors USING hnsw (embedding vector_cosine_ops)
WITH (
  m = 16,              -- Connections per node (16-48 typical)
  ef_construction = 200 -- Candidates during construction (100-2000)
);
`;

const ivfflatIndexSQL = `
CREATE INDEX ON vectors USING ivfflat (embedding vector_cosine_ops)
WITH (
  lists = 100          -- Number of clusters (10-1000 based on dataset size)
);
-- After creation, tune ef parameter
ALTER INDEX vector_ivfflat_idx SET (ef = 100); -- Search parameter (10-1000)
`;

interface IndexMetrics {
  algorithm: string;
  vectorCount: number;
  indexSizeGB: number;
  buildTimeSecs: number;
  p50SearchLatencyMs: number;
  p95SearchLatencyMs: number;
  recallAt10: number; // How many of top 10 are actual nearest neighbors
}

const benchmarks: IndexMetrics[] = [
  {
    algorithm: 'hnsw',
    vectorCount: 1_000_000,
    indexSizeGB: 5.2,
    buildTimeSecs: 45,
    p50SearchLatencyMs: 3,
    p95SearchLatencyMs: 8,
    recallAt10: 0.98,
  },
  {
    algorithm: 'ivfflat',
    vectorCount: 10_000_000,
    indexSizeGB: 12.1,
    buildTimeSecs: 120,
    p50SearchLatencyMs: 15,
    p95SearchLatencyMs: 45,
    recallAt10: 0.95,
  },
  {
    algorithm: 'hnsw',
    vectorCount: 100_000_000,
    indexSizeGB: 520,
    buildTimeSecs: 3600,
    p50SearchLatencyMs: 8,
    p95SearchLatencyMs: 25,
    recallAt10: 0.98,
  },
];

Batch Embedding Generation

Generate embeddings for many documents efficiently.

interface BatchEmbeddingConfig {
  batchSize: number;
  concurrency: number;
  retryStrategy: { maxRetries: number; backoffMs: number };
}

class BatchEmbeddingGenerator {
  private config: BatchEmbeddingConfig;
  private embeddingService: any;

  constructor(config: BatchEmbeddingConfig) {
    this.config = config;
  }

  async generateBatch(
    items: Array<{ id: string; text: string; metadata?: Record<string, any> }>
  ): Promise<Array<{ id: string; embedding: number[]; metadata?: Record<string, any> }>> {
    const results: Array<{ id: string; embedding: number[]; metadata?: Record<string, any> }> = [];
    const errors: Array<{ id: string; error: string }> = [];

    // Process in batches with concurrency control
    for (let i = 0; i < items.length; i += this.config.batchSize) {
      const batch = items.slice(i, i + this.config.batchSize);

      // Process batch items concurrently
      const batchResults = await Promise.all(
        batch.map(item => this.generateWithRetry(item))
      );

      for (const result of batchResults) {
        if ('error' in result) {
          errors.push(result);
        } else {
          results.push(result);
        }
      }

      // Progress logging
      console.log(`Processed ${Math.min(i + this.config.batchSize, items.length)}/${items.length} embeddings`);

      // Rate limiting between batches
      await new Promise(resolve => setTimeout(resolve, 100));
    }

    if (errors.length > 0) {
      console.warn(`${errors.length} embeddings failed to generate`);
    }

    return results;
  }

  private async generateWithRetry(
    item: { id: string; text: string; metadata?: Record<string, any> }
  ): Promise<{ id: string; embedding: number[]; metadata?: Record<string, any> } | { id: string; error: string }> {
    let lastError: Error | null = null;

    for (let attempt = 0; attempt <= this.config.retryStrategy.maxRetries; attempt++) {
      try {
        const embedding = await this.embeddingService.embed(item.text);

        return {
          id: item.id,
          embedding,
          metadata: item.metadata,
        };
      } catch (error) {
        lastError = error as Error;

        if (attempt < this.config.retryStrategy.maxRetries) {
          const backoffMs = this.config.retryStrategy.backoffMs * Math.pow(2, attempt);
          await new Promise(resolve => setTimeout(resolve, backoffMs));
          continue;
        }
      }
    }

    return {
      id: item.id,
      error: lastError?.message || 'Unknown error',
    };
  }

  async estimateCost(itemCount: number, avgTextLength: number): Promise<number> {
    // OpenAI text-embedding-3-small: $0.02 per 1M tokens
    const estimatedTokens = (itemCount * avgTextLength) / 4;
    return (estimatedTokens / 1_000_000) * 0.02;
  }
}

Incremental Index Updates

Add new vectors without full reindexing.

interface IndexUpdateStrategy {
  strategy: 'incremental' | 'batch_rebuild' | 'dual_index';
  threshold: number; // When to switch from incremental to rebuild
}

class IncrementalIndexUpdater {
  private vectorStore: any;
  private updateQueue: Array<{ id: string; vector: number[]; action: 'add' | 'update' | 'delete' }> = [];
  private queuedUpdates = 0;

  async queueUpdate(
    id: string,
    vector: number[],
    action: 'add' | 'update' | 'delete' = 'add'
  ): Promise<void> {
    this.updateQueue.push({ id, vector, action });
    this.queuedUpdates++;

    // Flush if queue exceeds threshold
    if (this.queuedUpdates > 1000) {
      await this.flushUpdates();
    }
  }

  async flushUpdates(): Promise<{ addedCount: number; updatedCount: number; deletedCount: number }> {
    if (this.updateQueue.length === 0) {
      return { addedCount: 0, updatedCount: 0, deletedCount: 0 };
    }

    const updates = this.updateQueue.splice(0);
    const stats = {
      addedCount: updates.filter(u => u.action === 'add').length,
      updatedCount: updates.filter(u => u.action === 'update').length,
      deletedCount: updates.filter(u => u.action === 'delete').length,
    };

    // Use pgvector's ON CONFLICT for upsert
    const addUpdateStmts = updates
      .filter(u => u.action === 'add' || u.action === 'update')
      .map((u, idx) => `($${idx * 2 + 1}, $${idx * 2 + 2})`);

    if (addUpdateStmts.length > 0) {
      await this.vectorStore.query(`
        INSERT INTO vectors (id, embedding) VALUES ${addUpdateStmts.join(',')}
        ON CONFLICT (id) DO UPDATE SET
        embedding = EXCLUDED.embedding,
        updated_at = NOW()
      `);
    }

    // Handle deletes
    const deletes = updates.filter(u => u.action === 'delete').map(u => u.id);
    if (deletes.length > 0) {
      await this.vectorStore.query('DELETE FROM vectors WHERE id = ANY($1)', [deletes]);
    }

    this.queuedUpdates = 0;
    return stats;
  }

  async rebuildIndexIfNeeded(totalVectors: number): Promise<boolean> {
    const indexFragmentation = this.queuedUpdates / totalVectors;

    // Rebuild if >10% of vectors are pending updates
    if (indexFragmentation > 0.1) {
      console.log(`Index fragmentation at ${(indexFragmentation * 100).toFixed(1)}%. Triggering rebuild...`);

      // Drop old index
      await this.vectorStore.query('DROP INDEX IF EXISTS vector_hnsw_idx');

      // Create new index
      await this.vectorStore.query(`
        CREATE INDEX ON vectors USING hnsw (embedding vector_cosine_ops)
        WITH (m = 16, ef_construction = 200)
      `);

      this.queuedUpdates = 0;
      return true;
    }

    return false;
  }
}

Hybrid Search: Vector + Keyword BM25

Combine semantic similarity with traditional full-text search.

interface HybridSearchResult {
  id: string;
  text: string;
  semanticScore: number;
  keywordScore: number;
  combinedScore: number;
  rank: number;
}

class HybridSearchEngine {
  private vectorStore: any;
  private fullTextIndex: any;

  async hybridSearch(
    query: string,
    embedding: number[],
    options: {
      limit: number;
      semanticWeight: number; // 0-1, typically 0.5-0.7
      keywordWeight: number;
    }
  ): Promise<HybridSearchResult[]> {
    // 1. Vector search (semantic)
    const semanticResults = await this.vectorSearch(embedding, options.limit * 2);

    // 2. Full-text search (keyword)
    const keywordResults = await this.keywordSearch(query, options.limit * 2);

    // 3. Normalize scores to 0-1
    const normalized = new Map<string, HybridSearchResult>();

    // Add semantic results
    for (let i = 0; i < semanticResults.length; i++) {
      const result = semanticResults[i];
      const normalizedScore = 1 - i / semanticResults.length; // Higher rank = higher score

      normalized.set(result.id, {
        ...result,
        semanticScore: normalizedScore,
        keywordScore: 0,
        combinedScore: 0,
        rank: 0,
      });
    }

    // Add keyword results
    for (let i = 0; i < keywordResults.length; i++) {
      const result = keywordResults[i];
      const normalizedScore = 1 - i / keywordResults.length;

      const entry = normalized.get(result.id);
      if (entry) {
        entry.keywordScore = normalizedScore;
      } else {
        normalized.set(result.id, {
          ...result,
          semanticScore: 0,
          keywordScore: normalizedScore,
          combinedScore: 0,
          rank: 0,
        });
      }
    }

    // 4. Combine scores
    const combined = Array.from(normalized.values()).map(result => ({
      ...result,
      combinedScore: result.semanticScore * options.semanticWeight + result.keywordScore * options.keywordWeight,
    }));

    // 5. Sort by combined score and limit
    return combined
      .sort((a, b) => b.combinedScore - a.combinedScore)
      .slice(0, options.limit)
      .map((result, idx) => ({ ...result, rank: idx + 1 }));
  }

  private async vectorSearch(embedding: number[], limit: number): Promise<Array<{ id: string; text: string }>> {
    const results = await this.vectorStore.search(embedding, limit);
    return results.map((r: any) => ({ id: r.id, text: r.text }));
  }

  private async keywordSearch(query: string, limit: number): Promise<Array<{ id: string; text: string }>> {
    const results = await this.fullTextIndex.search(query, limit);
    return results.map((r: any) => ({ id: r.id, text: r.text }));
  }
}

Filtering Before/After ANN

Pre-filtering reduces search space, post-filtering ensures all results match.

class FilteredANNSearch {
  async search(
    embedding: number[],
    options: {
      preFilter?: { category: string; active: boolean };
      postFilter?: (item: any) => boolean;
      limit: number;
    }
  ): Promise<any[]> {
    // 1. Pre-filter: Reduce candidate set BEFORE ANN
    let candidates: any[] = [];

    if (options.preFilter) {
      candidates = await this.getFilteredCandidates(options.preFilter);
    } else {
      candidates = await this.getAllCandidates();
    }

    // 2. ANN search on filtered candidates
    const results = this.annSearchInMemory(embedding, candidates, options.limit * 2);

    // 3. Post-filter: Apply complex filters AFTER ANN
    let filtered = results;
    if (options.postFilter) {
      filtered = results.filter(options.postFilter);
    }

    return filtered.slice(0, options.limit);
  }

  private async getFilteredCandidates(filter: { category: string; active: boolean }): Promise<any[]> {
    // Use index on metadata for fast lookup
    const sql = `
      SELECT id, embedding, text, metadata FROM vectors
      WHERE metadata->>'category' = $1
      AND (metadata->>'active')::boolean = $2
    `;

    return await this.db.query(sql, [filter.category, filter.active]);
  }

  private async getAllCandidates(): Promise<any[]> {
    return await this.db.query('SELECT id, embedding, text, metadata FROM vectors');
  }

  private annSearchInMemory(embedding: number[], candidates: any[], limit: number): any[] {
    // Compute similarity scores (in-memory for speed)
    const scored = candidates.map(candidate => ({
      ...candidate,
      score: this.cosineSimilarity(embedding, candidate.embedding),
    }));

    return scored.sort((a, b) => b.score - a.score).slice(0, limit);
  }

  private cosineSimilarity(a: number[], b: number[]): number {
    const dotProduct = a.reduce((sum, x, i) => sum + x * b[i], 0);
    const normA = Math.sqrt(a.reduce((sum, x) => sum + x * x, 0));
    const normB = Math.sqrt(b.reduce((sum, x) => sum + x * x, 0));
    return dotProduct / (normA * normB);
  }
}

Embedding Cache

Cache frequently accessed embeddings to avoid recomputation.

class EmbeddingCache {
  private cache: Map<string, { embedding: number[]; timestamp: number }> = new Map();
  private hitCount = 0;
  private missCount = 0;
  private readonly ttlMs = 86400000; // 24 hours

  get(text: string): number[] | null {
    const cached = this.cache.get(text);

    if (!cached) {
      this.missCount++;
      return null;
    }

    // Check expiration
    if (Date.now() - cached.timestamp > this.ttlMs) {
      this.cache.delete(text);
      this.missCount++;
      return null;
    }

    this.hitCount++;
    return cached.embedding;
  }

  set(text: string, embedding: number[]): void {
    this.cache.set(text, {
      embedding,
      timestamp: Date.now(),
    });

    // Evict oldest entries if cache exceeds size
    if (this.cache.size > 100000) {
      const entries = Array.from(this.cache.entries()).sort((a, b) => a[1].timestamp - b[1].timestamp);

      for (let i = 0; i < 10000; i++) {
        this.cache.delete(entries[i][0]);
      }
    }
  }

  getStats(): { hits: number; misses: number; hitRate: string; size: number } {
    const total = this.hitCount + this.missCount;

    return {
      hits: this.hitCount,
      misses: this.missCount,
      hitRate: total === 0 ? '0%' : `${((this.hitCount / total) * 100).toFixed(1)}%`,
      size: this.cache.size,
    };
  }

  clear(): void {
    this.cache.clear();
    this.hitCount = 0;
    this.missCount = 0;
  }
}

Dimension Reduction

Reduce embedding dimensions to save storage and speed up search.

// PCA for dimension reduction (simple example)
class DimensionReducer {
  async reduce(embeddings: number[][], targetDimension: number): Promise<number[][]> {
    // In production, use UMAP or PCA library
    // For demonstration, simple mean-based projection

    if (embeddings[0].length <= targetDimension) {
      return embeddings;
    }

    const originalDim = embeddings[0].length;
    const reductionFactor = targetDimension / originalDim;

    const reduced = embeddings.map(embedding => {
      const result: number[] = [];

      for (let i = 0; i < targetDimension; i++) {
        const startIdx = Math.floor(i / reductionFactor);
        const endIdx = Math.floor((i + 1) / reductionFactor);

        const chunk = embedding.slice(startIdx, endIdx);
        const mean = chunk.reduce((a, b) => a + b, 0) / chunk.length;

        result.push(mean);
      }

      return result;
    });

    return reduced;
  }

  // Measure quality loss from reduction
  measureQualityLoss(original: number[], reduced: number[]): number {
    // Compare similarity scores before/after reduction
    const referenceEmbedding = original.slice(0, Math.min(256, original.length));
    const originalSim = this.cosineSimilarity(original, referenceEmbedding);
    const reducedSim = this.cosineSimilarity(reduced, reduced);

    return Math.abs(originalSim - reducedSim);
  }

  private cosineSimilarity(a: number[], b: number[]): number {
    const dotProduct = a.reduce((sum, x, i) => sum + x * b[i], 0);
    const normA = Math.sqrt(a.reduce((sum, x) => sum + x * x, 0));
    const normB = Math.sqrt(b.reduce((sum, x) => sum + x * x, 0));
    return dotProduct / (normA * normB);
  }
}

Embeddings Search at Scale Checklist

Conclusion

Scaling embeddings search requires choosing the right index algorithm, batching generation efficiently, managing incremental updates, combining vector search with keyword search, filtering strategically, caching aggressively, and optimizing dimensions. Master these patterns and you can search billions of vectors sub-millisecond.