Published on

GraphRAG — Combining Knowledge Graphs With Vector Search

Authors

Introduction

Vector search treats documents as isolated islands. It misses relationships: "What projects has Jane worked on?" requires finding Jane, then traversing project connections.

Knowledge graphs capture these relationships. Combined with vector search, they enable reasoning that flat retrieval cannot.

Why Flat Vector Search Misses Relationships

Vector similarity operates on isolated chunks. Relationships and multi-hop reasoning require explicit graph structures:

// Problem: Vector search finds isolated answers
async function vectorSearchOnly(query: string): Promise<string> {
  // Query: "What projects has Jane worked on since 2023?"

  // Vector search returns chunks mentioning "Jane" and "projects"
  // But cannot connect Jane → Projects without explicit relationships
  const relevantChunks = [
    "Jane Smith is a software engineer at Acme Corp",
    "Acme Corp's projects include Project X, Project Y",
    "Jane was assigned to Project X starting March 2023"
  ];

  // Cannot answer without explicit relationship traversal
  // These chunks don't clearly connect Jane to specific projects
}

// Solution: Build explicit entity relationships
interface Entity {
  id: string;
  name: string;
  type: 'person' | 'project' | 'organization' | 'technology';
  properties: Record<string, any>;
}

interface Relationship {
  fromEntity: string; // Entity ID
  toEntity: string;
  type: 'works_on' | 'manages' | 'uses' | 'part_of' | 'created_by';
  properties?: {
    startDate?: string;
    endDate?: string;
    role?: string;
  };
}

class KnowledgeGraph {
  private entities: Map<string, Entity> = new Map();
  private relationships: Relationship[] = [];

  addEntity(entity: Entity): void {
    this.entities.set(entity.id, entity);
  }

  addRelationship(rel: Relationship): void {
    this.relationships.push(rel);
  }

  // Find all entities of a type
  findEntitiesByType(type: Entity['type']): Entity[] {
    return Array.from(this.entities.values()).filter(e => e.type === type);
  }

  // Traverse relationships from entity
  traverseFromEntity(entityId: string, relationshipType?: string): Entity[] {
    const outgoing = this.relationships
      .filter(r => r.fromEntity === entityId && (!relationshipType || r.type === relationshipType))
      .map(r => this.entities.get(r.toEntity)!)
      .filter(Boolean);

    return outgoing;
  }

  // Multi-hop: Jane → projects she works on
  async findProjectsByPerson(personId: string): Promise<Entity[]> {
    const person = this.entities.get(personId);
    if (!person || person.type !== 'person') return [];

    // One hop: person → works_on → project
    return this.traverseFromEntity(personId, 'works_on');
  }
}

Entity Extraction from Documents

Extract entities and relationships automatically:

async function extractEntitiesAndRelationships(
  text: string,
  llm: LLMClient
): Promise<{ entities: Entity[]; relationships: Relationship[] }> {
  const extractPrompt = `
Extract all entities and relationships from this text.

Text:
"${text}"

Respond with JSON:
{
  "entities": [
    {
      "id": "entity_1",
      "name": "Jane Smith",
      "type": "person",
      "properties": {}
    }
  ],
  "relationships": [
    {
      "fromEntity": "entity_1",
      "toEntity": "entity_2",
      "type": "works_on",
      "properties": { "startDate": "2023-01" }
    }
  ]
}`;

  const response = await llm.generate({
    messages: [{ role: 'user', content: extractPrompt }],
    maxTokens: 500,
  });

  return JSON.parse(response.text);
}

// Entity resolution: merge duplicates
async function resolveEntityDuplicates(
  entities: Entity[],
  llm: LLMClient
): Promise<Map<string, string>> {
  // Groups similar entities: ["Jane Smith", "Jane S.", "J. Smith"] → "Jane Smith"

  const deduplicationPrompt = `
These entities likely refer to the same person/concept:
${entities.map(e => e.name).join(', ')}

Group them and provide the canonical name. Respond with JSON:
{
  "groups": [
    ["Jane Smith", "Jane S.", "J. Smith"]
  ],
  "canonical": ["Jane Smith"]
}`;

  const response = await llm.generate({
    messages: [{ role: 'user', content: deduplicationPrompt }],
    maxTokens: 200,
  });

  const parsed = JSON.parse(response.text);
  const mapping = new Map<string, string>();

  parsed.groups.forEach((group: string[], idx: number) => {
    const canonical = parsed.canonical[idx];
    group.forEach(alias => {
      mapping.set(alias, canonical);
    });
  });

  return mapping;
}

Knowledge Graph Storage (Neo4j)

Store and query the graph:

interface Neo4jDriver {
  query(cypher: string, params?: Record<string, any>): Promise<any[]>;
}

async function storeInNeo4j(
  driver: Neo4jDriver,
  entities: Entity[],
  relationships: Relationship[]
): Promise<void> {
  // Create entities
  for (const entity of entities) {
    const createEntityQuery = `
CREATE (:${entity.type.toUpperCase()} {
  id: $id,
  name: $name,
  properties: $properties
})`;

    await driver.query(createEntityQuery, {
      id: entity.id,
      name: entity.name,
      properties: entity.properties,
    });
  }

  // Create relationships
  for (const rel of relationships) {
    const createRelQuery = `
MATCH (a { id: $fromId }), (b { id: $toId })
CREATE (a)-[r:${rel.type} {properties: $properties}]->(b)`;

    await driver.query(createRelQuery, {
      fromId: rel.fromEntity,
      toId: rel.toEntity,
      properties: rel.properties || {},
    });
  }
}

// Query example: Find all projects Jane works on (with dates)
async function queryProjectsByPerson(
  driver: Neo4jDriver,
  personName: string
): Promise<
  Array<{
    projectName: string;
    startDate?: string;
    endDate?: string;
  }>
> {
  const query = `
MATCH (p:PERSON {name: $personName})-[r:works_on]->(proj:PROJECT)
RETURN proj.name AS projectName, r.properties.startDate AS startDate, r.properties.endDate AS endDate`;

  const results = await driver.query(query, { personName });
  return results;
}

// Graph traversal: multi-hop queries
async function graphTraversal(
  driver: Neo4jDriver,
  startEntityId: string,
  maxHops: number = 3
): Promise<Array<{ entityId: string; distance: number; path: string[] }>> {
  const query = `
MATCH (start { id: $startId }), (end)
WHERE start <> end
MATCH p = shortestPath((start)-[*..${maxHops}]->(end))
RETURN DISTINCT
  end.id AS entityId,
  length(p) AS distance,
  [node IN nodes(p) | node.name] AS path
ORDER BY distance
LIMIT 20`;

  const results = await driver.query(query, { startId: startEntityId });
  return results;
}

Combined Vector + Graph Retrieval

Merge vector and graph results intelligently:

interface CombinedRetrievalResult {
  source: 'vector' | 'graph';
  entityId: string;
  entityName: string;
  context: string;
  relevance: number;
  distance?: number; // For graph results (hops)
}

async function combineVectorAndGraphRetrieval(
  query: string,
  vectorStore: {
    search: (q: string, k: number) => Promise<Array<{ id: string; text: string; score: number }>>;
  },
  graph: KnowledgeGraph,
  graphDriver: Neo4jDriver,
  embedModel: EmbedModel,
  topK: number = 5
): Promise<CombinedRetrievalResult[]> {
  // Step 1: Vector search for relevant chunks
  const vectorResults = await vectorStore.search(query, topK * 2);

  // Step 2: Extract entities from vector results
  const extractPrompt = `
From these documents, identify key entities (people, projects, organizations):
${vectorResults.map(r => r.text).join('\n---\n')}

Respond with JSON: { "entities": [{ "name": "...", "type": "..." }] }`;

  const extractResponse = await embedModel.embed(query); // Mock
  const mentionedEntities: Entity[] = [];

  // Step 3: Traverse graph from mentioned entities
  const graphResults: CombinedRetrievalResult[] = [];

  for (const entity of mentionedEntities) {
    const traversalResults = await graphTraversal(graphDriver, entity.id, 2);

    for (const result of traversalResults) {
      graphResults.push({
        source: 'graph',
        entityId: result.entityId,
        entityName: result.path[result.path.length - 1],
        context: `Connected via ${result.path.join(' → ')}`,
        relevance: 1 / (result.distance + 1), // Decay by distance
        distance: result.distance,
      });
    }
  }

  // Step 4: Combine and deduplicate
  const combined: CombinedRetrievalResult[] = [
    ...vectorResults.map(r => ({
      source: 'vector' as const,
      entityId: r.id,
      entityName: r.text.substring(0, 50),
      context: r.text,
      relevance: r.score,
    })),
    ...graphResults,
  ];

  // Deduplicate by entityId
  const deduped = new Map<string, CombinedRetrievalResult>();

  for (const result of combined) {
    const existing = deduped.get(result.entityId);
    if (!existing || result.relevance > existing.relevance) {
      deduped.set(result.entityId, result);
    }
  }

  return Array.from(deduped.values())
    .sort((a, b) => b.relevance - a.relevance)
    .slice(0, topK);
}

Microsoft GraphRAG Community Detection

Structure knowledge into communities for better retrieval:

interface Community {
  id: string;
  nodes: Entity[];
  relationships: Relationship[];
  summary?: string;
  embedding?: number[];
}

async function detectCommunities(
  graph: KnowledgeGraph,
  driver: Neo4jDriver
): Promise<Community[]> {
  // Use Neo4j community detection algorithm
  const communityQuery = `
CALL gds.louvain.stream('myGraph')
YIELD nodeId, communityId
RETURN communityId, collect(gds.util.asNode(nodeId)) as nodes
ORDER BY communityId`;

  const results = await driver.query(communityQuery);

  const communities: Community[] = results.map(r => ({
    id: `community_${r.communityId}`,
    nodes: r.nodes,
    relationships: [], // Filter to edges within community
  }));

  return communities;
}

// Summarize each community
async function summarizeCommunity(
  community: Community,
  llm: LLMClient,
  embedModel: EmbedModel
): Promise<Community> {
  const nodeNames = community.nodes.map(n => n.name).join(', ');
  const relationshipDescriptions = community.relationships
    .map(r => `${r.type} relationship`)
    .join('; ');

  const summaryPrompt = `
Summarize this community of entities:
Entities: ${nodeNames}
Relationships: ${relationshipDescriptions}

Summary (1-2 sentences):`;

  const response = await llm.generate({
    messages: [{ role: 'user', content: summaryPrompt }],
    maxTokens: 100,
  });

  const summary = response.text;
  const embedding = await embedModel.embed(summary);

  return {
    ...community,
    summary,
    embedding,
  };
}

// Query communities instead of raw entities
async function queryCommunities(
  query: string,
  communities: Community[],
  embedModel: EmbedModel,
  topK: number = 3
): Promise<Community[]> {
  const queryEmbedding = await embedModel.embed(query);

  const scores = communities
    .filter(c => c.embedding && c.summary)
    .map(community => ({
      community,
      score: cosineSimilarity(queryEmbedding, community.embedding!),
    }));

  return scores
    .sort((a, b) => b.score - a.score)
    .slice(0, topK)
    .map(s => s.community);
}

function cosineSimilarity(a: number[], b: number[]): number {
  const dotProduct = a.reduce((sum, x, i) => sum + x * b[i], 0);
  const normA = Math.sqrt(a.reduce((sum, x) => sum + x * x, 0));
  const normB = Math.sqrt(b.reduce((sum, x) => sum + x * x, 0));
  return dotProduct / (normA * normB);
}

Relationship-Aware Context Building

Build answer context using relationship chains:

async function buildContextFromRelationships(
  queryEntity: Entity,
  graph: KnowledgeGraph,
  driver: Neo4jDriver,
  embedModel: EmbedModel,
  maxDepth: number = 2
): Promise<string> {
  let context = `About ${queryEntity.name}:\n`;

  // Find direct relationships
  const directRels = async (entityId: string, depth: number = 0): Promise<string> => {
    if (depth > maxDepth) return '';

    const query = `
MATCH (entity {id: $entityId})-[r]->(target)
RETURN entity.name AS entityName, r.type AS relationType, target.name AS targetName
LIMIT 10`;

    const results = await driver.query(query, { entityId });
    let text = '';

    for (const result of results) {
      text += `\n- ${result.entityName} ${result.relationType} ${result.targetName}`;

      if (depth < maxDepth) {
        const nestedContext = await directRels(
          queryEntity.id, // Would be target.id in real code
          depth + 1
        );
        text += nestedContext;
      }
    }

    return text;
  };

  context += await directRels(queryEntity.id);
  return context;
}

// Build context considering relationship importance
async function buildContextWithWeight(
  query: string,
  graph: KnowledgeGraph,
  driver: Neo4jDriver,
  topK: number = 5
): Promise<string> {
  // Find most important entities for this query
  const importanceQuery = `
MATCH (e)-[r]-(other)
WHERE any(label in labels(e) WHERE label IN ['PERSON', 'PROJECT', 'ORGANIZATION'])
RETURN e.id AS entityId, e.name AS entityName, count(r) AS importance
ORDER BY importance DESC
LIMIT ${topK}`;

  const importantEntities = await driver.query(importanceQuery);

  let context = '';
  for (const entity of importantEntities) {
    context += `${entity.entityName} (${entity.importance} connections)\n`;

    // Get relationships
    const relQuery = `
MATCH (e {id: $entityId})-[r]->(target)
RETURN e.name AS from, r.type AS type, target.name AS to
LIMIT 5`;

    const relationships = await driver.query(relQuery, { entityId: entity.entityId });
    relationships.forEach(rel => {
      context += `  - ${rel.from} ${rel.type} ${rel.to}\n`;
    });
  }

  return context;
}

When GraphRAG Pays Off

Evaluate whether GraphRAG is worth the complexity:

interface GraphRAGROI {
  queryComplexity: 'simple' | 'medium' | 'complex';
  requiresMultiHop: boolean;
  requiresRelationshipReasoning: boolean;
  expectedQualityGain: number; // 0-1, estimated improvement
  implementationCost: number; // hours
  recommendation: 'use_vector_only' | 'hybrid' | 'graphrag_required';
}

function analyzeGraphRAGFeasibility(
  documentCorpus: string[],
  typicalQueries: string[]
): GraphRAGROI {
  // Estimate multi-hop requirement
  const multiHopKeywords = ['how many', 'what projects', 'who worked', 'relationship', 'connected'];
  const multiHopCount = typicalQueries.filter(q =>
    multiHopKeywords.some(kw => q.toLowerCase().includes(kw))
  ).length;

  const requiresMultiHop = multiHopCount / typicalQueries.length > 0.3;

  // Estimate entity density
  const avgChunkLength = documentCorpus.reduce((sum, doc) => sum + doc.length, 0) / documentCorpus.length;
  const entityDensity = Math.random(); // Would compute via NER in real code

  const queryComplexity =
    multiHopCount / typicalQueries.length > 0.5
      ? 'complex'
      : multiHopCount > 0
        ? 'medium'
        : 'simple';

  // Recommendation logic
  let recommendation: GraphRAGROI['recommendation'] = 'use_vector_only';

  if (queryComplexity === 'complex') {
    recommendation = 'graphrag_required';
  } else if (queryComplexity === 'medium') {
    recommendation = 'hybrid';
  }

  return {
    queryComplexity,
    requiresMultiHop,
    requiresRelationshipReasoning: entityDensity > 0.5,
    expectedQualityGain: requiresMultiHop ? 0.3 : 0.1,
    implementationCost: requiresMultiHop ? 40 : 20,
    recommendation,
  };
}

Checklist

  • Extract entities and relationships from your corpus
  • Implement entity deduplication (resolve aliases)
  • Store graph in Neo4j or alternative
  • Build baseline vector-only RAG first
  • Measure multi-hop query success rate
  • Implement combined vector + graph retrieval
  • Add community detection for large graphs
  • Summarize communities for efficient search
  • Track NDCG improvement from graph addition
  • Document when GraphRAG complexity is justified

Conclusion

GraphRAG is powerful but not always necessary. Use vector-only retrieval as your baseline. If your golden dataset shows <60% hit rate on multi-hop questions, invest in knowledge graphs. The key metric: does relationship traversal meaningfully improve your NDCG@5? If yes, the complexity is justified. If no, simpler query decomposition and HyDE likely suffice.