Published on

Building an AI Knowledge Base — Internal Documentation Search That Actually Works

Authors

Introduction

Most companies have documentation scattered across Confluence, Notion, GitHub, and Google Docs. When an employee searches, they find outdated guides, duplicate information, or nothing at all. AI knowledge bases solve this by unifying fragmented documentation into a single conversational interface. However, building one requires solving ingestion, synchronization, access control, and quality assurance challenges.

Documentation Ingestion Pipeline

Build a unified ingestion system across multiple sources:

type DocumentSource = 'confluence' | 'notion' | 'github' | 'gdocs';

interface Document {
  id: string;
  source: DocumentSource;
  sourceId: string;
  title: string;
  content: string;
  url: string;
  lastModified: Date;
  owner: string;
  accessLevel: 'public' | 'team' | 'restricted';
  tags: string[];
}

async function ingestDocumentation(
  source: DocumentSource,
  config: Record<string, any>
): Promise<Document[]> {
  switch (source) {
    case 'confluence':
      return ingestConfluence(config);
    case 'notion':
      return ingestNotion(config);
    case 'github':
      return ingestGitHub(config);
    case 'gdocs':
      return ingestGoogleDocs(config);
  }
}

async function ingestConfluence(config: {
  domain: string;
  token: string;
  spaceKeys: string[];
}): Promise<Document[]> {
  const docs: Document[] = [];

  for (const spaceKey of config.spaceKeys) {
    const pages = await confluenceAPI.getPages(spaceKey, config.token);

    for (const page of pages) {
      docs.push({
        id: `confluence-${page.id}`,
        source: 'confluence',
        sourceId: page.id,
        title: page.title,
        content: await parseConfluenceContent(page),
        url: `${config.domain}/wiki/spaces/${spaceKey}/pages/${page.id}`,
        lastModified: new Date(page.version.when),
        owner: page.version.by.username,
        accessLevel: inferAccessLevel(page.restrictions),
        tags: page.labels || []
      });
    }
  }

  return docs;
}

async function ingestNotion(config: {
  token: string;
  databases: string[];
}): Promise<Document[]> {
  const docs: Document[] = [];
  const notion = new NotionClient({ auth: config.token });

  for (const dbId of config.databases) {
    const response = await notion.databases.query({ database_id: dbId });

    for (const page of response.results) {
      const blocks = await notion.blocks.children.list({ block_id: page.id });
      const content = serializeNotionBlocks(blocks.results);

      docs.push({
        id: `notion-${page.id}`,
        source: 'notion',
        sourceId: page.id,
        title: extractPageTitle(page),
        content,
        url: page.url,
        lastModified: new Date(page.last_edited_time),
        owner: page.created_by.id,
        accessLevel: 'team', // Notion access handled separately
        tags: extractTags(page.properties)
      });
    }
  }

  return docs;
}

Incremental Sync on Document Update

Don't re-ingest everything. Sync only changes:

interface SyncState {
  source: DocumentSource;
  lastSyncTime: Date;
  documentCount: number;
  checksumMap: Map<string, string>;
}

async function incrementalSync(
  source: DocumentSource,
  state: SyncState,
  config: Record<string, any>
): Promise<{ added: Document[]; updated: Document[]; deleted: string[] }> {
  const allDocs = await ingestDocumentation(source, config);
  const changes = {
    added: [] as Document[],
    updated: [] as Document[],
    deleted: [] as string[]
  };

  const newChecksumMap = new Map<string, string>();

  for (const doc of allDocs) {
    const currentChecksum = hashContent(doc.content);
    newChecksumMap.set(doc.id, currentChecksum);

    const previousChecksum = state.checksumMap.get(doc.id);

    if (!previousChecksum) {
      changes.added.push(doc);
    } else if (previousChecksum !== currentChecksum) {
      changes.updated.push(doc);
    }
  }

  // Find deleted documents
  for (const [docId] of state.checksumMap) {
    if (!newChecksumMap.has(docId)) {
      changes.deleted.push(docId);
    }
  }

  state.checksumMap = newChecksumMap;
  state.lastSyncTime = new Date();

  return changes;
}

async function applySync(
  changes: ReturnType<typeof incrementalSync>
): Promise<void> {
  // Insert new documents
  for (const doc of changes.added) {
    await vectorDb.insert(doc.id, doc.content, {
      title: doc.title,
      source: doc.source,
      url: doc.url,
      lastModified: doc.lastModified
    });
  }

  // Update modified documents
  for (const doc of changes.updated) {
    await vectorDb.update(doc.id, doc.content);
  }

  // Remove deleted documents
  for (const docId of changes.deleted) {
    await vectorDb.delete(docId);
  }
}

Access-Controlled Retrieval

Users only see docs they can read:

interface UserContext {
  userId: string;
  email: string;
  teams: string[];
  roles: string[];
}

async function searchWithAccess(
  query: string,
  user: UserContext,
  limit: number = 5
): Promise<Document[]> {
  // Vector search returns candidates
  const candidates = await vectorDb.search(query, limit * 3);

  // Filter by access level
  const accessible = [];

  for (const doc of candidates) {
    const canAccess = await checkDocAccess(doc, user);
    if (canAccess) {
      accessible.push(doc);
    }
  }

  return accessible.slice(0, limit);
}

async function checkDocAccess(
  doc: Document,
  user: UserContext
): Promise<boolean> {
  if (doc.accessLevel === 'public') {
    return true;
  }

  if (doc.accessLevel === 'team') {
    // Check if user''s teams overlap with doc teams
    const docTeams = await getDocumentTeams(doc.id);
    return user.teams.some(t => docTeams.includes(t));
  }

  if (doc.accessLevel === 'restricted') {
    const allowedUsers = await getRestrictedUsers(doc.id);
    return allowedUsers.includes(user.email);
  }

  return false;
}

Conversational Search UX

Build a chat interface for knowledge discovery:

interface SearchConversation {
  userId: string;
  messages: Array<{
    role: 'user' | 'assistant';
    content: string;
    timestamp: Date;
  }>;
  context: { docs: Document[]; metadata: any };
}

async function answerQuestion(
  query: string,
  user: UserContext,
  conversationHistory: SearchConversation['messages'] = []
): Promise<string> {
  // Search for relevant documents
  const docs = await searchWithAccess(query, user);

  if (docs.length === 0) {
    return 'I couldn''t find relevant documentation for that question. Try rephrasing or contact your knowledge manager.';
  }

  // Build context from documents
  const context = docs
    .map(d => `[${d.title}](${d.url}):\n${d.content.slice(0, 500)}...`)
    .join('\n\n');

  const answerPrompt = `
    You are a helpful knowledge base assistant. Answer this question using the provided documentation.

    Question: ${query}

    Previous conversation context:
    ${conversationHistory.map(m => `${m.role}: ${m.content}`).join('\n')}

    Relevant documentation:
    ${context}

    Rules:
    - Answer concisely (2-3 sentences max)
    - Always cite the source document with a link
    - If docs don''t fully answer the question, say so
    - Ask clarifying questions if needed
  `;

  return llm.generate(answerPrompt);
}

Attribution to Source Documents

Always show where answers come from:

interface SourceAttribution {
  documentId: string;
  title: string;
  url: string;
  relevanceScore: number;
  excerpt: string;
}

async function generateAnswerWithSources(
  query: string,
  user: UserContext
): Promise<{ answer: string; sources: SourceAttribution[] }> {
  const docs = await searchWithAccess(query, user);

  const attributionPrompt = `
    Answer this question: "${query}"

    Using these sources:
    ${docs.map(d => `- "${d.title}" (${d.url}): ${d.content.slice(0, 300)}`).join('\n')}

    Respond in JSON with:
    {
      "answer": "your answer here",
      "sourceIds": ["document-ids", "you", "cited"]
    }
  `;

  const result = JSON.parse(await llm.generate(attributionPrompt));

  const sources: SourceAttribution[] = result.sourceIds.map((id: string) => {
    const doc = docs.find(d => d.id === id)!;
    return {
      documentId: id,
      title: doc.title,
      url: doc.url,
      relevanceScore: 0.95, // Calculate actual relevance
      excerpt: extractRelevantExcerpt(doc.content, query)
    };
  });

  return { answer: result.answer, sources };
}

Staleness Detection

Flag outdated documentation:

interface StaleDocumentReport {
  documentId: string;
  title: string;
  lastModified: Date;
  daysSinceUpdate: number;
  confidence: number;
  suggestions: string[];
}

async function detectStaleDocuments(): Promise<StaleDocumentReport[]> {
  const allDocs = await vectorDb.getAllDocuments();
  const staleReports: StaleDocumentReport[] = [];

  for (const doc of allDocs) {
    const daysSince = Math.floor(
      (Date.now() - doc.lastModified.getTime()) / (1000 * 60 * 60 * 24)
    );

    let isStale = false;
    let confidence = 0;

    // Hard rule: older than 1 year
    if (daysSince &gt; 365) {
      isStale = true;
      confidence = 0.95;
    }

    // Check for outdated patterns
    if (doc.content.match(/as of (2024|2023|2022)/i)) {
      isStale = true;
      confidence = 0.85;
    }

    if (isStale) {
      staleReports.push({
        documentId: doc.id,
        title: doc.title,
        lastModified: doc.lastModified,
        daysSinceUpdate: daysSince,
        confidence,
        suggestions: [
          `Review and update if still accurate (${daysSince} days old)`,
          'Check if this process or tool has changed',
          'Consider archiving if no longer relevant'
        ]
      });
    }
  }

  return staleReports;
}

Multi-Workspace Support

Handle multiple separate knowledge bases:

interface Workspace {
  id: string;
  name: string;
  sources: DocumentSource[];
  accessControl: 'open' | 'restricted';
  users: UserContext[];
}

async function searchAcrossWorkspaces(
  query: string,
  user: UserContext
): Promise<Map<string, Document[]>> {
  const userWorkspaces = await getUserWorkspaces(user.userId);
  const results = new Map<string, Document[]>();

  for (const workspace of userWorkspaces) {
    const docs = await vectorDb.search(query, 5, { workspaceId: workspace.id });
    const filtered = docs.filter(d => checkDocAccess(d, user));
    results.set(workspace.id, filtered);
  }

  return results;
}

Analytics: What Are Employees Searching For?

Track search patterns to identify gaps:

interface SearchAnalytics {
  query: string;
  userId: string;
  timestamp: Date;
  resultsCount: number;
  userClicked: boolean;
  dwellTime: number; // seconds
  feedback?: 'helpful' | 'unhelpful' | 'irrelevant';
}

async function logSearch(
  query: string,
  user: UserContext,
  results: Document[],
  dwellTime: number
): Promise<void> {
  await analyticsDb.insert('search_analytics', {
    query,
    userId: user.userId,
    timestamp: new Date(),
    resultsCount: results.length,
    userClicked: dwellTime &gt; 5,
    dwellTime,
    workspace: user.teams[0]
  });

  // Identify common failed searches
  if (results.length === 0) {
    await analyticsDb.insert('failed_searches', {
      query,
      timestamp: new Date()
    });
  }
}

async function getTopSearches(days: number = 30): Promise<Array<{
  query: string;
  count: number;
  successRate: number;
}>> {
  return analyticsDb.query(`
    SELECT query, COUNT(*) as count,
      AVG(CASE WHEN resultsCount > 0 THEN 1 ELSE 0 END) as successRate
    FROM search_analytics
    WHERE timestamp &gt; NOW() - INTERVAL '${days} days'
    GROUP BY query
    ORDER BY count DESC
    LIMIT 20
  `);
}

Feedback Loop for Document Quality

Let users improve documentation:

interface DocumentFeedback {
  documentId: string;
  userId: string;
  helpful: boolean;
  comment?: string;
  suggestedEdit?: string;
  timestamp: Date;
}

async function recordFeedback(
  feedback: DocumentFeedback
): Promise<void> {
  await feedbackDb.insert('document_feedback', feedback);

  // If multiple people mark as unhelpful, flag for review
  const recentFeedback = await feedbackDb.query(`
    SELECT COUNT(*) as total, SUM(CASE WHEN helpful THEN 1 ELSE 0 END) as helpful
    FROM document_feedback
    WHERE documentId = ? AND timestamp > NOW() - INTERVAL '30 days'
  `, [feedback.documentId]);

  const helpfulRate = recentFeedback.helpful / recentFeedback.total;
  if (helpfulRate &lt; 0.6 && recentFeedback.total &gt;= 5) {
    await flagDocumentForReview(feedback.documentId);
  }
}

Checklist

  • Ingest documentation from Confluence, Notion, GitHub, and Google Docs
  • Implement incremental sync based on content checksums, not timestamps
  • Enforce access control: public, team, restricted
  • Build conversational search with multi-turn context
  • Always attribute answers to source documents with links
  • Detect and flag stale documents (1+ year old or date references)
  • Support multi-workspace knowledge bases
  • Track search analytics to identify gaps and user needs
  • Implement feedback loop: helpful/unhelpful and suggested edits
  • Set alerts when documents receive consistently negative feedback
  • Schedule weekly review of failed searches and stale docs
  • Monitor vector search quality and tune similarity thresholds

Conclusion

An AI knowledge base transforms how teams find information. By unifying fragmented documentation, enforcing access controls, and building conversational interfaces, you create a single source of truth. The key is keeping it up-to-date through automated staleness detection and quality feedback loops. Start with your most-searched documentation sources, measure search success rate, and expand gradually.