AI Document Intelligence — Extracting Structured Data From Unstructured Documents

Introduction

Document extraction is a critical workflow for automation: invoices to accounting systems, contracts to legal databases, forms to backend services. This guide covers the full pipeline from PDF ingestion to high-confidence structured extraction.

PDF Text Extraction Fundamentals
Table Extraction from PDFs
Invoice/Receipt Parsing with LLM
Form Field Extraction
Entity Extraction and Normalization
Relationship Extraction
Data Validation Pipeline
Confidence Scoring
Human Review Queue
Document Processing Pipeline
Checklist
Conclusion

PDF Text Extraction Fundamentals

Different libraries handle PDFs differently. Choose based on your document types:

import * as fs from 'fs';

interface PDFContent {
  pages: Array<{
    pageNumber: number;
    text: string;
    tables: Array<Array<string[]>>;
    images: string[]; // Base64 encoded images
  }>;
  metadata: {
    title?: string;
    author?: string;
    creationDate?: string;
    pageCount: number;
  };
}

async function extractPDFWithPdfplumber(
  pdfPath: string
): Promise<PDFContent> {
  // In production, use Python subprocess or pdfplumber.js
  // This is pseudocode for TypeScript integration

  const content: PDFContent = {
    pages: [],
    metadata: {
      pageCount: 0
    }
  };

  // pdfplumber advantages:
  // - Excellent table detection
  // - Preserves layout and coordinates
  // - Good for structured documents (invoices, forms)
  // - Python-based (requires subprocess integration)

  // Example usage (Python):
  // import pdfplumber
  // with pdfplumber.open(pdf_path) as pdf:
  //   for page in pdf.pages:
  //     text = page.extract_text()
  //     tables = page.extract_tables()

  return content;
}

async function extractPDFWithPyMuPDF(
  pdfPath: string
): Promise<PDFContent> {
  // PyMuPDF (fitz) advantages:
  // - Faster for large documents
  // - Better rendering of complex PDFs
  // - Access to document structure
  // - Can extract images directly

  // Example usage (Python):
  // import fitz
  // doc = fitz.open(pdf_path)
  // for page in doc:
  //   text = page.get_text()
  //   images = page.get_images()

  return {
    pages: [],
    metadata: { pageCount: 0 }
  };
}

// Selection guide:
// - Structured documents (invoices, forms): pdfplumber
// - Scanned documents (need OCR): PyMuPDF + Tesseract
// - Speed-critical: PyMuPDF
// - Complex layouts: pdfplumber or Vision API

Table Extraction from PDFs

Tables are common in documents but hard to parse:

interface ExtractedTable {
  pageNumber: number;
  headers: string[];
  rows: Array<{ [key: string]: string }>;
  confidence: number;
}

async function extractTablesWithStructure(
  pdfPath: string
): Promise<ExtractedTable[]> {
  const tables: ExtractedTable[] = [];

  // Python integration (pseudocode)
  // import pdfplumber
  // with pdfplumber.open(pdf_path) as pdf:
  //   for page_num, page in enumerate(pdf.pages):
  //     extracted = page.extract_tables()
  //     if extracted:
  //       for table in extracted:
  //         headers = table[0]
  //         rows = table[1:]
  //         structure = {
  //           pageNumber: page_num,
  //           headers: headers,
  //           rows: rows.map(row => dict(zip(headers, row))),
  //           confidence: 0.95
  //         }

  return tables;
}

async function fixMalformedTables(
  table: ExtractedTable[][]
): Promise<ExtractedTable> {
  // Some tables span multiple rows or columns unnaturally
  // Use LLM to understand the table semantics

  const tableStr = JSON.stringify(table);

  const response = await fetch('https://api.openai.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: 'gpt-4-vision',
      messages: [
        {
          role: 'user',
          content: `Fix this malformed table structure. Identify correct headers and align rows.
Table: ${tableStr}
Respond with valid JSON: { headers: [...], rows: [...] }`
        }
      ],
      temperature: 0
    })
  });

  const data = await response.json();
  const fixed = JSON.parse(data.choices[0].message.content);

  return {
    pageNumber: 0,
    headers: fixed.headers,
    rows: fixed.rows,
    confidence: 0.88
  };
}

Invoice/Receipt Parsing with LLM

Extract key fields from financial documents:

interface InvoiceData {
  invoiceNumber: string;
  date: string;
  dueDate?: string;
  vendor: {
    name: string;
    address?: string;
    taxId?: string;
  };
  customer: {
    name: string;
    address?: string;
  };
  lineItems: Array<{
    description: string;
    quantity: number;
    unitPrice: number;
    total: number;
  }>;
  subtotal: number;
  tax: number;
  total: number;
  currency: string;
  confidence: number;
}

async function extractInvoiceData(
  pdfText: string,
  pdfImages?: string[] // Base64 encoded
): Promise<InvoiceData> {
  const messages: Array<{
    role: 'user' | 'assistant';
    content: string | Array<any>;
  }> = [
    {
      role: 'user',
      content: [
        {
          type: 'text',
          text: `Extract invoice data and respond with JSON only. Fields: invoiceNumber, date, dueDate, vendor{name, address, taxId}, customer{name, address}, lineItems[{description, quantity, unitPrice, total}], subtotal, tax, total, currency.
Invoice text:
${pdfText}`
        }
      ]
    }
  ];

  // Add images if provided
  if (pdfImages && pdfImages.length &gt; 0) {
    messages[0].content.push({
      type: 'image_url',
      image_url: { url: `data:image/png;base64,${pdfImages[0]}` }
    });
  }

  const response = await fetch('https://api.openai.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: 'gpt-4-vision',
      messages,
      temperature: 0
    })
  });

  const data = await response.json();
  const extracted = JSON.parse(data.choices[0].message.content);

  return {
    invoiceNumber: extracted.invoiceNumber || '',
    date: extracted.date || '',
    dueDate: extracted.dueDate,
    vendor: extracted.vendor || {},
    customer: extracted.customer || {},
    lineItems: extracted.lineItems || [],
    subtotal: parseFloat(extracted.subtotal) || 0,
    tax: parseFloat(extracted.tax) || 0,
    total: parseFloat(extracted.total) || 0,
    currency: extracted.currency || 'USD',
    confidence: 0.92
  };
}

// Improvements:
// - Use vision model for scanned PDFs (has OCR)
// - Extract tables separately, then invoice fields
// - Use field-specific regex for post-processing (dates, amounts)

Form Field Extraction

Extract data from structured forms:

interface FormField {
  name: string;
  value: string;
  fieldType: 'text' | 'checkbox' | 'radio' | 'date' | 'currency' | 'number';
  confidence: number;
}

interface FormData {
  formType: string;
  fields: FormField[];
  extractionMethod: 'pdf-native' | 'ocr' | 'llm';
}

async function extractFormFields(
  pdfPath: string,
  formDefinition: {
    fieldNames: string[];
    expectedTypes: { [field: string]: string };
  }
): Promise<FormData> {
  // Attempt native PDF form extraction first (quickest)
  // If not available, fall back to vision + LLM

  const response = await fetch('https://api.openai.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: 'gpt-4-vision',
      messages: [
        {
          role: 'user',
          content: `Extract form field values. Expected fields: ${formDefinition.fieldNames.join(', ')}
          Format response as JSON array of {name, value, type}`
        }
      ]
    })
  });

  const data = await response.json();
  const parsed = JSON.parse(data.choices[0].message.content);

  return {
    formType: 'application_form',
    fields: parsed.map((f: any) => ({
      name: f.name,
      value: f.value,
      fieldType: f.type || 'text',
      confidence: 0.88
    })),
    extractionMethod: 'llm'
  };
}

Entity Extraction and Normalization

Extract specific entities and normalize to standard formats:

interface ExtractedEntity {
  type: 'name' | 'email' | 'phone' | 'date' | 'amount' | 'address' | 'url';
  value: string;
  normalized: string;
  confidence: number;
  position?: { page: number; x: number; y: number };
}

async function extractAndNormalizeEntities(
  text: string
): Promise<ExtractedEntity[]> {
  const response = await fetch('https://api.openai.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: 'gpt-4',
      messages: [
        {
          role: 'user',
          content: `Extract entities: names, emails, phone numbers, dates, amounts, addresses, URLs.
Format as JSON array: [{ type, value, normalized }]
Text: ${text}`
        }
      ],
      temperature: 0
    })
  });

  const data = await response.json();
  const entities = JSON.parse(data.choices[0].message.content);

  return entities.map((e: any) => ({
    type: e.type,
    value: e.value,
    normalized: normalizeEntity(e.type, e.value),
    confidence: 0.91
  }));
}

function normalizeEntity(type: string, value: string): string {
  switch (type) {
    case 'email':
      return value.toLowerCase().trim();
    case 'phone':
      return value.replace(/\D/g, ''); // Remove non-digits
    case 'date':
      return new Date(value).toISOString().split('T')[0]; // YYYY-MM-DD
    case 'amount':
      return parseFloat(value.replace(/[^0-9.]/g, '')).toFixed(2);
    case 'url':
      return new URL(value).toString();
    default:
      return value.trim();
  }
}

Relationship Extraction

Extract how entities relate to each other:

interface EntityRelationship {
  entity1: string;
  relation: string;
  entity2: string;
  confidence: number;
}

async function extractRelationships(
  text: string,
  entityTypes: string[]
): Promise<EntityRelationship[]> {
  const response = await fetch('https://api.openai.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: 'gpt-4',
      messages: [
        {
          role: 'user',
          content: `Extract relationships between entities.
Entity types: ${entityTypes.join(', ')}
Format as JSON: [{ entity1, relation, entity2, confidence }]
Text: ${text}`
        }
      ],
      temperature: 0
    })
  });

  const data = await response.json();
  return JSON.parse(data.choices[0].message.content);
}

// Example: Invoice extraction
// "Invoice ABC123 from Company XYZ to Customer John Doe, due 2026-03-22"
// Extracts: {
//   entity1: "Invoice ABC123",
//   relation: "issued_by",
//   entity2: "Company XYZ"
// }, {
//   entity1: "Invoice ABC123",
//   relation: "sent_to",
//   entity2: "Customer John Doe"
// }

Data Validation Pipeline

Validate extracted data against business rules:

interface ValidationRule {
  field: string;
  type: 'required' | 'format' | 'range' | 'custom';
  constraint: string | RegExp | ((value: any) => boolean);
}

interface ValidationResult {
  field: string;
  valid: boolean;
  error?: string;
}

function validateExtraction(
  data: any,
  rules: ValidationRule[]
): ValidationResult[] {
  return rules.map(rule => {
    const value = data[rule.field];

    if (rule.type === 'required') {
      if (!value) {
        return { field: rule.field, valid: false, error: 'Required field missing' };
      }
    }

    if (rule.type === 'format') {
      const regex = rule.constraint as RegExp;
      if (!regex.test(String(value))) {
        return { field: rule.field, valid: false, error: `Format mismatch: ${rule.constraint}` };
      }
    }

    if (rule.type === 'range') {
      const [min, max] = (rule.constraint as string).split('-').map(Number);
      const num = parseFloat(value);
      if (num &lt; min || num &gt; max) {
        return { field: rule.field, valid: false, error: `Value outside range ${min}-${max}` };
      }
    }

    if (rule.type === 'custom') {
      const validator = rule.constraint as (v: any) => boolean;
      if (!validator(value)) {
        return { field: rule.field, valid: false, error: 'Custom validation failed' };
      }
    }

    return { field: rule.field, valid: true };
  });
}

const invoiceRules: ValidationRule[] = [
  { field: 'invoiceNumber', type: 'required', constraint: '' },
  { field: 'invoiceNumber', type: 'format', constraint: /^[A-Z0-9]{4,}$/ },
  { field: 'date', type: 'format', constraint: /^\d{4}-\d{2}-\d{2}$/ },
  { field: 'total', type: 'range', constraint: '0-999999' },
  { field: 'total', type: 'custom', constraint: (v: number) => v &gt; 0 }
];

Confidence Scoring

Score extraction quality per field:

interface FieldConfidence {
  field: string;
  extractionMethod: 'native' | 'ocr' | 'llm';
  confidence: number;
  reasons: string[];
}

function scoreFieldConfidence(
  field: string,
  value: string,
  extractionMethod: string,
  sourceText: string
): FieldConfidence {
  let confidence = 0.5; // Base score
  const reasons: string[] = [];

  // Method-based scoring
  const methodScores: { [key: string]: number } = {
    native: 0.99, // PDF form fields are most reliable
    ocr: 0.85, // OCR has errors
    llm: 0.90 // LLM is very good but not perfect
  };

  confidence = methodScores[extractionMethod] || 0.5;

  // Boost if value appears multiple times (cross-validation)
  const occurrences = (sourceText.match(new RegExp(value, 'g')) || []).length;
  if (occurrences &gt; 1) {
    confidence = Math.min(0.99, confidence + 0.05);
    reasons.push(`Value appears ${occurrences} times in source`);
  }

  // Reduce if field seems ambiguous
  if (value.length &lt; 2) {
    confidence *= 0.8;
    reasons.push('Value too short, may be error');
  }

  // Reduce if extraction found alternatives
  if (sourceText.includes('or') || sourceText.includes('approximately')) {
    confidence *= 0.9;
    reasons.push('Source text contains uncertainty indicators');
  }

  return {
    field,
    extractionMethod: extractionMethod as any,
    confidence: Math.round(confidence * 100) / 100,
    reasons
  };
}

Human Review Queue

Route low-confidence extractions for human verification:

interface ReviewTask {
  id: string;
  documentPath: string;
  extractedData: any;
  lowConfidenceFields: string[];
  priority: 'high' | 'medium' | 'low';
  createdAt: Date;
}

async function createReviewQueue(
  extractions: Array<{
    documentId: string;
    data: any;
    confidences: Map<string, number>;
  }>,
  confidenceThreshold: number = 0.8
): Promise<ReviewTask[]> {
  const queue: ReviewTask[] = [];

  extractions.forEach(extraction => {
    const lowConfidenceFields = Array.from(
      extraction.confidences.entries()
    )
      .filter(([_, conf]) => conf &lt; confidenceThreshold)
      .map(([field, _]) => field);

    if (lowConfidenceFields.length &gt; 0) {
      const priority =
        lowConfidenceFields.length &gt; 3
          ? 'high'
          : lowConfidenceFields.length &gt; 1
            ? 'medium'
            : 'low';

      queue.push({
        id: `review_${extraction.documentId}`,
        documentPath: extraction.documentId,
        extractedData: extraction.data,
        lowConfidenceFields,
        priority,
        createdAt: new Date()
      });
    }
  });

  return queue.sort((a, b) => {
    const priorityOrder = { high: 0, medium: 1, low: 2 };
    return priorityOrder[a.priority] - priorityOrder[b.priority];
  });
}

// SLA: High priority reviewed within 1 hour
// Medium within 4 hours, Low within 24 hours

Document Processing Pipeline

End-to-end workflow:

interface ProcessingResult {
  documentId: string;
  extractedData: any;
  confidence: number;
  requiresReview: boolean;
  processingTimeMs: number;
}

async function processDocument(
  pdfPath: string
): Promise<ProcessingResult> {
  const startTime = Date.now();

  try {
    // Step 1: Extract text and images
    const pdfContent = await extractPDFWithPdfplumber(pdfPath);
    const pdfText = pdfContent.pages.map(p => p.text).join('\n');

    // Step 2: Detect document type
    const documentType = await detectDocumentType(pdfText);

    // Step 3: Extract structured data based on type
    let extractedData: any;
    if (documentType === 'invoice') {
      extractedData = await extractInvoiceData(
        pdfText,
        pdfContent.pages[0].images
      );
    } else if (documentType === 'form') {
      extractedData = await extractFormFields(pdfPath, {
        fieldNames: ['name', 'date', 'amount'],
        expectedTypes: {}
      });
    }

    // Step 4: Validate extracted data
    const validationResults = validateExtraction(extractedData, []);

    // Step 5: Score confidence
    const confidenceScores = new Map<string, number>();
    Object.keys(extractedData).forEach(field => {
      const score = scoreFieldConfidence(
        field,
        extractedData[field],
        'llm',
        pdfText
      );
      confidenceScores.set(field, score.confidence);
    });

    const avgConfidence =
      Array.from(confidenceScores.values()).reduce((a, b) => a + b, 0) /
      confidenceScores.size;

    // Step 6: Determine if human review needed
    const requiresReview =
      avgConfidence &lt; 0.85 || validationResults.some(r => !r.valid);

    return {
      documentId: pdfPath,
      extractedData,
      confidence: Math.round(avgConfidence * 100) / 100,
      requiresReview,
      processingTimeMs: Date.now() - startTime
    };
  } catch (error) {
    return {
      documentId: pdfPath,
      extractedData: {},
      confidence: 0,
      requiresReview: true,
      processingTimeMs: Date.now() - startTime
    };
  }
}

async function detectDocumentType(text: string): Promise<string> {
  const response = await fetch('https://api.openai.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: 'gpt-3.5-turbo',
      messages: [
        {
          role: 'user',
          content: `Classify document type: invoice, receipt, form, contract, or other.
Text: ${text.substring(0, 500)}`
        }
      ],
      temperature: 0
    })
  });

  const data = await response.json();
  return data.choices[0].message.content.toLowerCase().split(',')[0].trim();
}

Checklist

Conclusion

Document extraction requires layered approach: OCR or PDF extraction, LLM-based entity extraction, validation, and human review for edge cases. Target 90%+ accuracy with <5% human review rate by combining native PDF parsing with vision-based LLM extraction.