Published on

AI Document Intelligence — Extracting Structured Data From Unstructured Documents

Authors

Introduction

Document extraction is a critical workflow for automation: invoices to accounting systems, contracts to legal databases, forms to backend services. This guide covers the full pipeline from PDF ingestion to high-confidence structured extraction.

PDF Text Extraction Fundamentals

Different libraries handle PDFs differently. Choose based on your document types:

import * as fs from 'fs';

interface PDFContent {
  pages: Array<{
    pageNumber: number;
    text: string;
    tables: Array<Array<string[]>>;
    images: string[]; // Base64 encoded images
  }>;
  metadata: {
    title?: string;
    author?: string;
    creationDate?: string;
    pageCount: number;
  };
}

async function extractPDFWithPdfplumber(
  pdfPath: string
): Promise<PDFContent> {
  // In production, use Python subprocess or pdfplumber.js
  // This is pseudocode for TypeScript integration

  const content: PDFContent = {
    pages: [],
    metadata: {
      pageCount: 0
    }
  };

  // pdfplumber advantages:
  // - Excellent table detection
  // - Preserves layout and coordinates
  // - Good for structured documents (invoices, forms)
  // - Python-based (requires subprocess integration)

  // Example usage (Python):
  // import pdfplumber
  // with pdfplumber.open(pdf_path) as pdf:
  //   for page in pdf.pages:
  //     text = page.extract_text()
  //     tables = page.extract_tables()

  return content;
}

async function extractPDFWithPyMuPDF(
  pdfPath: string
): Promise<PDFContent> {
  // PyMuPDF (fitz) advantages:
  // - Faster for large documents
  // - Better rendering of complex PDFs
  // - Access to document structure
  // - Can extract images directly

  // Example usage (Python):
  // import fitz
  // doc = fitz.open(pdf_path)
  // for page in doc:
  //   text = page.get_text()
  //   images = page.get_images()

  return {
    pages: [],
    metadata: { pageCount: 0 }
  };
}

// Selection guide:
// - Structured documents (invoices, forms): pdfplumber
// - Scanned documents (need OCR): PyMuPDF + Tesseract
// - Speed-critical: PyMuPDF
// - Complex layouts: pdfplumber or Vision API

Table Extraction from PDFs

Tables are common in documents but hard to parse:

interface ExtractedTable {
  pageNumber: number;
  headers: string[];
  rows: Array<{ [key: string]: string }>;
  confidence: number;
}

async function extractTablesWithStructure(
  pdfPath: string
): Promise<ExtractedTable[]> {
  const tables: ExtractedTable[] = [];

  // Python integration (pseudocode)
  // import pdfplumber
  // with pdfplumber.open(pdf_path) as pdf:
  //   for page_num, page in enumerate(pdf.pages):
  //     extracted = page.extract_tables()
  //     if extracted:
  //       for table in extracted:
  //         headers = table[0]
  //         rows = table[1:]
  //         structure = {
  //           pageNumber: page_num,
  //           headers: headers,
  //           rows: rows.map(row => dict(zip(headers, row))),
  //           confidence: 0.95
  //         }

  return tables;
}

async function fixMalformedTables(
  table: ExtractedTable[][]
): Promise<ExtractedTable> {
  // Some tables span multiple rows or columns unnaturally
  // Use LLM to understand the table semantics

  const tableStr = JSON.stringify(table);

  const response = await fetch('https://api.openai.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: 'gpt-4-vision',
      messages: [
        {
          role: 'user',
          content: `Fix this malformed table structure. Identify correct headers and align rows.
Table: ${tableStr}
Respond with valid JSON: { headers: [...], rows: [...] }`
        }
      ],
      temperature: 0
    })
  });

  const data = await response.json();
  const fixed = JSON.parse(data.choices[0].message.content);

  return {
    pageNumber: 0,
    headers: fixed.headers,
    rows: fixed.rows,
    confidence: 0.88
  };
}

Invoice/Receipt Parsing with LLM

Extract key fields from financial documents:

interface InvoiceData {
  invoiceNumber: string;
  date: string;
  dueDate?: string;
  vendor: {
    name: string;
    address?: string;
    taxId?: string;
  };
  customer: {
    name: string;
    address?: string;
  };
  lineItems: Array<{
    description: string;
    quantity: number;
    unitPrice: number;
    total: number;
  }>;
  subtotal: number;
  tax: number;
  total: number;
  currency: string;
  confidence: number;
}

async function extractInvoiceData(
  pdfText: string,
  pdfImages?: string[] // Base64 encoded
): Promise<InvoiceData> {
  const messages: Array<{
    role: 'user' | 'assistant';
    content: string | Array<any>;
  }> = [
    {
      role: 'user',
      content: [
        {
          type: 'text',
          text: `Extract invoice data and respond with JSON only. Fields: invoiceNumber, date, dueDate, vendor{name, address, taxId}, customer{name, address}, lineItems[{description, quantity, unitPrice, total}], subtotal, tax, total, currency.
Invoice text:
${pdfText}`
        }
      ]
    }
  ];

  // Add images if provided
  if (pdfImages && pdfImages.length &gt; 0) {
    messages[0].content.push({
      type: 'image_url',
      image_url: { url: `data:image/png;base64,${pdfImages[0]}` }
    });
  }

  const response = await fetch('https://api.openai.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: 'gpt-4-vision',
      messages,
      temperature: 0
    })
  });

  const data = await response.json();
  const extracted = JSON.parse(data.choices[0].message.content);

  return {
    invoiceNumber: extracted.invoiceNumber || '',
    date: extracted.date || '',
    dueDate: extracted.dueDate,
    vendor: extracted.vendor || {},
    customer: extracted.customer || {},
    lineItems: extracted.lineItems || [],
    subtotal: parseFloat(extracted.subtotal) || 0,
    tax: parseFloat(extracted.tax) || 0,
    total: parseFloat(extracted.total) || 0,
    currency: extracted.currency || 'USD',
    confidence: 0.92
  };
}

// Improvements:
// - Use vision model for scanned PDFs (has OCR)
// - Extract tables separately, then invoice fields
// - Use field-specific regex for post-processing (dates, amounts)

Form Field Extraction

Extract data from structured forms:

interface FormField {
  name: string;
  value: string;
  fieldType: 'text' | 'checkbox' | 'radio' | 'date' | 'currency' | 'number';
  confidence: number;
}

interface FormData {
  formType: string;
  fields: FormField[];
  extractionMethod: 'pdf-native' | 'ocr' | 'llm';
}

async function extractFormFields(
  pdfPath: string,
  formDefinition: {
    fieldNames: string[];
    expectedTypes: { [field: string]: string };
  }
): Promise<FormData> {
  // Attempt native PDF form extraction first (quickest)
  // If not available, fall back to vision + LLM

  const response = await fetch('https://api.openai.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: 'gpt-4-vision',
      messages: [
        {
          role: 'user',
          content: `Extract form field values. Expected fields: ${formDefinition.fieldNames.join(', ')}
          Format response as JSON array of {name, value, type}`
        }
      ]
    })
  });

  const data = await response.json();
  const parsed = JSON.parse(data.choices[0].message.content);

  return {
    formType: 'application_form',
    fields: parsed.map((f: any) => ({
      name: f.name,
      value: f.value,
      fieldType: f.type || 'text',
      confidence: 0.88
    })),
    extractionMethod: 'llm'
  };
}

Entity Extraction and Normalization

Extract specific entities and normalize to standard formats:

interface ExtractedEntity {
  type: 'name' | 'email' | 'phone' | 'date' | 'amount' | 'address' | 'url';
  value: string;
  normalized: string;
  confidence: number;
  position?: { page: number; x: number; y: number };
}

async function extractAndNormalizeEntities(
  text: string
): Promise<ExtractedEntity[]> {
  const response = await fetch('https://api.openai.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: 'gpt-4',
      messages: [
        {
          role: 'user',
          content: `Extract entities: names, emails, phone numbers, dates, amounts, addresses, URLs.
Format as JSON array: [{ type, value, normalized }]
Text: ${text}`
        }
      ],
      temperature: 0
    })
  });

  const data = await response.json();
  const entities = JSON.parse(data.choices[0].message.content);

  return entities.map((e: any) => ({
    type: e.type,
    value: e.value,
    normalized: normalizeEntity(e.type, e.value),
    confidence: 0.91
  }));
}

function normalizeEntity(type: string, value: string): string {
  switch (type) {
    case 'email':
      return value.toLowerCase().trim();
    case 'phone':
      return value.replace(/\D/g, ''); // Remove non-digits
    case 'date':
      return new Date(value).toISOString().split('T')[0]; // YYYY-MM-DD
    case 'amount':
      return parseFloat(value.replace(/[^0-9.]/g, '')).toFixed(2);
    case 'url':
      return new URL(value).toString();
    default:
      return value.trim();
  }
}

Relationship Extraction

Extract how entities relate to each other:

interface EntityRelationship {
  entity1: string;
  relation: string;
  entity2: string;
  confidence: number;
}

async function extractRelationships(
  text: string,
  entityTypes: string[]
): Promise<EntityRelationship[]> {
  const response = await fetch('https://api.openai.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: 'gpt-4',
      messages: [
        {
          role: 'user',
          content: `Extract relationships between entities.
Entity types: ${entityTypes.join(', ')}
Format as JSON: [{ entity1, relation, entity2, confidence }]
Text: ${text}`
        }
      ],
      temperature: 0
    })
  });

  const data = await response.json();
  return JSON.parse(data.choices[0].message.content);
}

// Example: Invoice extraction
// "Invoice ABC123 from Company XYZ to Customer John Doe, due 2026-03-22"
// Extracts: {
//   entity1: "Invoice ABC123",
//   relation: "issued_by",
//   entity2: "Company XYZ"
// }, {
//   entity1: "Invoice ABC123",
//   relation: "sent_to",
//   entity2: "Customer John Doe"
// }

Data Validation Pipeline

Validate extracted data against business rules:

interface ValidationRule {
  field: string;
  type: 'required' | 'format' | 'range' | 'custom';
  constraint: string | RegExp | ((value: any) => boolean);
}

interface ValidationResult {
  field: string;
  valid: boolean;
  error?: string;
}

function validateExtraction(
  data: any,
  rules: ValidationRule[]
): ValidationResult[] {
  return rules.map(rule => {
    const value = data[rule.field];

    if (rule.type === 'required') {
      if (!value) {
        return { field: rule.field, valid: false, error: 'Required field missing' };
      }
    }

    if (rule.type === 'format') {
      const regex = rule.constraint as RegExp;
      if (!regex.test(String(value))) {
        return { field: rule.field, valid: false, error: `Format mismatch: ${rule.constraint}` };
      }
    }

    if (rule.type === 'range') {
      const [min, max] = (rule.constraint as string).split('-').map(Number);
      const num = parseFloat(value);
      if (num &lt; min || num &gt; max) {
        return { field: rule.field, valid: false, error: `Value outside range ${min}-${max}` };
      }
    }

    if (rule.type === 'custom') {
      const validator = rule.constraint as (v: any) => boolean;
      if (!validator(value)) {
        return { field: rule.field, valid: false, error: 'Custom validation failed' };
      }
    }

    return { field: rule.field, valid: true };
  });
}

const invoiceRules: ValidationRule[] = [
  { field: 'invoiceNumber', type: 'required', constraint: '' },
  { field: 'invoiceNumber', type: 'format', constraint: /^[A-Z0-9]{4,}$/ },
  { field: 'date', type: 'format', constraint: /^\d{4}-\d{2}-\d{2}$/ },
  { field: 'total', type: 'range', constraint: '0-999999' },
  { field: 'total', type: 'custom', constraint: (v: number) => v &gt; 0 }
];

Confidence Scoring

Score extraction quality per field:

interface FieldConfidence {
  field: string;
  extractionMethod: 'native' | 'ocr' | 'llm';
  confidence: number;
  reasons: string[];
}

function scoreFieldConfidence(
  field: string,
  value: string,
  extractionMethod: string,
  sourceText: string
): FieldConfidence {
  let confidence = 0.5; // Base score
  const reasons: string[] = [];

  // Method-based scoring
  const methodScores: { [key: string]: number } = {
    native: 0.99, // PDF form fields are most reliable
    ocr: 0.85, // OCR has errors
    llm: 0.90 // LLM is very good but not perfect
  };

  confidence = methodScores[extractionMethod] || 0.5;

  // Boost if value appears multiple times (cross-validation)
  const occurrences = (sourceText.match(new RegExp(value, 'g')) || []).length;
  if (occurrences &gt; 1) {
    confidence = Math.min(0.99, confidence + 0.05);
    reasons.push(`Value appears ${occurrences} times in source`);
  }

  // Reduce if field seems ambiguous
  if (value.length &lt; 2) {
    confidence *= 0.8;
    reasons.push('Value too short, may be error');
  }

  // Reduce if extraction found alternatives
  if (sourceText.includes('or') || sourceText.includes('approximately')) {
    confidence *= 0.9;
    reasons.push('Source text contains uncertainty indicators');
  }

  return {
    field,
    extractionMethod: extractionMethod as any,
    confidence: Math.round(confidence * 100) / 100,
    reasons
  };
}

Human Review Queue

Route low-confidence extractions for human verification:

interface ReviewTask {
  id: string;
  documentPath: string;
  extractedData: any;
  lowConfidenceFields: string[];
  priority: 'high' | 'medium' | 'low';
  createdAt: Date;
}

async function createReviewQueue(
  extractions: Array<{
    documentId: string;
    data: any;
    confidences: Map<string, number>;
  }>,
  confidenceThreshold: number = 0.8
): Promise<ReviewTask[]> {
  const queue: ReviewTask[] = [];

  extractions.forEach(extraction => {
    const lowConfidenceFields = Array.from(
      extraction.confidences.entries()
    )
      .filter(([_, conf]) => conf &lt; confidenceThreshold)
      .map(([field, _]) => field);

    if (lowConfidenceFields.length &gt; 0) {
      const priority =
        lowConfidenceFields.length &gt; 3
          ? 'high'
          : lowConfidenceFields.length &gt; 1
            ? 'medium'
            : 'low';

      queue.push({
        id: `review_${extraction.documentId}`,
        documentPath: extraction.documentId,
        extractedData: extraction.data,
        lowConfidenceFields,
        priority,
        createdAt: new Date()
      });
    }
  });

  return queue.sort((a, b) => {
    const priorityOrder = { high: 0, medium: 1, low: 2 };
    return priorityOrder[a.priority] - priorityOrder[b.priority];
  });
}

// SLA: High priority reviewed within 1 hour
// Medium within 4 hours, Low within 24 hours

Document Processing Pipeline

End-to-end workflow:

interface ProcessingResult {
  documentId: string;
  extractedData: any;
  confidence: number;
  requiresReview: boolean;
  processingTimeMs: number;
}

async function processDocument(
  pdfPath: string
): Promise<ProcessingResult> {
  const startTime = Date.now();

  try {
    // Step 1: Extract text and images
    const pdfContent = await extractPDFWithPdfplumber(pdfPath);
    const pdfText = pdfContent.pages.map(p => p.text).join('\n');

    // Step 2: Detect document type
    const documentType = await detectDocumentType(pdfText);

    // Step 3: Extract structured data based on type
    let extractedData: any;
    if (documentType === 'invoice') {
      extractedData = await extractInvoiceData(
        pdfText,
        pdfContent.pages[0].images
      );
    } else if (documentType === 'form') {
      extractedData = await extractFormFields(pdfPath, {
        fieldNames: ['name', 'date', 'amount'],
        expectedTypes: {}
      });
    }

    // Step 4: Validate extracted data
    const validationResults = validateExtraction(extractedData, []);

    // Step 5: Score confidence
    const confidenceScores = new Map<string, number>();
    Object.keys(extractedData).forEach(field => {
      const score = scoreFieldConfidence(
        field,
        extractedData[field],
        'llm',
        pdfText
      );
      confidenceScores.set(field, score.confidence);
    });

    const avgConfidence =
      Array.from(confidenceScores.values()).reduce((a, b) => a + b, 0) /
      confidenceScores.size;

    // Step 6: Determine if human review needed
    const requiresReview =
      avgConfidence &lt; 0.85 || validationResults.some(r => !r.valid);

    return {
      documentId: pdfPath,
      extractedData,
      confidence: Math.round(avgConfidence * 100) / 100,
      requiresReview,
      processingTimeMs: Date.now() - startTime
    };
  } catch (error) {
    return {
      documentId: pdfPath,
      extractedData: {},
      confidence: 0,
      requiresReview: true,
      processingTimeMs: Date.now() - startTime
    };
  }
}

async function detectDocumentType(text: string): Promise<string> {
  const response = await fetch('https://api.openai.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: 'gpt-3.5-turbo',
      messages: [
        {
          role: 'user',
          content: `Classify document type: invoice, receipt, form, contract, or other.
Text: ${text.substring(0, 500)}`
        }
      ],
      temperature: 0
    })
  });

  const data = await response.json();
  return data.choices[0].message.content.toLowerCase().split(',')[0].trim();
}

Checklist

  • Selected PDF extraction library based on document type
  • Extracted text, images, and tables from sample documents
  • Built table detection and parsing for structured data
  • Implemented LLM-based extraction for unstructured fields
  • Normalized entities (dates, amounts, phone numbers)
  • Set up validation rules for domain-specific constraints
  • Scored confidence per field and per document
  • Routed low-confidence extractions to human review queue
  • Built end-to-end pipeline with error handling
  • Tested on 100+ diverse document samples
  • Set up monitoring for extraction accuracy and SLA

Conclusion

Document extraction requires layered approach: OCR or PDF extraction, LLM-based entity extraction, validation, and human review for edge cases. Target 90%+ accuracy with <5% human review rate by combining native PDF parsing with vision-based LLM extraction.