- Published on
Multimodal RAG — Searching Images, Tables, and PDFs Together
- Authors

- Name
- Sanjeev Sharma
- @webcoderspeed1
Introduction
Most RAG systems only index text. But knowledge lives in PDFs with tables, charts, and figures. Ignoring this visual information means your RAG system misses critical context.
This post covers parsing multimodal documents, embedding visual elements, and searching across all media types.
- PDF Parsing with Structure Preservation
- Table Extraction and Embedding
- Image Captioning for Searchability
- ColPali: Vision Encoder for Document Retrieval
- Unified Multimodal Retrieval
- Figure-to-Text Conversion
- Multimodal Query Handling
- Checklist
- Conclusion
PDF Parsing with Structure Preservation
Extract text while preserving layout and structure:
import PDFParser from 'pdf-parse';
import { Jimp } from 'jimp';
interface PDFElement {
type: 'text' | 'table' | 'image' | 'figure';
content: string; // text or base64 for images
bbox?: { x: number; y: number; width: number; height: number }; // bounding box
pageNumber: number;
metadata?: Record<string, any>;
}
async function parseStructuredPDF(filePath: string): Promise<PDFElement[]> {
const elements: PDFElement[] = [];
// Use pdfplumber-like parsing to detect structure
const pdf = await PDFParser.PDFDocument.load(filePath);
for (let pageNum = 0; pageNum < pdf.numPages; pageNum++) {
const page = pdf.getPage(pageNum);
// Extract text with position info
const textContent = await page.getTextContent();
let currentText = '';
let lastY = 0;
for (const item of textContent.items) {
// Detect headers (larger font)
if (item.height && item.height > 14) {
if (currentText.trim()) {
elements.push({
type: 'text',
content: currentText.trim(),
pageNumber: pageNum,
metadata: { isHeader: false },
});
currentText = '';
}
elements.push({
type: 'text',
content: item.str,
pageNumber: pageNum,
metadata: { isHeader: true, fontSize: item.height },
bbox: {
x: item.x,
y: item.y,
width: item.width,
height: item.height,
},
});
} else {
currentText += item.str + ' ';
}
// Detect new sections (large vertical gap)
if (lastY - item.y > 20) {
if (currentText.trim()) {
elements.push({
type: 'text',
content: currentText.trim(),
pageNumber: pageNum,
});
currentText = '';
}
}
lastY = item.y;
}
if (currentText.trim()) {
elements.push({
type: 'text',
content: currentText.trim(),
pageNumber: pageNum,
});
}
}
return elements;
}
// Alternative: Use unstructured library wrapper
async function parseWithUnstructured(filePath: string): Promise<PDFElement[]> {
// Mock wrapper around unstructured.io SDK
// In production: pip install unstructured[pdf] && configure API
const elements: PDFElement[] = [];
// Simulated response from unstructured API
const response = {
elements: [
{
type: 'Title',
text: 'Document Title',
metadata: { page_number: 0 },
},
{
type: 'Table',
text: '| Col1 | Col2 |\n|------|------|\n| A | B |',
metadata: { page_number: 0 },
},
{
type: 'Image',
text: '[base64-image-data]',
metadata: { page_number: 1 },
},
],
};
for (const elem of response.elements) {
elements.push({
type: elem.type.toLowerCase() as PDFElement['type'],
content: elem.text,
pageNumber: elem.metadata.page_number,
});
}
return elements;
}
Table Extraction and Embedding
Extract and embed structured table data:
interface TableData {
id: string;
rows: Array<Record<string, string>>;
metadata: {
title?: string;
pageNumber: number;
rowCount: number;
columnCount: number;
};
}
interface TableEmbedding {
tableId: string;
headerEmbedding: number[];
rowEmbeddings: number[][]; // One embedding per row
tableDescriptionEmbedding: number[];
}
async function extractTableStructure(tableText: string): Promise<TableData> {
// Parse markdown/CSV table format
const lines = tableText.trim().split('\n');
const headers = lines[0]
.split('|')
.map(h => h.trim())
.filter(Boolean);
const rows: Array<Record<string, string>> = [];
for (let i = 2; i < lines.length; i++) {
const values = lines[i]
.split('|')
.map(v => v.trim())
.filter(Boolean);
if (values.length === headers.length) {
const row: Record<string, string> = {};
headers.forEach((h, idx) => {
row[h] = values[idx];
});
rows.push(row);
}
}
return {
id: `table_${Date.now()}`,
rows,
metadata: {
rowCount: rows.length,
columnCount: headers.length,
pageNumber: 0,
},
};
}
async function embedTable(
table: TableData,
embedModel: EmbedModel
): Promise<TableEmbedding> {
// Embed table header as query
const headerText = Object.keys(table.rows[0] || {}).join(' | ');
const headerEmbedding = await embedModel.embed(headerText);
// Embed each row as searchable unit
const rowEmbeddings = await Promise.all(
table.rows.map(row => {
const rowText = Object.values(row).join(' ');
return embedModel.embed(rowText);
})
);
// Create table description for context
const description = `
Table with ${table.metadata.columnCount} columns and ${table.metadata.rowCount} rows.
Columns: ${headerText}.
${table.rows.slice(0, 3).map(r => Object.values(r).join(' | ')).join('\n')}
${table.metadata.rowCount > 3 ? '... (more rows)' : ''}`;
const descriptionEmbedding = await embedModel.embed(description);
return {
tableId: table.id,
headerEmbedding,
rowEmbeddings,
tableDescriptionEmbedding: descriptionEmbedding,
};
}
// Retrieve rows by query
async function searchTableRows(
query: string,
tables: TableEmbedding[],
embedModel: EmbedModel,
topK: number = 5
): Promise<Array<{ tableId: string; row: Record<string, string>; relevance: number }>> {
const queryEmbedding = await embedModel.embed(query);
const results: Array<{ tableId: string; rowIdx: number; score: number; row: Record<string, string> }> = [];
// Search across all tables
for (const table of tables) {
// Score against table description first
const tableScore = cosineSimilarity(queryEmbedding, table.tableDescriptionEmbedding);
if (tableScore > 0.3) {
// Score individual rows
table.rowEmbeddings.forEach((rowEmbed, idx) => {
const score = cosineSimilarity(queryEmbedding, rowEmbed) * tableScore;
results.push({
tableId: table.tableId,
rowIdx: idx,
score,
row: {}, // Populate from tableData
});
});
}
}
return results
.sort((a, b) => b.score - a.score)
.slice(0, topK)
.map(r => ({
tableId: r.tableId,
row: r.row,
relevance: r.score,
}));
}
function cosineSimilarity(a: number[], b: number[]): number {
const dotProduct = a.reduce((sum, x, i) => sum + x * b[i], 0);
const normA = Math.sqrt(a.reduce((sum, x) => sum + x * x, 0));
const normB = Math.sqrt(b.reduce((sum, x) => sum + x * x, 0));
return dotProduct / (normA * normB);
}
Image Captioning for Searchability
Generate descriptions for images to make them searchable:
interface ImageAsset {
id: string;
base64: string;
caption?: string;
embedding?: number[];
pageNumber: number;
type: 'photograph' | 'chart' | 'diagram' | 'screenshot';
}
async function generateImageCaption(
image: ImageAsset,
visionModel: VisionModel
): Promise<string> {
const captionPrompt = `
Describe this image in detail, focusing on key information, labels, and context.
For charts/graphs, describe the data and trends. For diagrams, describe relationships.
[IMAGE]`;
const response = await visionModel.generate({
messages: [{ role: 'user', content: captionPrompt }],
images: [{ base64: image.base64, mediaType: 'image/png' }],
maxTokens: 200,
});
return response.text;
}
// Batch caption generation
async function captionAllImages(
images: ImageAsset[],
visionModel: VisionModel,
embedModel: EmbedModel
): Promise<ImageAsset[]> {
const captioned = await Promise.all(
images.map(async image => {
const caption = await generateImageCaption(image, visionModel);
const embedding = await embedModel.embed(caption);
return {
...image,
caption,
embedding,
};
})
);
return captioned;
}
// Search images by query
async function searchImages(
query: string,
images: ImageAsset[],
embedModel: EmbedModel,
topK: number = 5
): Promise<Array<{ id: string; caption: string; relevance: number }>> {
const queryEmbedding = await embedModel.embed(query);
const scores = images
.filter(img => img.embedding && img.caption)
.map(img => ({
id: img.id,
caption: img.caption!,
relevance: cosineSimilarity(queryEmbedding, img.embedding!),
}));
return scores
.sort((a, b) => b.relevance - a.relevance)
.slice(0, topK);
}
ColPali: Vision Encoder for Document Retrieval
Use vision encoders to retrieve entire pages based on visual content:
interface DocumentPage {
id: string;
documentId: string;
pageNumber: number;
imageBase64: string;
visionEmbedding?: number[];
textContent?: string;
}
// Mock ColPali implementation
interface ColpaliEncoder {
encodeImage(base64: string): Promise<number[]>;
encodeQuery(query: string): Promise<number[]>;
}
async function embedPageWithColpali(
page: DocumentPage,
colpali: ColpaliEncoder
): Promise<DocumentPage> {
const embedding = await colpali.encodeImage(page.imageBase64);
return {
...page,
visionEmbedding: embedding,
};
}
async function searchPagesWithColpali(
query: string,
pages: DocumentPage[],
colpali: ColpaliEncoder,
topK: number = 5
): Promise<Array<{ pageId: string; relevance: number; pageNumber: number }>> {
const queryEmbedding = await colpali.encodeQuery(query);
const scores = pages
.filter(p => p.visionEmbedding)
.map(page => ({
pageId: page.id,
relevance: cosineSimilarity(queryEmbedding, page.visionEmbedding!),
pageNumber: page.pageNumber,
}));
return scores
.sort((a, b) => b.relevance - a.relevance)
.slice(0, topK);
}
Unified Multimodal Retrieval
Combine text, table, and image search:
interface MultimodalSearchResult {
type: 'text' | 'table' | 'image';
contentId: string;
relevance: number;
content: string; // text excerpt, table markdown, or image caption
pageNumber: number;
}
async function unifiedMultimodalSearch(
query: string,
index: {
textChunks: Array<{ id: string; text: string; embedding: number[]; pageNumber: number }>;
tables: TableEmbedding[];
images: ImageAsset[];
tableData: Map<string, TableData>;
},
embedModel: EmbedModel,
topK: number = 5
): Promise<MultimodalSearchResult[]> {
const queryEmbedding = await embedModel.embed(query);
const results: MultimodalSearchResult[] = [];
// Search text
for (const chunk of index.textChunks) {
const relevance = cosineSimilarity(queryEmbedding, chunk.embedding);
results.push({
type: 'text',
contentId: chunk.id,
relevance,
content: chunk.text,
pageNumber: chunk.pageNumber,
});
}
// Search tables
for (const table of index.tables) {
const tableRelevance = cosineSimilarity(queryEmbedding, table.tableDescriptionEmbedding);
if (tableRelevance > 0.3) {
const tableData = index.tableData.get(table.tableId);
if (tableData) {
results.push({
type: 'table',
contentId: table.tableId,
relevance: tableRelevance,
content: tableData.rows.map(r => Object.values(r).join(' | ')).join('\n'),
pageNumber: tableData.metadata.pageNumber,
});
}
}
}
// Search images
for (const image of index.images) {
if (image.embedding && image.caption) {
const relevance = cosineSimilarity(queryEmbedding, image.embedding);
results.push({
type: 'image',
contentId: image.id,
relevance,
content: image.caption,
pageNumber: image.pageNumber,
});
}
}
// Normalize scores and return top-k
const maxScore = Math.max(...results.map(r => r.relevance));
return results
.map(r => ({ ...r, relevance: r.relevance / maxScore }))
.sort((a, b) => b.relevance - a.relevance)
.slice(0, topK);
}
Figure-to-Text Conversion
Convert charts and figures into queryable descriptions:
async function convertFigureToText(
image: ImageAsset,
visionModel: VisionModel
): Promise<string> {
// Specialized prompts for different figure types
let prompt = '';
if (image.type === 'chart') {
prompt = `
Analyze this chart and extract:
1. Chart type (bar, line, pie, etc.)
2. Axes labels and ranges
3. Data values and trends
4. Key insights and patterns
Provide the data in a structured text format suitable for analysis.`;
} else if (image.type === 'diagram') {
prompt = `
Describe the structure and relationships shown in this diagram:
1. Main components/entities
2. Connections and relationships
3. Flow or hierarchy
4. Key information labels`;
} else {
prompt = `
Describe the key elements and information in this image in detail.`;
}
const response = await visionModel.generate({
messages: [{ role: 'user', content: prompt }],
images: [{ base64: image.base64, mediaType: 'image/png' }],
maxTokens: 300,
});
return response.text;
}
// Store figure descriptions in searchable index
async function indexFigureTexts(
images: ImageAsset[],
visionModel: VisionModel,
embedModel: EmbedModel
): Promise<
Array<{
imageId: string;
description: string;
structuredData?: Record<string, any>;
embedding: number[];
}>
> {
const indexed = await Promise.all(
images.map(async image => {
const description = await convertFigureToText(image, visionModel);
const embedding = await embedModel.embed(description);
return {
imageId: image.id,
description,
embedding,
};
})
);
return indexed;
}
// Example: Extract data from chart
async function extractChartData(
image: ImageAsset,
visionModel: VisionModel
): Promise<Record<string, number>> {
const extractPrompt = `
From this chart, extract the data in JSON format.
[IMAGE]
Respond with:
{
"labels": [...],
"values": [...],
"title": "..."
}`;
const response = await visionModel.generate({
messages: [{ role: 'user', content: extractPrompt }],
images: [{ base64: image.base64, mediaType: 'image/png' }],
maxTokens: 200,
});
return JSON.parse(response.text);
}
Multimodal Query Handling
Route queries to appropriate modality:
async function classifyQueryModality(
query: string,
llm: LLMClient
): Promise<'text' | 'visual' | 'both'> {
const classifyPrompt = `
Is this query asking for:
- Text information (facts, definitions, explanations)
- Visual information (charts, diagrams, images, appearance)
- Both
Query: "${query}"
Respond with: text, visual, or both`;
const response = await llm.generate({
messages: [{ role: 'user', content: classifyPrompt }],
maxTokens: 10,
});
const answer = response.text.toLowerCase();
if (answer.includes('visual')) return 'visual';
if (answer.includes('both')) return 'both';
return 'text';
}
async function multimodalQuery(
query: string,
index: any,
llm: LLMClient,
embedModel: EmbedModel,
topK: number = 5
): Promise<MultimodalSearchResult[]> {
const modality = await classifyQueryModality(query, llm);
switch (modality) {
case 'visual':
// Prioritize image results
return unifiedMultimodalSearch(query, index, embedModel, topK);
case 'text':
// Prioritize text results
return unifiedMultimodalSearch(query, index, embedModel, topK)
.then(results => results.filter(r => r.type === 'text' || r.type === 'table'));
case 'both':
default:
return unifiedMultimodalSearch(query, index, embedModel, topK);
}
}
Checklist
- Implement PDF parsing with structure detection
- Extract and embed tables separately from text
- Generate captions for all images in documents
- Test ColPali vision encoder on your document corpus
- Create unified index for text, tables, images
- Implement multimodal search aggregation
- Add figure-to-text conversion for charts
- Build query modality classifier
- Measure retrieval quality across modalities
- Monitor embedding distribution (text vs visual)
Conclusion
Multimodal RAG unlocks information locked in images, tables, and visual layouts. The key insight: visual content needs different embedding strategies than text. Use vision encoders (ColPali) for page-level retrieval and specialized embeddings for tables. When combined, you retrieve richer context and generate more accurate answers. Start with PDF parsing, add table extraction, then layer in vision encoders as you scale.