Skip to content

Commit 86a13ae

Browse files
committed
feat: preserve text and image order based on Y-coordinates
- Extract Y-coordinates from PDF.js transform matrices - Sort content items by vertical position within each page - Return ordered content parts for optimal AI consumption - Text and images interleaved as they appear in document - Automatic line grouping for text on same Y-coordinate - Improved error handling with descriptive messages BREAKING CHANGE: Content parts now include individual text blocks instead of concatenated page text when images are enabled Resolves user requirement for exact document ordering like: page 1 [text, image, text, image, image, text]
1 parent 11f5693 commit 86a13ae

File tree

11 files changed

+474
-135
lines changed

11 files changed

+474
-135
lines changed

CHANGELOG.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,33 @@
22

33
All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
44

5+
## [1.2.0](https://github.com/sylphxltd/pdf-reader-mcp/compare/v1.1.0...v1.2.0) (2025-10-31)
6+
7+
### Features
8+
9+
* **Content Ordering**: Preserve exact text and image order based on Y-coordinates
10+
- Content items within each page are now sorted by their vertical position
11+
- Enables AI to see content in the same order as it appears in the PDF
12+
- Text and images are interleaved based on document layout
13+
- Example: page 1 [text, image, text, image, image, text]
14+
- Uses PDF.js transform matrices to extract Y-coordinates
15+
- Automatically groups text items on the same line
16+
- Returns ordered content parts for optimal AI consumption
17+
18+
### Internal Changes
19+
20+
* New `extractPageContent()` function combines text and image extraction with positioning
21+
* New `PageContentItem` interface tracks content type, position, and data
22+
* Handler updated to generate content parts in document-reading order
23+
* Improved error handling to return descriptive error messages as text content
24+
25+
### Code Quality
26+
27+
* All tests passing (91 tests)
28+
* Coverage maintained at 97.76% statements, 90.95% branches
29+
* TypeScript strict mode compliance
30+
* Zero linting errors
31+
532
## [1.1.0](https://github.com/sylphxltd/pdf-reader-mcp/compare/v1.0.0...v1.1.0) (2025-10-31)
633

734
### Features

dist/handlers/readPdf.js

Lines changed: 63 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// PDF reading handler - orchestrates PDF processing workflow
22
import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js';
33
import { z } from 'zod';
4-
import { buildWarnings, extractImages, extractMetadataAndPageCount, extractPageTexts, } from '../pdf/extractor.js';
4+
import { buildWarnings, extractMetadataAndPageCount, extractPageContent, } from '../pdf/extractor.js';
55
import { loadPdfDocument } from '../pdf/loader.js';
66
import { determinePagesToProcess, getTargetPages } from '../pdf/parser.js';
77
import { readPdfArgsSchema } from '../schemas/readPdf.js';
@@ -28,9 +28,23 @@ const processSingleSource = async (source, options) => {
2828
if (warnings.length > 0) {
2929
output.warnings = warnings;
3030
}
31-
// Extract text if needed
31+
// Extract content with ordering preserved
3232
if (pagesToProcess.length > 0) {
33-
const extractedPageTexts = await extractPageTexts(pdfDocument, pagesToProcess, sourceDescription);
33+
// Use new extractPageContent to preserve Y-coordinate ordering
34+
const pageContents = await Promise.all(pagesToProcess.map((pageNum) => extractPageContent(pdfDocument, pageNum, options.includeImages, sourceDescription)));
35+
// Store page contents for ordered retrieval
36+
output.page_contents = pageContents.map((items, idx) => ({
37+
page: pagesToProcess[idx],
38+
items,
39+
}));
40+
// For backward compatibility, also provide text-only outputs
41+
const extractedPageTexts = pageContents.map((items, idx) => ({
42+
page: pagesToProcess[idx],
43+
text: items
44+
.filter((item) => item.type === 'text')
45+
.map((item) => item.textContent)
46+
.join(''),
47+
}));
3448
if (targetPages) {
3549
// Specific pages requested
3650
output.page_texts = extractedPageTexts;
@@ -39,12 +53,15 @@ const processSingleSource = async (source, options) => {
3953
// Full text requested
4054
output.full_text = extractedPageTexts.map((p) => p.text).join('\n\n');
4155
}
42-
}
43-
// Extract images if needed
44-
if (options.includeImages && pagesToProcess.length > 0) {
45-
const extractedImages = await extractImages(pdfDocument, pagesToProcess);
46-
if (extractedImages.length > 0) {
47-
output.images = extractedImages;
56+
// Extract image metadata for JSON response
57+
if (options.includeImages) {
58+
const extractedImages = pageContents
59+
.flatMap((items) => items.filter((item) => item.type === 'image' && item.imageData))
60+
.map((item) => item.imageData)
61+
.filter((img) => img !== undefined);
62+
if (extractedImages.length > 0) {
63+
output.images = extractedImages;
64+
}
4865
}
4966
}
5067
individualResult = { ...individualResult, data: output, success: true };
@@ -89,60 +106,52 @@ export const handleReadPdfFunc = async (args) => {
89106
includePageCount: include_page_count,
90107
includeImages: include_images,
91108
})));
92-
// Build content parts preserving page order
109+
// Build content parts - start with structured JSON for backward compatibility
93110
const content = [];
94-
// Add metadata/summary as first text part
95-
const summaryData = results.map((result) => ({
96-
source: result.source,
97-
success: result.success,
98-
num_pages: result.data?.num_pages,
99-
info: result.data?.info,
100-
metadata: result.data?.metadata,
101-
warnings: result.data?.warnings,
102-
error: result.error,
103-
}));
111+
// Strip image data and page_contents from JSON to keep it manageable
112+
const resultsForJson = results.map((result) => {
113+
if (result.data) {
114+
const { images, page_contents, ...dataWithoutBinaryContent } = result.data;
115+
// Include image count and metadata in JSON, but not the base64 data
116+
if (images) {
117+
const imageInfo = images.map((img) => ({
118+
page: img.page,
119+
index: img.index,
120+
width: img.width,
121+
height: img.height,
122+
format: img.format,
123+
}));
124+
return { ...result, data: { ...dataWithoutBinaryContent, image_info: imageInfo } };
125+
}
126+
return { ...result, data: dataWithoutBinaryContent };
127+
}
128+
return result;
129+
});
130+
// First content part: Structured JSON results
104131
content.push({
105132
type: 'text',
106-
text: JSON.stringify({ summary: summaryData }, null, 2),
133+
text: JSON.stringify({ results: resultsForJson }, null, 2),
107134
});
108-
// Add page content in order: text then images for each page
135+
// Add page content in exact Y-coordinate order
109136
for (const result of results) {
110-
if (!result.success || !result.data)
137+
if (!result.success || !result.data?.page_contents)
111138
continue;
112-
// Handle page_texts (specific pages requested)
113-
if (result.data.page_texts) {
114-
for (const pageText of result.data.page_texts) {
115-
// Add text for this page
116-
content.push({
117-
type: 'text',
118-
text: `[Page ${pageText.page} from ${result.source}]\n${pageText.text}`,
119-
});
120-
// Add images for this page (if any)
121-
if (result.data.images) {
122-
const pageImages = result.data.images.filter((img) => img.page === pageText.page);
123-
for (const image of pageImages) {
124-
content.push({
125-
type: 'image',
126-
data: image.data,
127-
mimeType: image.format === 'rgba' ? 'image/png' : 'image/jpeg',
128-
});
129-
}
139+
// Process each page's content items in order
140+
for (const pageContent of result.data.page_contents) {
141+
for (const item of pageContent.items) {
142+
if (item.type === 'text' && item.textContent) {
143+
// Add text content part
144+
content.push({
145+
type: 'text',
146+
text: item.textContent,
147+
});
130148
}
131-
}
132-
}
133-
// Handle full_text (all pages)
134-
if (result.data.full_text) {
135-
content.push({
136-
type: 'text',
137-
text: `[Full text from ${result.source}]\n${result.data.full_text}`,
138-
});
139-
// Add all images at the end for full text mode
140-
if (result.data.images) {
141-
for (const image of result.data.images) {
149+
else if (item.type === 'image' && item.imageData) {
150+
// Add image content part
142151
content.push({
143152
type: 'image',
144-
data: image.data,
145-
mimeType: image.format === 'rgba' ? 'image/png' : 'image/jpeg',
153+
data: item.imageData.data,
154+
mimeType: item.imageData.format === 'rgba' ? 'image/png' : 'image/jpeg',
146155
});
147156
}
148157
}

dist/index.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import { allToolDefinitions } from './handlers/index.js';
1111
// --- Server Setup ---
1212
const server = new Server({
1313
name: 'pdf-reader-mcp',
14-
version: '1.1.0',
14+
version: '1.2.0',
1515
description: 'MCP Server for reading PDF files and extracting text, metadata, images, and page information.',
1616
}, {
1717
capabilities: { tools: {} },

dist/pdf/extractor.js

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,3 +151,115 @@ export const buildWarnings = (invalidPages, totalPages) => {
151151
`Requested page numbers ${invalidPages.join(', ')} exceed total pages (${String(totalPages)}).`,
152152
];
153153
};
154+
/**
155+
* Extract all content (text and images) from a single page with Y-coordinate ordering
156+
*/
157+
export const extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescription) => {
158+
const contentItems = [];
159+
try {
160+
const page = await pdfDocument.getPage(pageNum);
161+
// Extract text content with Y-coordinates
162+
const textContent = await page.getTextContent();
163+
// Group text items by Y-coordinate (items on same line have similar Y values)
164+
const textByY = new Map();
165+
for (const item of textContent.items) {
166+
const textItem = item;
167+
// transform[5] is the Y coordinate
168+
const yCoord = textItem.transform[5];
169+
if (yCoord === undefined)
170+
continue;
171+
const y = Math.round(yCoord);
172+
if (!textByY.has(y)) {
173+
textByY.set(y, []);
174+
}
175+
textByY.get(y)?.push(textItem.str);
176+
}
177+
// Convert grouped text to content items
178+
for (const [y, textParts] of textByY.entries()) {
179+
const textContent = textParts.join('');
180+
if (textContent.trim()) {
181+
contentItems.push({
182+
type: 'text',
183+
yPosition: y,
184+
textContent,
185+
});
186+
}
187+
}
188+
// Extract images with Y-coordinates if requested
189+
if (includeImages) {
190+
const operatorList = await page.getOperatorList();
191+
// Find all image painting operations
192+
const imageIndices = [];
193+
for (let i = 0; i < operatorList.fnArray.length; i++) {
194+
const op = operatorList.fnArray[i];
195+
if (op === OPS.paintImageXObject || op === OPS.paintXObject) {
196+
imageIndices.push(i);
197+
}
198+
}
199+
// Extract each image with its Y-coordinate
200+
const imagePromises = imageIndices.map((imgIndex, arrayIndex) => new Promise((resolve) => {
201+
const argsArray = operatorList.argsArray[imgIndex];
202+
if (!argsArray || argsArray.length === 0) {
203+
resolve(null);
204+
return;
205+
}
206+
const imageName = argsArray[0];
207+
// Get transform matrix from the args (if available)
208+
// The transform is typically in argsArray[1] for some ops
209+
let yPosition = 0;
210+
if (argsArray.length > 1 && Array.isArray(argsArray[1])) {
211+
const transform = argsArray[1];
212+
// transform[5] is the Y coordinate
213+
const yCoord = transform[5];
214+
if (yCoord !== undefined) {
215+
yPosition = Math.round(yCoord);
216+
}
217+
}
218+
// Use callback-based get() as images may not be resolved yet
219+
page.objs.get(imageName, (imageData) => {
220+
if (!imageData || typeof imageData !== 'object') {
221+
resolve(null);
222+
return;
223+
}
224+
const img = imageData;
225+
if (!img.data || !img.width || !img.height) {
226+
resolve(null);
227+
return;
228+
}
229+
// Determine image format based on kind
230+
const format = img.kind === 1 ? 'grayscale' : img.kind === 3 ? 'rgba' : 'rgb';
231+
// Convert Uint8Array to base64
232+
const base64 = Buffer.from(img.data).toString('base64');
233+
resolve({
234+
type: 'image',
235+
yPosition,
236+
imageData: {
237+
page: pageNum,
238+
index: arrayIndex,
239+
width: img.width,
240+
height: img.height,
241+
format,
242+
data: base64,
243+
},
244+
});
245+
});
246+
}));
247+
const resolvedImages = await Promise.all(imagePromises);
248+
contentItems.push(...resolvedImages.filter((item) => item !== null));
249+
}
250+
}
251+
catch (error) {
252+
const message = error instanceof Error ? error.message : String(error);
253+
console.warn(`[PDF Reader MCP] Error extracting page content for page ${String(pageNum)} in ${sourceDescription}: ${message}`);
254+
// Return error message as text content
255+
return [
256+
{
257+
type: 'text',
258+
yPosition: 0,
259+
textContent: `Error processing page: ${message}`,
260+
},
261+
];
262+
}
263+
// Sort by Y-position (descending = top to bottom in PDF coordinates)
264+
return contentItems.sort((a, b) => b.yPosition - a.yPosition);
265+
};

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@sylphx/pdf-reader-mcp",
3-
"version": "1.1.0",
3+
"version": "1.2.0",
44
"description": "An MCP server providing tools to read PDF files.",
55
"type": "module",
66
"bin": {

0 commit comments

Comments
 (0)