11// PDF reading handler - orchestrates PDF processing workflow
22import { ErrorCode , McpError } from '@modelcontextprotocol/sdk/types.js' ;
33import { z } from 'zod' ;
4- import { buildWarnings , extractImages , extractMetadataAndPageCount , extractPageTexts , } from '../pdf/extractor.js' ;
4+ import { buildWarnings , extractMetadataAndPageCount , extractPageContent , } from '../pdf/extractor.js' ;
55import { loadPdfDocument } from '../pdf/loader.js' ;
66import { determinePagesToProcess , getTargetPages } from '../pdf/parser.js' ;
77import { readPdfArgsSchema } from '../schemas/readPdf.js' ;
@@ -28,9 +28,23 @@ const processSingleSource = async (source, options) => {
2828 if ( warnings . length > 0 ) {
2929 output . warnings = warnings ;
3030 }
31- // Extract text if needed
31+ // Extract content with ordering preserved
3232 if ( pagesToProcess . length > 0 ) {
33- const extractedPageTexts = await extractPageTexts ( pdfDocument , pagesToProcess , sourceDescription ) ;
33+ // Use new extractPageContent to preserve Y-coordinate ordering
34+ const pageContents = await Promise . all ( pagesToProcess . map ( ( pageNum ) => extractPageContent ( pdfDocument , pageNum , options . includeImages , sourceDescription ) ) ) ;
35+ // Store page contents for ordered retrieval
36+ output . page_contents = pageContents . map ( ( items , idx ) => ( {
37+ page : pagesToProcess [ idx ] ,
38+ items,
39+ } ) ) ;
40+ // For backward compatibility, also provide text-only outputs
41+ const extractedPageTexts = pageContents . map ( ( items , idx ) => ( {
42+ page : pagesToProcess [ idx ] ,
43+ text : items
44+ . filter ( ( item ) => item . type === 'text' )
45+ . map ( ( item ) => item . textContent )
46+ . join ( '' ) ,
47+ } ) ) ;
3448 if ( targetPages ) {
3549 // Specific pages requested
3650 output . page_texts = extractedPageTexts ;
@@ -39,12 +53,15 @@ const processSingleSource = async (source, options) => {
3953 // Full text requested
4054 output . full_text = extractedPageTexts . map ( ( p ) => p . text ) . join ( '\n\n' ) ;
4155 }
42- }
43- // Extract images if needed
44- if ( options . includeImages && pagesToProcess . length > 0 ) {
45- const extractedImages = await extractImages ( pdfDocument , pagesToProcess ) ;
46- if ( extractedImages . length > 0 ) {
47- output . images = extractedImages ;
56+ // Extract image metadata for JSON response
57+ if ( options . includeImages ) {
58+ const extractedImages = pageContents
59+ . flatMap ( ( items ) => items . filter ( ( item ) => item . type === 'image' && item . imageData ) )
60+ . map ( ( item ) => item . imageData )
61+ . filter ( ( img ) => img !== undefined ) ;
62+ if ( extractedImages . length > 0 ) {
63+ output . images = extractedImages ;
64+ }
4865 }
4966 }
5067 individualResult = { ...individualResult , data : output , success : true } ;
@@ -89,60 +106,52 @@ export const handleReadPdfFunc = async (args) => {
89106 includePageCount : include_page_count ,
90107 includeImages : include_images ,
91108 } ) ) ) ;
92- // Build content parts preserving page order
109+ // Build content parts - start with structured JSON for backward compatibility
93110 const content = [ ] ;
94- // Add metadata/summary as first text part
95- const summaryData = results . map ( ( result ) => ( {
96- source : result . source ,
97- success : result . success ,
98- num_pages : result . data ?. num_pages ,
99- info : result . data ?. info ,
100- metadata : result . data ?. metadata ,
101- warnings : result . data ?. warnings ,
102- error : result . error ,
103- } ) ) ;
111+ // Strip image data and page_contents from JSON to keep it manageable
112+ const resultsForJson = results . map ( ( result ) => {
113+ if ( result . data ) {
114+ const { images, page_contents, ...dataWithoutBinaryContent } = result . data ;
115+ // Include image count and metadata in JSON, but not the base64 data
116+ if ( images ) {
117+ const imageInfo = images . map ( ( img ) => ( {
118+ page : img . page ,
119+ index : img . index ,
120+ width : img . width ,
121+ height : img . height ,
122+ format : img . format ,
123+ } ) ) ;
124+ return { ...result , data : { ...dataWithoutBinaryContent , image_info : imageInfo } } ;
125+ }
126+ return { ...result , data : dataWithoutBinaryContent } ;
127+ }
128+ return result ;
129+ } ) ;
130+ // First content part: Structured JSON results
104131 content . push ( {
105132 type : 'text' ,
106- text : JSON . stringify ( { summary : summaryData } , null , 2 ) ,
133+ text : JSON . stringify ( { results : resultsForJson } , null , 2 ) ,
107134 } ) ;
108- // Add page content in order: text then images for each page
135+ // Add page content in exact Y-coordinate order
109136 for ( const result of results ) {
110- if ( ! result . success || ! result . data )
137+ if ( ! result . success || ! result . data ?. page_contents )
111138 continue ;
112- // Handle page_texts (specific pages requested)
113- if ( result . data . page_texts ) {
114- for ( const pageText of result . data . page_texts ) {
115- // Add text for this page
116- content . push ( {
117- type : 'text' ,
118- text : `[Page ${ pageText . page } from ${ result . source } ]\n${ pageText . text } ` ,
119- } ) ;
120- // Add images for this page (if any)
121- if ( result . data . images ) {
122- const pageImages = result . data . images . filter ( ( img ) => img . page === pageText . page ) ;
123- for ( const image of pageImages ) {
124- content . push ( {
125- type : 'image' ,
126- data : image . data ,
127- mimeType : image . format === 'rgba' ? 'image/png' : 'image/jpeg' ,
128- } ) ;
129- }
139+ // Process each page's content items in order
140+ for ( const pageContent of result . data . page_contents ) {
141+ for ( const item of pageContent . items ) {
142+ if ( item . type === 'text' && item . textContent ) {
143+ // Add text content part
144+ content . push ( {
145+ type : 'text' ,
146+ text : item . textContent ,
147+ } ) ;
130148 }
131- }
132- }
133- // Handle full_text (all pages)
134- if ( result . data . full_text ) {
135- content . push ( {
136- type : 'text' ,
137- text : `[Full text from ${ result . source } ]\n${ result . data . full_text } ` ,
138- } ) ;
139- // Add all images at the end for full text mode
140- if ( result . data . images ) {
141- for ( const image of result . data . images ) {
149+ else if ( item . type === 'image' && item . imageData ) {
150+ // Add image content part
142151 content . push ( {
143152 type : 'image' ,
144- data : image . data ,
145- mimeType : image . format === 'rgba' ? 'image/png' : 'image/jpeg' ,
153+ data : item . imageData . data ,
154+ mimeType : item . imageData . format === 'rgba' ? 'image/png' : 'image/jpeg' ,
146155 } ) ;
147156 }
148157 }
0 commit comments