@@ -70,6 +70,48 @@ async function processDocument(
7070 } ) ;
7171 return submitResult ;
7272 } catch ( error ) {
73+ // Handle PDF validation errors with specific guidance
74+ if (
75+ error instanceof Error &&
76+ error . message ?. includes ( 'Not a valid PDF file' )
77+ ) {
78+ return createErrorResponse (
79+ error . message ,
80+ { } ,
81+ {
82+ next_steps : {
83+ immediate : 'The file is not a valid PDF document.' ,
84+ options : [
85+ 'Verify the URL points directly to a PDF file, not a webpage' ,
86+ 'Check if the URL requires authentication or specific headers' ,
87+ 'Try accessing the URL in a browser to see what content it returns' ,
88+ ] ,
89+ auto_retry : 'You can retry with a valid PDF URL' ,
90+ } ,
91+ } ,
92+ ) ;
93+ }
94+
95+ // Handle arxiv-specific retry failures
96+ if ( error instanceof Error && error . name === 'ArxivRetryFailed' ) {
97+ return createErrorResponse (
98+ error . message ,
99+ { } ,
100+ {
101+ next_steps : {
102+ immediate :
103+ 'Failed to retrieve PDF from arXiv. Both original URL and .pdf suffix were tried.' ,
104+ options : [
105+ 'Verify the arXiv paper ID is correct (format: YYMM.NNNNN)' ,
106+ 'Try the direct PDF URL: https://arxiv.org/pdf/PAPER_ID.pdf' ,
107+ 'Check if the paper exists by visiting https://arxiv.org/abs/PAPER_ID' ,
108+ ] ,
109+ auto_retry : 'You can retry with the correct arXiv URL format' ,
110+ } ,
111+ } ,
112+ ) ;
113+ }
114+
73115 return createErrorResponse (
74116 error instanceof Error ? error . message : 'Unknown error occurred' ,
75117 { } ,
@@ -119,6 +161,11 @@ async function readLocalPdf(filePath: string): Promise<FileInfo> {
119161
120162 const buffer = await fs . readFile ( resolvedPath ) ;
121163
164+ // Validate PDF magic bytes
165+ if ( ! buffer . subarray ( 0 , 4 ) . equals ( Buffer . from ( '%PDF' ) ) ) {
166+ throw new Error ( `Not a valid PDF file: ${ fileName } ` ) ;
167+ }
168+
122169 return {
123170 name : fileName ,
124171 size : buffer . length ,
@@ -128,17 +175,56 @@ async function readLocalPdf(filePath: string): Promise<FileInfo> {
128175}
129176
130177/**
131- * Download a PDF from a remote URL
178+ * Download a PDF from a remote URL with arXiv compatibility and validation
132179 */
133180async function downloadPdf ( url : string ) : Promise < FileInfo > {
134181 return pRetry (
135182 async ( ) => {
136- const response = await fetch ( url , {
137- signal : AbortSignal . timeout ( 120000 ) , // 2 minute timeout
138- } ) ;
183+ const fetchWithRetry = async ( fetchUrl : string ) : Promise < Response > => {
184+ const response = await fetch ( fetchUrl , {
185+ signal : AbortSignal . timeout ( 120000 ) , // 2 minute timeout
186+ headers : {
187+ Accept : 'application/pdf, application/octet-stream, */*' ,
188+ 'User-Agent' : 'Mozilla/5.0 (compatible; PDF-Processor/1.0)' ,
189+ } ,
190+ } ) ;
139191
140- if ( ! response . ok ) {
141- throw new Error ( `HTTP ${ response . status } : ${ response . statusText } ` ) ;
192+ if ( ! response . ok ) {
193+ throw new Error ( `HTTP ${ response . status } : ${ response . statusText } ` ) ;
194+ }
195+
196+ return response ;
197+ } ;
198+
199+ let response : Response ;
200+ try {
201+ response = await fetchWithRetry ( url ) ;
202+ } catch ( error : any ) {
203+ // For arxiv.org URLs, try adding .pdf suffix if original request failed
204+ if ( url . includes ( 'arxiv.org' ) && ! url . endsWith ( '.pdf' ) ) {
205+ console . log (
206+ `Initial request failed for arxiv URL: ${ url } , retrying with .pdf suffix` ,
207+ ) ;
208+ const retryUrl = url . endsWith ( '/' ) ? url + 'pdf' : url + '.pdf' ;
209+
210+ try {
211+ response = await fetchWithRetry ( retryUrl ) ;
212+ console . log (
213+ `Successfully retrieved PDF from retry URL: ${ retryUrl } ` ,
214+ ) ;
215+ } catch ( retryError : any ) {
216+ console . log (
217+ `Retry with .pdf suffix also failed: ${ retryError . message } ` ,
218+ ) ;
219+ const enhancedError = new AbortError (
220+ `Failed to retrieve PDF from ${ url } . Tried both original URL and ${ retryUrl } ` ,
221+ ) ;
222+ enhancedError . name = 'ArxivRetryFailed' ;
223+ throw enhancedError ;
224+ }
225+ } else {
226+ throw error ;
227+ }
142228 }
143229
144230 // Check content length
@@ -149,7 +235,7 @@ async function downloadPdf(url: string): Promise<FileInfo> {
149235
150236 // Extract filename from URL or Content-Disposition header
151237 let filename = path . basename ( new URL ( url ) . pathname ) ;
152- const contentDisposition = response . headers [ 'content-disposition' ] ;
238+ const contentDisposition = response . headers . get ( 'content-disposition' ) ;
153239 if ( contentDisposition ) {
154240 const filenameMatch = contentDisposition . match (
155241 / f i l e n a m e [ ^ ; = \n ] * = ( ( [ ' " ] ) .* ?\2| [ ^ ; \n ] * ) / ,
@@ -159,18 +245,34 @@ async function downloadPdf(url: string): Promise<FileInfo> {
159245 }
160246 }
161247
248+ // Ensure filename has .pdf extension if not present
249+ if ( ! filename . toLowerCase ( ) . endsWith ( '.pdf' ) ) {
250+ filename = filename + '.pdf' ;
251+ }
252+
162253 const contentType = response . headers . get ( 'content-type' ) ;
163- if (
164- ! contentType ?. includes ( 'pdf' ) &&
165- ! filename . toLowerCase ( ) . endsWith ( '.pdf' )
166- ) {
254+
255+ const arrayBuffer = await response . arrayBuffer ( ) ;
256+ const buffer = Buffer . from ( arrayBuffer ) ;
257+
258+ // Validate PDF magic bytes
259+ if ( ! buffer . subarray ( 0 , 4 ) . equals ( Buffer . from ( '%PDF' ) ) ) {
167260 throw new AbortError (
168- `File must be a PDF. Got content-type: ${ contentType } , filename: ${ filename } ` ,
261+ `Not a valid PDF file . Got content-type: ${ contentType } , filename: ${ filename } ` ,
169262 ) ;
170263 }
171264
172- const arrayBuffer = await response . arrayBuffer ( ) ;
173- const buffer = Buffer . from ( arrayBuffer ) ;
265+ // Additional content-type validation (more lenient after magic byte check)
266+ if (
267+ contentType &&
268+ ! contentType . includes ( 'pdf' ) &&
269+ ! contentType . includes ( 'octet-stream' ) &&
270+ ! contentType . includes ( 'application/pdf' )
271+ ) {
272+ console . warn (
273+ `Unexpected content-type: ${ contentType } , but PDF magic bytes validated` ,
274+ ) ;
275+ }
174276
175277 return {
176278 name : filename ,
@@ -200,6 +302,11 @@ async function downloadPdf(url: string): Promise<FileInfo> {
200302 }
201303 }
202304
305+ // Don't retry ArxivRetryFailed errors
306+ if ( error . name === 'ArxivRetryFailed' ) {
307+ throw error ;
308+ }
309+
203310 console . warn (
204311 `PDF download attempt ${ error . attemptNumber } failed. ${ error . retriesLeft } retries left. URL: ${ url } ` ,
205312 ) ;
0 commit comments