Skip to content

Commit d6ba2f4

Browse files
committed
feat: improve PDF validation and arXiv URL handling with retry logic
1 parent 0dc9577 commit d6ba2f4

File tree

1 file changed

+121
-14
lines changed

1 file changed

+121
-14
lines changed

src/tools/process-document.ts

Lines changed: 121 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,48 @@ async function processDocument(
7070
});
7171
return submitResult;
7272
} catch (error) {
73+
// Handle PDF validation errors with specific guidance
74+
if (
75+
error instanceof Error &&
76+
error.message?.includes('Not a valid PDF file')
77+
) {
78+
return createErrorResponse(
79+
error.message,
80+
{},
81+
{
82+
next_steps: {
83+
immediate: 'The file is not a valid PDF document.',
84+
options: [
85+
'Verify the URL points directly to a PDF file, not a webpage',
86+
'Check if the URL requires authentication or specific headers',
87+
'Try accessing the URL in a browser to see what content it returns',
88+
],
89+
auto_retry: 'You can retry with a valid PDF URL',
90+
},
91+
},
92+
);
93+
}
94+
95+
// Handle arxiv-specific retry failures
96+
if (error instanceof Error && error.name === 'ArxivRetryFailed') {
97+
return createErrorResponse(
98+
error.message,
99+
{},
100+
{
101+
next_steps: {
102+
immediate:
103+
'Failed to retrieve PDF from arXiv. Both original URL and .pdf suffix were tried.',
104+
options: [
105+
'Verify the arXiv paper ID is correct (format: YYMM.NNNNN)',
106+
'Try the direct PDF URL: https://arxiv.org/pdf/PAPER_ID.pdf',
107+
'Check if the paper exists by visiting https://arxiv.org/abs/PAPER_ID',
108+
],
109+
auto_retry: 'You can retry with the correct arXiv URL format',
110+
},
111+
},
112+
);
113+
}
114+
73115
return createErrorResponse(
74116
error instanceof Error ? error.message : 'Unknown error occurred',
75117
{},
@@ -119,6 +161,11 @@ async function readLocalPdf(filePath: string): Promise<FileInfo> {
119161

120162
const buffer = await fs.readFile(resolvedPath);
121163

164+
// Validate PDF magic bytes
165+
if (!buffer.subarray(0, 4).equals(Buffer.from('%PDF'))) {
166+
throw new Error(`Not a valid PDF file: ${fileName}`);
167+
}
168+
122169
return {
123170
name: fileName,
124171
size: buffer.length,
@@ -128,17 +175,56 @@ async function readLocalPdf(filePath: string): Promise<FileInfo> {
128175
}
129176

130177
/**
131-
* Download a PDF from a remote URL
178+
* Download a PDF from a remote URL with arXiv compatibility and validation
132179
*/
133180
async function downloadPdf(url: string): Promise<FileInfo> {
134181
return pRetry(
135182
async () => {
136-
const response = await fetch(url, {
137-
signal: AbortSignal.timeout(120000), // 2 minute timeout
138-
});
183+
const fetchWithRetry = async (fetchUrl: string): Promise<Response> => {
184+
const response = await fetch(fetchUrl, {
185+
signal: AbortSignal.timeout(120000), // 2 minute timeout
186+
headers: {
187+
Accept: 'application/pdf, application/octet-stream, */*',
188+
'User-Agent': 'Mozilla/5.0 (compatible; PDF-Processor/1.0)',
189+
},
190+
});
139191

140-
if (!response.ok) {
141-
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
192+
if (!response.ok) {
193+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
194+
}
195+
196+
return response;
197+
};
198+
199+
let response: Response;
200+
try {
201+
response = await fetchWithRetry(url);
202+
} catch (error: any) {
203+
// For arxiv.org URLs, try adding .pdf suffix if original request failed
204+
if (url.includes('arxiv.org') && !url.endsWith('.pdf')) {
205+
console.log(
206+
`Initial request failed for arxiv URL: ${url}, retrying with .pdf suffix`,
207+
);
208+
const retryUrl = url.endsWith('/') ? url + 'pdf' : url + '.pdf';
209+
210+
try {
211+
response = await fetchWithRetry(retryUrl);
212+
console.log(
213+
`Successfully retrieved PDF from retry URL: ${retryUrl}`,
214+
);
215+
} catch (retryError: any) {
216+
console.log(
217+
`Retry with .pdf suffix also failed: ${retryError.message}`,
218+
);
219+
const enhancedError = new AbortError(
220+
`Failed to retrieve PDF from ${url}. Tried both original URL and ${retryUrl}`,
221+
);
222+
enhancedError.name = 'ArxivRetryFailed';
223+
throw enhancedError;
224+
}
225+
} else {
226+
throw error;
227+
}
142228
}
143229

144230
// Check content length
@@ -149,7 +235,7 @@ async function downloadPdf(url: string): Promise<FileInfo> {
149235

150236
// Extract filename from URL or Content-Disposition header
151237
let filename = path.basename(new URL(url).pathname);
152-
const contentDisposition = response.headers['content-disposition'];
238+
const contentDisposition = response.headers.get('content-disposition');
153239
if (contentDisposition) {
154240
const filenameMatch = contentDisposition.match(
155241
/filename[^;=\n]*=((['"]).*?\2|[^;\n]*)/,
@@ -159,18 +245,34 @@ async function downloadPdf(url: string): Promise<FileInfo> {
159245
}
160246
}
161247

248+
// Ensure filename has .pdf extension if not present
249+
if (!filename.toLowerCase().endsWith('.pdf')) {
250+
filename = filename + '.pdf';
251+
}
252+
162253
const contentType = response.headers.get('content-type');
163-
if (
164-
!contentType?.includes('pdf') &&
165-
!filename.toLowerCase().endsWith('.pdf')
166-
) {
254+
255+
const arrayBuffer = await response.arrayBuffer();
256+
const buffer = Buffer.from(arrayBuffer);
257+
258+
// Validate PDF magic bytes
259+
if (!buffer.subarray(0, 4).equals(Buffer.from('%PDF'))) {
167260
throw new AbortError(
168-
`File must be a PDF. Got content-type: ${contentType}, filename: ${filename}`,
261+
`Not a valid PDF file. Got content-type: ${contentType}, filename: ${filename}`,
169262
);
170263
}
171264

172-
const arrayBuffer = await response.arrayBuffer();
173-
const buffer = Buffer.from(arrayBuffer);
265+
// Additional content-type validation (more lenient after magic byte check)
266+
if (
267+
contentType &&
268+
!contentType.includes('pdf') &&
269+
!contentType.includes('octet-stream') &&
270+
!contentType.includes('application/pdf')
271+
) {
272+
console.warn(
273+
`Unexpected content-type: ${contentType}, but PDF magic bytes validated`,
274+
);
275+
}
174276

175277
return {
176278
name: filename,
@@ -200,6 +302,11 @@ async function downloadPdf(url: string): Promise<FileInfo> {
200302
}
201303
}
202304

305+
// Don't retry ArxivRetryFailed errors
306+
if (error.name === 'ArxivRetryFailed') {
307+
throw error;
308+
}
309+
203310
console.warn(
204311
`PDF download attempt ${error.attemptNumber} failed. ${error.retriesLeft} retries left. URL: ${url}`,
205312
);

0 commit comments

Comments
 (0)