Skip to content

Commit c4508ac

Browse files
committed
Enhance PDF saving logic in executeStep function by implementing response interception for PDF downloads. Added directory creation for saved files and improved error handling during the save process. Updated logging for better visibility of PDF retrieval outcomes.
1 parent 64e2f61 commit c4508ac

File tree

1 file changed

+109
-227
lines changed

1 file changed

+109
-227
lines changed

src/step-executor.ts

Lines changed: 109 additions & 227 deletions
Original file line numberDiff line numberDiff line change
@@ -402,258 +402,140 @@ export async function executeStep(
402402

403403
const collectorKey = step.key || step.id || 'file';
404404
let savedPath: string | null = null;
405-
// After the guard above, we can safely treat step.value as string
406405
const targetPathBase: string = step.value as string;
406+
const resolvedPath: string = replaceDataPlaceholders(targetPathBase, collector) || targetPathBase;
407+
const dir = path.dirname(resolvedPath);
408+
if (!fs.existsSync(dir)) {
409+
fs.mkdirSync(dir, { recursive: true });
410+
}
411+
412+
let pdfSaved = false;
413+
const interceptedData: { buffer: Buffer | null } = { buffer: null };
407414

408415
try {
409416
// Ensure the page finished initial navigation
410417
try {
411418
await page.waitForLoadState('domcontentloaded', { timeout: step.wait ?? 600000 });
412419
} catch {}
413420

414-
// Try to resolve the direct PDF URL
415-
let pdfUrl: string | null = null;
416-
417-
// 1) If the current URL points to a PDF (anywhere in the URL), use it or extract from query
418-
const currentUrl = page.url();
419-
console.log(` 📄 Current URL: ${currentUrl}`);
420-
try {
421-
const u = new URL(currentUrl);
422-
const candidates = [
423-
u.searchParams.get('file'),
424-
u.searchParams.get('src'),
425-
u.searchParams.get('document'),
426-
u.searchParams.get('url')
427-
].filter(Boolean) as string[];
428-
const paramPdf = candidates.find(v => /\.pdf/i.test(v));
429-
if (paramPdf) {
430-
pdfUrl = new URL(paramPdf, u.href).toString();
421+
// Intercept responses to capture PDF even when displayed inline
422+
await page.route('**/*', async route => {
423+
const response = await route.fetch();
424+
const contentType = response.headers()['content-type'] || '';
425+
const url = route.request().url();
426+
427+
// Check if this is a PDF response
428+
if (contentType.includes('application/pdf') || url.includes('.pdf')) {
429+
const buffer = await response.body();
430+
if (!pdfSaved && buffer.length > 0) {
431+
interceptedData.buffer = buffer;
432+
// Save immediately when intercepted
433+
try {
434+
fs.writeFileSync(resolvedPath, buffer);
435+
savedPath = resolvedPath;
436+
pdfSaved = true;
437+
console.log(
438+
` 📄 PDF intercepted and saved (${(buffer.length / 1024).toFixed(2)} KB) to ${resolvedPath}`
439+
);
440+
} catch (saveErr: any) {
441+
console.log(` 📄 Failed to save intercepted PDF: ${saveErr.message}`);
442+
}
443+
}
431444
}
432-
} catch {}
433-
if (!pdfUrl && /\.pdf/i.test(currentUrl)) {
434-
pdfUrl = currentUrl;
435-
}
436445

437-
// 2) Otherwise, try to discover PDF source from common viewer elements
438-
if (!pdfUrl) {
439-
try {
440-
pdfUrl = await page.evaluate(() => {
441-
const getAbs = (src?: string | null) => {
442-
if (!src) return null;
443-
try {
444-
return new URL(src, window.location.href).toString();
445-
} catch {
446-
return src;
447-
}
448-
};
449-
450-
const embed = document.querySelector('embed[type="application/pdf"]') as HTMLObjectElement | null;
451-
if (embed && embed.getAttribute('src')) return getAbs(embed.getAttribute('src'));
452-
453-
const objectEl = document.querySelector('object[type="application/pdf"]') as HTMLObjectElement | null;
454-
if (objectEl && objectEl.getAttribute('data')) return getAbs(objectEl.getAttribute('data'));
455-
456-
const iframe = Array.from(document.querySelectorAll('iframe')).find(f => {
457-
const s = f.getAttribute('src') || '';
458-
return /\.pdf/i.test(s) || s.includes('pdf');
459-
}) as HTMLIFrameElement | undefined;
460-
if (iframe && iframe.getAttribute('src')) return getAbs(iframe.getAttribute('src'));
461-
462-
return null;
463-
});
464-
} catch {}
465-
}
466-
467-
// 3) Additional wait if requested (helps some viewers populate 'src')
468-
if (!pdfUrl && step.wait && step.wait > 0) {
469-
await page.waitForTimeout(step.wait);
470-
try {
471-
// Try again once after waiting
472-
pdfUrl = await page.evaluate(() => {
473-
const iframe = Array.from(document.querySelectorAll('iframe')).find(f => f.getAttribute('src')) as HTMLIFrameElement | undefined;
474-
return iframe?.src || null;
475-
});
476-
} catch {}
477-
}
446+
// Continue with the normal response
447+
await route.fulfill({ response });
448+
});
478449

479-
// If we couldn't find a PDF URL, abort instead of rendering HTML with page.pdf
480-
if (!pdfUrl) {
481-
console.log(' 📄 Direct PDF URL not found. Skipping save (no page.pdf fallback).');
482-
break;
483-
}
450+
const currentUrl = page.url();
451+
console.log(` 📄 Current URL: ${currentUrl}`);
484452

485-
// Build candidate URLs and try them until one succeeds
486-
const candidates: string[] = [];
487-
const isAbsolute = /^https?:/i.test(pdfUrl);
488-
if (isAbsolute) {
489-
candidates.push(pdfUrl);
490-
} else {
491-
// 1) Same-origin resolution
492-
candidates.push(new URL(pdfUrl, currentUrl).toString());
453+
// Check if we're already on a PDF URL - wait a bit for interception
454+
const isPdfUrl = currentUrl.includes('.pdf') || /\.pdf(\?|$)/i.test(currentUrl);
455+
if (isPdfUrl && !pdfSaved) {
456+
// Wait a moment for route interception to catch the PDF if it's already loading
457+
await page.waitForTimeout(1000);
493458
}
494459

495-
// Log URLs for debugging
496-
console.log(` 📄 Current URL: ${currentUrl}`);
497-
console.log(` 📄 Candidate PDF URLs:`, candidates);
498-
// Download the first successful candidate
499-
let downloadedBuffer: Buffer | null = null;
500-
for (const candidateUrl of candidates) {
501-
try {
502-
const ctx = page.context();
503-
const cookies = await ctx.cookies(candidateUrl);
504-
const cookieHeader = cookies.map(c => `${c.name}=${c.value}`).join('; ');
505-
const api = await request.newContext({
506-
extraHTTPHeaders: {
507-
...(cookieHeader ? { Cookie: cookieHeader } : {}),
508-
Referer: currentUrl,
509-
'User-Agent': 'Mozilla/5.0'
460+
// Try both approaches: wait for download event OR intercept response
461+
try {
462+
// Reload the page to trigger route interception (unless already saved)
463+
const [response, download] = await Promise.all([
464+
!pdfSaved ? page.reload({ waitUntil: 'networkidle' }).catch(() => null) : Promise.resolve(null),
465+
page.waitForEvent('download', { timeout: 5000 }).catch(() => null)
466+
]);
467+
468+
if (download) {
469+
// If download event occurred, save it
470+
await download.saveAs(resolvedPath);
471+
savedPath = resolvedPath;
472+
pdfSaved = true;
473+
console.log(` 📄 PDF saved via download event to ${resolvedPath}`);
474+
} else if (response) {
475+
// Check if the response itself is a PDF
476+
const contentType = response.headers()['content-type'] || '';
477+
if (contentType.includes('application/pdf') && !pdfSaved) {
478+
const buffer = await response.body();
479+
if (buffer.length > 0) {
480+
fs.writeFileSync(resolvedPath, buffer);
481+
savedPath = resolvedPath;
482+
pdfSaved = true;
483+
console.log(
484+
` 📄 PDF saved via response body (${(buffer.length / 1024).toFixed(2)} KB) to ${resolvedPath}`
485+
);
510486
}
511-
});
512-
const res = await api.get(candidateUrl);
513-
if (res.ok()) {
514-
downloadedBuffer = await res.body();
515-
await api.dispose();
516-
pdfUrl = candidateUrl; // final URL used
517-
break;
518487
} else {
519-
console.log(` 📄 GET ${candidateUrl} -> ${res.status()} ${res.statusText()}`);
520-
await api.dispose();
521-
}
522-
} catch (e: any) {
523-
console.log(` 📄 GET ${candidateUrl} failed: ${e.message}`);
524-
}
525-
}
526-
if (!downloadedBuffer) {
527-
console.log(' 📄 All candidate PDF URLs failed. Trying viewer download fallback...');
528-
529-
// Strategy 1: Try to extract PDF URL from embed element and fetch directly
530-
let saved = false;
531-
532-
try {
533-
const embedPdfUrl = await page.evaluate(() => {
534-
const embed = document.querySelector('embed[type="application/x-google-chrome-pdf"]') as HTMLEmbedElement;
535-
if (embed && embed.getAttribute('original-url')) {
536-
return embed.getAttribute('original-url');
537-
}
538-
return null;
539-
});
540-
541-
if (embedPdfUrl) {
542-
console.log(` 📄 Found PDF URL in embed: ${embedPdfUrl}`);
543-
try {
544-
const ctx = page.context();
545-
const cookies = await ctx.cookies(embedPdfUrl);
546-
const cookieHeader = cookies.map(c => `${c.name}=${c.value}`).join('; ');
547-
const api = await request.newContext({
548-
extraHTTPHeaders: {
549-
...(cookieHeader ? { Cookie: cookieHeader } : {}),
550-
Referer: currentUrl,
551-
'User-Agent': 'Mozilla/5.0'
552-
}
553-
});
554-
const res = await api.get(embedPdfUrl);
555-
if (res.ok()) {
556-
const body = await res.body();
557-
const resolvedPath: string = (replaceDataPlaceholders(targetPathBase, collector) || targetPathBase);
558-
const dir = path.dirname(resolvedPath);
559-
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
560-
fs.writeFileSync(resolvedPath, body);
561-
savedPath = resolvedPath;
562-
console.log(` 📄 PDF saved from embed URL to ${resolvedPath}`);
563-
await api.dispose();
564-
saved = true;
565-
}
566-
await api.dispose();
567-
} catch (e: any) {
568-
console.log(` 📄 Failed to fetch from embed URL: ${e.message}`);
569-
}
570-
}
571-
} catch (e: any) {
572-
console.log(` 📄 Embed extraction failed: ${e.message}`);
573-
}
574-
575-
// Strategy 2: Wait for PDF viewer and click download button
576-
if (!saved) {
577-
console.log(' 📄 Waiting for PDF viewer to load...');
578-
try {
579-
// Wait for pdf-viewer element to appear
580-
await page.waitForSelector('pdf-viewer', { timeout: 10000 }).catch(() => null);
581-
await page.waitForTimeout(2000); // Additional wait for shadow DOMs to initialize
582-
583-
console.log(' 📄 Searching for download button...');
584-
const clicked = await page.evaluate(() => {
585-
// Try direct selector path
586-
const pdfViewer = document.querySelector('pdf-viewer');
587-
if (pdfViewer && pdfViewer.shadowRoot) {
588-
const toolbar = pdfViewer.shadowRoot.querySelector('viewer-toolbar');
589-
if (toolbar && toolbar.shadowRoot) {
590-
const downloadControls = toolbar.shadowRoot.querySelector('viewer-download-controls');
591-
if (downloadControls && downloadControls.shadowRoot) {
592-
const saveButton = downloadControls.shadowRoot.querySelector('#save') as HTMLElement;
593-
if (saveButton) {
594-
saveButton.click();
595-
return true;
596-
}
597-
}
598-
}
599-
}
600-
601-
// Fallback: recursive search
602-
function findAndClickDownload(root: Element | Document | ShadowRoot): boolean {
603-
const walker = document.createTreeWalker(root as Node, NodeFilter.SHOW_ELEMENT);
604-
let node: Node | null;
605-
while ((node = walker.nextNode())) {
606-
const el = node as HTMLElement;
607-
if (el.id === 'save' || el.id === 'download') {
608-
el.click();
609-
return true;
610-
}
611-
if (el.shadowRoot) {
612-
if (findAndClickDownload(el.shadowRoot)) return true;
613-
}
614-
}
615-
return false;
616-
}
617-
return findAndClickDownload(document);
618-
}).catch(() => false);
619-
620-
if (clicked) {
621-
console.log(' 📄 Found and clicked download button! Waiting for download...');
622-
const dl = await page.waitForEvent('download', { timeout: 10000 }).catch(() => null);
623-
if (dl) {
624-
const resolvedPath: string = (replaceDataPlaceholders(targetPathBase, collector) || targetPathBase);
625-
const dir = path.dirname(resolvedPath);
626-
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
627-
await dl.saveAs(resolvedPath);
628-
savedPath = resolvedPath;
629-
console.log(` 📄 PDF saved via download button to ${resolvedPath}`);
630-
saved = true;
631-
} else {
632-
console.log(' 📄 Download button clicked but no download event received.');
633-
}
634-
} else {
635-
console.log(' 📄 Download button not found.');
488+
// Wait a bit for route interception to capture it
489+
await page.waitForTimeout(2000);
490+
if (interceptedData.buffer && !pdfSaved && interceptedData.buffer.length > 0) {
491+
fs.writeFileSync(resolvedPath, interceptedData.buffer);
492+
savedPath = resolvedPath;
493+
pdfSaved = true;
494+
console.log(
495+
` 📄 PDF saved via intercepted response (${(interceptedData.buffer.length / 1024).toFixed(2)} KB) to ${resolvedPath}`
496+
);
636497
}
637-
} catch (e: any) {
638-
console.log(` 📄 PDF viewer approach failed: ${e.message}`);
639498
}
499+
} else if (interceptedData.buffer && !pdfSaved && interceptedData.buffer.length > 0) {
500+
// Fallback: use intercepted buffer
501+
fs.writeFileSync(resolvedPath, interceptedData.buffer);
502+
savedPath = resolvedPath;
503+
pdfSaved = true;
504+
console.log(
505+
` 📄 PDF saved via intercepted response (${(interceptedData.buffer.length / 1024).toFixed(2)} KB) to ${resolvedPath}`
506+
);
640507
}
641-
642-
if (!saved) {
643-
console.log(' 📄 All viewer download fallbacks failed.');
508+
} catch (error: any) {
509+
console.log(` 📄 Error during PDF save: ${error.message}`);
510+
// Still try to save intercepted buffer if available
511+
if (interceptedData.buffer && !pdfSaved && interceptedData.buffer.length > 0) {
512+
fs.writeFileSync(resolvedPath, interceptedData.buffer);
513+
savedPath = resolvedPath;
514+
pdfSaved = true;
515+
console.log(
516+
` 📄 PDF saved via intercepted response (${(interceptedData.buffer.length / 1024).toFixed(2)} KB) to ${resolvedPath}`
517+
);
644518
}
645-
} else {
646-
const resolvedPath: string = (replaceDataPlaceholders(targetPathBase, collector) || targetPathBase);
647-
const dir = path.dirname(resolvedPath);
648-
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
649-
fs.writeFileSync(resolvedPath, downloadedBuffer);
650-
savedPath = resolvedPath;
651-
console.log(` 📄 PDF saved to ${resolvedPath} (from ${pdfUrl})`);
652519
}
653520
} catch (err: any) {
654521
console.log(` 📄 savePDF failed: ${err.message}`);
655522
} finally {
523+
// Unroute to clean up
524+
try {
525+
await page.unroute('**/*');
526+
} catch {}
527+
528+
// Verify file was saved
529+
if (!pdfSaved && savedPath && fs.existsSync(savedPath)) {
530+
pdfSaved = true;
531+
}
532+
656533
collector[collectorKey] = savedPath;
534+
if (pdfSaved || savedPath) {
535+
console.log(` ✓ PDF successfully saved to ${savedPath}`);
536+
} else {
537+
console.log(` ✗ Failed to save PDF`);
538+
}
657539
}
658540
break;
659541
}

0 commit comments

Comments
 (0)