Skip to content

Commit ffd56cc

Browse files
committed
Refactor PDF retrieval logic in executeStep function to enhance download strategies. Implemented extraction of PDF URLs from embed elements and improved handling of viewer download buttons. Added detailed logging for better debugging and ensured proper directory creation for saved files.
1 parent 0c11af3 commit ffd56cc

File tree

1 file changed

+97
-137
lines changed

1 file changed

+97
-137
lines changed

src/step-executor.ts

Lines changed: 97 additions & 137 deletions
Original file line numberDiff line numberDiff line change
@@ -490,47 +490,11 @@ export async function executeStep(
490490
} else {
491491
// 1) Same-origin resolution
492492
candidates.push(new URL(pdfUrl, currentUrl).toString());
493-
494-
// 2) Granicus S3 pattern: <prefix>/<filename>.pdf
495-
// Example filename: queencreekaz_<hash>.pdf -> folder "queencreekaz"
496-
const m = pdfUrl.match(/^([a-z0-9-]+)_(.+\.pdf)$/i);
497-
if (m) {
498-
const city = m[1];
499-
const fileName = `${m[1]}_${m[2]}`; // full filename again
500-
candidates.push(`https://granicus_production_attachments.s3.amazonaws.com/${city}/${fileName}`);
501-
}
502493
}
503494

504-
// 3) If current page is a Granicus DocumentViewer, try explicit download query param variants
505-
try {
506-
const urlObj = new URL(currentUrl);
507-
if (/DocumentViewer\.php$/i.test(urlObj.pathname) && urlObj.searchParams.has('file')) {
508-
const origin = `${urlObj.protocol}//${urlObj.host}`;
509-
const fileParam = urlObj.searchParams.get('file') as string;
510-
const baseViewer = `${origin}${urlObj.pathname}`;
511-
// Add explicit download query attempts
512-
const withDownload = new URL(baseViewer);
513-
withDownload.searchParams.set('file', fileParam);
514-
withDownload.searchParams.set('download', '1');
515-
candidates.push(withDownload.toString());
516-
517-
const withDownloadAndView = new URL(baseViewer);
518-
withDownloadAndView.searchParams.set('file', fileParam);
519-
withDownloadAndView.searchParams.set('view', urlObj.searchParams.get('view') || '1');
520-
withDownloadAndView.searchParams.set('download', '1');
521-
candidates.push(withDownloadAndView.toString());
522-
523-
// Also try direct origin + file param path as a last resort
524-
if (/\.pdf$/i.test(fileParam)) {
525-
candidates.push(`${origin}/${fileParam}`);
526-
}
527-
}
528-
} catch {}
529-
530495
// Log URLs for debugging
531496
console.log(` 📄 Current URL: ${currentUrl}`);
532497
console.log(` 📄 Candidate PDF URLs:`, candidates);
533-
534498
// Download the first successful candidate
535499
let downloadedBuffer: Buffer | null = null;
536500
for (const candidateUrl of candidates) {
@@ -561,126 +525,122 @@ export async function executeStep(
561525
}
562526
if (!downloadedBuffer) {
563527
console.log(' 📄 All candidate PDF URLs failed. Trying viewer download fallback...');
564-
// Main page attempt (deep shadow click only)
528+
529+
// Strategy 1: Try to extract PDF URL from embed element and fetch directly
565530
let saved = false;
566-
const clickedMain = await page.evaluate(async () => {
567-
const targetIds = ['download', 'save'];
568-
const visited = new Set<Node>();
569-
function tryClick(node: Node): boolean {
570-
if (visited.has(node)) return false;
571-
visited.add(node);
572-
const el = node as HTMLElement;
573-
if (el && el.id && targetIds.includes(el.id)) { el.click(); return true; }
574-
const elem = node as Element;
575-
if (!elem) return false;
576-
const sr = (elem as any).shadowRoot as ShadowRoot | undefined;
577-
if (sr) for (const child of Array.from(sr.children)) { if (tryClick(child)) return true; }
578-
for (const child of Array.from(elem.children)) { if (tryClick(child)) return true; }
579-
return false;
580-
}
581-
return tryClick(document.documentElement);
582-
}).catch(() => false as any);
583-
if (clickedMain) {
584-
const dl = await page.waitForEvent('download', { timeout: 5000 }).catch(() => null);
585-
if (dl) {
586-
const resolvedPath: string = (replaceDataPlaceholders(targetPathBase, collector) || targetPathBase);
587-
const dir = path.dirname(resolvedPath);
588-
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
589-
await dl.saveAs(resolvedPath);
590-
savedPath = resolvedPath;
591-
console.log(` 📄 PDF saved via viewer download to ${resolvedPath}`);
592-
saved = true;
593-
}
594-
}
595-
596-
// Frames attempt
597-
if (!saved) {
598-
for (const frame of page.frames()) {
599-
if (frame === page.mainFrame()) continue;
600-
const clicked = await frame.evaluate(async () => {
601-
const targetIds = ['download', 'save'];
602-
const visited = new Set<Node>();
603-
function tryClick(node: Node): boolean {
604-
if (visited.has(node)) return false;
605-
visited.add(node);
606-
const el = node as HTMLElement;
607-
if (el && el.id && targetIds.includes(el.id)) { el.click(); return true; }
608-
const elem = node as Element;
609-
if (!elem) return false;
610-
const sr = (elem as any).shadowRoot as ShadowRoot | undefined;
611-
if (sr) for (const child of Array.from(sr.children)) { if (tryClick(child)) return true; }
612-
for (const child of Array.from(elem.children)) { if (tryClick(child)) return true; }
613-
return false;
614-
}
615-
return tryClick(document.documentElement);
616-
}).catch(() => false as any);
617-
if (clicked) {
618-
const dl = await page.waitForEvent('download', { timeout: 5000 }).catch(() => null);
619-
if (dl) {
531+
532+
try {
533+
const embedPdfUrl = await page.evaluate(() => {
534+
const embed = document.querySelector('embed[type="application/x-google-chrome-pdf"]') as HTMLEmbedElement;
535+
if (embed && embed.getAttribute('original-url')) {
536+
return embed.getAttribute('original-url');
537+
}
538+
return null;
539+
});
540+
541+
if (embedPdfUrl) {
542+
console.log(` 📄 Found PDF URL in embed: ${embedPdfUrl}`);
543+
try {
544+
const ctx = page.context();
545+
const cookies = await ctx.cookies(embedPdfUrl);
546+
const cookieHeader = cookies.map(c => `${c.name}=${c.value}`).join('; ');
547+
const api = await request.newContext({
548+
extraHTTPHeaders: {
549+
...(cookieHeader ? { Cookie: cookieHeader } : {}),
550+
Referer: currentUrl,
551+
'User-Agent': 'Mozilla/5.0'
552+
}
553+
});
554+
const res = await api.get(embedPdfUrl);
555+
if (res.ok()) {
556+
const body = await res.body();
620557
const resolvedPath: string = (replaceDataPlaceholders(targetPathBase, collector) || targetPathBase);
621558
const dir = path.dirname(resolvedPath);
622559
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
623-
await dl.saveAs(resolvedPath);
560+
fs.writeFileSync(resolvedPath, body);
624561
savedPath = resolvedPath;
625-
console.log(` 📄 PDF saved via viewer download to ${resolvedPath}`);
562+
console.log(` 📄 PDF saved from embed URL to ${resolvedPath}`);
563+
await api.dispose();
626564
saved = true;
627-
break;
628565
}
566+
await api.dispose();
567+
} catch (e: any) {
568+
console.log(` 📄 Failed to fetch from embed URL: ${e.message}`);
629569
}
630570
}
571+
} catch (e: any) {
572+
console.log(` 📄 Embed extraction failed: ${e.message}`);
631573
}
632-
633-
// Non-click fallback: try to scrape a direct download link href and fetch it
574+
575+
// Strategy 2: Wait for PDF viewer and click download button
634576
if (!saved) {
577+
console.log(' 📄 Waiting for PDF viewer to load...');
635578
try {
636-
const hrefs = await page.evaluate(() => {
637-
const links: string[] = [];
638-
const anchors = Array.from(document.querySelectorAll('a')) as HTMLAnchorElement[];
639-
for (const a of anchors) {
640-
const text = (a.textContent || '').toLowerCase();
641-
const aria = (a.getAttribute('aria-label') || '').toLowerCase();
642-
if (a.hasAttribute('download') || text.includes('download') || aria.includes('download')) {
643-
if (a.href) links.push(a.href);
579+
// Wait for pdf-viewer element to appear
580+
await page.waitForSelector('pdf-viewer', { timeout: 10000 }).catch(() => null);
581+
await page.waitForTimeout(2000); // Additional wait for shadow DOMs to initialize
582+
583+
console.log(' 📄 Searching for download button...');
584+
const clicked = await page.evaluate(() => {
585+
// Try direct selector path
586+
const pdfViewer = document.querySelector('pdf-viewer');
587+
if (pdfViewer && pdfViewer.shadowRoot) {
588+
const toolbar = pdfViewer.shadowRoot.querySelector('viewer-toolbar');
589+
if (toolbar && toolbar.shadowRoot) {
590+
const downloadControls = toolbar.shadowRoot.querySelector('viewer-download-controls');
591+
if (downloadControls && downloadControls.shadowRoot) {
592+
const saveButton = downloadControls.shadowRoot.querySelector('#save') as HTMLElement;
593+
if (saveButton) {
594+
saveButton.click();
595+
return true;
596+
}
597+
}
644598
}
645599
}
646-
return links.slice(0, 3);
647-
});
648-
if (hrefs && hrefs.length > 0) {
649-
for (const href of hrefs) {
650-
try {
651-
const ctx = page.context();
652-
const cookies = await ctx.cookies(href);
653-
const cookieHeader = cookies.map(c => `${c.name}=${c.value}`).join('; ');
654-
const api = await request.newContext({
655-
extraHTTPHeaders: {
656-
...(cookieHeader ? { Cookie: cookieHeader } : {}),
657-
Referer: currentUrl,
658-
'User-Agent': 'Mozilla/5.0',
659-
Accept: 'application/pdf,*/*'
660-
}
661-
});
662-
const res = await api.get(href);
663-
if (res.ok()) {
664-
const body = await res.body();
665-
const resolvedPath: string = (replaceDataPlaceholders(targetPathBase, collector) || targetPathBase);
666-
const dir = path.dirname(resolvedPath);
667-
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
668-
fs.writeFileSync(resolvedPath, body);
669-
savedPath = resolvedPath;
670-
console.log(` 📄 PDF saved via scraped href to ${resolvedPath}`);
671-
await api.dispose();
672-
saved = true;
673-
break;
600+
601+
// Fallback: recursive search
602+
function findAndClickDownload(root: Element | Document | ShadowRoot): boolean {
603+
const walker = document.createTreeWalker(root as Node, NodeFilter.SHOW_ELEMENT);
604+
let node: Node | null;
605+
while ((node = walker.nextNode())) {
606+
const el = node as HTMLElement;
607+
if (el.id === 'save' || el.id === 'download') {
608+
el.click();
609+
return true;
610+
}
611+
if (el.shadowRoot) {
612+
if (findAndClickDownload(el.shadowRoot)) return true;
674613
}
675-
await api.dispose();
676-
} catch {}
614+
}
615+
return false;
677616
}
617+
return findAndClickDownload(document);
618+
}).catch(() => false);
619+
620+
if (clicked) {
621+
console.log(' 📄 Found and clicked download button! Waiting for download...');
622+
const dl = await page.waitForEvent('download', { timeout: 10000 }).catch(() => null);
623+
if (dl) {
624+
const resolvedPath: string = (replaceDataPlaceholders(targetPathBase, collector) || targetPathBase);
625+
const dir = path.dirname(resolvedPath);
626+
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
627+
await dl.saveAs(resolvedPath);
628+
savedPath = resolvedPath;
629+
console.log(` 📄 PDF saved via download button to ${resolvedPath}`);
630+
saved = true;
631+
} else {
632+
console.log(' 📄 Download button clicked but no download event received.');
633+
}
634+
} else {
635+
console.log(' 📄 Download button not found.');
678636
}
679-
} catch {}
637+
} catch (e: any) {
638+
console.log(` 📄 PDF viewer approach failed: ${e.message}`);
639+
}
680640
}
681-
641+
682642
if (!saved) {
683-
console.log(' 📄 Viewer download fallback failed.');
643+
console.log(' 📄 All viewer download fallbacks failed.');
684644
}
685645
} else {
686646
const resolvedPath: string = (replaceDataPlaceholders(targetPathBase, collector) || targetPathBase);

0 commit comments

Comments
 (0)