Skip to content

Commit 459e304

Browse files
committed
Refactor savePDF step in executeStep function to enhance PDF retrieval logic. Improved handling of PDF URLs by checking current page URL, common viewer elements, and implementing a fallback for viewer downloads. Added detailed logging for better debugging and ensured proper directory creation for saved files.
1 parent 7e1725b commit 459e304

File tree

1 file changed

+276
-86
lines changed

1 file changed

+276
-86
lines changed

src/step-executor.ts

Lines changed: 276 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -395,114 +395,304 @@ export async function executeStep(
395395
break;
396396
}
397397
case 'savePDF': {
398-
// Save PDF content from current page to file
398+
// Save the actual PDF binary from the current page or embedded viewer
399399
if (!step.value) {
400400
throw new Error(`savePDF step ${step.id} requires 'value' as target filepath`);
401401
}
402402

403403
const collectorKey = step.key || step.id || 'file';
404404
let savedPath: string | null = null;
405+
// After the guard above, we can safely treat step.value as string
406+
const targetPathBase: string = step.value as string;
405407

406408
try {
407-
console.log(` 📄 Waiting for PDF content to load...`);
408-
409-
// Strategy 1: Wait for DOM content loaded first, //timeout 10 minutes
409+
// Ensure the page finished initial navigation
410410
try {
411411
await page.waitForLoadState('domcontentloaded', { timeout: step.wait ?? 600000 });
412-
console.log(` 📄 DOM content loaded`);
413-
} catch (domErr) {
414-
console.log(` 📄 DOM content timeout, continuing anyway`);
412+
} catch {}
413+
414+
// Try to resolve the direct PDF URL
415+
let pdfUrl: string | null = null;
416+
417+
// 1) If the current URL points to a PDF (anywhere in the URL), use it or extract from query
418+
const currentUrl = page.url();
419+
console.log(` 📄 Current URL: ${currentUrl}`);
420+
try {
421+
const u = new URL(currentUrl);
422+
const candidates = [
423+
u.searchParams.get('file'),
424+
u.searchParams.get('src'),
425+
u.searchParams.get('document'),
426+
u.searchParams.get('url')
427+
].filter(Boolean) as string[];
428+
const paramPdf = candidates.find(v => /\.pdf/i.test(v));
429+
if (paramPdf) {
430+
pdfUrl = new URL(paramPdf, u.href).toString();
431+
}
432+
} catch {}
433+
if (!pdfUrl && /\.pdf/i.test(currentUrl)) {
434+
pdfUrl = currentUrl;
415435
}
416-
417-
// Strategy 2: Wait for PDF-specific elements or indicators
418-
let pdfReady = false;
419-
const maxAttempts = 15; // Increased attempts for PDF loading
420-
let attempts = 0;
421-
422-
while (!pdfReady && attempts < maxAttempts) {
423-
attempts++;
424-
console.log(` 📄 Checking PDF readiness (attempt ${attempts}/${maxAttempts})`);
425-
436+
437+
// 2) Otherwise, try to discover PDF source from common viewer elements
438+
if (!pdfUrl) {
426439
try {
427-
// Check if page has PDF content indicators
428-
const hasPdfContent = await page.evaluate(() => {
429-
// Check for PDF viewer elements
430-
const pdfViewer = document.querySelector('embed[type="application/pdf"]') ||
431-
document.querySelector('object[type="application/pdf"]') ||
432-
document.querySelector('iframe[src*=".pdf"]') ||
433-
document.querySelector('.pdf-viewer') ||
434-
document.querySelector('[data-pdf]');
435-
436-
// Check if page content is substantial (not just loading screen)
437-
const bodyText = document.body ? document.body.innerText : '';
438-
const hasSubstantialContent = bodyText.length > 200; // Increased threshold
439-
440-
// Check if page is visible
441-
const isVisible = document.body &&
442-
document.body.style.display !== 'none' &&
443-
document.body.style.visibility !== 'hidden';
444-
445-
// Check for PDF-specific content
446-
const hasPdfText = bodyText.includes('PDF') ||
447-
bodyText.includes('Page') ||
448-
bodyText.includes('Agenda') ||
449-
bodyText.includes('Meeting');
450-
451-
return {
452-
hasPdfViewer: !!pdfViewer,
453-
hasSubstantialContent,
454-
isVisible,
455-
bodyTextLength: bodyText.length,
456-
hasPdfText
440+
pdfUrl = await page.evaluate(() => {
441+
const getAbs = (src?: string | null) => {
442+
if (!src) return null;
443+
try {
444+
return new URL(src, window.location.href).toString();
445+
} catch {
446+
return src;
447+
}
457448
};
449+
450+
const embed = document.querySelector('embed[type="application/pdf"]') as HTMLObjectElement | null;
451+
if (embed && embed.getAttribute('src')) return getAbs(embed.getAttribute('src'));
452+
453+
const objectEl = document.querySelector('object[type="application/pdf"]') as HTMLObjectElement | null;
454+
if (objectEl && objectEl.getAttribute('data')) return getAbs(objectEl.getAttribute('data'));
455+
456+
const iframe = Array.from(document.querySelectorAll('iframe')).find(f => {
457+
const s = f.getAttribute('src') || '';
458+
return /\.pdf/i.test(s) || s.includes('pdf');
459+
}) as HTMLIFrameElement | undefined;
460+
if (iframe && iframe.getAttribute('src')) return getAbs(iframe.getAttribute('src'));
461+
462+
return null;
463+
});
464+
} catch {}
465+
}
466+
467+
// 3) Additional wait if requested (helps some viewers populate 'src')
468+
if (!pdfUrl && step.wait && step.wait > 0) {
469+
await page.waitForTimeout(step.wait);
470+
try {
471+
// Try again once after waiting
472+
pdfUrl = await page.evaluate(() => {
473+
const iframe = Array.from(document.querySelectorAll('iframe')).find(f => f.getAttribute('src')) as HTMLIFrameElement | undefined;
474+
return iframe?.src || null;
475+
});
476+
} catch {}
477+
}
478+
479+
// If we couldn't find a PDF URL, abort instead of rendering HTML with page.pdf
480+
if (!pdfUrl) {
481+
console.log(' 📄 Direct PDF URL not found. Skipping save (no page.pdf fallback).');
482+
break;
483+
}
484+
485+
// Build candidate URLs and try them until one succeeds
486+
const candidates: string[] = [];
487+
const isAbsolute = /^https?:/i.test(pdfUrl);
488+
if (isAbsolute) {
489+
candidates.push(pdfUrl);
490+
} else {
491+
// 1) Same-origin resolution
492+
candidates.push(new URL(pdfUrl, currentUrl).toString());
493+
494+
// 2) Granicus S3 pattern: <prefix>/<filename>.pdf
495+
// Example filename: queencreekaz_<hash>.pdf -> folder "queencreekaz"
496+
const m = pdfUrl.match(/^([a-z0-9-]+)_(.+\.pdf)$/i);
497+
if (m) {
498+
const city = m[1];
499+
const fileName = `${m[1]}_${m[2]}`; // full filename again
500+
candidates.push(`https://granicus_production_attachments.s3.amazonaws.com/${city}/${fileName}`);
501+
}
502+
}
503+
504+
// 3) If current page is a Granicus DocumentViewer, try explicit download query param variants
505+
try {
506+
const urlObj = new URL(currentUrl);
507+
if (/DocumentViewer\.php$/i.test(urlObj.pathname) && urlObj.searchParams.has('file')) {
508+
const origin = `${urlObj.protocol}//${urlObj.host}`;
509+
const fileParam = urlObj.searchParams.get('file') as string;
510+
const baseViewer = `${origin}${urlObj.pathname}`;
511+
// Add explicit download query attempts
512+
const withDownload = new URL(baseViewer);
513+
withDownload.searchParams.set('file', fileParam);
514+
withDownload.searchParams.set('download', '1');
515+
candidates.push(withDownload.toString());
516+
517+
const withDownloadAndView = new URL(baseViewer);
518+
withDownloadAndView.searchParams.set('file', fileParam);
519+
withDownloadAndView.searchParams.set('view', urlObj.searchParams.get('view') || '1');
520+
withDownloadAndView.searchParams.set('download', '1');
521+
candidates.push(withDownloadAndView.toString());
522+
523+
// Also try direct origin + file param path as a last resort
524+
if (/\.pdf$/i.test(fileParam)) {
525+
candidates.push(`${origin}/${fileParam}`);
526+
}
527+
}
528+
} catch {}
529+
530+
// Log URLs for debugging
531+
console.log(` 📄 Current URL: ${currentUrl}`);
532+
console.log(` 📄 Candidate PDF URLs:`, candidates);
533+
534+
// Download the first successful candidate
535+
let downloadedBuffer: Buffer | null = null;
536+
for (const candidateUrl of candidates) {
537+
try {
538+
const ctx = page.context();
539+
const cookies = await ctx.cookies(candidateUrl);
540+
const cookieHeader = cookies.map(c => `${c.name}=${c.value}`).join('; ');
541+
const api = await request.newContext({
542+
extraHTTPHeaders: {
543+
...(cookieHeader ? { Cookie: cookieHeader } : {}),
544+
Referer: currentUrl,
545+
'User-Agent': 'Mozilla/5.0'
546+
}
458547
});
459-
460-
console.log(` 📄 PDF check:`, hasPdfContent);
461-
462-
// Only consider ready if we have substantial content OR PDF text
463-
if (hasPdfContent.hasSubstantialContent || hasPdfContent.hasPdfText) {
464-
pdfReady = true;
465-
console.log(` 📄 PDF content appears ready (substantial content or PDF text found)`);
548+
const res = await api.get(candidateUrl);
549+
if (res.ok()) {
550+
downloadedBuffer = await res.body();
551+
await api.dispose();
552+
pdfUrl = candidateUrl; // final URL used
466553
break;
467554
} else {
468-
console.log(` 📄 PDF not ready yet - content: ${hasPdfContent.hasSubstantialContent}, text length: ${hasPdfContent.bodyTextLength}, hasPdfText: ${hasPdfContent.hasPdfText}`);
555+
console.log(` 📄 GET ${candidateUrl} -> ${res.status()} ${res.statusText()}`);
556+
await api.dispose();
469557
}
470-
471-
// Wait a bit before next check
472-
await page.waitForTimeout(2000); // Increased wait time
473-
474-
} catch (checkErr: any) {
475-
console.log(` 📄 PDF check failed: ${checkErr.message}`);
476-
await page.waitForTimeout(2000);
558+
} catch (e: any) {
559+
console.log(` 📄 GET ${candidateUrl} failed: ${e.message}`);
477560
}
478561
}
479-
480-
// Strategy 3: Additional wait for any dynamic content
481-
if (step.wait && step.wait > 0) {
482-
console.log(` 📄 Additional wait: ${step.wait}ms`);
483-
await page.waitForTimeout(step.wait);
484-
}
485-
486-
console.log(` 📄 Capturing PDF...`);
487-
// Get the PDF content as buffer
488-
const pdfBuffer = await page.pdf({ format: 'A4' });
489-
490-
// Ensure directory exists
491-
const savePath = replaceDataPlaceholders(step.value, collector) || step.value || '';
492-
const dir = path.dirname(savePath);
493-
if (!fs.existsSync(dir)) {
494-
fs.mkdirSync(dir, { recursive: true });
562+
if (!downloadedBuffer) {
563+
console.log(' 📄 All candidate PDF URLs failed. Trying viewer download fallback...');
564+
// Main page attempt (deep shadow click only)
565+
let saved = false;
566+
const clickedMain = await page.evaluate(async () => {
567+
const targetIds = ['download', 'save'];
568+
const visited = new Set<Node>();
569+
function tryClick(node: Node): boolean {
570+
if (visited.has(node)) return false;
571+
visited.add(node);
572+
const el = node as HTMLElement;
573+
if (el && el.id && targetIds.includes(el.id)) { el.click(); return true; }
574+
const elem = node as Element;
575+
if (!elem) return false;
576+
const sr = (elem as any).shadowRoot as ShadowRoot | undefined;
577+
if (sr) for (const child of Array.from(sr.children)) { if (tryClick(child)) return true; }
578+
for (const child of Array.from(elem.children)) { if (tryClick(child)) return true; }
579+
return false;
580+
}
581+
return tryClick(document.documentElement);
582+
}).catch(() => false as any);
583+
if (clickedMain) {
584+
const dl = await page.waitForEvent('download', { timeout: 5000 }).catch(() => null);
585+
if (dl) {
586+
const resolvedPath: string = (replaceDataPlaceholders(targetPathBase, collector) || targetPathBase);
587+
const dir = path.dirname(resolvedPath);
588+
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
589+
await dl.saveAs(resolvedPath);
590+
savedPath = resolvedPath;
591+
console.log(` 📄 PDF saved via viewer download to ${resolvedPath}`);
592+
saved = true;
593+
}
594+
}
595+
596+
// Frames attempt
597+
if (!saved) {
598+
for (const frame of page.frames()) {
599+
if (frame === page.mainFrame()) continue;
600+
const clicked = await frame.evaluate(async () => {
601+
const targetIds = ['download', 'save'];
602+
const visited = new Set<Node>();
603+
function tryClick(node: Node): boolean {
604+
if (visited.has(node)) return false;
605+
visited.add(node);
606+
const el = node as HTMLElement;
607+
if (el && el.id && targetIds.includes(el.id)) { el.click(); return true; }
608+
const elem = node as Element;
609+
if (!elem) return false;
610+
const sr = (elem as any).shadowRoot as ShadowRoot | undefined;
611+
if (sr) for (const child of Array.from(sr.children)) { if (tryClick(child)) return true; }
612+
for (const child of Array.from(elem.children)) { if (tryClick(child)) return true; }
613+
return false;
614+
}
615+
return tryClick(document.documentElement);
616+
}).catch(() => false as any);
617+
if (clicked) {
618+
const dl = await page.waitForEvent('download', { timeout: 5000 }).catch(() => null);
619+
if (dl) {
620+
const resolvedPath: string = (replaceDataPlaceholders(targetPathBase, collector) || targetPathBase);
621+
const dir = path.dirname(resolvedPath);
622+
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
623+
await dl.saveAs(resolvedPath);
624+
savedPath = resolvedPath;
625+
console.log(` 📄 PDF saved via viewer download to ${resolvedPath}`);
626+
saved = true;
627+
break;
628+
}
629+
}
630+
}
631+
}
632+
633+
// Non-click fallback: try to scrape a direct download link href and fetch it
634+
if (!saved) {
635+
try {
636+
const hrefs = await page.evaluate(() => {
637+
const links: string[] = [];
638+
const anchors = Array.from(document.querySelectorAll('a')) as HTMLAnchorElement[];
639+
for (const a of anchors) {
640+
const text = (a.textContent || '').toLowerCase();
641+
const aria = (a.getAttribute('aria-label') || '').toLowerCase();
642+
if (a.hasAttribute('download') || text.includes('download') || aria.includes('download')) {
643+
if (a.href) links.push(a.href);
644+
}
645+
}
646+
return links.slice(0, 3);
647+
});
648+
if (hrefs && hrefs.length > 0) {
649+
for (const href of hrefs) {
650+
try {
651+
const ctx = page.context();
652+
const cookies = await ctx.cookies(href);
653+
const cookieHeader = cookies.map(c => `${c.name}=${c.value}`).join('; ');
654+
const api = await request.newContext({
655+
extraHTTPHeaders: {
656+
...(cookieHeader ? { Cookie: cookieHeader } : {}),
657+
Referer: currentUrl,
658+
'User-Agent': 'Mozilla/5.0',
659+
Accept: 'application/pdf,*/*'
660+
}
661+
});
662+
const res = await api.get(href);
663+
if (res.ok()) {
664+
const body = await res.body();
665+
const resolvedPath: string = (replaceDataPlaceholders(targetPathBase, collector) || targetPathBase);
666+
const dir = path.dirname(resolvedPath);
667+
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
668+
fs.writeFileSync(resolvedPath, body);
669+
savedPath = resolvedPath;
670+
console.log(` 📄 PDF saved via scraped href to ${resolvedPath}`);
671+
await api.dispose();
672+
saved = true;
673+
break;
674+
}
675+
await api.dispose();
676+
} catch {}
677+
}
678+
}
679+
} catch {}
680+
}
681+
682+
if (!saved) {
683+
console.log(' 📄 Viewer download fallback failed.');
684+
}
685+
} else {
686+
const resolvedPath: string = (replaceDataPlaceholders(targetPathBase, collector) || targetPathBase);
687+
const dir = path.dirname(resolvedPath);
688+
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
689+
fs.writeFileSync(resolvedPath, downloadedBuffer);
690+
savedPath = resolvedPath;
691+
console.log(` 📄 PDF saved to ${resolvedPath} (from ${pdfUrl})`);
495692
}
496-
497-
// Save the PDF
498-
fs.writeFileSync(savePath, pdfBuffer);
499-
savedPath = savePath;
500-
console.log(` 📄 PDF saved to ${savePath}`);
501693
} catch (err: any) {
502-
console.log(` 📄 PDF save failed: ${err.message}`);
503-
// Don't throw error, just continue
694+
console.log(` 📄 savePDF failed: ${err.message}`);
504695
} finally {
505-
// Record the file path (or null if not saved) in the collector
506696
collector[collectorKey] = savedPath;
507697
}
508698
break;

0 commit comments

Comments
 (0)