@@ -490,47 +490,11 @@ export async function executeStep(
490490 } else {
491491 // 1) Same-origin resolution
492492 candidates . push ( new URL ( pdfUrl , currentUrl ) . toString ( ) ) ;
493-
494- // 2) Granicus S3 pattern: <prefix>/<filename>.pdf
495- // Example filename: queencreekaz_<hash>.pdf -> folder "queencreekaz"
496- const m = pdfUrl . match ( / ^ ( [ a - z 0 - 9 - ] + ) _ ( .+ \. p d f ) $ / i) ;
497- if ( m ) {
498- const city = m [ 1 ] ;
499- const fileName = `${ m [ 1 ] } _${ m [ 2 ] } ` ; // full filename again
500- candidates . push ( `https://granicus_production_attachments.s3.amazonaws.com/${ city } /${ fileName } ` ) ;
501- }
502493 }
503494
504- // 3) If current page is a Granicus DocumentViewer, try explicit download query param variants
505- try {
506- const urlObj = new URL ( currentUrl ) ;
507- if ( / D o c u m e n t V i e w e r \. p h p $ / i. test ( urlObj . pathname ) && urlObj . searchParams . has ( 'file' ) ) {
508- const origin = `${ urlObj . protocol } //${ urlObj . host } ` ;
509- const fileParam = urlObj . searchParams . get ( 'file' ) as string ;
510- const baseViewer = `${ origin } ${ urlObj . pathname } ` ;
511- // Add explicit download query attempts
512- const withDownload = new URL ( baseViewer ) ;
513- withDownload . searchParams . set ( 'file' , fileParam ) ;
514- withDownload . searchParams . set ( 'download' , '1' ) ;
515- candidates . push ( withDownload . toString ( ) ) ;
516-
517- const withDownloadAndView = new URL ( baseViewer ) ;
518- withDownloadAndView . searchParams . set ( 'file' , fileParam ) ;
519- withDownloadAndView . searchParams . set ( 'view' , urlObj . searchParams . get ( 'view' ) || '1' ) ;
520- withDownloadAndView . searchParams . set ( 'download' , '1' ) ;
521- candidates . push ( withDownloadAndView . toString ( ) ) ;
522-
523- // Also try direct origin + file param path as a last resort
524- if ( / \. p d f $ / i. test ( fileParam ) ) {
525- candidates . push ( `${ origin } /${ fileParam } ` ) ;
526- }
527- }
528- } catch { }
529-
530495 // Log URLs for debugging
531496 console . log ( ` 📄 Current URL: ${ currentUrl } ` ) ;
532497 console . log ( ` 📄 Candidate PDF URLs:` , candidates ) ;
533-
534498 // Download the first successful candidate
535499 let downloadedBuffer : Buffer | null = null ;
536500 for ( const candidateUrl of candidates ) {
@@ -561,126 +525,122 @@ export async function executeStep(
561525 }
562526 if ( ! downloadedBuffer ) {
563527 console . log ( ' 📄 All candidate PDF URLs failed. Trying viewer download fallback...' ) ;
564- // Main page attempt (deep shadow click only)
528+
529+ // Strategy 1: Try to extract PDF URL from embed element and fetch directly
565530 let saved = false ;
566- const clickedMain = await page . evaluate ( async ( ) => {
567- const targetIds = [ 'download' , 'save' ] ;
568- const visited = new Set < Node > ( ) ;
569- function tryClick ( node : Node ) : boolean {
570- if ( visited . has ( node ) ) return false ;
571- visited . add ( node ) ;
572- const el = node as HTMLElement ;
573- if ( el && el . id && targetIds . includes ( el . id ) ) { el . click ( ) ; return true ; }
574- const elem = node as Element ;
575- if ( ! elem ) return false ;
576- const sr = ( elem as any ) . shadowRoot as ShadowRoot | undefined ;
577- if ( sr ) for ( const child of Array . from ( sr . children ) ) { if ( tryClick ( child ) ) return true ; }
578- for ( const child of Array . from ( elem . children ) ) { if ( tryClick ( child ) ) return true ; }
579- return false ;
580- }
581- return tryClick ( document . documentElement ) ;
582- } ) . catch ( ( ) => false as any ) ;
583- if ( clickedMain ) {
584- const dl = await page . waitForEvent ( 'download' , { timeout : 5000 } ) . catch ( ( ) => null ) ;
585- if ( dl ) {
586- const resolvedPath : string = ( replaceDataPlaceholders ( targetPathBase , collector ) || targetPathBase ) ;
587- const dir = path . dirname ( resolvedPath ) ;
588- if ( ! fs . existsSync ( dir ) ) fs . mkdirSync ( dir , { recursive : true } ) ;
589- await dl . saveAs ( resolvedPath ) ;
590- savedPath = resolvedPath ;
591- console . log ( ` 📄 PDF saved via viewer download to ${ resolvedPath } ` ) ;
592- saved = true ;
593- }
594- }
595-
596- // Frames attempt
597- if ( ! saved ) {
598- for ( const frame of page . frames ( ) ) {
599- if ( frame === page . mainFrame ( ) ) continue ;
600- const clicked = await frame . evaluate ( async ( ) => {
601- const targetIds = [ 'download' , 'save' ] ;
602- const visited = new Set < Node > ( ) ;
603- function tryClick ( node : Node ) : boolean {
604- if ( visited . has ( node ) ) return false ;
605- visited . add ( node ) ;
606- const el = node as HTMLElement ;
607- if ( el && el . id && targetIds . includes ( el . id ) ) { el . click ( ) ; return true ; }
608- const elem = node as Element ;
609- if ( ! elem ) return false ;
610- const sr = ( elem as any ) . shadowRoot as ShadowRoot | undefined ;
611- if ( sr ) for ( const child of Array . from ( sr . children ) ) { if ( tryClick ( child ) ) return true ; }
612- for ( const child of Array . from ( elem . children ) ) { if ( tryClick ( child ) ) return true ; }
613- return false ;
614- }
615- return tryClick ( document . documentElement ) ;
616- } ) . catch ( ( ) => false as any ) ;
617- if ( clicked ) {
618- const dl = await page . waitForEvent ( 'download' , { timeout : 5000 } ) . catch ( ( ) => null ) ;
619- if ( dl ) {
531+
532+ try {
533+ const embedPdfUrl = await page . evaluate ( ( ) => {
534+ const embed = document . querySelector ( 'embed[type="application/x-google-chrome-pdf"]' ) as HTMLEmbedElement ;
535+ if ( embed && embed . getAttribute ( 'original-url' ) ) {
536+ return embed . getAttribute ( 'original-url' ) ;
537+ }
538+ return null ;
539+ } ) ;
540+
541+ if ( embedPdfUrl ) {
542+ console . log ( ` 📄 Found PDF URL in embed: ${ embedPdfUrl } ` ) ;
543+ try {
544+ const ctx = page . context ( ) ;
545+ const cookies = await ctx . cookies ( embedPdfUrl ) ;
546+ const cookieHeader = cookies . map ( c => `${ c . name } =${ c . value } ` ) . join ( '; ' ) ;
547+ const api = await request . newContext ( {
548+ extraHTTPHeaders : {
549+ ...( cookieHeader ? { Cookie : cookieHeader } : { } ) ,
550+ Referer : currentUrl ,
551+ 'User-Agent' : 'Mozilla/5.0'
552+ }
553+ } ) ;
554+ const res = await api . get ( embedPdfUrl ) ;
555+ if ( res . ok ( ) ) {
556+ const body = await res . body ( ) ;
620557 const resolvedPath : string = ( replaceDataPlaceholders ( targetPathBase , collector ) || targetPathBase ) ;
621558 const dir = path . dirname ( resolvedPath ) ;
622559 if ( ! fs . existsSync ( dir ) ) fs . mkdirSync ( dir , { recursive : true } ) ;
623- await dl . saveAs ( resolvedPath ) ;
560+ fs . writeFileSync ( resolvedPath , body ) ;
624561 savedPath = resolvedPath ;
625- console . log ( ` 📄 PDF saved via viewer download to ${ resolvedPath } ` ) ;
562+ console . log ( ` 📄 PDF saved from embed URL to ${ resolvedPath } ` ) ;
563+ await api . dispose ( ) ;
626564 saved = true ;
627- break ;
628565 }
566+ await api . dispose ( ) ;
567+ } catch ( e : any ) {
568+ console . log ( ` 📄 Failed to fetch from embed URL: ${ e . message } ` ) ;
629569 }
630570 }
571+ } catch ( e : any ) {
572+ console . log ( ` 📄 Embed extraction failed: ${ e . message } ` ) ;
631573 }
632-
633- // Non-click fallback: try to scrape a direct download link href and fetch it
574+
575+ // Strategy 2: Wait for PDF viewer and click download button
634576 if ( ! saved ) {
577+ console . log ( ' 📄 Waiting for PDF viewer to load...' ) ;
635578 try {
636- const hrefs = await page . evaluate ( ( ) => {
637- const links : string [ ] = [ ] ;
638- const anchors = Array . from ( document . querySelectorAll ( 'a' ) ) as HTMLAnchorElement [ ] ;
639- for ( const a of anchors ) {
640- const text = ( a . textContent || '' ) . toLowerCase ( ) ;
641- const aria = ( a . getAttribute ( 'aria-label' ) || '' ) . toLowerCase ( ) ;
642- if ( a . hasAttribute ( 'download' ) || text . includes ( 'download' ) || aria . includes ( 'download' ) ) {
643- if ( a . href ) links . push ( a . href ) ;
579+ // Wait for pdf-viewer element to appear
580+ await page . waitForSelector ( 'pdf-viewer' , { timeout : 10000 } ) . catch ( ( ) => null ) ;
581+ await page . waitForTimeout ( 2000 ) ; // Additional wait for shadow DOMs to initialize
582+
583+ console . log ( ' 📄 Searching for download button...' ) ;
584+ const clicked = await page . evaluate ( ( ) => {
585+ // Try direct selector path
586+ const pdfViewer = document . querySelector ( 'pdf-viewer' ) ;
587+ if ( pdfViewer && pdfViewer . shadowRoot ) {
588+ const toolbar = pdfViewer . shadowRoot . querySelector ( 'viewer-toolbar' ) ;
589+ if ( toolbar && toolbar . shadowRoot ) {
590+ const downloadControls = toolbar . shadowRoot . querySelector ( 'viewer-download-controls' ) ;
591+ if ( downloadControls && downloadControls . shadowRoot ) {
592+ const saveButton = downloadControls . shadowRoot . querySelector ( '#save' ) as HTMLElement ;
593+ if ( saveButton ) {
594+ saveButton . click ( ) ;
595+ return true ;
596+ }
597+ }
644598 }
645599 }
646- return links . slice ( 0 , 3 ) ;
647- } ) ;
648- if ( hrefs && hrefs . length > 0 ) {
649- for ( const href of hrefs ) {
650- try {
651- const ctx = page . context ( ) ;
652- const cookies = await ctx . cookies ( href ) ;
653- const cookieHeader = cookies . map ( c => `${ c . name } =${ c . value } ` ) . join ( '; ' ) ;
654- const api = await request . newContext ( {
655- extraHTTPHeaders : {
656- ...( cookieHeader ? { Cookie : cookieHeader } : { } ) ,
657- Referer : currentUrl ,
658- 'User-Agent' : 'Mozilla/5.0' ,
659- Accept : 'application/pdf,*/*'
660- }
661- } ) ;
662- const res = await api . get ( href ) ;
663- if ( res . ok ( ) ) {
664- const body = await res . body ( ) ;
665- const resolvedPath : string = ( replaceDataPlaceholders ( targetPathBase , collector ) || targetPathBase ) ;
666- const dir = path . dirname ( resolvedPath ) ;
667- if ( ! fs . existsSync ( dir ) ) fs . mkdirSync ( dir , { recursive : true } ) ;
668- fs . writeFileSync ( resolvedPath , body ) ;
669- savedPath = resolvedPath ;
670- console . log ( ` 📄 PDF saved via scraped href to ${ resolvedPath } ` ) ;
671- await api . dispose ( ) ;
672- saved = true ;
673- break ;
600+
601+ // Fallback: recursive search
602+ function findAndClickDownload ( root : Element | Document | ShadowRoot ) : boolean {
603+ const walker = document . createTreeWalker ( root as Node , NodeFilter . SHOW_ELEMENT ) ;
604+ let node : Node | null ;
605+ while ( ( node = walker . nextNode ( ) ) ) {
606+ const el = node as HTMLElement ;
607+ if ( el . id === 'save' || el . id === 'download' ) {
608+ el . click ( ) ;
609+ return true ;
610+ }
611+ if ( el . shadowRoot ) {
612+ if ( findAndClickDownload ( el . shadowRoot ) ) return true ;
674613 }
675- await api . dispose ( ) ;
676- } catch { }
614+ }
615+ return false ;
677616 }
617+ return findAndClickDownload ( document ) ;
618+ } ) . catch ( ( ) => false ) ;
619+
620+ if ( clicked ) {
621+ console . log ( ' 📄 Found and clicked download button! Waiting for download...' ) ;
622+ const dl = await page . waitForEvent ( 'download' , { timeout : 10000 } ) . catch ( ( ) => null ) ;
623+ if ( dl ) {
624+ const resolvedPath : string = ( replaceDataPlaceholders ( targetPathBase , collector ) || targetPathBase ) ;
625+ const dir = path . dirname ( resolvedPath ) ;
626+ if ( ! fs . existsSync ( dir ) ) fs . mkdirSync ( dir , { recursive : true } ) ;
627+ await dl . saveAs ( resolvedPath ) ;
628+ savedPath = resolvedPath ;
629+ console . log ( ` 📄 PDF saved via download button to ${ resolvedPath } ` ) ;
630+ saved = true ;
631+ } else {
632+ console . log ( ' 📄 Download button clicked but no download event received.' ) ;
633+ }
634+ } else {
635+ console . log ( ' 📄 Download button not found.' ) ;
678636 }
679- } catch { }
637+ } catch ( e : any ) {
638+ console . log ( ` 📄 PDF viewer approach failed: ${ e . message } ` ) ;
639+ }
680640 }
681-
641+
682642 if ( ! saved ) {
683- console . log ( ' 📄 Viewer download fallback failed.' ) ;
643+ console . log ( ' 📄 All viewer download fallbacks failed.' ) ;
684644 }
685645 } else {
686646 const resolvedPath : string = ( replaceDataPlaceholders ( targetPathBase , collector ) || targetPathBase ) ;
0 commit comments