@@ -395,114 +395,304 @@ export async function executeStep(
395395 break ;
396396 }
397397 case 'savePDF' : {
398- // Save PDF content from current page to file
398+ // Save the actual PDF binary from the current page or embedded viewer
399399 if ( ! step . value ) {
400400 throw new Error ( `savePDF step ${ step . id } requires 'value' as target filepath` ) ;
401401 }
402402
403403 const collectorKey = step . key || step . id || 'file' ;
404404 let savedPath : string | null = null ;
405+ // After the guard above, we can safely treat step.value as string
406+ const targetPathBase : string = step . value as string ;
405407
406408 try {
407- console . log ( ` 📄 Waiting for PDF content to load...` ) ;
408-
409- // Strategy 1: Wait for DOM content loaded first, //timeout 10 minutes
409+ // Ensure the page finished initial navigation
410410 try {
411411 await page . waitForLoadState ( 'domcontentloaded' , { timeout : step . wait ?? 600000 } ) ;
412- console . log ( ` 📄 DOM content loaded` ) ;
413- } catch ( domErr ) {
414- console . log ( ` 📄 DOM content timeout, continuing anyway` ) ;
412+ } catch { }
413+
414+ // Try to resolve the direct PDF URL
415+ let pdfUrl : string | null = null ;
416+
417+ // 1) If the current URL points to a PDF (anywhere in the URL), use it or extract from query
418+ const currentUrl = page . url ( ) ;
419+ console . log ( ` 📄 Current URL: ${ currentUrl } ` ) ;
420+ try {
421+ const u = new URL ( currentUrl ) ;
422+ const candidates = [
423+ u . searchParams . get ( 'file' ) ,
424+ u . searchParams . get ( 'src' ) ,
425+ u . searchParams . get ( 'document' ) ,
426+ u . searchParams . get ( 'url' )
427+ ] . filter ( Boolean ) as string [ ] ;
428+ const paramPdf = candidates . find ( v => / \. p d f / i. test ( v ) ) ;
429+ if ( paramPdf ) {
430+ pdfUrl = new URL ( paramPdf , u . href ) . toString ( ) ;
431+ }
432+ } catch { }
433+ if ( ! pdfUrl && / \. p d f / i. test ( currentUrl ) ) {
434+ pdfUrl = currentUrl ;
415435 }
416-
417- // Strategy 2: Wait for PDF-specific elements or indicators
418- let pdfReady = false ;
419- const maxAttempts = 15 ; // Increased attempts for PDF loading
420- let attempts = 0 ;
421-
422- while ( ! pdfReady && attempts < maxAttempts ) {
423- attempts ++ ;
424- console . log ( ` 📄 Checking PDF readiness (attempt ${ attempts } /${ maxAttempts } )` ) ;
425-
436+
437+ // 2) Otherwise, try to discover PDF source from common viewer elements
438+ if ( ! pdfUrl ) {
426439 try {
427- // Check if page has PDF content indicators
428- const hasPdfContent = await page . evaluate ( ( ) => {
429- // Check for PDF viewer elements
430- const pdfViewer = document . querySelector ( 'embed[type="application/pdf"]' ) ||
431- document . querySelector ( 'object[type="application/pdf"]' ) ||
432- document . querySelector ( 'iframe[src*=".pdf"]' ) ||
433- document . querySelector ( '.pdf-viewer' ) ||
434- document . querySelector ( '[data-pdf]' ) ;
435-
436- // Check if page content is substantial (not just loading screen)
437- const bodyText = document . body ? document . body . innerText : '' ;
438- const hasSubstantialContent = bodyText . length > 200 ; // Increased threshold
439-
440- // Check if page is visible
441- const isVisible = document . body &&
442- document . body . style . display !== 'none' &&
443- document . body . style . visibility !== 'hidden' ;
444-
445- // Check for PDF-specific content
446- const hasPdfText = bodyText . includes ( 'PDF' ) ||
447- bodyText . includes ( 'Page' ) ||
448- bodyText . includes ( 'Agenda' ) ||
449- bodyText . includes ( 'Meeting' ) ;
450-
451- return {
452- hasPdfViewer : ! ! pdfViewer ,
453- hasSubstantialContent,
454- isVisible,
455- bodyTextLength : bodyText . length ,
456- hasPdfText
440+ pdfUrl = await page . evaluate ( ( ) => {
441+ const getAbs = ( src ?: string | null ) => {
442+ if ( ! src ) return null ;
443+ try {
444+ return new URL ( src , window . location . href ) . toString ( ) ;
445+ } catch {
446+ return src ;
447+ }
457448 } ;
449+
450+ const embed = document . querySelector ( 'embed[type="application/pdf"]' ) as HTMLObjectElement | null ;
451+ if ( embed && embed . getAttribute ( 'src' ) ) return getAbs ( embed . getAttribute ( 'src' ) ) ;
452+
453+ const objectEl = document . querySelector ( 'object[type="application/pdf"]' ) as HTMLObjectElement | null ;
454+ if ( objectEl && objectEl . getAttribute ( 'data' ) ) return getAbs ( objectEl . getAttribute ( 'data' ) ) ;
455+
456+ const iframe = Array . from ( document . querySelectorAll ( 'iframe' ) ) . find ( f => {
457+ const s = f . getAttribute ( 'src' ) || '' ;
458+ return / \. p d f / i. test ( s ) || s . includes ( 'pdf' ) ;
459+ } ) as HTMLIFrameElement | undefined ;
460+ if ( iframe && iframe . getAttribute ( 'src' ) ) return getAbs ( iframe . getAttribute ( 'src' ) ) ;
461+
462+ return null ;
463+ } ) ;
464+ } catch { }
465+ }
466+
467+ // 3) Additional wait if requested (helps some viewers populate 'src')
468+ if ( ! pdfUrl && step . wait && step . wait > 0 ) {
469+ await page . waitForTimeout ( step . wait ) ;
470+ try {
471+ // Try again once after waiting
472+ pdfUrl = await page . evaluate ( ( ) => {
473+ const iframe = Array . from ( document . querySelectorAll ( 'iframe' ) ) . find ( f => f . getAttribute ( 'src' ) ) as HTMLIFrameElement | undefined ;
474+ return iframe ?. src || null ;
475+ } ) ;
476+ } catch { }
477+ }
478+
479+ // If we couldn't find a PDF URL, abort instead of rendering HTML with page.pdf
480+ if ( ! pdfUrl ) {
481+ console . log ( ' 📄 Direct PDF URL not found. Skipping save (no page.pdf fallback).' ) ;
482+ break ;
483+ }
484+
485+ // Build candidate URLs and try them until one succeeds
486+ const candidates : string [ ] = [ ] ;
487+ const isAbsolute = / ^ h t t p s ? : / i. test ( pdfUrl ) ;
488+ if ( isAbsolute ) {
489+ candidates . push ( pdfUrl ) ;
490+ } else {
491+ // 1) Same-origin resolution
492+ candidates . push ( new URL ( pdfUrl , currentUrl ) . toString ( ) ) ;
493+
494+ // 2) Granicus S3 pattern: <prefix>/<filename>.pdf
495+ // Example filename: queencreekaz_<hash>.pdf -> folder "queencreekaz"
496+ const m = pdfUrl . match ( / ^ ( [ a - z 0 - 9 - ] + ) _ ( .+ \. p d f ) $ / i) ;
497+ if ( m ) {
498+ const city = m [ 1 ] ;
499+ const fileName = `${ m [ 1 ] } _${ m [ 2 ] } ` ; // full filename again
500+ candidates . push ( `https://granicus_production_attachments.s3.amazonaws.com/${ city } /${ fileName } ` ) ;
501+ }
502+ }
503+
504+ // 3) If current page is a Granicus DocumentViewer, try explicit download query param variants
505+ try {
506+ const urlObj = new URL ( currentUrl ) ;
507+ if ( / D o c u m e n t V i e w e r \. p h p $ / i. test ( urlObj . pathname ) && urlObj . searchParams . has ( 'file' ) ) {
508+ const origin = `${ urlObj . protocol } //${ urlObj . host } ` ;
509+ const fileParam = urlObj . searchParams . get ( 'file' ) as string ;
510+ const baseViewer = `${ origin } ${ urlObj . pathname } ` ;
511+ // Add explicit download query attempts
512+ const withDownload = new URL ( baseViewer ) ;
513+ withDownload . searchParams . set ( 'file' , fileParam ) ;
514+ withDownload . searchParams . set ( 'download' , '1' ) ;
515+ candidates . push ( withDownload . toString ( ) ) ;
516+
517+ const withDownloadAndView = new URL ( baseViewer ) ;
518+ withDownloadAndView . searchParams . set ( 'file' , fileParam ) ;
519+ withDownloadAndView . searchParams . set ( 'view' , urlObj . searchParams . get ( 'view' ) || '1' ) ;
520+ withDownloadAndView . searchParams . set ( 'download' , '1' ) ;
521+ candidates . push ( withDownloadAndView . toString ( ) ) ;
522+
523+ // Also try direct origin + file param path as a last resort
524+ if ( / \. p d f $ / i. test ( fileParam ) ) {
525+ candidates . push ( `${ origin } /${ fileParam } ` ) ;
526+ }
527+ }
528+ } catch { }
529+
530+ // Log URLs for debugging
531+ console . log ( ` 📄 Current URL: ${ currentUrl } ` ) ;
532+ console . log ( ` 📄 Candidate PDF URLs:` , candidates ) ;
533+
534+ // Download the first successful candidate
535+ let downloadedBuffer : Buffer | null = null ;
536+ for ( const candidateUrl of candidates ) {
537+ try {
538+ const ctx = page . context ( ) ;
539+ const cookies = await ctx . cookies ( candidateUrl ) ;
540+ const cookieHeader = cookies . map ( c => `${ c . name } =${ c . value } ` ) . join ( '; ' ) ;
541+ const api = await request . newContext ( {
542+ extraHTTPHeaders : {
543+ ...( cookieHeader ? { Cookie : cookieHeader } : { } ) ,
544+ Referer : currentUrl ,
545+ 'User-Agent' : 'Mozilla/5.0'
546+ }
458547 } ) ;
459-
460- console . log ( ` 📄 PDF check:` , hasPdfContent ) ;
461-
462- // Only consider ready if we have substantial content OR PDF text
463- if ( hasPdfContent . hasSubstantialContent || hasPdfContent . hasPdfText ) {
464- pdfReady = true ;
465- console . log ( ` 📄 PDF content appears ready (substantial content or PDF text found)` ) ;
548+ const res = await api . get ( candidateUrl ) ;
549+ if ( res . ok ( ) ) {
550+ downloadedBuffer = await res . body ( ) ;
551+ await api . dispose ( ) ;
552+ pdfUrl = candidateUrl ; // final URL used
466553 break ;
467554 } else {
468- console . log ( ` 📄 PDF not ready yet - content: ${ hasPdfContent . hasSubstantialContent } , text length: ${ hasPdfContent . bodyTextLength } , hasPdfText: ${ hasPdfContent . hasPdfText } ` ) ;
555+ console . log ( ` 📄 GET ${ candidateUrl } -> ${ res . status ( ) } ${ res . statusText ( ) } ` ) ;
556+ await api . dispose ( ) ;
469557 }
470-
471- // Wait a bit before next check
472- await page . waitForTimeout ( 2000 ) ; // Increased wait time
473-
474- } catch ( checkErr : any ) {
475- console . log ( ` 📄 PDF check failed: ${ checkErr . message } ` ) ;
476- await page . waitForTimeout ( 2000 ) ;
558+ } catch ( e : any ) {
559+ console . log ( ` 📄 GET ${ candidateUrl } failed: ${ e . message } ` ) ;
477560 }
478561 }
479-
480- // Strategy 3: Additional wait for any dynamic content
481- if ( step . wait && step . wait > 0 ) {
482- console . log ( ` 📄 Additional wait: ${ step . wait } ms` ) ;
483- await page . waitForTimeout ( step . wait ) ;
484- }
485-
486- console . log ( ` 📄 Capturing PDF...` ) ;
487- // Get the PDF content as buffer
488- const pdfBuffer = await page . pdf ( { format : 'A4' } ) ;
489-
490- // Ensure directory exists
491- const savePath = replaceDataPlaceholders ( step . value , collector ) || step . value || '' ;
492- const dir = path . dirname ( savePath ) ;
493- if ( ! fs . existsSync ( dir ) ) {
494- fs . mkdirSync ( dir , { recursive : true } ) ;
562+ if ( ! downloadedBuffer ) {
563+ console . log ( ' 📄 All candidate PDF URLs failed. Trying viewer download fallback...' ) ;
564+ // Main page attempt (deep shadow click only)
565+ let saved = false ;
566+ const clickedMain = await page . evaluate ( async ( ) => {
567+ const targetIds = [ 'download' , 'save' ] ;
568+ const visited = new Set < Node > ( ) ;
569+ function tryClick ( node : Node ) : boolean {
570+ if ( visited . has ( node ) ) return false ;
571+ visited . add ( node ) ;
572+ const el = node as HTMLElement ;
573+ if ( el && el . id && targetIds . includes ( el . id ) ) { el . click ( ) ; return true ; }
574+ const elem = node as Element ;
575+ if ( ! elem ) return false ;
576+ const sr = ( elem as any ) . shadowRoot as ShadowRoot | undefined ;
577+ if ( sr ) for ( const child of Array . from ( sr . children ) ) { if ( tryClick ( child ) ) return true ; }
578+ for ( const child of Array . from ( elem . children ) ) { if ( tryClick ( child ) ) return true ; }
579+ return false ;
580+ }
581+ return tryClick ( document . documentElement ) ;
582+ } ) . catch ( ( ) => false as any ) ;
583+ if ( clickedMain ) {
584+ const dl = await page . waitForEvent ( 'download' , { timeout : 5000 } ) . catch ( ( ) => null ) ;
585+ if ( dl ) {
586+ const resolvedPath : string = ( replaceDataPlaceholders ( targetPathBase , collector ) || targetPathBase ) ;
587+ const dir = path . dirname ( resolvedPath ) ;
588+ if ( ! fs . existsSync ( dir ) ) fs . mkdirSync ( dir , { recursive : true } ) ;
589+ await dl . saveAs ( resolvedPath ) ;
590+ savedPath = resolvedPath ;
591+ console . log ( ` 📄 PDF saved via viewer download to ${ resolvedPath } ` ) ;
592+ saved = true ;
593+ }
594+ }
595+
596+ // Frames attempt
597+ if ( ! saved ) {
598+ for ( const frame of page . frames ( ) ) {
599+ if ( frame === page . mainFrame ( ) ) continue ;
600+ const clicked = await frame . evaluate ( async ( ) => {
601+ const targetIds = [ 'download' , 'save' ] ;
602+ const visited = new Set < Node > ( ) ;
603+ function tryClick ( node : Node ) : boolean {
604+ if ( visited . has ( node ) ) return false ;
605+ visited . add ( node ) ;
606+ const el = node as HTMLElement ;
607+ if ( el && el . id && targetIds . includes ( el . id ) ) { el . click ( ) ; return true ; }
608+ const elem = node as Element ;
609+ if ( ! elem ) return false ;
610+ const sr = ( elem as any ) . shadowRoot as ShadowRoot | undefined ;
611+ if ( sr ) for ( const child of Array . from ( sr . children ) ) { if ( tryClick ( child ) ) return true ; }
612+ for ( const child of Array . from ( elem . children ) ) { if ( tryClick ( child ) ) return true ; }
613+ return false ;
614+ }
615+ return tryClick ( document . documentElement ) ;
616+ } ) . catch ( ( ) => false as any ) ;
617+ if ( clicked ) {
618+ const dl = await page . waitForEvent ( 'download' , { timeout : 5000 } ) . catch ( ( ) => null ) ;
619+ if ( dl ) {
620+ const resolvedPath : string = ( replaceDataPlaceholders ( targetPathBase , collector ) || targetPathBase ) ;
621+ const dir = path . dirname ( resolvedPath ) ;
622+ if ( ! fs . existsSync ( dir ) ) fs . mkdirSync ( dir , { recursive : true } ) ;
623+ await dl . saveAs ( resolvedPath ) ;
624+ savedPath = resolvedPath ;
625+ console . log ( ` 📄 PDF saved via viewer download to ${ resolvedPath } ` ) ;
626+ saved = true ;
627+ break ;
628+ }
629+ }
630+ }
631+ }
632+
633+ // Non-click fallback: try to scrape a direct download link href and fetch it
634+ if ( ! saved ) {
635+ try {
636+ const hrefs = await page . evaluate ( ( ) => {
637+ const links : string [ ] = [ ] ;
638+ const anchors = Array . from ( document . querySelectorAll ( 'a' ) ) as HTMLAnchorElement [ ] ;
639+ for ( const a of anchors ) {
640+ const text = ( a . textContent || '' ) . toLowerCase ( ) ;
641+ const aria = ( a . getAttribute ( 'aria-label' ) || '' ) . toLowerCase ( ) ;
642+ if ( a . hasAttribute ( 'download' ) || text . includes ( 'download' ) || aria . includes ( 'download' ) ) {
643+ if ( a . href ) links . push ( a . href ) ;
644+ }
645+ }
646+ return links . slice ( 0 , 3 ) ;
647+ } ) ;
648+ if ( hrefs && hrefs . length > 0 ) {
649+ for ( const href of hrefs ) {
650+ try {
651+ const ctx = page . context ( ) ;
652+ const cookies = await ctx . cookies ( href ) ;
653+ const cookieHeader = cookies . map ( c => `${ c . name } =${ c . value } ` ) . join ( '; ' ) ;
654+ const api = await request . newContext ( {
655+ extraHTTPHeaders : {
656+ ...( cookieHeader ? { Cookie : cookieHeader } : { } ) ,
657+ Referer : currentUrl ,
658+ 'User-Agent' : 'Mozilla/5.0' ,
659+ Accept : 'application/pdf,*/*'
660+ }
661+ } ) ;
662+ const res = await api . get ( href ) ;
663+ if ( res . ok ( ) ) {
664+ const body = await res . body ( ) ;
665+ const resolvedPath : string = ( replaceDataPlaceholders ( targetPathBase , collector ) || targetPathBase ) ;
666+ const dir = path . dirname ( resolvedPath ) ;
667+ if ( ! fs . existsSync ( dir ) ) fs . mkdirSync ( dir , { recursive : true } ) ;
668+ fs . writeFileSync ( resolvedPath , body ) ;
669+ savedPath = resolvedPath ;
670+ console . log ( ` 📄 PDF saved via scraped href to ${ resolvedPath } ` ) ;
671+ await api . dispose ( ) ;
672+ saved = true ;
673+ break ;
674+ }
675+ await api . dispose ( ) ;
676+ } catch { }
677+ }
678+ }
679+ } catch { }
680+ }
681+
682+ if ( ! saved ) {
683+ console . log ( ' 📄 Viewer download fallback failed.' ) ;
684+ }
685+ } else {
686+ const resolvedPath : string = ( replaceDataPlaceholders ( targetPathBase , collector ) || targetPathBase ) ;
687+ const dir = path . dirname ( resolvedPath ) ;
688+ if ( ! fs . existsSync ( dir ) ) fs . mkdirSync ( dir , { recursive : true } ) ;
689+ fs . writeFileSync ( resolvedPath , downloadedBuffer ) ;
690+ savedPath = resolvedPath ;
691+ console . log ( ` 📄 PDF saved to ${ resolvedPath } (from ${ pdfUrl } )` ) ;
495692 }
496-
497- // Save the PDF
498- fs . writeFileSync ( savePath , pdfBuffer ) ;
499- savedPath = savePath ;
500- console . log ( ` 📄 PDF saved to ${ savePath } ` ) ;
501693 } catch ( err : any ) {
502- console . log ( ` 📄 PDF save failed: ${ err . message } ` ) ;
503- // Don't throw error, just continue
694+ console . log ( ` 📄 savePDF failed: ${ err . message } ` ) ;
504695 } finally {
505- // Record the file path (or null if not saved) in the collector
506696 collector [ collectorKey ] = savedPath ;
507697 }
508698 break ;
0 commit comments