@@ -402,258 +402,140 @@ export async function executeStep(
402402
403403 const collectorKey = step . key || step . id || 'file' ;
404404 let savedPath : string | null = null ;
405- // After the guard above, we can safely treat step.value as string
406405 const targetPathBase : string = step . value as string ;
406+ const resolvedPath : string = replaceDataPlaceholders ( targetPathBase , collector ) || targetPathBase ;
407+ const dir = path . dirname ( resolvedPath ) ;
408+ if ( ! fs . existsSync ( dir ) ) {
409+ fs . mkdirSync ( dir , { recursive : true } ) ;
410+ }
411+
412+ let pdfSaved = false ;
413+ const interceptedData : { buffer : Buffer | null } = { buffer : null } ;
407414
408415 try {
409416 // Ensure the page finished initial navigation
410417 try {
411418 await page . waitForLoadState ( 'domcontentloaded' , { timeout : step . wait ?? 600000 } ) ;
412419 } catch { }
413420
414- // Try to resolve the direct PDF URL
415- let pdfUrl : string | null = null ;
416-
417- // 1) If the current URL points to a PDF (anywhere in the URL), use it or extract from query
418- const currentUrl = page . url ( ) ;
419- console . log ( ` 📄 Current URL: ${ currentUrl } ` ) ;
420- try {
421- const u = new URL ( currentUrl ) ;
422- const candidates = [
423- u . searchParams . get ( 'file' ) ,
424- u . searchParams . get ( 'src' ) ,
425- u . searchParams . get ( 'document' ) ,
426- u . searchParams . get ( 'url' )
427- ] . filter ( Boolean ) as string [ ] ;
428- const paramPdf = candidates . find ( v => / \. p d f / i. test ( v ) ) ;
429- if ( paramPdf ) {
430- pdfUrl = new URL ( paramPdf , u . href ) . toString ( ) ;
421+ // Intercept responses to capture PDF even when displayed inline
422+ await page . route ( '**/*' , async route => {
423+ const response = await route . fetch ( ) ;
424+ const contentType = response . headers ( ) [ 'content-type' ] || '' ;
425+ const url = route . request ( ) . url ( ) ;
426+
427+ // Check if this is a PDF response
428+ if ( contentType . includes ( 'application/pdf' ) || url . includes ( '.pdf' ) ) {
429+ const buffer = await response . body ( ) ;
430+ if ( ! pdfSaved && buffer . length > 0 ) {
431+ interceptedData . buffer = buffer ;
432+ // Save immediately when intercepted
433+ try {
434+ fs . writeFileSync ( resolvedPath , buffer ) ;
435+ savedPath = resolvedPath ;
436+ pdfSaved = true ;
437+ console . log (
438+ ` 📄 PDF intercepted and saved (${ ( buffer . length / 1024 ) . toFixed ( 2 ) } KB) to ${ resolvedPath } `
439+ ) ;
440+ } catch ( saveErr : any ) {
441+ console . log ( ` 📄 Failed to save intercepted PDF: ${ saveErr . message } ` ) ;
442+ }
443+ }
431444 }
432- } catch { }
433- if ( ! pdfUrl && / \. p d f / i. test ( currentUrl ) ) {
434- pdfUrl = currentUrl ;
435- }
436445
437- // 2) Otherwise, try to discover PDF source from common viewer elements
438- if ( ! pdfUrl ) {
439- try {
440- pdfUrl = await page . evaluate ( ( ) => {
441- const getAbs = ( src ?: string | null ) => {
442- if ( ! src ) return null ;
443- try {
444- return new URL ( src , window . location . href ) . toString ( ) ;
445- } catch {
446- return src ;
447- }
448- } ;
449-
450- const embed = document . querySelector ( 'embed[type="application/pdf"]' ) as HTMLObjectElement | null ;
451- if ( embed && embed . getAttribute ( 'src' ) ) return getAbs ( embed . getAttribute ( 'src' ) ) ;
452-
453- const objectEl = document . querySelector ( 'object[type="application/pdf"]' ) as HTMLObjectElement | null ;
454- if ( objectEl && objectEl . getAttribute ( 'data' ) ) return getAbs ( objectEl . getAttribute ( 'data' ) ) ;
455-
456- const iframe = Array . from ( document . querySelectorAll ( 'iframe' ) ) . find ( f => {
457- const s = f . getAttribute ( 'src' ) || '' ;
458- return / \. p d f / i. test ( s ) || s . includes ( 'pdf' ) ;
459- } ) as HTMLIFrameElement | undefined ;
460- if ( iframe && iframe . getAttribute ( 'src' ) ) return getAbs ( iframe . getAttribute ( 'src' ) ) ;
461-
462- return null ;
463- } ) ;
464- } catch { }
465- }
466-
467- // 3) Additional wait if requested (helps some viewers populate 'src')
468- if ( ! pdfUrl && step . wait && step . wait > 0 ) {
469- await page . waitForTimeout ( step . wait ) ;
470- try {
471- // Try again once after waiting
472- pdfUrl = await page . evaluate ( ( ) => {
473- const iframe = Array . from ( document . querySelectorAll ( 'iframe' ) ) . find ( f => f . getAttribute ( 'src' ) ) as HTMLIFrameElement | undefined ;
474- return iframe ?. src || null ;
475- } ) ;
476- } catch { }
477- }
446+ // Continue with the normal response
447+ await route . fulfill ( { response } ) ;
448+ } ) ;
478449
479- // If we couldn't find a PDF URL, abort instead of rendering HTML with page.pdf
480- if ( ! pdfUrl ) {
481- console . log ( ' 📄 Direct PDF URL not found. Skipping save (no page.pdf fallback).' ) ;
482- break ;
483- }
450+ const currentUrl = page . url ( ) ;
451+ console . log ( ` 📄 Current URL: ${ currentUrl } ` ) ;
484452
485- // Build candidate URLs and try them until one succeeds
486- const candidates : string [ ] = [ ] ;
487- const isAbsolute = / ^ h t t p s ? : / i. test ( pdfUrl ) ;
488- if ( isAbsolute ) {
489- candidates . push ( pdfUrl ) ;
490- } else {
491- // 1) Same-origin resolution
492- candidates . push ( new URL ( pdfUrl , currentUrl ) . toString ( ) ) ;
453+ // Check if we're already on a PDF URL - wait a bit for interception
454+ const isPdfUrl = currentUrl . includes ( '.pdf' ) || / \. p d f ( \? | $ ) / i. test ( currentUrl ) ;
455+ if ( isPdfUrl && ! pdfSaved ) {
456+ // Wait a moment for route interception to catch the PDF if it's already loading
457+ await page . waitForTimeout ( 1000 ) ;
493458 }
494459
495- // Log URLs for debugging
496- console . log ( ` 📄 Current URL: ${ currentUrl } ` ) ;
497- console . log ( ` 📄 Candidate PDF URLs:` , candidates ) ;
498- // Download the first successful candidate
499- let downloadedBuffer : Buffer | null = null ;
500- for ( const candidateUrl of candidates ) {
501- try {
502- const ctx = page . context ( ) ;
503- const cookies = await ctx . cookies ( candidateUrl ) ;
504- const cookieHeader = cookies . map ( c => `${ c . name } =${ c . value } ` ) . join ( '; ' ) ;
505- const api = await request . newContext ( {
506- extraHTTPHeaders : {
507- ...( cookieHeader ? { Cookie : cookieHeader } : { } ) ,
508- Referer : currentUrl ,
509- 'User-Agent' : 'Mozilla/5.0'
460+ // Try both approaches: wait for download event OR intercept response
461+ try {
462+ // Reload the page to trigger route interception (unless already saved)
463+ const [ response , download ] = await Promise . all ( [
464+ ! pdfSaved ? page . reload ( { waitUntil : 'networkidle' } ) . catch ( ( ) => null ) : Promise . resolve ( null ) ,
465+ page . waitForEvent ( 'download' , { timeout : 5000 } ) . catch ( ( ) => null )
466+ ] ) ;
467+
468+ if ( download ) {
469+ // If download event occurred, save it
470+ await download . saveAs ( resolvedPath ) ;
471+ savedPath = resolvedPath ;
472+ pdfSaved = true ;
473+ console . log ( ` 📄 PDF saved via download event to ${ resolvedPath } ` ) ;
474+ } else if ( response ) {
475+ // Check if the response itself is a PDF
476+ const contentType = response . headers ( ) [ 'content-type' ] || '' ;
477+ if ( contentType . includes ( 'application/pdf' ) && ! pdfSaved ) {
478+ const buffer = await response . body ( ) ;
479+ if ( buffer . length > 0 ) {
480+ fs . writeFileSync ( resolvedPath , buffer ) ;
481+ savedPath = resolvedPath ;
482+ pdfSaved = true ;
483+ console . log (
484+ ` 📄 PDF saved via response body (${ ( buffer . length / 1024 ) . toFixed ( 2 ) } KB) to ${ resolvedPath } `
485+ ) ;
510486 }
511- } ) ;
512- const res = await api . get ( candidateUrl ) ;
513- if ( res . ok ( ) ) {
514- downloadedBuffer = await res . body ( ) ;
515- await api . dispose ( ) ;
516- pdfUrl = candidateUrl ; // final URL used
517- break ;
518487 } else {
519- console . log ( ` 📄 GET ${ candidateUrl } -> ${ res . status ( ) } ${ res . statusText ( ) } ` ) ;
520- await api . dispose ( ) ;
521- }
522- } catch ( e : any ) {
523- console . log ( ` 📄 GET ${ candidateUrl } failed: ${ e . message } ` ) ;
524- }
525- }
526- if ( ! downloadedBuffer ) {
527- console . log ( ' 📄 All candidate PDF URLs failed. Trying viewer download fallback...' ) ;
528-
529- // Strategy 1: Try to extract PDF URL from embed element and fetch directly
530- let saved = false ;
531-
532- try {
533- const embedPdfUrl = await page . evaluate ( ( ) => {
534- const embed = document . querySelector ( 'embed[type="application/x-google-chrome-pdf"]' ) as HTMLEmbedElement ;
535- if ( embed && embed . getAttribute ( 'original-url' ) ) {
536- return embed . getAttribute ( 'original-url' ) ;
537- }
538- return null ;
539- } ) ;
540-
541- if ( embedPdfUrl ) {
542- console . log ( ` 📄 Found PDF URL in embed: ${ embedPdfUrl } ` ) ;
543- try {
544- const ctx = page . context ( ) ;
545- const cookies = await ctx . cookies ( embedPdfUrl ) ;
546- const cookieHeader = cookies . map ( c => `${ c . name } =${ c . value } ` ) . join ( '; ' ) ;
547- const api = await request . newContext ( {
548- extraHTTPHeaders : {
549- ...( cookieHeader ? { Cookie : cookieHeader } : { } ) ,
550- Referer : currentUrl ,
551- 'User-Agent' : 'Mozilla/5.0'
552- }
553- } ) ;
554- const res = await api . get ( embedPdfUrl ) ;
555- if ( res . ok ( ) ) {
556- const body = await res . body ( ) ;
557- const resolvedPath : string = ( replaceDataPlaceholders ( targetPathBase , collector ) || targetPathBase ) ;
558- const dir = path . dirname ( resolvedPath ) ;
559- if ( ! fs . existsSync ( dir ) ) fs . mkdirSync ( dir , { recursive : true } ) ;
560- fs . writeFileSync ( resolvedPath , body ) ;
561- savedPath = resolvedPath ;
562- console . log ( ` 📄 PDF saved from embed URL to ${ resolvedPath } ` ) ;
563- await api . dispose ( ) ;
564- saved = true ;
565- }
566- await api . dispose ( ) ;
567- } catch ( e : any ) {
568- console . log ( ` 📄 Failed to fetch from embed URL: ${ e . message } ` ) ;
569- }
570- }
571- } catch ( e : any ) {
572- console . log ( ` 📄 Embed extraction failed: ${ e . message } ` ) ;
573- }
574-
575- // Strategy 2: Wait for PDF viewer and click download button
576- if ( ! saved ) {
577- console . log ( ' 📄 Waiting for PDF viewer to load...' ) ;
578- try {
579- // Wait for pdf-viewer element to appear
580- await page . waitForSelector ( 'pdf-viewer' , { timeout : 10000 } ) . catch ( ( ) => null ) ;
581- await page . waitForTimeout ( 2000 ) ; // Additional wait for shadow DOMs to initialize
582-
583- console . log ( ' 📄 Searching for download button...' ) ;
584- const clicked = await page . evaluate ( ( ) => {
585- // Try direct selector path
586- const pdfViewer = document . querySelector ( 'pdf-viewer' ) ;
587- if ( pdfViewer && pdfViewer . shadowRoot ) {
588- const toolbar = pdfViewer . shadowRoot . querySelector ( 'viewer-toolbar' ) ;
589- if ( toolbar && toolbar . shadowRoot ) {
590- const downloadControls = toolbar . shadowRoot . querySelector ( 'viewer-download-controls' ) ;
591- if ( downloadControls && downloadControls . shadowRoot ) {
592- const saveButton = downloadControls . shadowRoot . querySelector ( '#save' ) as HTMLElement ;
593- if ( saveButton ) {
594- saveButton . click ( ) ;
595- return true ;
596- }
597- }
598- }
599- }
600-
601- // Fallback: recursive search
602- function findAndClickDownload ( root : Element | Document | ShadowRoot ) : boolean {
603- const walker = document . createTreeWalker ( root as Node , NodeFilter . SHOW_ELEMENT ) ;
604- let node : Node | null ;
605- while ( ( node = walker . nextNode ( ) ) ) {
606- const el = node as HTMLElement ;
607- if ( el . id === 'save' || el . id === 'download' ) {
608- el . click ( ) ;
609- return true ;
610- }
611- if ( el . shadowRoot ) {
612- if ( findAndClickDownload ( el . shadowRoot ) ) return true ;
613- }
614- }
615- return false ;
616- }
617- return findAndClickDownload ( document ) ;
618- } ) . catch ( ( ) => false ) ;
619-
620- if ( clicked ) {
621- console . log ( ' 📄 Found and clicked download button! Waiting for download...' ) ;
622- const dl = await page . waitForEvent ( 'download' , { timeout : 10000 } ) . catch ( ( ) => null ) ;
623- if ( dl ) {
624- const resolvedPath : string = ( replaceDataPlaceholders ( targetPathBase , collector ) || targetPathBase ) ;
625- const dir = path . dirname ( resolvedPath ) ;
626- if ( ! fs . existsSync ( dir ) ) fs . mkdirSync ( dir , { recursive : true } ) ;
627- await dl . saveAs ( resolvedPath ) ;
628- savedPath = resolvedPath ;
629- console . log ( ` 📄 PDF saved via download button to ${ resolvedPath } ` ) ;
630- saved = true ;
631- } else {
632- console . log ( ' 📄 Download button clicked but no download event received.' ) ;
633- }
634- } else {
635- console . log ( ' 📄 Download button not found.' ) ;
488+ // Wait a bit for route interception to capture it
489+ await page . waitForTimeout ( 2000 ) ;
490+ if ( interceptedData . buffer && ! pdfSaved && interceptedData . buffer . length > 0 ) {
491+ fs . writeFileSync ( resolvedPath , interceptedData . buffer ) ;
492+ savedPath = resolvedPath ;
493+ pdfSaved = true ;
494+ console . log (
495+ ` 📄 PDF saved via intercepted response (${ ( interceptedData . buffer . length / 1024 ) . toFixed ( 2 ) } KB) to ${ resolvedPath } `
496+ ) ;
636497 }
637- } catch ( e : any ) {
638- console . log ( ` 📄 PDF viewer approach failed: ${ e . message } ` ) ;
639498 }
499+ } else if ( interceptedData . buffer && ! pdfSaved && interceptedData . buffer . length > 0 ) {
500+ // Fallback: use intercepted buffer
501+ fs . writeFileSync ( resolvedPath , interceptedData . buffer ) ;
502+ savedPath = resolvedPath ;
503+ pdfSaved = true ;
504+ console . log (
505+ ` 📄 PDF saved via intercepted response (${ ( interceptedData . buffer . length / 1024 ) . toFixed ( 2 ) } KB) to ${ resolvedPath } `
506+ ) ;
640507 }
641-
642- if ( ! saved ) {
643- console . log ( ' 📄 All viewer download fallbacks failed.' ) ;
508+ } catch ( error : any ) {
509+ console . log ( ` 📄 Error during PDF save: ${ error . message } ` ) ;
510+ // Still try to save intercepted buffer if available
511+ if ( interceptedData . buffer && ! pdfSaved && interceptedData . buffer . length > 0 ) {
512+ fs . writeFileSync ( resolvedPath , interceptedData . buffer ) ;
513+ savedPath = resolvedPath ;
514+ pdfSaved = true ;
515+ console . log (
516+ ` 📄 PDF saved via intercepted response (${ ( interceptedData . buffer . length / 1024 ) . toFixed ( 2 ) } KB) to ${ resolvedPath } `
517+ ) ;
644518 }
645- } else {
646- const resolvedPath : string = ( replaceDataPlaceholders ( targetPathBase , collector ) || targetPathBase ) ;
647- const dir = path . dirname ( resolvedPath ) ;
648- if ( ! fs . existsSync ( dir ) ) fs . mkdirSync ( dir , { recursive : true } ) ;
649- fs . writeFileSync ( resolvedPath , downloadedBuffer ) ;
650- savedPath = resolvedPath ;
651- console . log ( ` 📄 PDF saved to ${ resolvedPath } (from ${ pdfUrl } )` ) ;
652519 }
653520 } catch ( err : any ) {
654521 console . log ( ` 📄 savePDF failed: ${ err . message } ` ) ;
655522 } finally {
523+ // Unroute to clean up
524+ try {
525+ await page . unroute ( '**/*' ) ;
526+ } catch { }
527+
528+ // Verify file was saved
529+ if ( ! pdfSaved && savedPath && fs . existsSync ( savedPath ) ) {
530+ pdfSaved = true ;
531+ }
532+
656533 collector [ collectorKey ] = savedPath ;
534+ if ( pdfSaved || savedPath ) {
535+ console . log ( ` ✓ PDF successfully saved to ${ savedPath } ` ) ;
536+ } else {
537+ console . log ( ` ✗ Failed to save PDF` ) ;
538+ }
657539 }
658540 break ;
659541 }
0 commit comments