@@ -23,9 +23,35 @@ type BrowserContext = {
2323 page : Page ,
2424} ;
2525
26+ function escapeRegExp ( value : string ) : string {
27+ return value . replace ( / [ . * + ? ^ $ { } ( ) | [ \] \\ ] / g, "\\$&" ) ;
28+ }
29+
30+ function createDefaultPattern ( url : string ) : RegExp {
31+ return new RegExp ( `^${ escapeRegExp ( url ) } ` ) ;
32+ }
33+
34+ // Accept CLI patterns written as /pattern/flags while keeping backward compatibility with plain strings.
35+ export function buildURLPattern ( patternArg : string | undefined , mainURL : string ) : RegExp {
36+ if ( ! patternArg ) {
37+ return createDefaultPattern ( mainURL ) ;
38+ }
39+
40+ const regexLiteralMatch = patternArg . match ( / ^ \/ ( .* ) \/ ( [ a - z ] * ) $ / i) ;
41+ if ( regexLiteralMatch ) {
42+ const [ , patternSource , patternFlags ] = regexLiteralMatch ;
43+ return new RegExp ( patternSource , patternFlags ) ;
44+ }
45+
46+ return new RegExp ( patternArg ) ;
47+ }
48+
2649async function useBrowserContext ( ) {
2750 const browser = await puppeteer . launch ( {
2851 headless : true ,
52+ // Keep Chrome launch working inside sandboxed environments.
53+ args : [ "--no-sandbox" , "--disable-setuid-sandbox" ] ,
54+ userDataDir : join ( process . cwd ( ) , ".site2pdf-chrome" ) ,
2955 ...( process . env . CHROME_PATH && { executablePath : process . env . CHROME_PATH } ) ,
3056 } ) ;
3157 const page = ( await browser . pages ( ) ) [ 0 ] ;
@@ -39,17 +65,17 @@ export async function generatePDF(
3965 ctx : BrowserContext ,
4066 url : string ,
4167 concurrentLimit : number ,
42- urlPattern : RegExp = new RegExp ( `^ ${ url } ` ) ,
68+ urlPattern : RegExp = createDefaultPattern ( url ) ,
4369) : Promise < Buffer > {
4470 const limit = pLimit ( concurrentLimit ) ;
4571 const page = await ctx . browser . newPage ( ) ;
4672 await page . goto ( url , { waitUntil : 'domcontentloaded' } ) ;
4773
48- const subLinks = await page . evaluate ( ( patternString ) => {
49- const pattern = new RegExp ( patternString ) ;
74+ const subLinks = await page . evaluate ( ( { patternSource , patternFlags } ) => {
75+ const pattern = new RegExp ( patternSource , patternFlags ) ;
5076 const links = Array . from ( document . querySelectorAll ( "a" ) ) ;
5177 return links . map ( ( link ) => link . href ) . filter ( ( href ) => pattern . test ( href ) ) ;
52- } , urlPattern . source ) ;
78+ } , { patternSource : urlPattern . source , patternFlags : urlPattern . flags } ) ;
5379
5480 const subLinksWithoutAnchors = subLinks . map ( ( link ) => normalizeURL ( link ) ) ;
5581 const uniqueSubLinks = Array . from ( new Set ( subLinksWithoutAnchors ) ) ;
@@ -60,13 +86,13 @@ export async function generatePDF(
6086
6187 const pdfDoc = await PDFDocument . create ( ) ;
6288
63- const generatePDFForPage = async ( link : string ) => {
64- console . log ( `loading ${ link } ` ) ;
65- const newPage = await ctx . browser . newPage ( ) ;
66- let pdfBytes : Uint8Array ;
67- try {
68- await newPage . goto ( link , { waitUntil : 'domcontentloaded' } ) ;
69- pdfBytes = await newPage . pdf ( { format : "A4" } ) ;
89+ const generatePDFForPage = async ( link : string ) => {
90+ console . log ( `loading ${ link } ` ) ;
91+ const newPage = await ctx . browser . newPage ( ) ;
92+ let pdfBytes : Buffer ;
93+ try {
94+ await newPage . goto ( link , { waitUntil : 'domcontentloaded' } ) ;
95+ pdfBytes = await newPage . pdf ( { format : "A4" } ) ;
7096 console . log ( `Generated PDF for ${ link } ` ) ;
7197 return Buffer . from ( pdfBytes ) ;
7298 } catch ( error ) {
@@ -123,15 +149,14 @@ export function normalizeURL(url: string): string {
123149
124150export async function main ( ) {
125151 const mainURL = process . argv [ 2 ] ;
126- const urlPattern = process . argv [ 3 ]
127- ? new RegExp ( process . argv [ 3 ] )
128- : new RegExp ( `^${ mainURL } ` ) ;
129152
130153 if ( ! mainURL ) {
131154 showHelp ( ) ;
132155 throw new Error ( "<main_url> is required" ) ;
133156 }
134157
158+ const urlPattern = buildURLPattern ( process . argv [ 3 ] , mainURL ) ;
159+
135160 console . log (
136161 `Generating PDF for ${ mainURL } and sub-links matching ${ urlPattern } ` ,
137162 ) ;
0 commit comments