@@ -8,10 +8,14 @@ import TurndownService from "turndown"
88import PCR from "puppeteer-chromium-resolver"
99import { fileExistsAtPath } from "../../utils/fs"
1010import { serializeError } from "serialize-error"
11+ import { analyzeWebsite } from "./analyzeWebsite"
1112
1213// Timeout constants
1314const URL_FETCH_TIMEOUT = 30_000 // 30 seconds
1415const URL_FETCH_FALLBACK_TIMEOUT = 20_000 // 20 seconds for fallback
16+ const MAX_FETCH_RETRIES = 3 // Number of retries for transient errors
17+ const USER_AGENT =
18+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
1519
1620interface PCRStats {
1721 puppeteer : { launch : typeof launch }
@@ -44,6 +48,37 @@ export class UrlContentFetcher {
4448 } )
4549 return stats
4650 }
51+ /**
52+ * Fetch the content of a URL
53+ * @param url The URL to fetch content from
54+ * @returns The content of the URL as a string
55+ * @throws Error if fetching fails after retries
56+ */
57+ async fetchUrlContent ( url : string ) : Promise < string > {
58+ for ( let attempt = 1 ; attempt <= MAX_FETCH_RETRIES ; attempt ++ ) {
59+ try {
60+ const response = await fetch ( url , {
61+ headers : {
62+ "User-Agent" : USER_AGENT ,
63+ } ,
64+ signal : AbortSignal . timeout ( URL_FETCH_TIMEOUT ) ,
65+ } )
66+ if ( ! response . ok ) {
67+ throw new Error ( `Failed to fetch URL: ${ response . status } ${ response . statusText } ` )
68+ }
69+ return await response . text ( )
70+ } catch ( error ) {
71+ const serializedError = serializeError ( error )
72+ console . error ( `Error fetching URL content: ${ serializedError . message } ` )
73+ if ( attempt === MAX_FETCH_RETRIES ) {
74+ throw new Error (
75+ `Failed to fetch URL after ${ MAX_FETCH_RETRIES } attempts: ${ serializedError . message } ` ,
76+ )
77+ }
78+ }
79+ }
80+ return ""
81+ }
4782
4883 async launchBrowser ( ) : Promise < void > {
4984 if ( this . browser ) {
@@ -52,7 +87,7 @@ export class UrlContentFetcher {
5287 const stats = await this . ensureChromiumExists ( )
5388 this . browser = await stats . puppeteer . launch ( {
5489 args : [
55- " --user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36" ,
90+ ` --user-agent=${ USER_AGENT } ` ,
5691 "--disable-dev-shm-usage" ,
5792 "--disable-accelerated-2d-canvas" ,
5893 "--no-first-run" ,
@@ -81,49 +116,53 @@ export class UrlContentFetcher {
81116
82117 // must make sure to call launchBrowser before and closeBrowser after using this
83118 async urlToMarkdown ( url : string ) : Promise < string > {
84- if ( ! this . browser || ! this . page ) {
85- throw new Error ( "Browser not initialized" )
86- }
87- /*
119+ let content = await this . fetchUrlContent ( url )
120+ const analyzedContent = await analyzeWebsite ( content )
121+ if ( analyzedContent . needsJavaScript ) {
122+ if ( ! this . browser || ! this . page ) {
123+ throw new Error ( "Browser not initialized" )
124+ }
125+ /*
88126 - networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
89127 - domcontentloaded is when the basic DOM is loaded
90128 this should be sufficient for most doc sites
91129 */
92- try {
93- await this . page . goto ( url , {
94- timeout : URL_FETCH_TIMEOUT ,
95- waitUntil : [ "domcontentloaded" , "networkidle2" ] ,
96- } )
97- } catch ( error ) {
98- // Use serialize-error to safely extract error information
99- const serializedError = serializeError ( error )
100- const errorMessage = serializedError . message || String ( error )
101- const errorName = serializedError . name
102-
103- // Only retry for timeout or network-related errors
104- const shouldRetry =
105- errorMessage . includes ( "timeout" ) ||
106- errorMessage . includes ( "net::" ) ||
107- errorMessage . includes ( "NetworkError" ) ||
108- errorMessage . includes ( "ERR_" ) ||
109- errorName === "TimeoutError"
110-
111- if ( shouldRetry ) {
112- // If networkidle2 fails due to timeout/network issues, try with just domcontentloaded as fallback
113- console . warn (
114- `Failed to load ${ url } with networkidle2, retrying with domcontentloaded only: ${ errorMessage } ` ,
115- )
130+ try {
116131 await this . page . goto ( url , {
117- timeout : URL_FETCH_FALLBACK_TIMEOUT ,
118- waitUntil : [ "domcontentloaded" ] ,
132+ timeout : URL_FETCH_TIMEOUT ,
133+ waitUntil : [ "domcontentloaded" , "networkidle2" ] ,
119134 } )
120- } else {
121- // For other errors, throw them as-is
122- throw error
135+ } catch ( error ) {
136+ // Use serialize-error to safely extract error information
137+ const serializedError = serializeError ( error )
138+ const errorMessage = serializedError . message || String ( error )
139+ const errorName = serializedError . name
140+
141+ // Only retry for timeout or network-related errors
142+ const shouldRetry =
143+ errorMessage . includes ( "timeout" ) ||
144+ errorMessage . includes ( "net::" ) ||
145+ errorMessage . includes ( "NetworkError" ) ||
146+ errorMessage . includes ( "ERR_" ) ||
147+ errorName === "TimeoutError"
148+
149+ if ( shouldRetry ) {
150+ // If networkidle2 fails due to timeout/network issues, try with just domcontentloaded as fallback
151+ console . warn (
152+ `Failed to load ${ url } with networkidle2, retrying with domcontentloaded only: ${ errorMessage } ` ,
153+ )
154+ await this . page . goto ( url , {
155+ timeout : URL_FETCH_FALLBACK_TIMEOUT ,
156+ waitUntil : [ "domcontentloaded" ] ,
157+ } )
158+ } else {
159+ // For other errors, throw them as-is
160+ throw error
161+ }
123162 }
124- }
125163
126- const content = await this . page . content ( )
164+ content = await this . page . content ( )
165+ }
127166
128167 // use cheerio to parse and clean up the HTML
129168 const $ = cheerio . load ( content )
0 commit comments