Skip to content

Commit 29821b0

Browse files
committed
feat: implement URL content fetching with retry logic and user agent configuration
1 parent d8f0a50 commit 29821b0

File tree

1 file changed

+75
-36
lines changed

1 file changed

+75
-36
lines changed

src/services/browser/UrlContentFetcher.ts

Lines changed: 75 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,14 @@ import TurndownService from "turndown"
88
import PCR from "puppeteer-chromium-resolver"
99
import { fileExistsAtPath } from "../../utils/fs"
1010
import { serializeError } from "serialize-error"
11+
import { analyzeWebsite } from "./analyzeWebsite"
1112

1213
// Timeout constants
1314
const URL_FETCH_TIMEOUT = 30_000 // 30 seconds
1415
const URL_FETCH_FALLBACK_TIMEOUT = 20_000 // 20 seconds for fallback
16+
const MAX_FETCH_RETRIES = 3 // Number of retries for transient errors
17+
const USER_AGENT =
18+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
1519

1620
interface PCRStats {
1721
puppeteer: { launch: typeof launch }
@@ -44,6 +48,37 @@ export class UrlContentFetcher {
4448
})
4549
return stats
4650
}
51+
/**
52+
* Fetch the content of a URL
53+
* @param url The URL to fetch content from
54+
* @returns The content of the URL as a string
55+
* @throws Error if fetching fails after retries
56+
*/
57+
async fetchUrlContent(url: string): Promise<string> {
58+
for (let attempt = 1; attempt <= MAX_FETCH_RETRIES; attempt++) {
59+
try {
60+
const response = await fetch(url, {
61+
headers: {
62+
"User-Agent": USER_AGENT,
63+
},
64+
signal: AbortSignal.timeout(URL_FETCH_TIMEOUT),
65+
})
66+
if (!response.ok) {
67+
throw new Error(`Failed to fetch URL: ${response.status} ${response.statusText}`)
68+
}
69+
return await response.text()
70+
} catch (error) {
71+
const serializedError = serializeError(error)
72+
console.error(`Error fetching URL content: ${serializedError.message}`)
73+
if (attempt === MAX_FETCH_RETRIES) {
74+
throw new Error(
75+
`Failed to fetch URL after ${MAX_FETCH_RETRIES} attempts: ${serializedError.message}`,
76+
)
77+
}
78+
}
79+
}
80+
return ""
81+
}
4782

4883
async launchBrowser(): Promise<void> {
4984
if (this.browser) {
@@ -52,7 +87,7 @@ export class UrlContentFetcher {
5287
const stats = await this.ensureChromiumExists()
5388
this.browser = await stats.puppeteer.launch({
5489
args: [
55-
"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
90+
`--user-agent=${USER_AGENT}`,
5691
"--disable-dev-shm-usage",
5792
"--disable-accelerated-2d-canvas",
5893
"--no-first-run",
@@ -81,49 +116,53 @@ export class UrlContentFetcher {
81116

82117
// must make sure to call launchBrowser before and closeBrowser after using this
83118
async urlToMarkdown(url: string): Promise<string> {
84-
if (!this.browser || !this.page) {
85-
throw new Error("Browser not initialized")
86-
}
87-
/*
119+
let content = await this.fetchUrlContent(url)
120+
const analyzedContent = await analyzeWebsite(content)
121+
if (analyzedContent.needsJavaScript) {
122+
if (!this.browser || !this.page) {
123+
throw new Error("Browser not initialized")
124+
}
125+
/*
88126
- networkidle2 is equivalent to playwright's networkidle where it waits until there are no more than 2 network connections for at least 500 ms.
89127
- domcontentloaded is when the basic DOM is loaded
90128
this should be sufficient for most doc sites
91129
*/
92-
try {
93-
await this.page.goto(url, {
94-
timeout: URL_FETCH_TIMEOUT,
95-
waitUntil: ["domcontentloaded", "networkidle2"],
96-
})
97-
} catch (error) {
98-
// Use serialize-error to safely extract error information
99-
const serializedError = serializeError(error)
100-
const errorMessage = serializedError.message || String(error)
101-
const errorName = serializedError.name
102-
103-
// Only retry for timeout or network-related errors
104-
const shouldRetry =
105-
errorMessage.includes("timeout") ||
106-
errorMessage.includes("net::") ||
107-
errorMessage.includes("NetworkError") ||
108-
errorMessage.includes("ERR_") ||
109-
errorName === "TimeoutError"
110-
111-
if (shouldRetry) {
112-
// If networkidle2 fails due to timeout/network issues, try with just domcontentloaded as fallback
113-
console.warn(
114-
`Failed to load ${url} with networkidle2, retrying with domcontentloaded only: ${errorMessage}`,
115-
)
130+
try {
116131
await this.page.goto(url, {
117-
timeout: URL_FETCH_FALLBACK_TIMEOUT,
118-
waitUntil: ["domcontentloaded"],
132+
timeout: URL_FETCH_TIMEOUT,
133+
waitUntil: ["domcontentloaded", "networkidle2"],
119134
})
120-
} else {
121-
// For other errors, throw them as-is
122-
throw error
135+
} catch (error) {
136+
// Use serialize-error to safely extract error information
137+
const serializedError = serializeError(error)
138+
const errorMessage = serializedError.message || String(error)
139+
const errorName = serializedError.name
140+
141+
// Only retry for timeout or network-related errors
142+
const shouldRetry =
143+
errorMessage.includes("timeout") ||
144+
errorMessage.includes("net::") ||
145+
errorMessage.includes("NetworkError") ||
146+
errorMessage.includes("ERR_") ||
147+
errorName === "TimeoutError"
148+
149+
if (shouldRetry) {
150+
// If networkidle2 fails due to timeout/network issues, try with just domcontentloaded as fallback
151+
console.warn(
152+
`Failed to load ${url} with networkidle2, retrying with domcontentloaded only: ${errorMessage}`,
153+
)
154+
await this.page.goto(url, {
155+
timeout: URL_FETCH_FALLBACK_TIMEOUT,
156+
waitUntil: ["domcontentloaded"],
157+
})
158+
} else {
159+
// For other errors, throw them as-is
160+
throw error
161+
}
123162
}
124-
}
125163

126-
const content = await this.page.content()
164+
content = await this.page.content()
165+
}
127166

128167
// use cheerio to parse and clean up the HTML
129168
const $ = cheerio.load(content)

0 commit comments

Comments
 (0)