Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .husky/pre-commit
Original file line number Diff line number Diff line change
@@ -1 +1 @@
yarn format
yarn lint-staged
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@
"typescript-eslint": "^8.28.0",
"vitest": "^4.0.16"
},
"lint-staged": {
"**/*.{js,ts,mjs,mts,cjs,cts,json,css}": "biome format --write --no-errors-on-unmatched"
},
"packageManager": "yarn@4.10.3",
"volta": {
"node": "24.13.0",
Expand Down
1 change: 1 addition & 0 deletions packages/browser-pool/src/puppeteer/puppeteer-plugin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ export class PuppeteerPlugin extends BrowserPlugin<
'version',
'on',
'process',
'pages',
] as const
).reduce((map, method) => {
map[method] = browser[method as 'close']?.bind(browser);
Expand Down
56 changes: 47 additions & 9 deletions packages/utils/src/internals/sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,8 @@ export async function* discoverValidSitemaps(
const { proxyUrl } = options;
const { gotScraping } = await import('got-scraping');
const sitemapUrls = new Set<string>();
// Keep each probe bounded so discovery cannot stall indefinitely on a single request.
const DISCOVERY_REQUEST_TIMEOUT_MILLIS = 20_000;

const addSitemapUrl = (url: string): string | undefined => {
const sizeBefore = sitemapUrls.size;
Expand All @@ -472,21 +474,49 @@ export async function* discoverValidSitemaps(
return undefined;
};

const urlExists = (url: string) =>
gotScraping({
proxyUrl,
const runWithTimeout = async <T>(
promise: Promise<T>,
timeoutMillis: number,
timeoutMessage: string,
): Promise<T> => {
let timeout: ReturnType<typeof setTimeout> | undefined;
const timeoutPromise = new Promise<never>((_, reject) => {
timeout = setTimeout(() => reject(new Error(timeoutMessage)), timeoutMillis);
});

try {
return await Promise.race([promise, timeoutPromise]);
} finally {
if (timeout !== undefined) {
clearTimeout(timeout);
}
}
};

const urlExists = async (url: string) => {
const response = await gotScraping({
url,
method: 'HEAD',
}).then((response) => response.statusCode >= 200 && response.statusCode < 400);
proxyUrl,
timeout: {
request: DISCOVERY_REQUEST_TIMEOUT_MILLIS,
},
});

return response.statusCode >= 200 && response.statusCode < 400;
};

const discoverSitemapsForDomainUrls = async function* (hostname: string, domainUrls: string[]) {
if (!hostname) {
return;
}

try {
const robotsFile = await RobotsFile.find(domainUrls[0], proxyUrl);

const robotsFile = await runWithTimeout(
RobotsFile.find(domainUrls[0], proxyUrl),
DISCOVERY_REQUEST_TIMEOUT_MILLIS,
`Fetching robots.txt timed out for ${hostname}`,
);
for (const sitemapUrl of robotsFile.getSitemaps()) {
if (addSitemapUrl(sitemapUrl)) {
yield sitemapUrl;
Expand All @@ -507,10 +537,18 @@ export async function* discoverValidSitemaps(
const possibleSitemapPathnames = ['/sitemap.xml', '/sitemap.txt', '/sitemap_index.xml'];
for (const pathname of possibleSitemapPathnames) {
firstUrl.pathname = pathname;
if (await urlExists(firstUrl.toString())) {
if (addSitemapUrl(firstUrl.toString())) {
yield firstUrl.toString();
const candidateSitemapUrl = firstUrl.toString();

try {
if (await urlExists(candidateSitemapUrl)) {
if (addSitemapUrl(candidateSitemapUrl)) {
yield candidateSitemapUrl;
}
}
} catch (err) {
log.debug(`Failed to check sitemap candidate ${candidateSitemapUrl} for ${hostname}`, {
error: err,
});
}
}
}
Expand Down
12 changes: 12 additions & 0 deletions test/browser-pool/browser-plugins/plugins.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,18 @@ describe('Plugins', () => {
}
});

test.concurrent('should allow calling functions on the proxied browser', async () => {
const plugin = new PuppeteerPlugin(puppeteer);
const launchContext = plugin.createLaunchContext();
const browser = await plugin.launch(launchContext);

try {
await expect(browser.pages()).resolves.toBeDefined();
} finally {
await browser.close();
}
});

test.concurrent('should pass launch options to browser', async () => {
const plugin = new PuppeteerPlugin(puppeteer);

Expand Down
Loading