diff --git a/CHANGELOG.md b/CHANGELOG.md index ac73b4e7..4ab0556d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,8 @@ # [1.5.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.4.0...v1.5.0) (2024-07-05) - ### Features -* git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c)) +- git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c)) # [1.4.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.3.0...v1.4.0) (2024-01-15) diff --git a/src/core.ts b/src/core.ts index c996f2bb..05a9f8e3 100644 --- a/src/core.ts +++ b/src/core.ts @@ -104,13 +104,11 @@ export async function crawl(config: Config) { // Uncomment this option to see the browser window. // headless: false, preNavigationHooks: [ - // Abort requests for certain resource types - async ({ request, page, log }) => { - // If there are no resource exclusions, return - const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? []; - if (RESOURCE_EXCLUSTIONS.length === 0) { - return; - } + // Abort requests for certain resource types and add cookies + async (crawlingContext, _gotoOptions) => { + const { request, page, log } = crawlingContext; + // Add cookies to the page + // Because the crawler has not yet navigated to the page, so the loadedUrl is always undefined. Use the request url instead. if (config.cookie) { const cookies = ( Array.isArray(config.cookie) ? config.cookie : [config.cookie] @@ -118,11 +116,16 @@ export async function crawl(config: Config) { return { name: cookie.name, value: cookie.value, - url: request.loadedUrl, + url: request.url, }; }); await page.context().addCookies(cookies); } + const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? []; + // If there are no resource exclusions, return + if (RESOURCE_EXCLUSTIONS.length === 0) { + return; + } await page.route( `**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, (route) => route.abort("aborted"),