diff --git a/CHANGELOG.md b/CHANGELOG.md index 346460fa7..1b5f725bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Unreleased [minor] + +> Development of this release was supported by the [French Ministry for Foreign Affairs](https://www.diplomatie.gouv.fr/fr/politique-etrangere-de-la-france/diplomatie-numerique/) through its ministerial [State Startups incubator](https://beta.gouv.fr/startups/open-terms-archive.html) under the aegis of the Ambassador for Digital Affairs. + +### Changed + +- Improve headless browser context isolation when fetching pages by disabling cache and clearing cookies between requests to prevent session persistence across different URLs and to improve tracking success rate + ## 5.6.1 - 2025-06-30 _Full changeset and discussions: [#1168](https://github.com/OpenTermsArchive/engine/pull/1168)._ diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index e193e9426..bb5d22c45 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -20,6 +20,11 @@ export default async function fetch(url, cssSelectors, config) { await page.setDefaultNavigationTimeout(config.navigationTimeout); await page.setExtraHTTPHeaders({ 'Accept-Language': config.language }); + await page.setCacheEnabled(false); // Disable cache to ensure fresh content on each fetch and prevent stale data from previous requests + const client = await page.target().createCDPSession(); + + await client.send('Network.clearBrowserCookies'); // Clear cookies to ensure clean state between fetches and prevent session persistence across different URLs + response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout. if (!response) {