From 6262fdbae3f20ed1aca74640f2b727511beb0f64 Mon Sep 17 00:00:00 2001 From: JJetmar Date: Mon, 11 Aug 2025 16:01:42 +0200 Subject: [PATCH] fix: #10 - Improve crawling of initially redirected requests --- checker-cheerio/Dockerfile | 6 +++--- checker-cheerio/INPUT_SCHEMA.json | 7 +++++++ checker-cheerio/src/handlePage.ts | 20 ++++++++++++++++++-- checker-cheerio/src/typedefs.ts | 1 + checker-playwright/Dockerfile | 6 +++--- checker-playwright/INPUT_SCHEMA.json | 7 +++++++ checker-playwright/src/handlePage.ts | 19 +++++++++++++++++-- checker-playwright/src/main.ts | 2 +- checker-playwright/src/typedefs.ts | 1 + checker-puppeteer/Dockerfile | 4 ++-- checker-puppeteer/src/handlePage.ts | 19 +++++++++++++++++-- checker-puppeteer/src/main.ts | 2 +- checker-puppeteer/src/typedefs.ts | 3 ++- starter/Dockerfile | 4 ++-- starter/INPUT_SCHEMA.json | 7 +++++++ starter/src/configs.ts | 5 +++-- starter/src/main.ts | 2 -- starter/src/startRunAndPool.ts | 2 +- starter/src/typedefs.ts | 4 ++-- 19 files changed, 95 insertions(+), 26 deletions(-) diff --git a/checker-cheerio/Dockerfile b/checker-cheerio/Dockerfile index 2354db7..a9e5fcb 100644 --- a/checker-cheerio/Dockerfile +++ b/checker-cheerio/Dockerfile @@ -1,7 +1,7 @@ # Specify the base Docker image. You can read more about # the available images at https://crawlee.dev/docs/guides/docker-images # You can also use any other image from Docker Hub. -FROM apify/actor-node:16 AS builder +FROM apify/actor-node:22 AS builder # Copy just package.json and package-lock.json # to speed up the build using Docker layer cache. @@ -19,7 +19,7 @@ COPY . ./ RUN npm run build # Create final image -FROM apify/actor-node:16 +FROM apify/actor-node:22 # Copy only built JS files from builder image COPY --from=builder /usr/src/app/dist ./dist @@ -48,4 +48,4 @@ COPY . ./ # Run the image. -CMD npm run start:prod --silent \ No newline at end of file +CMD npm run start:prod --silent diff --git a/checker-cheerio/INPUT_SCHEMA.json b/checker-cheerio/INPUT_SCHEMA.json index f1dd4b1..0f4c4e5 100644 --- a/checker-cheerio/INPUT_SCHEMA.json +++ b/checker-cheerio/INPUT_SCHEMA.json @@ -44,6 +44,13 @@ "prefill": "a[href]", "minLength": 1 }, + "allowOnlyLinksFromSameDomain": { + "title": "Allow only links from the same domain", + "type": "boolean", + "description": "Additional check to make sure that only link related to the same domain are enqueued.", + "editor": "checkbox", + "prefill": false + }, "pseudoUrls": { "title": "Pseudo-URLs", "type": "array", diff --git a/checker-cheerio/src/handlePage.ts b/checker-cheerio/src/handlePage.ts index 7028259..d253885 100644 --- a/checker-cheerio/src/handlePage.ts +++ b/checker-cheerio/src/handlePage.ts @@ -62,12 +62,16 @@ export async function handlePage( wasSuccess, }); - const pageOrigin = new URL(request.url).origin; + const currentUrl = request.loadedUrl ?? request.url; + const pageOrigin = new URL(currentUrl).origin; if (input.linkSelector && !!$) { const info = await requestQueue.getInfo(); const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount; + // Using page.url() in case the initial link was redirected. + const initialDomain = request.userData.initialDomain ?? new URL(currentUrl).hostname; + if (maxUrlsToEnqueue > 0) { const toEnqueue: RequestOptions[] = []; $(input.linkSelector).each((_, el) => { @@ -76,6 +80,15 @@ export async function handlePage( return; } const href = new URL(rawHref, pageOrigin).toString(); + + if (!['http:', 'https:'].includes(new URL(href).protocol)) { + return; // Skipping invalid protocol (mailto:, ftp: etc...) + } + + if (input.allowOnlyLinksFromSameDomain && new URL(href).hostname !== initialDomain) { + return; // Skip if only links related to the same domain are allowed. + } + for (const pseudoUrlInput of input.pseudoUrls) { if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) { const newUrl = new URL(href, request.loadedUrl).toString(); @@ -84,7 +97,10 @@ export async function handlePage( headers: pseudoUrlInput.headers, method: pseudoUrlInput.method as 'GET' | 'POST', payload: pseudoUrlInput.payload, - userData: pseudoUrlInput.userData, + userData: { + ...pseudoUrlInput.userData, + initialDomain: input.allowOnlyLinksFromSameDomain ? initialDomain : undefined, + }, }); } } diff --git a/checker-cheerio/src/typedefs.ts b/checker-cheerio/src/typedefs.ts index fc5aac8..ead859c 100644 --- a/checker-cheerio/src/typedefs.ts +++ b/checker-cheerio/src/typedefs.ts @@ -53,6 +53,7 @@ export interface ActorInputData { urlsToCheck: UrlInput[]; proxyConfiguration: ProxyConfiguration; linkSelector?: string; + allowOnlyLinksFromSameDomain?: boolean; pseudoUrls: PseudoUrlInputCustom[]; repeatChecksOnProvidedUrls?: number; maxNumberOfPagesCheckedPerDomain: number; diff --git a/checker-playwright/Dockerfile b/checker-playwright/Dockerfile index 95c0b40..f90b1f8 100644 --- a/checker-playwright/Dockerfile +++ b/checker-playwright/Dockerfile @@ -1,7 +1,7 @@ # Specify the base Docker image. You can read more about # the available images at https://crawlee.dev/docs/guides/docker-images # You can also use any other image from Docker Hub. -FROM apify/actor-node-playwright-chrome:16 AS builder +FROM apify/actor-node-playwright-chrome:22 AS builder # Copy just package.json and package-lock.json # to speed up the build using Docker layer cache. @@ -19,7 +19,7 @@ COPY --chown=myuser . ./ RUN npm run build # Create final image -FROM apify/actor-node-playwright-chrome:16 +FROM apify/actor-node-playwright-chrome:22 # Copy only built JS files from builder image COPY --from=builder --chown=myuser /home/myuser/dist ./dist @@ -49,4 +49,4 @@ COPY --chown=myuser . ./ # Run the image. If you know you won't need headful browsers, # you can remove the XVFB start script for a micro perf gain. -CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent \ No newline at end of file +CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent diff --git a/checker-playwright/INPUT_SCHEMA.json b/checker-playwright/INPUT_SCHEMA.json index b60ec67..1299304 100644 --- a/checker-playwright/INPUT_SCHEMA.json +++ b/checker-playwright/INPUT_SCHEMA.json @@ -44,6 +44,13 @@ "prefill": "a[href]", "minLength": 1 }, + "allowOnlyLinksFromSameDomain": { + "title": "Allow only links from the same domain", + "type": "boolean", + "description": "Additional check to make sure that only link related to the same domain are enqueued.", + "editor": "checkbox", + "prefill": false + }, "pseudoUrls": { "title": "Pseudo-URLs", "type": "array", diff --git a/checker-playwright/src/handlePage.ts b/checker-playwright/src/handlePage.ts index 73f6e70..ae2662d 100644 --- a/checker-playwright/src/handlePage.ts +++ b/checker-playwright/src/handlePage.ts @@ -82,12 +82,15 @@ export async function handlePage( wasSuccess, }); - const pageOrigin = new URL(request.url).origin; + const pageOrigin = new URL(page.url()).origin; if (input.linkSelector && !!$) { const info = await requestQueue.getInfo(); const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount; + // Using page.url() in case the initial link was redirected. + const initialDomain = request.userData.initialDomain ?? new URL(page.url()).hostname; + if (maxUrlsToEnqueue > 0) { const toEnqueue: RequestOptions[] = []; $(input.linkSelector).each((_, el) => { @@ -96,6 +99,15 @@ export async function handlePage( return; } const href = new URL(rawHref, pageOrigin).toString(); + + if (!['http:', 'https:'].includes(new URL(href).protocol)) { + return; // Skipping invalid protocol (mailto:, ftp: etc...) + } + + if (input.allowOnlyLinksFromSameDomain && new URL(href).hostname !== initialDomain) { + return; // Skip if only links related to the same domain are allowed. + } + for (const pseudoUrlInput of input.pseudoUrls) { if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) { const newUrl = new URL(href, request.loadedUrl).toString(); @@ -104,7 +116,10 @@ export async function handlePage( headers: pseudoUrlInput.headers, method: pseudoUrlInput.method as 'GET' | 'POST', payload: pseudoUrlInput.payload, - userData: pseudoUrlInput.userData, + userData: { + ...pseudoUrlInput.userData, + initialDomain: input.allowOnlyLinksFromSameDomain ? initialDomain : undefined, + }, }); } } diff --git a/checker-playwright/src/main.ts b/checker-playwright/src/main.ts index fc1f97d..eefce97 100644 --- a/checker-playwright/src/main.ts +++ b/checker-playwright/src/main.ts @@ -3,7 +3,7 @@ import { log, PlaywrightCrawler, RequestOptions } from 'crawlee'; import { chromium, firefox, webkit } from 'playwright'; import { inspect } from 'util'; -import type { ActorCheckDetailedOutput, PlaywrightActorInput } from './typedefs'; +import type { ActorCheckDetailedOutput, PlaywrightActorInput } from './typedefs.js'; import { handleFailedRequest } from './handleFailedRequest.js'; import { handlePage } from './handlePage.js'; diff --git a/checker-playwright/src/typedefs.ts b/checker-playwright/src/typedefs.ts index 209fa4a..ace0f07 100644 --- a/checker-playwright/src/typedefs.ts +++ b/checker-playwright/src/typedefs.ts @@ -46,6 +46,7 @@ export interface ActorInputData { urlsToCheck: UrlInput[]; proxyConfiguration: ProxyConfiguration; linkSelector?: string; + allowOnlyLinksFromSameDomain?: boolean; pseudoUrls: PseudoUrlInputCustom[]; repeatChecksOnProvidedUrls?: number; maxNumberOfPagesCheckedPerDomain: number; diff --git a/checker-puppeteer/Dockerfile b/checker-puppeteer/Dockerfile index 927040c..4307e07 100644 --- a/checker-puppeteer/Dockerfile +++ b/checker-puppeteer/Dockerfile @@ -1,7 +1,7 @@ # Specify the base Docker image. You can read more about # the available images at https://crawlee.dev/docs/guides/docker-images # You can also use any other image from Docker Hub. -FROM apify/actor-node-puppeteer-chrome:16 AS builder +FROM apify/actor-node-puppeteer-chrome:22 AS builder # Copy just package.json and package-lock.json # to speed up the build using Docker layer cache. @@ -49,4 +49,4 @@ COPY --chown=myuser . ./ # Run the image. If you know you won't need headful browsers, # you can remove the XVFB start script for a micro perf gain. -CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent \ No newline at end of file +CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent diff --git a/checker-puppeteer/src/handlePage.ts b/checker-puppeteer/src/handlePage.ts index 4c5566d..37ef1fe 100644 --- a/checker-puppeteer/src/handlePage.ts +++ b/checker-puppeteer/src/handlePage.ts @@ -73,12 +73,15 @@ export async function handlePage( wasSuccess, }); - const pageOrigin = new URL(request.url).origin; + const pageOrigin = new URL(page.url()).origin; if (input.linkSelector && !!$) { const info = await requestQueue.getInfo(); const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount; + // Using page.url() in case the initial link was redirected. + const initialDomain = request.userData.initialDomain ?? new URL(page.url()).hostname; + if (maxUrlsToEnqueue > 0) { const toEnqueue: RequestOptions[] = []; $(input.linkSelector).each((_, el) => { @@ -87,6 +90,15 @@ export async function handlePage( return; } const href = new URL(rawHref, pageOrigin).toString(); + + if (!['http:', 'https:'].includes(new URL(href).protocol)) { + return; // Skipping invalid protocol (mailto:, ftp: etc...) + } + + if (input.allowOnlyLinksFromSameDomain && new URL(href).hostname !== initialDomain) { + return; // Skip if only links related to the same domain are allowed. + } + for (const pseudoUrlInput of input.pseudoUrls) { if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) { const newUrl = new URL(href, request.loadedUrl).toString(); @@ -95,7 +107,10 @@ export async function handlePage( headers: pseudoUrlInput.headers, method: pseudoUrlInput.method as 'GET' | 'POST', payload: pseudoUrlInput.payload, - userData: pseudoUrlInput.userData, + userData: { + ...pseudoUrlInput.userData, + initialDomain: input.allowOnlyLinksFromSameDomain ? initialDomain : undefined, + }, }); } } diff --git a/checker-puppeteer/src/main.ts b/checker-puppeteer/src/main.ts index b3c156b..2b81e33 100644 --- a/checker-puppeteer/src/main.ts +++ b/checker-puppeteer/src/main.ts @@ -1,7 +1,7 @@ import { Actor } from 'apify'; import { log, PuppeteerCrawler, RequestOptions } from 'crawlee'; -import type { ActorCheckDetailedOutput, PuppeteerActorInput } from './typedefs'; +import type { ActorCheckDetailedOutput, PuppeteerActorInput } from './typedefs.js'; import { inspect } from 'util'; import { handleFailedRequest } from './handleFailedRequest.js'; diff --git a/checker-puppeteer/src/typedefs.ts b/checker-puppeteer/src/typedefs.ts index a81ca67..fd26960 100644 --- a/checker-puppeteer/src/typedefs.ts +++ b/checker-puppeteer/src/typedefs.ts @@ -50,6 +50,7 @@ export interface ActorInputData { urlsToCheck: UrlInput[]; proxyConfiguration: ProxyConfiguration; linkSelector?: string; + allowOnlyLinksFromSameDomain?: boolean; pseudoUrls: PseudoUrlInputCustom[]; repeatChecksOnProvidedUrls?: number; maxNumberOfPagesCheckedPerDomain: number; @@ -136,4 +137,4 @@ export type ActorCheckSimplifiedOutput = { : ActorCheckDetailedOutput[K] extends { [key: number]: UrlCheckResult[] } ? Record : ActorCheckDetailedOutput[K]; -}; \ No newline at end of file +}; diff --git a/starter/Dockerfile b/starter/Dockerfile index 2354db7..9fccec2 100644 --- a/starter/Dockerfile +++ b/starter/Dockerfile @@ -1,7 +1,7 @@ # Specify the base Docker image. You can read more about # the available images at https://crawlee.dev/docs/guides/docker-images # You can also use any other image from Docker Hub. -FROM apify/actor-node:16 AS builder +FROM apify/actor-node:22 AS builder # Copy just package.json and package-lock.json # to speed up the build using Docker layer cache. @@ -48,4 +48,4 @@ COPY . ./ # Run the image. -CMD npm run start:prod --silent \ No newline at end of file +CMD npm run start:prod --silent diff --git a/starter/INPUT_SCHEMA.json b/starter/INPUT_SCHEMA.json index 36958b3..fd33025 100644 --- a/starter/INPUT_SCHEMA.json +++ b/starter/INPUT_SCHEMA.json @@ -84,6 +84,13 @@ "default": [], "editor": "pseudoUrls" }, + "allowOnlyLinksFromSameDomain": { + "title": "Allow only links from the same domain", + "type": "boolean", + "description": "Additional check to make sure that only link related to the same domain are enqueued.", + "editor": "checkbox", + "prefill": false + }, "repeatChecksOnProvidedUrls": { "title": "Repeat checks on provided URLs", "type": "integer", diff --git a/starter/src/configs.ts b/starter/src/configs.ts index e425e16..b542d74 100644 --- a/starter/src/configs.ts +++ b/starter/src/configs.ts @@ -49,7 +49,7 @@ export function convertInputToActorConfigs(input: ActorInputData): PreparedActor function* createActorRunConfigForCrawler({ input, urlData, checkerId, playwrightBrowser, memory }: CreateActorRunConfig) { const proxyGroups = input.proxyConfiguration.apifyProxyGroups?.length ? input.proxyConfiguration.apifyProxyGroups - : ['auto']; + : ['auto']; for (const group of proxyGroups) { const { url } = urlData; const config: PreparedActorConfig = { @@ -66,8 +66,9 @@ function* createActorRunConfigForCrawler({ input, urlData, checkerId, playwright }, linkSelector: input.enqueueAllOnDomain ? 'a[href]' : input.linkSelector, pseudoUrls: input.enqueueAllOnDomain - ? [{ purl: `${new URL(url).origin}[.*]` }] + ? [{ purl: `[.*]` }] : input.pseudoUrls, + allowOnlyLinksFromSameDomain: input.enqueueAllOnDomain, repeatChecksOnProvidedUrls: input.repeatChecksOnProvidedUrls, maxNumberOfPagesCheckedPerDomain: input.maxNumberOfPagesCheckedPerDomain, maxConcurrentPagesCheckedPerDomain: input.maxConcurrentPagesCheckedPerDomain, diff --git a/starter/src/main.ts b/starter/src/main.ts index 5567ad7..55394f8 100644 --- a/starter/src/main.ts +++ b/starter/src/main.ts @@ -49,8 +49,6 @@ Actor.main(async () => { userData: { actorInput }, })); - - const requestList = await RequestList.open(null, sources); const runner = new BasicCrawler({ diff --git a/starter/src/startRunAndPool.ts b/starter/src/startRunAndPool.ts index 36f425c..5194fb8 100644 --- a/starter/src/startRunAndPool.ts +++ b/starter/src/startRunAndPool.ts @@ -1,4 +1,4 @@ -import { Actor, ActorRun } from 'apify'; +import { Actor } from 'apify'; import { DEFAULT_COSTS } from './constants.js'; import type { PreparedActorConfig, ActorCheckSimplifiedOutput, FixedActorRun } from './typedefs.js'; diff --git a/starter/src/typedefs.ts b/starter/src/typedefs.ts index 1ab1685..047939a 100644 --- a/starter/src/typedefs.ts +++ b/starter/src/typedefs.ts @@ -1,4 +1,4 @@ -import { ActorRun } from "apify"; +import { ActorRun } from 'apify'; export interface FrontendActorState { totalUrls: number; @@ -45,6 +45,7 @@ export interface ActorInputData { enqueueAllOnDomain?: boolean; linkSelector?: string; pseudoUrls: PseudoUrlInput[]; + allowOnlyLinksFromSameDomain?: boolean; repeatChecksOnProvidedUrls?: number; maxNumberOfPagesCheckedPerDomain: number; maxConcurrentPagesCheckedPerDomain: number; @@ -109,7 +110,6 @@ export interface ActorCheckDetailedOutput { estimatedCost: number; estimatedCostPerRequest: number; - // URLs url: string; simplifiedOutput: string;