Skip to content

Commit a50aebd

Browse files
authored
fix: #10 - Improve crawling of initially redirected requests (#11)
1 parent 4282e17 commit a50aebd

File tree

19 files changed

+95
-26
lines changed

19 files changed

+95
-26
lines changed

checker-cheerio/Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Specify the base Docker image. You can read more about
22
# the available images at https://crawlee.dev/docs/guides/docker-images
33
# You can also use any other image from Docker Hub.
4-
FROM apify/actor-node:16 AS builder
4+
FROM apify/actor-node:22 AS builder
55

66
# Copy just package.json and package-lock.json
77
# to speed up the build using Docker layer cache.
@@ -19,7 +19,7 @@ COPY . ./
1919
RUN npm run build
2020

2121
# Create final image
22-
FROM apify/actor-node:16
22+
FROM apify/actor-node:22
2323

2424
# Copy only built JS files from builder image
2525
COPY --from=builder /usr/src/app/dist ./dist
@@ -48,4 +48,4 @@ COPY . ./
4848

4949

5050
# Run the image.
51-
CMD npm run start:prod --silent
51+
CMD npm run start:prod --silent

checker-cheerio/INPUT_SCHEMA.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,13 @@
4444
"prefill": "a[href]",
4545
"minLength": 1
4646
},
47+
"allowOnlyLinksFromSameDomain": {
48+
"title": "Allow only links from the same domain",
49+
"type": "boolean",
50+
"description": "Additional check to make sure that only link related to the same domain are enqueued.",
51+
"editor": "checkbox",
52+
"prefill": false
53+
},
4754
"pseudoUrls": {
4855
"title": "Pseudo-URLs",
4956
"type": "array",

checker-cheerio/src/handlePage.ts

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,16 @@ export async function handlePage(
6262
wasSuccess,
6363
});
6464

65-
const pageOrigin = new URL(request.url).origin;
65+
const currentUrl = request.loadedUrl ?? request.url;
66+
const pageOrigin = new URL(currentUrl).origin;
6667

6768
if (input.linkSelector && !!$) {
6869
const info = await requestQueue.getInfo();
6970

7071
const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount;
72+
// Using page.url() in case the initial link was redirected.
73+
const initialDomain = request.userData.initialDomain ?? new URL(currentUrl).hostname;
74+
7175
if (maxUrlsToEnqueue > 0) {
7276
const toEnqueue: RequestOptions[] = [];
7377
$(input.linkSelector).each((_, el) => {
@@ -76,6 +80,15 @@ export async function handlePage(
7680
return;
7781
}
7882
const href = new URL(rawHref, pageOrigin).toString();
83+
84+
if (!['http:', 'https:'].includes(new URL(href).protocol)) {
85+
return; // Skipping invalid protocol (mailto:, ftp: etc...)
86+
}
87+
88+
if (input.allowOnlyLinksFromSameDomain && new URL(href).hostname !== initialDomain) {
89+
return; // Skip if only links related to the same domain are allowed.
90+
}
91+
7992
for (const pseudoUrlInput of input.pseudoUrls) {
8093
if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) {
8194
const newUrl = new URL(href, request.loadedUrl).toString();
@@ -84,7 +97,10 @@ export async function handlePage(
8497
headers: pseudoUrlInput.headers,
8598
method: pseudoUrlInput.method as 'GET' | 'POST',
8699
payload: pseudoUrlInput.payload,
87-
userData: pseudoUrlInput.userData,
100+
userData: {
101+
...pseudoUrlInput.userData,
102+
initialDomain: input.allowOnlyLinksFromSameDomain ? initialDomain : undefined,
103+
},
88104
});
89105
}
90106
}

checker-cheerio/src/typedefs.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ export interface ActorInputData {
5353
urlsToCheck: UrlInput[];
5454
proxyConfiguration: ProxyConfiguration;
5555
linkSelector?: string;
56+
allowOnlyLinksFromSameDomain?: boolean;
5657
pseudoUrls: PseudoUrlInputCustom[];
5758
repeatChecksOnProvidedUrls?: number;
5859
maxNumberOfPagesCheckedPerDomain: number;

checker-playwright/Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Specify the base Docker image. You can read more about
22
# the available images at https://crawlee.dev/docs/guides/docker-images
33
# You can also use any other image from Docker Hub.
4-
FROM apify/actor-node-playwright-chrome:16 AS builder
4+
FROM apify/actor-node-playwright-chrome:22 AS builder
55

66
# Copy just package.json and package-lock.json
77
# to speed up the build using Docker layer cache.
@@ -19,7 +19,7 @@ COPY --chown=myuser . ./
1919
RUN npm run build
2020

2121
# Create final image
22-
FROM apify/actor-node-playwright-chrome:16
22+
FROM apify/actor-node-playwright-chrome:22
2323

2424
# Copy only built JS files from builder image
2525
COPY --from=builder --chown=myuser /home/myuser/dist ./dist
@@ -49,4 +49,4 @@ COPY --chown=myuser . ./
4949

5050
# Run the image. If you know you won't need headful browsers,
5151
# you can remove the XVFB start script for a micro perf gain.
52-
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
52+
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent

checker-playwright/INPUT_SCHEMA.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,13 @@
4444
"prefill": "a[href]",
4545
"minLength": 1
4646
},
47+
"allowOnlyLinksFromSameDomain": {
48+
"title": "Allow only links from the same domain",
49+
"type": "boolean",
50+
"description": "Additional check to make sure that only link related to the same domain are enqueued.",
51+
"editor": "checkbox",
52+
"prefill": false
53+
},
4754
"pseudoUrls": {
4855
"title": "Pseudo-URLs",
4956
"type": "array",

checker-playwright/src/handlePage.ts

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,15 @@ export async function handlePage(
8282
wasSuccess,
8383
});
8484

85-
const pageOrigin = new URL(request.url).origin;
85+
const pageOrigin = new URL(page.url()).origin;
8686

8787
if (input.linkSelector && !!$) {
8888
const info = await requestQueue.getInfo();
8989

9090
const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount;
91+
// Using page.url() in case the initial link was redirected.
92+
const initialDomain = request.userData.initialDomain ?? new URL(page.url()).hostname;
93+
9194
if (maxUrlsToEnqueue > 0) {
9295
const toEnqueue: RequestOptions[] = [];
9396
$(input.linkSelector).each((_, el) => {
@@ -96,6 +99,15 @@ export async function handlePage(
9699
return;
97100
}
98101
const href = new URL(rawHref, pageOrigin).toString();
102+
103+
if (!['http:', 'https:'].includes(new URL(href).protocol)) {
104+
return; // Skipping invalid protocol (mailto:, ftp: etc...)
105+
}
106+
107+
if (input.allowOnlyLinksFromSameDomain && new URL(href).hostname !== initialDomain) {
108+
return; // Skip if only links related to the same domain are allowed.
109+
}
110+
99111
for (const pseudoUrlInput of input.pseudoUrls) {
100112
if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) {
101113
const newUrl = new URL(href, request.loadedUrl).toString();
@@ -104,7 +116,10 @@ export async function handlePage(
104116
headers: pseudoUrlInput.headers,
105117
method: pseudoUrlInput.method as 'GET' | 'POST',
106118
payload: pseudoUrlInput.payload,
107-
userData: pseudoUrlInput.userData,
119+
userData: {
120+
...pseudoUrlInput.userData,
121+
initialDomain: input.allowOnlyLinksFromSameDomain ? initialDomain : undefined,
122+
},
108123
});
109124
}
110125
}

checker-playwright/src/main.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { log, PlaywrightCrawler, RequestOptions } from 'crawlee';
33
import { chromium, firefox, webkit } from 'playwright';
44
import { inspect } from 'util';
55

6-
import type { ActorCheckDetailedOutput, PlaywrightActorInput } from './typedefs';
6+
import type { ActorCheckDetailedOutput, PlaywrightActorInput } from './typedefs.js';
77

88
import { handleFailedRequest } from './handleFailedRequest.js';
99
import { handlePage } from './handlePage.js';

checker-playwright/src/typedefs.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ export interface ActorInputData {
4646
urlsToCheck: UrlInput[];
4747
proxyConfiguration: ProxyConfiguration;
4848
linkSelector?: string;
49+
allowOnlyLinksFromSameDomain?: boolean;
4950
pseudoUrls: PseudoUrlInputCustom[];
5051
repeatChecksOnProvidedUrls?: number;
5152
maxNumberOfPagesCheckedPerDomain: number;

checker-puppeteer/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Specify the base Docker image. You can read more about
22
# the available images at https://crawlee.dev/docs/guides/docker-images
33
# You can also use any other image from Docker Hub.
4-
FROM apify/actor-node-puppeteer-chrome:16 AS builder
4+
FROM apify/actor-node-puppeteer-chrome:22 AS builder
55

66
# Copy just package.json and package-lock.json
77
# to speed up the build using Docker layer cache.
@@ -49,4 +49,4 @@ COPY --chown=myuser . ./
4949

5050
# Run the image. If you know you won't need headful browsers,
5151
# you can remove the XVFB start script for a micro perf gain.
52-
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
52+
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent

0 commit comments

Comments
 (0)