Skip to content

fix: #10 - Improve crawling of initially redirected requests #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions checker-cheerio/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:16 AS builder
FROM apify/actor-node:22 AS builder

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
Expand All @@ -19,7 +19,7 @@ COPY . ./
RUN npm run build

# Create final image
FROM apify/actor-node:16
FROM apify/actor-node:22

# Copy only built JS files from builder image
COPY --from=builder /usr/src/app/dist ./dist
Expand Down Expand Up @@ -48,4 +48,4 @@ COPY . ./


# Run the image.
CMD npm run start:prod --silent
CMD npm run start:prod --silent
7 changes: 7 additions & 0 deletions checker-cheerio/INPUT_SCHEMA.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@
"prefill": "a[href]",
"minLength": 1
},
"allowOnlyLinksFromSameDomain": {
"title": "Allow only links from the same domain",
"type": "boolean",
"description": "Additional check to make sure that only link related to the same domain are enqueued.",
"editor": "checkbox",
"prefill": false
},
"pseudoUrls": {
"title": "Pseudo-URLs",
"type": "array",
Expand Down
20 changes: 18 additions & 2 deletions checker-cheerio/src/handlePage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,16 @@ export async function handlePage(
wasSuccess,
});

const pageOrigin = new URL(request.url).origin;
const currentUrl = request.loadedUrl ?? request.url;
const pageOrigin = new URL(currentUrl).origin;

if (input.linkSelector && !!$) {
const info = await requestQueue.getInfo();

const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount;
// Using page.url() in case the initial link was redirected.
const initialDomain = request.userData.initialDomain ?? new URL(currentUrl).hostname;

if (maxUrlsToEnqueue > 0) {
const toEnqueue: RequestOptions[] = [];
$(input.linkSelector).each((_, el) => {
Expand All @@ -76,6 +80,15 @@ export async function handlePage(
return;
}
const href = new URL(rawHref, pageOrigin).toString();

if (!['http:', 'https:'].includes(new URL(href).protocol)) {
return; // Skipping invalid protocol (mailto:, ftp: etc...)
}

if (input.allowOnlyLinksFromSameDomain && new URL(href).hostname !== initialDomain) {
return; // Skip if only links related to the same domain are allowed.
}

for (const pseudoUrlInput of input.pseudoUrls) {
if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) {
const newUrl = new URL(href, request.loadedUrl).toString();
Expand All @@ -84,7 +97,10 @@ export async function handlePage(
headers: pseudoUrlInput.headers,
method: pseudoUrlInput.method as 'GET' | 'POST',
payload: pseudoUrlInput.payload,
userData: pseudoUrlInput.userData,
userData: {
...pseudoUrlInput.userData,
initialDomain: input.allowOnlyLinksFromSameDomain ? initialDomain : undefined,
},
});
}
}
Expand Down
1 change: 1 addition & 0 deletions checker-cheerio/src/typedefs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ export interface ActorInputData {
urlsToCheck: UrlInput[];
proxyConfiguration: ProxyConfiguration;
linkSelector?: string;
allowOnlyLinksFromSameDomain?: boolean;
pseudoUrls: PseudoUrlInputCustom[];
repeatChecksOnProvidedUrls?: number;
maxNumberOfPagesCheckedPerDomain: number;
Expand Down
6 changes: 3 additions & 3 deletions checker-playwright/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:16 AS builder
FROM apify/actor-node-playwright-chrome:22 AS builder

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
Expand All @@ -19,7 +19,7 @@ COPY --chown=myuser . ./
RUN npm run build

# Create final image
FROM apify/actor-node-playwright-chrome:16
FROM apify/actor-node-playwright-chrome:22

# Copy only built JS files from builder image
COPY --from=builder --chown=myuser /home/myuser/dist ./dist
Expand Down Expand Up @@ -49,4 +49,4 @@ COPY --chown=myuser . ./

# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
7 changes: 7 additions & 0 deletions checker-playwright/INPUT_SCHEMA.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@
"prefill": "a[href]",
"minLength": 1
},
"allowOnlyLinksFromSameDomain": {
"title": "Allow only links from the same domain",
"type": "boolean",
"description": "Additional check to make sure that only link related to the same domain are enqueued.",
"editor": "checkbox",
"prefill": false
},
"pseudoUrls": {
"title": "Pseudo-URLs",
"type": "array",
Expand Down
19 changes: 17 additions & 2 deletions checker-playwright/src/handlePage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,15 @@ export async function handlePage(
wasSuccess,
});

const pageOrigin = new URL(request.url).origin;
const pageOrigin = new URL(page.url()).origin;

if (input.linkSelector && !!$) {
const info = await requestQueue.getInfo();

const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount;
// Using page.url() in case the initial link was redirected.
const initialDomain = request.userData.initialDomain ?? new URL(page.url()).hostname;

if (maxUrlsToEnqueue > 0) {
const toEnqueue: RequestOptions[] = [];
$(input.linkSelector).each((_, el) => {
Expand All @@ -96,6 +99,15 @@ export async function handlePage(
return;
}
const href = new URL(rawHref, pageOrigin).toString();

if (!['http:', 'https:'].includes(new URL(href).protocol)) {
return; // Skipping invalid protocol (mailto:, ftp: etc...)
}

if (input.allowOnlyLinksFromSameDomain && new URL(href).hostname !== initialDomain) {
return; // Skip if only links related to the same domain are allowed.
}

for (const pseudoUrlInput of input.pseudoUrls) {
if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) {
const newUrl = new URL(href, request.loadedUrl).toString();
Expand All @@ -104,7 +116,10 @@ export async function handlePage(
headers: pseudoUrlInput.headers,
method: pseudoUrlInput.method as 'GET' | 'POST',
payload: pseudoUrlInput.payload,
userData: pseudoUrlInput.userData,
userData: {
...pseudoUrlInput.userData,
initialDomain: input.allowOnlyLinksFromSameDomain ? initialDomain : undefined,
},
});
}
}
Expand Down
2 changes: 1 addition & 1 deletion checker-playwright/src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { log, PlaywrightCrawler, RequestOptions } from 'crawlee';
import { chromium, firefox, webkit } from 'playwright';
import { inspect } from 'util';

import type { ActorCheckDetailedOutput, PlaywrightActorInput } from './typedefs';
import type { ActorCheckDetailedOutput, PlaywrightActorInput } from './typedefs.js';

import { handleFailedRequest } from './handleFailedRequest.js';
import { handlePage } from './handlePage.js';
Expand Down
1 change: 1 addition & 0 deletions checker-playwright/src/typedefs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ export interface ActorInputData {
urlsToCheck: UrlInput[];
proxyConfiguration: ProxyConfiguration;
linkSelector?: string;
allowOnlyLinksFromSameDomain?: boolean;
pseudoUrls: PseudoUrlInputCustom[];
repeatChecksOnProvidedUrls?: number;
maxNumberOfPagesCheckedPerDomain: number;
Expand Down
4 changes: 2 additions & 2 deletions checker-puppeteer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-puppeteer-chrome:16 AS builder
FROM apify/actor-node-puppeteer-chrome:22 AS builder

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
Expand Down Expand Up @@ -49,4 +49,4 @@ COPY --chown=myuser . ./

# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
19 changes: 17 additions & 2 deletions checker-puppeteer/src/handlePage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,15 @@ export async function handlePage(
wasSuccess,
});

const pageOrigin = new URL(request.url).origin;
const pageOrigin = new URL(page.url()).origin;

if (input.linkSelector && !!$) {
const info = await requestQueue.getInfo();

const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount;
// Using page.url() in case the initial link was redirected.
const initialDomain = request.userData.initialDomain ?? new URL(page.url()).hostname;

if (maxUrlsToEnqueue > 0) {
const toEnqueue: RequestOptions[] = [];
$(input.linkSelector).each((_, el) => {
Expand All @@ -87,6 +90,15 @@ export async function handlePage(
return;
}
const href = new URL(rawHref, pageOrigin).toString();

if (!['http:', 'https:'].includes(new URL(href).protocol)) {
return; // Skipping invalid protocol (mailto:, ftp: etc...)
}

if (input.allowOnlyLinksFromSameDomain && new URL(href).hostname !== initialDomain) {
return; // Skip if only links related to the same domain are allowed.
}

for (const pseudoUrlInput of input.pseudoUrls) {
if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) {
const newUrl = new URL(href, request.loadedUrl).toString();
Expand All @@ -95,7 +107,10 @@ export async function handlePage(
headers: pseudoUrlInput.headers,
method: pseudoUrlInput.method as 'GET' | 'POST',
payload: pseudoUrlInput.payload,
userData: pseudoUrlInput.userData,
userData: {
...pseudoUrlInput.userData,
initialDomain: input.allowOnlyLinksFromSameDomain ? initialDomain : undefined,
},
});
}
}
Expand Down
2 changes: 1 addition & 1 deletion checker-puppeteer/src/main.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { Actor } from 'apify';
import { log, PuppeteerCrawler, RequestOptions } from 'crawlee';

import type { ActorCheckDetailedOutput, PuppeteerActorInput } from './typedefs';
import type { ActorCheckDetailedOutput, PuppeteerActorInput } from './typedefs.js';

import { inspect } from 'util';
import { handleFailedRequest } from './handleFailedRequest.js';
Expand Down
3 changes: 2 additions & 1 deletion checker-puppeteer/src/typedefs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ export interface ActorInputData {
urlsToCheck: UrlInput[];
proxyConfiguration: ProxyConfiguration;
linkSelector?: string;
allowOnlyLinksFromSameDomain?: boolean;
pseudoUrls: PseudoUrlInputCustom[];
repeatChecksOnProvidedUrls?: number;
maxNumberOfPagesCheckedPerDomain: number;
Expand Down Expand Up @@ -136,4 +137,4 @@ export type ActorCheckSimplifiedOutput = {
: ActorCheckDetailedOutput[K] extends { [key: number]: UrlCheckResult[] }
? Record<number, number>
: ActorCheckDetailedOutput[K];
};
};
4 changes: 2 additions & 2 deletions starter/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:16 AS builder
FROM apify/actor-node:22 AS builder

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
Expand Down Expand Up @@ -48,4 +48,4 @@ COPY . ./


# Run the image.
CMD npm run start:prod --silent
CMD npm run start:prod --silent
7 changes: 7 additions & 0 deletions starter/INPUT_SCHEMA.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@
"default": [],
"editor": "pseudoUrls"
},
"allowOnlyLinksFromSameDomain": {
"title": "Allow only links from the same domain",
"type": "boolean",
"description": "Additional check to make sure that only link related to the same domain are enqueued.",
"editor": "checkbox",
"prefill": false
},
"repeatChecksOnProvidedUrls": {
"title": "Repeat checks on provided URLs",
"type": "integer",
Expand Down
5 changes: 3 additions & 2 deletions starter/src/configs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ export function convertInputToActorConfigs(input: ActorInputData): PreparedActor
function* createActorRunConfigForCrawler({ input, urlData, checkerId, playwrightBrowser, memory }: CreateActorRunConfig) {
const proxyGroups = input.proxyConfiguration.apifyProxyGroups?.length
? input.proxyConfiguration.apifyProxyGroups
: ['auto'];
: ['auto'];
for (const group of proxyGroups) {
const { url } = urlData;
const config: PreparedActorConfig = {
Expand All @@ -66,8 +66,9 @@ function* createActorRunConfigForCrawler({ input, urlData, checkerId, playwright
},
linkSelector: input.enqueueAllOnDomain ? 'a[href]' : input.linkSelector,
pseudoUrls: input.enqueueAllOnDomain
? [{ purl: `${new URL(url).origin}[.*]` }]
? [{ purl: `[.*]` }]
: input.pseudoUrls,
allowOnlyLinksFromSameDomain: input.enqueueAllOnDomain,
repeatChecksOnProvidedUrls: input.repeatChecksOnProvidedUrls,
maxNumberOfPagesCheckedPerDomain: input.maxNumberOfPagesCheckedPerDomain,
maxConcurrentPagesCheckedPerDomain: input.maxConcurrentPagesCheckedPerDomain,
Expand Down
2 changes: 0 additions & 2 deletions starter/src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@ Actor.main(async () => {
userData: { actorInput },
}));



const requestList = await RequestList.open(null, sources);

const runner = new BasicCrawler({
Expand Down
2 changes: 1 addition & 1 deletion starter/src/startRunAndPool.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { Actor, ActorRun } from 'apify';
import { Actor } from 'apify';

import { DEFAULT_COSTS } from './constants.js';
import type { PreparedActorConfig, ActorCheckSimplifiedOutput, FixedActorRun } from './typedefs.js';
Expand Down
4 changes: 2 additions & 2 deletions starter/src/typedefs.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { ActorRun } from "apify";
import { ActorRun } from 'apify';

export interface FrontendActorState {
totalUrls: number;
Expand Down Expand Up @@ -45,6 +45,7 @@ export interface ActorInputData {
enqueueAllOnDomain?: boolean;
linkSelector?: string;
pseudoUrls: PseudoUrlInput[];
allowOnlyLinksFromSameDomain?: boolean;
repeatChecksOnProvidedUrls?: number;
maxNumberOfPagesCheckedPerDomain: number;
maxConcurrentPagesCheckedPerDomain: number;
Expand Down Expand Up @@ -109,7 +110,6 @@ export interface ActorCheckDetailedOutput {
estimatedCost: number;
estimatedCostPerRequest: number;


// URLs
url: string;
simplifiedOutput: string;
Expand Down