Skip to content

Commit dbf4244

Browse files
fix: fix auto enqueueing from the current domain for all checkers
1 parent f9f91f2 commit dbf4244

File tree

5 files changed

+70
-42
lines changed

5 files changed

+70
-42
lines changed

checker-cheerio/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@ dist
66
node_modules
77
apify_storage
88
storage
9+
storage
910
storage

checker-cheerio/src/handlePage.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,20 @@ export async function handlePage(
6262
wasSuccess,
6363
});
6464

65+
const pageOrigin = new URL(request.url).origin;
66+
6567
if (input.linkSelector && !!$) {
6668
const info = await requestQueue.getInfo();
6769

6870
const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount;
6971
if (maxUrlsToEnqueue > 0) {
7072
const toEnqueue: RequestOptions[] = [];
7173
$(input.linkSelector).each((_, el) => {
72-
const href = $(el).attr('href');
74+
const rawHref = $(el).attr('href');
75+
if (!rawHref) {
76+
return;
77+
}
78+
const href = new URL(rawHref, pageOrigin).toString();
7379
for (const pseudoUrlInput of input.pseudoUrls) {
7480
if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) {
7581
const newUrl = new URL(href, request.loadedUrl).toString();
@@ -83,6 +89,7 @@ export async function handlePage(
8389
}
8490
}
8591
});
92+
console.log(`Found ${toEnqueue.length} links to enqueue on ${request.url}.`);
8693
await crawler.addRequests(toEnqueue.slice(0, maxUrlsToEnqueue));
8794
}
8895
}

checker-playwright/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@ dist
66
node_modules
77
apify_storage
88
storage
9+
storage
910
storage

checker-playwright/src/handlePage.ts

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import { Actor } from 'apify';
22
import Cheerio from 'cheerio';
33

4+
import { PseudoUrl } from 'crawlee';
45
import type { RequestQueue } from 'apify';
5-
import type { PlaywrightCrawlingContext, PseudoUrlInput } from 'crawlee';
6+
import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
67

78
import { testHtml } from './checkers.js';
89

@@ -14,8 +15,8 @@ export async function handlePage(
1415
input: PlaywrightActorInput,
1516
requestQueue: RequestQueue,
1617
state: ActorCheckDetailedOutput,
17-
{ request, response, page, enqueueLinks }: PlaywrightCrawlingContext,
18-
) {
18+
{ request, response, page, crawler }: PlaywrightCrawlingContext,
19+
): Promise<void> {
1920
let htmlUrl;
2021
let screenshotUrl;
2122

@@ -81,26 +82,35 @@ export async function handlePage(
8182
wasSuccess,
8283
});
8384

84-
if (input.linkSelector) {
85+
const pageOrigin = new URL(request.url).origin;
86+
87+
if (input.linkSelector && !!$) {
8588
const info = await requestQueue.getInfo();
8689

87-
// Only queue up more requests in the queue if we should (this should avoid excessive queue writes)
88-
if (input.maxNumberOfPagesCheckedPerDomain > info!.totalRequestCount) {
89-
await enqueueLinks({
90-
selector: input.linkSelector,
91-
pseudoUrls: input.pseudoUrls.map(
92-
(req) => ({
93-
purl: req.purl,
94-
url: request.url,
95-
headers: req.headers,
96-
method: req.method,
97-
payload: req.payload,
98-
userData: req.userData,
99-
}) as PseudoUrlInput,
100-
),
101-
requestQueue,
102-
baseUrl: request.loadedUrl,
90+
const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount;
91+
if (maxUrlsToEnqueue > 0) {
92+
const toEnqueue: RequestOptions[] = [];
93+
$(input.linkSelector).each((_, el) => {
94+
const rawHref = $(el).attr('href');
95+
if (!rawHref) {
96+
return;
97+
}
98+
const href = new URL(rawHref, pageOrigin).toString();
99+
for (const pseudoUrlInput of input.pseudoUrls) {
100+
if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) {
101+
const newUrl = new URL(href, request.loadedUrl).toString();
102+
toEnqueue.push({
103+
url: newUrl,
104+
headers: pseudoUrlInput.headers,
105+
method: pseudoUrlInput.method as 'GET' | 'POST',
106+
payload: pseudoUrlInput.payload,
107+
userData: pseudoUrlInput.userData,
108+
});
109+
}
110+
}
103111
});
112+
console.log(`Found ${toEnqueue.length} links to enqueue on ${request.url}.`);
113+
await crawler.addRequests(toEnqueue.slice(0, maxUrlsToEnqueue));
104114
}
105115
}
106116
}

checker-puppeteer/src/handlePage.ts

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
import { Actor } from 'apify';
22
import Cheerio from 'cheerio';
33
import { testHtml } from './checkers.js';
4-
import { puppeteerUtils } from 'crawlee';
4+
import { puppeteerUtils, PseudoUrl } from 'crawlee';
55

66
import type { RequestQueue } from 'apify';
7-
import type { PuppeteerCrawlingContext, PseudoUrlInput } from 'crawlee';
7+
import type { PuppeteerCrawlingContext, RequestOptions } from 'crawlee';
88

99
import type { ActorCheckDetailedOutput, PuppeteerActorInput } from './typedefs.js';
1010

1111
export async function handlePage(
1212
input: PuppeteerActorInput,
1313
requestQueue: RequestQueue,
1414
state: ActorCheckDetailedOutput,
15-
{ request, response, page, enqueueLinks }: PuppeteerCrawlingContext
16-
) {
15+
{ request, response, page, crawler }: PuppeteerCrawlingContext
16+
): Promise<void> {
1717
let htmlUrl;
1818
let screenshotUrl;
1919

@@ -73,26 +73,35 @@ export async function handlePage(
7373
wasSuccess,
7474
});
7575

76-
if (input.linkSelector) {
76+
const pageOrigin = new URL(request.url).origin;
77+
78+
if (input.linkSelector && !!$) {
7779
const info = await requestQueue.getInfo();
7880

79-
// Only queue up more requests in the queue if we should (this should avoid excessive queue writes)
80-
if (input.maxNumberOfPagesCheckedPerDomain > info!.totalRequestCount) {
81-
await enqueueLinks({
82-
selector: input.linkSelector,
83-
pseudoUrls: input.pseudoUrls.map(
84-
(req) => ({
85-
purl: req.purl,
86-
url: request.url,
87-
headers: req.headers,
88-
method: req.method,
89-
payload: req.payload,
90-
userData: req.userData,
91-
}) as PseudoUrlInput,
92-
),
93-
requestQueue,
94-
baseUrl: request.loadedUrl,
81+
const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount;
82+
if (maxUrlsToEnqueue > 0) {
83+
const toEnqueue: RequestOptions[] = [];
84+
$(input.linkSelector).each((_, el) => {
85+
const rawHref = $(el).attr('href');
86+
if (!rawHref) {
87+
return;
88+
}
89+
const href = new URL(rawHref, pageOrigin).toString();
90+
for (const pseudoUrlInput of input.pseudoUrls) {
91+
if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) {
92+
const newUrl = new URL(href, request.loadedUrl).toString();
93+
toEnqueue.push({
94+
url: newUrl,
95+
headers: pseudoUrlInput.headers,
96+
method: pseudoUrlInput.method as 'GET' | 'POST',
97+
payload: pseudoUrlInput.payload,
98+
userData: pseudoUrlInput.userData,
99+
});
100+
}
101+
}
95102
});
103+
console.log(`Found ${toEnqueue.length} links to enqueue on ${request.url}.`);
104+
await crawler.addRequests(toEnqueue.slice(0, maxUrlsToEnqueue));
96105
}
97106
}
98107
}

0 commit comments

Comments
 (0)