Skip to content

Commit cf72d70

Browse files
janbucharB4nan
andauthored
test: Add end-to-end test of AdaptivePlaywrightCrawler (#3003)
- closes #2350 --------- Co-authored-by: Martin Adámek <banan23@gmail.com>
1 parent 5c4726d commit cf72d70

File tree

6 files changed

+140
-0
lines changed

6 files changed

+140
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"actorSpecification": 1,
3+
"name": "test-adaptive-playwright-default",
4+
"version": "0.0",
5+
"buildTag": "latest",
6+
"env": null
7+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
.idea
2+
.DS_Store
3+
node_modules
4+
package-lock.json
5+
apify_storage
6+
crawlee_storage
7+
storage
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
FROM node:22 AS builder
2+
3+
COPY /packages ./packages
4+
COPY /package*.json ./
5+
RUN npm --quiet set progress=false \
6+
&& npm install --only=prod --no-optional --no-audit \
7+
&& npm update
8+
9+
FROM apify/actor-node-playwright-chrome:22-beta
10+
11+
RUN rm -r node_modules
12+
COPY --from=builder /node_modules ./node_modules
13+
COPY --from=builder /packages ./packages
14+
COPY --from=builder /package*.json ./
15+
COPY /.actor ./.actor
16+
COPY /main.js ./
17+
18+
RUN echo "Installed NPM packages:" \
19+
&& (npm list --only=prod --no-optional --all || true) \
20+
&& echo "Node.js version:" \
21+
&& node --version \
22+
&& echo "NPM version:" \
23+
&& npm --version
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import { Actor } from 'apify';
2+
import { AdaptivePlaywrightCrawler } from '@crawlee/playwright';
3+
import { LogLevel } from '@apify/log';
4+
5+
await Actor.init({
6+
storage:
7+
process.env.STORAGE_IMPLEMENTATION === 'LOCAL'
8+
? new (await import('@apify/storage-local')).ApifyStorageLocal()
9+
: undefined,
10+
});
11+
12+
const crawler = new AdaptivePlaywrightCrawler({
13+
// Override the comparator so that it ignores `requestHandlerMode`
14+
resultComparator: (resultA, resultB) => {
15+
if (resultA.datasetItems.length === 1 && resultB.datasetItems.length === 1) {
16+
const itemA = resultA.datasetItems[0].item;
17+
const itemB = resultB.datasetItems[0].item;
18+
19+
if (itemA.url === itemB.url && itemA.heading === itemB.heading) {
20+
return 'equal';
21+
}
22+
}
23+
24+
return 'different';
25+
},
26+
requestHandler: async (context) => {
27+
const { url } = context.request;
28+
29+
const heading = (await context.querySelector('h1')).text();
30+
31+
const requestHandlerMode = await (async () => {
32+
try {
33+
await context.page.title();
34+
return 'browser';
35+
} catch {
36+
return 'httpOnly';
37+
}
38+
})();
39+
40+
await context.pushData({ url, heading, requestHandlerMode });
41+
42+
await context.enqueueLinks({
43+
globs: ['**/next/examples/*'],
44+
});
45+
},
46+
});
47+
48+
crawler.log.setLevel(LogLevel.DEBUG);
49+
50+
await crawler.run(['https://crawlee.dev/js/docs/next/examples/accept-user-input']);
51+
52+
await Actor.exit({ exit: Actor.isAtHome() });
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"name": "test-adaptive-playwright-default",
3+
"version": "0.0.1",
4+
"description": "Adaptive Playwright Test - Default",
5+
"dependencies": {
6+
"apify": "next",
7+
"@apify/storage-local": "^2.1.3",
8+
"@crawlee/basic": "file:./packages/basic-crawler",
9+
"@crawlee/browser": "file:./packages/browser-crawler",
10+
"@crawlee/browser-pool": "file:./packages/browser-pool",
11+
"@crawlee/core": "file:./packages/core",
12+
"@crawlee/memory-storage": "file:./packages/memory-storage",
13+
"@crawlee/playwright": "file:./packages/playwright-crawler",
14+
"@crawlee/types": "file:./packages/types",
15+
"@crawlee/utils": "file:./packages/utils",
16+
"playwright": "*"
17+
},
18+
"overrides": {
19+
"apify": {
20+
"@crawlee/core": "file:./packages/core",
21+
"@crawlee/utils": "file:./packages/utils"
22+
},
23+
"@apify/storage-local": {
24+
"better-sqlite3": "^11.10.0"
25+
}
26+
},
27+
"scripts": {
28+
"start": "node main.js"
29+
},
30+
"type": "module",
31+
"license": "ISC"
32+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import { initialize, getActorTestDir, runActor, expect, validateDataset } from '../tools.mjs';
2+
3+
const testActorDirname = getActorTestDir(import.meta.url);
4+
await initialize(testActorDirname);
5+
6+
const { datasetItems } = await runActor(testActorDirname, 16384);
7+
8+
await expect(datasetItems.length > 15, 'Number of dataset items');
9+
await expect(validateDataset(datasetItems, ['url', 'heading', 'requestHandlerMode']), 'Dataset items validation');
10+
11+
await expect(
12+
datasetItems.filter((it) => it.requestHandlerMode === 'browser').length >= 1,
13+
'The crawler should handle at least one request in the browser',
14+
);
15+
16+
await expect(
17+
datasetItems.filter((it) => it.requestHandlerMode === 'httpOnly').length >= 5,
18+
'The crawler should handle some requests in http-only mode',
19+
);

0 commit comments

Comments
 (0)