Skip to content

Commit b3601a5

Browse files
drobnikjfnesveda
andauthored
feat: Added tutorial to scraper website from multiple Actor runs (#918)
* Added tutorial about how to scrape a single website from multiple Actor runs * Added example of code under `./examples` I do it as short as possible on one side, plus cover all necessary problems. Of course, it is not a production-ready solution, but it should give the reader an idea of how to create a scraper running from multiple Actor runs. --------- Co-authored-by: František Nesveda <[email protected]>
1 parent e647387 commit b3601a5

File tree

19 files changed

+637
-4
lines changed

19 files changed

+637
-4
lines changed

.eslintrc.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,5 +66,6 @@
6666
"@typescript-eslint/promise-function-async": "off"
6767
}
6868
}
69-
]
69+
],
70+
"ignorePatterns": ["examples/"]
7071
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Specify the base Docker image. You can read more about
2+
# the available images at https://crawlee.dev/docs/guides/docker-images
3+
# You can also use any other image from Docker Hub.
4+
FROM apify/actor-node:18 AS builder
5+
6+
# Copy just package.json and package-lock.json
7+
# to speed up the build using Docker layer cache.
8+
COPY package*.json ./
9+
10+
# Install all dependencies. Don't audit to speed up the installation.
11+
RUN npm install --include=dev --audit=false
12+
13+
# Next, copy the source files using the user set
14+
# in the base image.
15+
COPY . ./
16+
17+
# Install all dependencies and build the project.
18+
# Don't audit to speed up the installation.
19+
RUN npm run build
20+
21+
# Create final image
22+
FROM apify/actor-node:18
23+
24+
# Copy just package.json and package-lock.json
25+
# to speed up the build using Docker layer cache.
26+
COPY package*.json ./
27+
28+
# Install NPM packages, skip optional and development dependencies to
29+
# keep the image small. Avoid logging too much and print the dependency
30+
# tree for debugging
31+
RUN npm --quiet set progress=false \
32+
&& npm install --omit=dev --omit=optional \
33+
&& echo "Installed NPM packages:" \
34+
&& (npm list --omit=dev --all || true) \
35+
&& echo "Node.js version:" \
36+
&& node --version \
37+
&& echo "NPM version:" \
38+
&& npm --version \
39+
&& rm -r ~/.npm
40+
41+
# Copy built JS files from builder image
42+
COPY --from=builder /usr/src/app/dist ./dist
43+
44+
# Next, copy the remaining files and directories with the source code.
45+
# Since we do this after NPM install, quick build will be really fast
46+
# for most source file changes.
47+
COPY . ./
48+
49+
50+
# Run the image.
51+
CMD npm run start:prod --silent
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"actorSpecification": 1,
3+
"name": "ts-parallel-scraping-orchestrator",
4+
"title": "Orchestrator Actor in Typescript",
5+
"description": "",
6+
"version": "0.0",
7+
"meta": {
8+
"templateId": "ts-crawlee-cheerio"
9+
},
10+
"input": "./input_schema.json",
11+
"dockerfile": "./Dockerfile"
12+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"title": "Orchestrator Actor Input",
3+
"type": "object",
4+
"schemaVersion": 1,
5+
"properties": {
6+
"parallelRunsCount": {
7+
"title": "Parallel Actor runs count",
8+
"type": "integer",
9+
"description": "Number of parallel runs of the Actor.",
10+
"default": 1
11+
},
12+
"targetActorId": {
13+
"title": "Actor ID",
14+
"type": "string",
15+
"editor": "textfield",
16+
"description": "ID of the Actor to run."
17+
},
18+
"targetActorInput": {
19+
"title": "Actor Input",
20+
"type": "object",
21+
"description": "Input of the Actor to run",
22+
"editor": "json",
23+
"prefill": {}
24+
},
25+
"targetActorRunOptions": {
26+
"title": "Actor Run Options",
27+
"type": "object",
28+
"description": "Options for the Actor run",
29+
"editor": "json",
30+
"prefill": {}
31+
}
32+
},
33+
"required": ["parallelRunsCount", "targetActorId"]
34+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"name": "ts-parallel-scraping-orchestrator",
3+
"version": "0.0.1",
4+
"type": "module",
5+
"description": "This is a boilerplate of an Apify actor.",
6+
"engines": {
7+
"node": ">=18.0.0"
8+
},
9+
"dependencies": {
10+
"apify": "^3.1.10",
11+
"crawlee": "^3.5.4"
12+
},
13+
"devDependencies": {
14+
"@apify/eslint-config-ts": "^0.3.0",
15+
"@apify/tsconfig": "^0.1.0",
16+
"@typescript-eslint/eslint-plugin": "^6.7.2",
17+
"@typescript-eslint/parser": "^6.7.2",
18+
"eslint": "^8.50.0",
19+
"tsx": "^4.6.2",
20+
"typescript": "^5.3.3"
21+
},
22+
"scripts": {
23+
"start": "npm run start:dev",
24+
"start:prod": "node dist/main.js",
25+
"start:dev": "tsx src/main.ts",
26+
"build": "tsc",
27+
"lint": "eslint ./src --ext .ts",
28+
"lint:fix": "eslint ./src --ext .ts --fix",
29+
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
30+
},
31+
"author": "It's not you it's me",
32+
"license": "ISC"
33+
}
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import { Actor, log } from 'apify';
2+
3+
interface Input {
4+
parallelRunsCount: number;
5+
targetActorId: string;
6+
targetActorInput: Record<string, any>;
7+
targetActorRunOptions: Record<string, any>;
8+
}
9+
10+
interface State {
11+
parallelRunIds: string[];
12+
isInitialized: boolean;
13+
}
14+
15+
await Actor.init();
16+
17+
const {
18+
parallelRunsCount= 1,
19+
targetActorId,
20+
targetActorInput = {},
21+
targetActorRunOptions = {},
22+
} = await Actor.getInput<Input>() ?? {} as Input;
23+
const { apifyClient} = Actor;
24+
25+
if (!targetActorId) throw await Actor.fail('Missing the "targetActorId" input!');
26+
27+
// Get the current run request queue and dataset, we use the default ones.
28+
const requestQueue = await Actor.openRequestQueue();
29+
const dataset = await Actor.openDataset();
30+
31+
// Spawn parallel runs and store their IDs in the state or continue if they are already running.
32+
const state = await Actor.useState<State>('actor-state', { parallelRunIds: [], isInitialized: false });
33+
34+
if (state.isInitialized) {
35+
for (const runId of state.parallelRunIds) {
36+
const runClient = apifyClient.run(runId);
37+
const run = await runClient.get();
38+
39+
// This should happen only if the run was deleted or the state was incorectly saved.
40+
if (!run) throw await Actor.fail(`The run ${runId} from state does not exists.`);
41+
42+
if (run.status === 'RUNNING') {
43+
log.info('Parallel run is already running.', { runId });
44+
} else {
45+
log.info(`Parallel run was in state ${run.status}, resurrecting.`, { runId });
46+
await runClient.resurrect(targetActorRunOptions);
47+
}
48+
}
49+
} else {
50+
for (let i = 0; i < parallelRunsCount; i++) {
51+
const run = await Actor.start(targetActorId, {
52+
...targetActorInput,
53+
datasetId: dataset.id,
54+
requestQueueId: requestQueue.id,
55+
}, targetActorRunOptions);
56+
log.info(`Started parallel run with ID: ${run.id}`, { runId: run.id });
57+
state.parallelRunIds.push(run.id);
58+
}
59+
state.isInitialized = true;
60+
}
61+
62+
const parallelRunPromises = state.parallelRunIds.map((runId) => {
63+
const runClient = apifyClient.run(runId);
64+
return runClient.waitForFinish();
65+
});
66+
67+
// Abort parallel runs if the main run is aborted
68+
Actor.on('aborting', async () => {
69+
for (const runId of state.parallelRunIds) {
70+
log.info('Aborting run', { runId });
71+
await apifyClient.run(runId).abort();
72+
}
73+
});
74+
75+
// Wait for all parallel runs to finish
76+
await Promise.all(parallelRunPromises);
77+
78+
// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()
79+
await Actor.exit();
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"extends": "@apify/tsconfig",
3+
"compilerOptions": {
4+
"module": "NodeNext",
5+
"moduleResolution": "NodeNext",
6+
"target": "ES2022",
7+
"outDir": "dist",
8+
"noUnusedLocals": false,
9+
"skipLibCheck": true,
10+
"lib": ["DOM"]
11+
},
12+
"include": [
13+
"./src/**/*"
14+
]
15+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Specify the base Docker image. You can read more about
2+
# the available images at https://crawlee.dev/docs/guides/docker-images
3+
# You can also use any other image from Docker Hub.
4+
FROM apify/actor-node:18 AS builder
5+
6+
# Copy just package.json and package-lock.json
7+
# to speed up the build using Docker layer cache.
8+
COPY package*.json ./
9+
10+
# Install all dependencies. Don't audit to speed up the installation.
11+
RUN npm install --include=dev --audit=false
12+
13+
# Next, copy the source files using the user set
14+
# in the base image.
15+
COPY . ./
16+
17+
# Install all dependencies and build the project.
18+
# Don't audit to speed up the installation.
19+
RUN npm run build
20+
21+
# Create final image
22+
FROM apify/actor-node:18
23+
24+
# Copy just package.json and package-lock.json
25+
# to speed up the build using Docker layer cache.
26+
COPY package*.json ./
27+
28+
# Install NPM packages, skip optional and development dependencies to
29+
# keep the image small. Avoid logging too much and print the dependency
30+
# tree for debugging
31+
RUN npm --quiet set progress=false \
32+
&& npm install --omit=dev --omit=optional \
33+
&& echo "Installed NPM packages:" \
34+
&& (npm list --omit=dev --all || true) \
35+
&& echo "Node.js version:" \
36+
&& node --version \
37+
&& echo "NPM version:" \
38+
&& npm --version \
39+
&& rm -r ~/.npm
40+
41+
# Copy built JS files from builder image
42+
COPY --from=builder /usr/src/app/dist ./dist
43+
44+
# Next, copy the remaining files and directories with the source code.
45+
# Since we do this after NPM install, quick build will be really fast
46+
# for most source file changes.
47+
COPY . ./
48+
49+
50+
# Run the image.
51+
CMD npm run start:prod --silent
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"actorSpecification": 1,
3+
"name": "ts-parallel-scraping-scraper",
4+
"title": "Scraper Actor in TypeScript",
5+
"description": "",
6+
"version": "0.0",
7+
"meta": {
8+
"templateId": "ts-crawlee-cheerio"
9+
},
10+
"input": "./input_schema.json",
11+
"dockerfile": "./Dockerfile"
12+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"title": "Scraper Actor Input",
3+
"type": "object",
4+
"schemaVersion": 1,
5+
"properties": {
6+
"requestQueueId": {
7+
"title": "Request Queue ID",
8+
"type": "string",
9+
"editor": "textfield",
10+
"description": "Request queue to use in scraper."
11+
},
12+
"datasetId": {
13+
"title": "Dataset ID",
14+
"type": "string",
15+
"editor": "textfield",
16+
"description": "Dataset to use in scraper."
17+
}
18+
},
19+
"required": ["requestQueueId", "datasetId"]
20+
}

0 commit comments

Comments
 (0)