Skip to content

Commit b6337d3

Browse files
committed
chore: requested changes - Move to BasicCrawler
1 parent 778d416 commit b6337d3

File tree

5 files changed

+66
-95
lines changed

5 files changed

+66
-95
lines changed

starter/src/index.js

Lines changed: 55 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import Apify from 'apify';
22
import { inspect } from 'util';
33
import { convertInputToActorConfigs } from './lib/configs.js';
4-
import { revivePendingConfigs } from './lib/revivePendingConfigs.js';
5-
import { waitForRunToFinish, startRun } from './lib/startRunAndPool.js';
4+
import { waitForRunToFinishAndPushData, startRun } from './lib/startRunAndPool.js';
65

76
const { log, sleep } = Apify.utils;
87
const env = Apify.getEnv();
@@ -11,6 +10,8 @@ Apify.main(async () => {
1110
/** @type {import('../../common/types').ActorInputData} */
1211
// @ts-ignore It's not null
1312
const input = await Apify.getInput();
13+
log.debug('Provided inputs:');
14+
log.debug(inspect(input));
1415

1516
const { maxConcurrentDomainsChecked, urlsToCheck } = input;
1617

@@ -21,70 +22,82 @@ Apify.main(async () => {
2122
/** @type {import('./types').FrontendActorState} */
2223
// @ts-expect-error It's an object
2324
const state = await Apify.getValue('STATE') ?? {
24-
preparedConfigs: [],
25-
pendingConfigs: [],
26-
totalUrls: 0,
25+
runConfigurations: [],
26+
totalUrls: urlsToCheck.length,
27+
checkerFinished: false,
2728
};
2829

2930
Apify.events.on('migrating', async () => {
3031
await Apify.setValue('STATE', state);
3132
});
3233

34+
Apify.events.on('persistState', async () => {
35+
await Apify.setValue('STATE', state);
36+
});
37+
3338
setInterval(async () => {
3439
await Apify.setValue('STATE', state);
3540

3641
log.debug('Internal state:');
3742
log.debug(inspect(state, false, 3));
3843
}, 10_000);
3944

40-
state.preparedConfigs = convertInputToActorConfigs(input);
41-
state.totalUrls = urlsToCheck.length;
42-
43-
log.info(`Preparing to process ${state.totalUrls} URLs...\n`);
44-
45-
// Check for revivals first, in the event the actor crashed, and handle those to the end
46-
await revivePendingConfigs(state);
47-
48-
while (true) {
49-
// Each element of domainsToCheck represents a URL with its own run configurations
50-
const domainsToCheck = state.preparedConfigs.splice(0, maxConcurrentDomainsChecked);
51-
// If we got no more URLs to run, exit the loop
52-
if (domainsToCheck.length === 0) break;
53-
54-
log.info(`Starting a batch of ${domainsToCheck.length} URLs to check`);
45+
// If we haven't initialized the state yet, do it now
46+
if (state.runConfigurations.length === 0 && !state.checkerFinished) {
47+
state.runConfigurations = convertInputToActorConfigs(input);
48+
}
5549

56-
state.pendingConfigs = domainsToCheck;
57-
// Save the state right off the bat, in the event the actor dies right after
58-
await Apify.setValue('STATE', state);
50+
// Sort state based on started runs
51+
state.runConfigurations = state.runConfigurations.sort((_, b) => Number(Boolean(b.runId)));
52+
await Apify.setValue('STATE', state);
5953

60-
const promises = [];
54+
log.info(`Preparing to process ${state.totalUrls} URLs...\n`);
6155

62-
for (const domainRunConfigs of domainsToCheck) {
63-
for (const run of domainRunConfigs) {
64-
const result = await startRun(run);
56+
/** @type {import('apify').RequestOptions[]} */
57+
const sources = state.runConfigurations.map((actorInput, index) => ({
58+
url: 'https://localhost',
59+
uniqueKey: index.toString(),
60+
userData: { actorInput },
61+
}));
62+
63+
const requestList = await Apify.openRequestList(null, sources);
64+
65+
const runner = new Apify.BasicCrawler({
66+
maxConcurrency: maxConcurrentDomainsChecked,
67+
requestList,
68+
handleRequestFunction: async ({ request }) => {
69+
const { uniqueKey, userData } = request;
70+
/** @type {{ actorInput: import('../../common/types').PreparedActorConfig }} */
71+
// @ts-expect-error JS-style casting
72+
const { actorInput } = userData;
73+
74+
if (actorInput.runId) {
75+
log.info(`Found run ${actorInput.runId} with actor ${actorInput.actorId} for URL "${actorInput.url}" - waiting for it to finish.`);
76+
log.info(`You can monitor the status of the run by going to https://console.apify.com/actors/runs/${actorInput.runId}`);
77+
} else {
78+
const result = await startRun(actorInput);
6579
log.info(
66-
`Starting run for "${run.url}" with actor ${run.actorId} and ${
67-
run.input.proxyConfiguration.useApifyProxy ? `proxy ${run.proxyUsed ?? 'auto'}` : 'no proxy'
80+
`Starting run for "${actorInput.url}" with actor ${actorInput.actorId} and ${
81+
actorInput.input.proxyConfiguration.useApifyProxy ? `proxy ${actorInput.proxyUsed ?? 'auto'}` : 'no proxy'
6882
}.`,
6983
);
7084
log.info(`You can monitor the status of the run by going to https://console.apify.com/actors/runs/${result.id}`);
71-
run.runId = result.id;
72-
await Apify.setValue('STATE', state);
73-
74-
// Start pooling the run for its results
75-
promises.push(waitForRunToFinish(run, result.id));
76-
77-
// Wait a second to not overload the platform
78-
await sleep(1000);
85+
actorInput.runId = result.id;
86+
// TODO(vladfrangu): remove this once I confirm the value is updated, so we don't restart runs for no reason
87+
console.log(state.runConfigurations[Number(uniqueKey)]);
7988
}
80-
}
8189

82-
// Await all runs to finish before continuing
83-
await Promise.allSettled(promises);
84-
}
90+
// Wait for the run to finish
91+
await waitForRunToFinishAndPushData(actorInput);
92+
},
93+
});
94+
95+
// Run the checker
96+
await runner.run();
8597

8698
// Save the state as done, to prevent resurrection doing requests it doesn't have to do
87-
state.preparedConfigs = [];
99+
state.runConfigurations = [];
100+
state.checkerFinished = true;
88101
await Apify.setValue('STATE', state);
89102

90103
log.info(`\nChecking ${state.totalUrls} URLs completed!`);

starter/src/lib/configs.js

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,43 @@ import { ACTOR_CHEERIO_CHECKER_NAME, ACTOR_PLAYWRIGHT_CHECKER_NAME, ACTOR_PUPPET
22

33
/** @param {import('../../../common/types').ActorInputData} input */
44
export function convertInputToActorConfigs(input) {
5-
/** @type {import('../../../common/types').PreparedActorConfig[][]} */
5+
/** @type {import('../../../common/types').PreparedActorConfig[]} */
66
const configs = [];
77

88
for (const urlData of input.urlsToCheck) {
9-
/** @type {import('../../../common/types').PreparedActorConfig[]} */
10-
const urlConfigs = [];
119
if (input['checkers.cheerio']) {
12-
urlConfigs.push(...createActorRunConfigForCrawler({ input, urlData, checkerId: ACTOR_CHEERIO_CHECKER_NAME }));
10+
configs.push(...createActorRunConfigForCrawler({ input, urlData, checkerId: ACTOR_CHEERIO_CHECKER_NAME }));
1311
}
1412
if (input['checkers.puppeteer']) {
15-
urlConfigs.push(...createActorRunConfigForCrawler({ input, urlData, checkerId: ACTOR_PUPPETEER_CHECKER_NAME }));
13+
configs.push(...createActorRunConfigForCrawler({ input, urlData, checkerId: ACTOR_PUPPETEER_CHECKER_NAME }));
1614
}
1715
if (input['checkers.playwright']) {
1816
// Create a run config for each playwright browser
1917
if (input['playwright.chrome']) {
20-
urlConfigs.push(...createActorRunConfigForCrawler({
18+
configs.push(...createActorRunConfigForCrawler({
2119
input,
2220
urlData,
2321
checkerId: ACTOR_PLAYWRIGHT_CHECKER_NAME,
2422
playwrightBrowser: 'chrome',
2523
}));
2624
}
2725
if (input['playwright.firefox']) {
28-
urlConfigs.push(...createActorRunConfigForCrawler({
26+
configs.push(...createActorRunConfigForCrawler({
2927
input,
3028
urlData,
3129
checkerId: ACTOR_PLAYWRIGHT_CHECKER_NAME,
3230
playwrightBrowser: 'firefox',
3331
}));
3432
}
3533
if (input['playwright.webkit']) {
36-
urlConfigs.push(...createActorRunConfigForCrawler({
34+
configs.push(...createActorRunConfigForCrawler({
3735
input,
3836
urlData,
3937
checkerId: ACTOR_PLAYWRIGHT_CHECKER_NAME,
4038
playwrightBrowser: 'webkit',
4139
}));
4240
}
4341
}
44-
45-
configs.push(urlConfigs);
4642
}
4743

4844
return configs;
@@ -91,6 +87,6 @@ function* createActorRunConfigForCrawler({ input, urlData, checkerId, playwright
9187
config.input['playwright.waitFor'] = input['playwright.waitFor'];
9288
}
9389

94-
yield (config);
90+
yield config;
9591
}
9692
}

starter/src/lib/revivePendingConfigs.js

Lines changed: 0 additions & 37 deletions
This file was deleted.

starter/src/lib/startRunAndPool.js

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,9 @@ export async function startRun(run) {
1414

1515
/**
1616
* @param {import('../../../common/types').PreparedActorConfig} runConfig
17-
* @param {string} runId
1817
*/
19-
export async function waitForRunToFinish(runConfig, runId) {
20-
const run = client.run(runId);
18+
export async function waitForRunToFinishAndPushData(runConfig) {
19+
const run = client.run(runConfig.runId);
2120

2221
const finishedRun = await run.waitForFinish();
2322
const { computeUnits } = finishedRun.stats;

starter/src/types.d.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@ import { PreparedActorConfig } from '../../common/types';
22

33
export interface FrontendActorState {
44
totalUrls: number;
5-
preparedConfigs: PreparedActorConfig[][];
6-
pendingConfigs: PreparedActorConfig[][];
5+
runConfigurations: PreparedActorConfig[];
6+
checkerFinished: boolean;
77
}

0 commit comments

Comments
 (0)