Skip to content

Commit 14b14ee

Browse files
authored
feat: simplify repo (#28)
* fix: simplify repo structure by removing resources * fix: simplify functions * fix: update readme * fix: remove unused property
1 parent 73a8ce5 commit 14b14ee

File tree

15 files changed

+128
-244
lines changed

15 files changed

+128
-244
lines changed

README.md

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ Use this node to run the Website Content Crawler, configure its input, and fetch
1313
- [Credentials](#credentials)
1414
- [Compatibility](#compatibility)
1515
- [Usage](#usage)
16-
- [Resources](#resources)
1716
- [Release](#releasing-a-new-version)
1817
- [Version History](#version-history)
1918
- [Troubleshooting](#troubleshooting)
@@ -54,10 +53,10 @@ In the same shell or Docker environment where n8n runs, export the `WEBHOOK_URL`
5453

5554
![operations](./docs/operations.png)
5655

57-
This node integrates the Apify Website Content Crawler Actor and supports running the Apify Actor with custom input.
56+
This node provides a single operation to run the Apify Website Content Crawler Actor with custom input parameters.
5857

5958
### Run Crawler
60-
Execute a Website Content Crawler with optional input parameters
59+
Execute the Website Content Crawler with optional input parameters to crawl websites and extract text content
6160

6261
## Credentials
6362

@@ -88,8 +87,7 @@ You can use this node in various workflows. It is especially useful for extracti
8887

8988
![workflow](./docs/uc.png)
9089

91-
## Resources
92-
90+
For more information, see:
9391
- [Apify Website Content Crawler](https://apify.com/apify/website-content-crawler)
9492
- [n8n Community Nodes Documentation](https://docs.n8n.io/integrations/community-nodes/)
9593
- [Apify API Documentation](https://docs.apify.com)
@@ -209,14 +207,10 @@ Track changes and updates to the node here.
209207

210208
1. **Authentication errors**
211209
- Verify your API key is correct
210+
- Ensure your Apify account has access to the Website Content Crawler
212211

213-
2. **Resource Not Found**
214-
- Verify the resource ID format
215-
- Check if the resource exists in your Apify account
216-
- Ensure you have access to the resource
217-
218-
3. **Operation failures**
219-
- Check the input parameters
212+
2. **Crawler execution failures**
213+
- Check the input parameters (URLs, crawler settings)
220214
- Verify resource limits (memory, timeout)
221215
- Review Apify Console for detailed error messages
222216

nodes/ApifyContentCrawler/ApifyContentCrawler.methods.ts

Lines changed: 0 additions & 3 deletions
This file was deleted.

nodes/ApifyContentCrawler/ApifyContentCrawler.node.ts

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@ import {
66
NodeConnectionType,
77
} from 'n8n-workflow';
88
import { properties } from './ApifyContentCrawler.properties';
9-
import { methods } from './ApifyContentCrawler.methods';
10-
import { actorsRouter } from './resources/actors/router';
9+
import { runActor } from './helpers/executeActor';
1110

1211
export const ACTOR_ID = 'aYG0l9s7dbB7j3gbS';
1312

@@ -58,15 +57,13 @@ export class ApifyContentCrawler implements INodeType {
5857
properties,
5958
};
6059

61-
methods = methods;
62-
6360
async execute(this: IExecuteFunctions): Promise<INodeExecutionData[][]> {
6461
const items = this.getInputData();
6562
const returnData: INodeExecutionData[] = [];
6663

6764
for (let i = 0; i < items.length; i++) {
6865
try {
69-
const data = await actorsRouter.call(this, i);
66+
const data = await runActor.call(this, i);
7067

7168
const addPairedItem = (item: INodeExecutionData) => ({
7269
...item,

nodes/ApifyContentCrawler/ApifyContentCrawler.properties.ts

Lines changed: 110 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
import { INodeProperties } from 'n8n-workflow';
2-
import { properties as resources } from './resources';
1+
import { IExecuteFunctions, INodeProperties } from 'n8n-workflow';
32

43
const authenticationProperties: INodeProperties[] = [
54
{
@@ -21,4 +20,112 @@ const authenticationProperties: INodeProperties[] = [
2120
},
2221
];
2322

24-
export const properties: INodeProperties[] = [...resources, ...authenticationProperties];
23+
export const actorProperties: INodeProperties[] = [
24+
{
25+
displayName: 'Start URLs',
26+
name: 'entries',
27+
type: 'fixedCollection',
28+
typeOptions: { multipleValues: true },
29+
default: {
30+
entry: [
31+
{
32+
value: 'https://docs.apify.com/academy/web-scraping-for-beginners',
33+
},
34+
],
35+
},
36+
description:
37+
'One or more URLs of pages where the crawler will start. By default, the Actor will also crawl sub-pages of these URLs. For example, for start URL `https://example.com/blog`, it will crawl also `https://example.com/blog/post` or `https://example.com/blog/article`. The **Include URLs (globs)** option overrides this automation behavior.',
38+
placeholder: 'Add URL',
39+
options: [
40+
{
41+
name: 'entry',
42+
displayName: 'Url',
43+
values: [{ displayName: 'Url', name: 'value', type: 'string', default: '' }],
44+
},
45+
],
46+
},
47+
{
48+
displayName: 'Consider URLs From Sitemaps',
49+
name: 'sitemapUrlsEnabled',
50+
type: 'boolean',
51+
default: false,
52+
description:
53+
'Whether the crawler should look for [Sitemaps](https://en.wikipedia.org/wiki/Sitemaps) at the domains of the provided *Start URLs* and enqueue matching URLs similarly to links found on crawled pages. You can also reference a `sitemap.xml` file directly by adding it as another Start URL (e.g., `https://www.example.com/sitemap.xml`). This feature makes the crawling more robust on websites that support Sitemaps, as it includes pages that might not be reachable from Start URLs. Note that if a page is found via a Sitemap, it will have depth 1.',
54+
},
55+
{
56+
displayName: 'Crawler Type',
57+
name: 'crawlerType',
58+
type: 'options',
59+
default: 'playwright:adaptive',
60+
options: [
61+
{
62+
name: 'Adaptive Switching Between Browser and Raw HTTP - Fast and Renders JavaScript Content if Present. This Is the Recommended Option.',
63+
value: 'playwright:adaptive',
64+
},
65+
{
66+
name: 'Headless Browser (Firefox+Playwright) - Reliable, Renders JavaScript Content, Best in Avoiding Blocking, but Might Be Slow.',
67+
value: 'playwright:firefox',
68+
},
69+
{
70+
name: "Raw HTTP Client (Cheerio) - Fastest, but Doesn't Render JavaScript Content.",
71+
value: 'cheerio',
72+
},
73+
],
74+
description:
75+
'Adaptive switching between browser and raw HTTP (default) - Fast and renders JavaScript content if present. This is the recommended option.\nHeadless web browser with Firefox and Playwright - Useful for modern websites with anti-scraping protections and JavaScript rendering. It recognizes common blocking patterns like CAPTCHAs and automatically retries blocked requests through new sessions.\nRaw HTTP client - High-performance crawling mode that uses raw HTTP requests to fetch the pages. It is faster and cheaper, but it might not work on all websites.',
76+
},
77+
{
78+
displayName: 'Max Crawling Depth',
79+
name: 'maxDepth',
80+
type: 'number',
81+
default: 20,
82+
typeOptions: {
83+
minValue: 0,
84+
},
85+
description:
86+
'The maximum number of links starting from the start URL that the crawler will recursively follow. The start URLs have depth `0`, the pages linked directly from the start URLs have depth `1`, and so on. This setting is useful to prevent accidental crawler runaway. By setting it to `0`, the Actor will only crawl the Start URLs.',
87+
},
88+
{
89+
displayName: 'Max Pages',
90+
name: 'maxPages',
91+
type: 'number',
92+
default: 9999,
93+
description:
94+
'The maximum number pages to crawl. It includes the start URLs, pagination pages, pages with no content, etc. The crawler will automatically finish after reaching this number. This setting is useful to prevent accidental crawler runaway.',
95+
},
96+
];
97+
98+
99+
export const properties: INodeProperties[] = [...actorProperties, ...authenticationProperties];
100+
101+
export function buildActorInput(
102+
this: IExecuteFunctions,
103+
i: number,
104+
defaultInput: Record<string, any>,
105+
): Record<string, any> {
106+
const entries = this.getNodeParameter('entries', i, {}) as {
107+
entry?: { value: string }[];
108+
};
109+
const crawlerType = this.getNodeParameter('crawlerType', i) as string;
110+
const sitemapUrlsEnabled = this.getNodeParameter('sitemapUrlsEnabled', i) as boolean;
111+
const maxDepth = this.getNodeParameter('maxDepth', i) as number;
112+
const maxPages = this.getNodeParameter('maxPages', i) as number;
113+
114+
const mergedInput: Record<string, any> = {
115+
...defaultInput,
116+
crawlerType,
117+
useSitemaps: sitemapUrlsEnabled,
118+
maxCrawlDepth: maxDepth,
119+
maxCrawlPages: maxPages,
120+
};
121+
122+
delete mergedInput.startUrls;
123+
if (entries?.entry?.length) {
124+
mergedInput.startUrls = entries.entry.map((e) => ({
125+
url: e.value,
126+
method: 'GET',
127+
}));
128+
}
129+
130+
return mergedInput;
131+
}

nodes/ApifyContentCrawler/resources/executeActor.ts renamed to nodes/ApifyContentCrawler/helpers/executeActor.ts

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import { IExecuteFunctions, INodeExecutionData, NodeApiError } from 'n8n-workflow';
22
import { apiRequest, getResults, pollRunStatus } from './genericFunctions';
3+
import { ACTOR_ID } from '../ApifyContentCrawler.node';
4+
import { buildActorInput } from '../ApifyContentCrawler.properties';
35

46
export async function getDefaultBuild(this: IExecuteFunctions, actorId: string) {
57
const defaultBuildResp = await apiRequest.call(this, {
@@ -47,12 +49,13 @@ export async function runActorApi(
4749
});
4850
}
4951

50-
export async function executeActorRunFlow(
51-
this: IExecuteFunctions,
52-
actorId: string,
53-
mergedInput: Record<string, any>,
54-
): Promise<INodeExecutionData> {
55-
const run = await runActorApi.call(this, actorId, mergedInput, { waitForFinish: 0 });
52+
export async function runActor(this: IExecuteFunctions, i: number): Promise<INodeExecutionData> {
53+
const build = await getDefaultBuild.call(this, ACTOR_ID);
54+
const defaultInput = getDefaultInputsFromBuild(build);
55+
56+
const mergedInput = buildActorInput.call(this, i, defaultInput);
57+
58+
const run = await runActorApi.call(this, ACTOR_ID, mergedInput, { waitForFinish: 0 });
5659
if (!run?.data?.id) {
5760
throw new NodeApiError(this.getNode(), {
5861
message: `Run ID not found after running the actor`,
@@ -67,3 +70,4 @@ export async function executeActorRunFlow(
6770

6871
return await getResults.call(this, datasetId);
6972
}
73+

nodes/ApifyContentCrawler/resources/genericFunctions.ts renamed to nodes/ApifyContentCrawler/helpers/genericFunctions.ts

File renamed without changes.
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
export * as hooks from './hooks';
2-
export * as methods from './methods';
32
export * as consts from './consts';

nodes/ApifyContentCrawler/helpers/methods.ts

Lines changed: 0 additions & 17 deletions
This file was deleted.

nodes/ApifyContentCrawler/resources/actors/hooks.ts

Lines changed: 0 additions & 11 deletions
This file was deleted.

nodes/ApifyContentCrawler/resources/actors/index.ts

Lines changed: 0 additions & 28 deletions
This file was deleted.

0 commit comments

Comments
 (0)