Skip to content

Commit 6a0fe3f

Browse files
authored
[components] Scrapeless - add new actions (#17086)
* [components] Scrapeless - new actions - Introduced a new README.md for Scrapeless, outlining its features and getting started guide. - Implemented multiple Scrapeless actions inclues `Crawler`, `Scraping API`, and `Universal Scraping API`. * fix(scrapeless): - fix lint code error - fix Component check error * Update Scrapeless component to version 0.2.0 - use nodes integration server to simple request logic * feat(scrapeless): Update descriptions for Scrapeless actions to include documentation links * feat(scrapeless): Integrate Scrapeless AI SDK and enhance API functionality - Added @scrapeless-ai/sdk as a dependency. - Updated API endpoints for scraping and crawling functionalities. - Implemented error handling and job management for scraping tasks. - Refactored existing methods to utilize the new SDK for improved performance and reliability. * feat(scrapeless): update actions - Fix request URLs for `submit-scrape-job` and `get-scrape-result` actions - Refactor `submit-scrape-job` input props to align with Scrapeless's official API parameters - Fix issue with retrieving additional props asynchronously * fix(scrapeless): downgrade action versions for consistency
1 parent 43f5a2f commit 6a0fe3f

File tree

11 files changed

+2168
-121
lines changed

11 files changed

+2168
-121
lines changed

components/scrapeless/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Overview
2+
3+
Scrapeless – your go-to platform for powerful, compliant web data extraction. With tools like Universal Scraping API, Scrapeless makes it easy to access and gather data from complex sites. Focus on insights while we handle the technical hurdles. Scrapeless – data extraction made simple.
4+
5+
# Example Use Cases
6+
7+
1. **Scraping API**: Endpoints for fresh, structured data from 100+ popular sites.
8+
2. **Universal Scraping API**: Access any website at scale and say goodbye to blocks.
9+
3. **Crawler**: Extract data from single pages or traverse entire domains.
10+
11+
# Getting Started
12+
13+
## Generating an API Key
14+
15+
1. If you are not a member of Scrapeless, you can sign up for a free account at [Scrapeless](https://app.scrapeless.com/passport/register).
16+
2. Once registered, you can go to the API Key Management page to generate an API Key in the app settings.
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import scrapeless from "../../scrapeless.app.mjs";
2+
3+
export default {
4+
key: "scrapeless-crawler",
5+
name: "Crawler",
6+
description: "Crawl any website at scale and say goodbye to blocks. [See the documentation](https://apidocs.scrapeless.com/api-17509010).",
7+
version: "0.0.2",
8+
type: "action",
9+
props: {
10+
scrapeless,
11+
apiServer: {
12+
type: "string",
13+
label: "Please select a API server",
14+
description: "Please select a API server to use",
15+
default: "crawl",
16+
options: [
17+
{
18+
label: "Crawl",
19+
value: "crawl",
20+
},
21+
{
22+
label: "Scrape",
23+
value: "scrape",
24+
},
25+
],
26+
reloadProps: true,
27+
},
28+
},
29+
async run({ $ }) {
30+
const {
31+
scrapeless, apiServer, ...inputProps
32+
} = this;
33+
34+
const browserOptions = {
35+
"proxy_country": "ANY",
36+
"session_name": "Crawl",
37+
"session_recording": true,
38+
"session_ttl": 900,
39+
};
40+
41+
let response;
42+
43+
if (apiServer === "crawl") {
44+
response =
45+
await scrapeless._scrapelessClient().scrapingCrawl.crawl.crawlUrl(inputProps.url, {
46+
limit: inputProps.limitCrawlPages,
47+
browserOptions,
48+
});
49+
}
50+
51+
if (apiServer === "scrape") {
52+
response =
53+
await scrapeless._scrapelessClient().scrapingCrawl.scrape.scrapeUrl(inputProps.url, {
54+
browserOptions,
55+
});
56+
}
57+
58+
if (response?.status === "completed" && response?.data) {
59+
$.export("$summary", `Successfully retrieved crawling results for ${inputProps.url}`);
60+
return response.data;
61+
} else {
62+
throw new Error(response?.error || "Failed to retrieve crawling results");
63+
}
64+
},
65+
additionalProps() {
66+
const { apiServer } = this;
67+
68+
const props = {};
69+
70+
if (apiServer === "crawl" || apiServer === "scrape") {
71+
props.url = {
72+
type: "string",
73+
label: "URL to Crawl",
74+
description: "If you want to crawl in batches, please refer to the SDK of the document",
75+
};
76+
}
77+
78+
if (apiServer === "crawl") {
79+
props.limitCrawlPages = {
80+
type: "integer",
81+
label: "Number Of Subpages",
82+
default: 5,
83+
description: "Max number of results to return",
84+
};
85+
}
86+
87+
return props;
88+
},
89+
};

components/scrapeless/actions/get-scrape-result/get-scrape-result.mjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ export default {
44
key: "scrapeless-get-scrape-result",
55
name: "Get Scrape Result",
66
description: "Retrieve the result of a completed scraping job. [See the documentation](https://apidocs.scrapeless.com/api-11949853)",
7-
version: "0.0.1",
7+
version: "0.0.2",
88
type: "action",
99
props: {
1010
scrapeless,
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import scrapeless from "../../scrapeless.app.mjs";
2+
import { log } from "../../common/utils.mjs";
3+
export default {
4+
key: "scrapeless-scraping-api",
5+
name: "Scraping API",
6+
description: "Endpoints for fresh, structured data from 100+ popular sites. [See the documentation](https://apidocs.scrapeless.com/api-12919045).",
7+
version: "0.0.1",
8+
type: "action",
9+
props: {
10+
scrapeless,
11+
apiServer: {
12+
type: "string",
13+
label: "Please select a API server",
14+
default: "googleSearch",
15+
description: "Please select a API server to use",
16+
options: [
17+
{
18+
label: "Google Search",
19+
value: "googleSearch",
20+
},
21+
],
22+
reloadProps: true,
23+
},
24+
},
25+
async run({ $ }) {
26+
const {
27+
scrapeless, apiServer, ...inputProps
28+
} = this;
29+
30+
const MAX_RETRIES = 3;
31+
// 10 seconds
32+
const DELAY = 1000 * 10;
33+
const { run } = $.context;
34+
35+
let submitData;
36+
let job;
37+
38+
// pre check if the job is already in the context
39+
if (run?.context?.job) {
40+
job = run.context.job;
41+
}
42+
43+
if (apiServer === "googleSearch") {
44+
submitData = {
45+
actor: "scraper.google.search",
46+
input: {
47+
q: inputProps.q,
48+
hl: inputProps.hl,
49+
gl: inputProps.gl,
50+
},
51+
};
52+
}
53+
54+
if (!submitData) {
55+
throw new Error("No actor found");
56+
}
57+
// 1. Create a new scraping job
58+
if (!job) {
59+
job = await scrapeless._scrapelessClient().deepserp.createTask({
60+
actor: submitData.actor,
61+
input: submitData.input,
62+
});
63+
64+
if (job.status === 200) {
65+
$.export("$summary", "Successfully retrieved scraping results");
66+
return job.data;
67+
}
68+
69+
log("task in progress");
70+
}
71+
72+
// 2. Wait for the job to complete
73+
if (run.runs === 1) {
74+
$.flow.rerun(DELAY, {
75+
job,
76+
}, MAX_RETRIES);
77+
} else if (run.runs > MAX_RETRIES ) {
78+
throw new Error("Max retries reached");
79+
} else if (job && job?.data?.taskId) {
80+
const result = await scrapeless._scrapelessClient().deepserp.getTaskResult(job.data.taskId);
81+
if (result.status === 200) {
82+
$.export("$summary", "Successfully retrieved scraping results");
83+
return result.data;
84+
} else {
85+
$.flow.rerun(DELAY, {
86+
job,
87+
}, MAX_RETRIES);
88+
}
89+
} else {
90+
throw new Error("No job found");
91+
}
92+
93+
},
94+
additionalProps() {
95+
const { apiServer } = this;
96+
97+
const props = {};
98+
99+
if (apiServer === "googleSearch") {
100+
props.q = {
101+
type: "string",
102+
label: "Search Query",
103+
description: "Parameter defines the query you want to search. You can use anything that you would use in a regular Google search. e.g. inurl:, site:, intitle:. We also support advanced search query parameters such as as_dt and as_eq.",
104+
default: "coffee",
105+
};
106+
107+
props.hl = {
108+
type: "string",
109+
label: "Language",
110+
description: "Parameter defines the language to use for the Google search. It's a two-letter language code. (e.g., en for English, es for Spanish, or fr for French).",
111+
default: "en",
112+
};
113+
114+
props.gl = {
115+
type: "string",
116+
label: "Country",
117+
description: "Parameter defines the country to use for the Google search. It's a two-letter country code. (e.g., us for the United States, uk for United Kingdom, or fr for France).",
118+
default: "us",
119+
};
120+
}
121+
122+
return props;
123+
},
124+
};
Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,76 +1,68 @@
1-
import { ConfigurationError } from "@pipedream/platform";
2-
import { ACTOR_OPTIONS } from "../../common/constants.mjs";
3-
import { parseObject } from "../../common/utils.mjs";
1+
import { COUNTRY_OPTIONS } from "../../common/constants.mjs";
2+
import { log } from "../../common/utils.mjs";
43
import scrapeless from "../../scrapeless.app.mjs";
54

65
export default {
76
key: "scrapeless-submit-scrape-job",
87
name: "Submit Scrape Job",
98
description: "Submit a new web scraping job with specified target URL and extraction rules. [See the documentation](https://apidocs.scrapeless.com/api-11949852)",
10-
version: "0.0.1",
9+
version: "0.0.2",
1110
type: "action",
1211
props: {
1312
scrapeless,
1413
actor: {
1514
type: "string",
1615
label: "Actor",
16+
default: "scraper.shopee",
1717
description: "The actor to use for the scrape job. This can be a specific user or a system account.",
18-
options: ACTOR_OPTIONS,
1918
},
2019
inputUrl: {
2120
type: "string",
2221
label: "Input URL",
2322
description: "Target URL to scrape. This is the URL of the web page you want to extract data from.",
24-
optional: true,
2523
},
2624
proxyCountry: {
2725
type: "string",
2826
label: "Proxy Country",
2927
description: "The country to route the request through. This can help in bypassing geo-restrictions.",
30-
optional: true,
31-
},
32-
additionalInput: {
33-
type: "object",
34-
label: "Additional Input",
35-
description: "Additional input parameters if you need to pass a specific configuration based on the actor. [See the documentation](https://apidocs.scrapeless.com/) for further details.",
36-
optional: true,
28+
default: "ANY",
29+
options: COUNTRY_OPTIONS.map((country) => ({
30+
label: country.label,
31+
value: country.value,
32+
})),
3733
},
3834
asyncMode: {
3935
type: "boolean",
4036
label: "Async Mode",
37+
default: true,
4138
description: "Whether to run the scrape job in asynchronous mode. If set to true, the job will be processed in the background.",
4239
},
4340
},
4441
async run({ $ }) {
4542
try {
4643
const data = {
4744
actor: this.actor,
48-
input: parseObject(this.additionalInput),
49-
};
50-
51-
if (this.asyncMode) {
52-
data.async = this.asyncMode;
53-
}
54-
if (this.inputUrl) {
55-
data.input.url = this.inputUrl;
56-
}
57-
if (this.proxyCountry) {
58-
data.proxy = {
45+
input: {
46+
url: this.inputUrl,
47+
},
48+
proxy: {
5949
country: this.proxyCountry,
60-
};
61-
}
50+
},
51+
async: this.asyncMode,
52+
};
6253

6354
const response = await this.scrapeless.submitScrapeJob({
6455
$,
6556
data,
6657
});
58+
log(response);
6759

6860
$.export("$summary", this.asyncMode
6961
? `Successfully submitted scrape job with ID: ${response.taskId}`
7062
: "Successfully scraped the target configuration.");
7163
return response;
72-
} catch ({ response }) {
73-
throw new ConfigurationError(response.data.message);
64+
} catch (error) {
65+
throw new Error(error.message);
7466
}
7567
},
7668
};

0 commit comments

Comments
 (0)