Skip to content

Commit 711e202

Browse files
authored
New Components - scrapeless (#16712)
* scrapeless init * [Components] scrapeless #16673 Actions - Submit Scrape Job - Get Scrape Result * pnpm update * some adjusts * some adjusts
1 parent cb039e0 commit 711e202

File tree

7 files changed

+312
-7
lines changed

7 files changed

+312
-7
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import scrapeless from "../../scrapeless.app.mjs";
2+
3+
export default {
4+
key: "scrapeless-get-scrape-result",
5+
name: "Get Scrape Result",
6+
description: "Retrieve the result of a completed scraping job. [See the documentation](https://apidocs.scrapeless.com/api-11949853)",
7+
version: "0.0.1",
8+
type: "action",
9+
props: {
10+
scrapeless,
11+
scrapeJobId: {
12+
type: "string",
13+
label: "Scrape Job ID",
14+
description: " The ID of the scrape job you want to retrieve results for. This ID is provided when you submit a scrape job.",
15+
},
16+
},
17+
async run({ $ }) {
18+
try {
19+
const response = await this.scrapeless.getScrapeResult({
20+
$,
21+
scrapeJobId: this.scrapeJobId,
22+
});
23+
24+
$.export("$summary", `Successfully retrieved scrape results for job ID ${this.scrapeJobId}`);
25+
return response;
26+
} catch ({ response }) {
27+
$.export("$summary", `Successfully retrieved scrape result with error for job ID ${this.scrapeJobId}`);
28+
return {
29+
success: false,
30+
...response.data,
31+
};
32+
}
33+
},
34+
};
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import { ConfigurationError } from "@pipedream/platform";
2+
import { ACTOR_OPTIONS } from "../../common/constants.mjs";
3+
import { parseObject } from "../../common/utils.mjs";
4+
import scrapeless from "../../scrapeless.app.mjs";
5+
6+
export default {
7+
key: "scrapeless-submit-scrape-job",
8+
name: "Submit Scrape Job",
9+
description: "Submit a new web scraping job with specified target URL and extraction rules. [See the documentation](https://apidocs.scrapeless.com/api-11949852)",
10+
version: "0.0.1",
11+
type: "action",
12+
props: {
13+
scrapeless,
14+
actor: {
15+
type: "string",
16+
label: "Actor",
17+
description: "The actor to use for the scrape job. This can be a specific user or a system account.",
18+
options: ACTOR_OPTIONS,
19+
},
20+
inputUrl: {
21+
type: "string",
22+
label: "Input URL",
23+
description: "Target URL to scrape. This is the URL of the web page you want to extract data from.",
24+
optional: true,
25+
},
26+
proxyCountry: {
27+
type: "string",
28+
label: "Proxy Country",
29+
description: "The country to route the request through. This can help in bypassing geo-restrictions.",
30+
optional: true,
31+
},
32+
additionalInput: {
33+
type: "object",
34+
label: "Additional Input",
35+
description: "Additional input parameters if you need to pass a specific configuration based on the actor. [See the documentation](https://apidocs.scrapeless.com/) for further details.",
36+
optional: true,
37+
},
38+
asyncMode: {
39+
type: "boolean",
40+
label: "Async Mode",
41+
description: "Whether to run the scrape job in asynchronous mode. If set to true, the job will be processed in the background.",
42+
},
43+
},
44+
async run({ $ }) {
45+
try {
46+
const data = {
47+
actor: this.actor,
48+
input: parseObject(this.additionalInput),
49+
};
50+
51+
if (this.asyncMode) {
52+
data.async = this.asyncMode;
53+
}
54+
if (this.inputUrl) {
55+
data.input.url = this.inputUrl;
56+
}
57+
if (this.proxyCountry) {
58+
data.proxy = {
59+
country: this.proxyCountry,
60+
};
61+
}
62+
63+
const response = await this.scrapeless.submitScrapeJob({
64+
$,
65+
data,
66+
});
67+
68+
$.export("$summary", this.asyncMode
69+
? `Successfully submitted scrape job with ID: ${response.taskId}`
70+
: "Successfully scraped the target configuration.");
71+
return response;
72+
} catch ({ response }) {
73+
throw new ConfigurationError(response.data.message);
74+
}
75+
},
76+
};
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
export const ACTOR_OPTIONS = [
2+
{
3+
label: "Shopee",
4+
value: "scraper.shopee",
5+
},
6+
{
7+
label: "BR Sites - Solucoes cnpjreva",
8+
value: "scraper.solucoes",
9+
},
10+
{
11+
label: "BR Sites - Solucoes certidaointernet",
12+
value: "scraper.solucoes.certidaointernet",
13+
},
14+
{
15+
label: "BR Sites - Servicos receita",
16+
value: "scraper.servicos.receita",
17+
},
18+
{
19+
label: "BR Sites - Consopt",
20+
value: "scraper.consopt",
21+
},
22+
{
23+
label: "Avnet",
24+
value: "scraper.avnet",
25+
},
26+
{
27+
label: "Arrow",
28+
value: "scraper.arrow",
29+
},
30+
{
31+
label: "Airline Iberia",
32+
value: "scraper.iberia",
33+
},
34+
{
35+
label: "Airline Expedia",
36+
value: "scraper.expedia",
37+
},
38+
{
39+
label: "Airline Kayak",
40+
value: "scraper.kayak",
41+
},
42+
{
43+
label: "Amazon Product",
44+
value: "scraper.amazon.product",
45+
},
46+
{
47+
label: "Amazon Seller",
48+
value: "scraper.amazon.seller",
49+
},
50+
{
51+
label: "Amazon Keywords",
52+
value: "scraper.amazon.keywords",
53+
},
54+
{
55+
label: "Temu",
56+
value: "scraper.temu.mobile.detail",
57+
},
58+
{
59+
label: "Google Search",
60+
value: "scraper.google.search",
61+
},
62+
{
63+
label: "Google Trends",
64+
value: "scraper.google.trends",
65+
},
66+
{
67+
label: "Google FLights",
68+
value: "scraper.google.flights",
69+
},
70+
{
71+
label: "Google FLights Chart",
72+
value: "scraper.google.flights.chart",
73+
},
74+
{
75+
label: "Google Maps",
76+
value: "scraper.google.maps",
77+
},
78+
{
79+
label: "Google Scholar",
80+
value: "scraper.google.scholar",
81+
},
82+
{
83+
label: "Google Jobs",
84+
value: "scraper.google.jobs",
85+
},
86+
{
87+
label: "Google Shopping",
88+
value: "scraper.google.shopping",
89+
},
90+
{
91+
label: "Google Hotels",
92+
value: "scraper.google.hotels",
93+
},
94+
{
95+
label: "Google News",
96+
value: "scraper.google.news",
97+
},
98+
{
99+
label: "Google Lens",
100+
value: "scraper.google.lens",
101+
},
102+
{
103+
label: "Google Finance",
104+
value: "scraper.google.finance",
105+
},
106+
{
107+
label: "Google Product",
108+
value: "scraper.google.product",
109+
},
110+
{
111+
label: "Google Play Games",
112+
value: "scraper.google.play.games",
113+
},
114+
{
115+
label: "Google Play Books",
116+
value: "scraper.google.play.books",
117+
},
118+
{
119+
label: "Google Play Movies",
120+
value: "scraper.google.play.movies",
121+
},
122+
{
123+
label: "Google Play Product",
124+
value: "scraper.google.play.product",
125+
},
126+
{
127+
label: "Google Play Apps",
128+
value: "scraper.google.play",
129+
},
130+
{
131+
label: "Google Ads",
132+
value: "scraper.google.ads",
133+
},
134+
{
135+
label: "Mouser",
136+
value: "scraper.mouser",
137+
},
138+
];
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
export const parseObject = (obj) => {
2+
if (!obj) return undefined;
3+
4+
if (Array.isArray(obj)) {
5+
return obj.map((item) => {
6+
if (typeof item === "string") {
7+
try {
8+
return JSON.parse(item);
9+
} catch (e) {
10+
return item;
11+
}
12+
}
13+
return item;
14+
});
15+
}
16+
if (typeof obj === "string") {
17+
try {
18+
return JSON.parse(obj);
19+
} catch (e) {
20+
return obj;
21+
}
22+
}
23+
return obj;
24+
};

components/scrapeless/package.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@pipedream/scrapeless",
3-
"version": "0.0.1",
3+
"version": "0.1.0",
44
"description": "Pipedream Scrapeless Components",
55
"main": "scrapeless.app.mjs",
66
"keywords": [
@@ -11,5 +11,8 @@
1111
"author": "Pipedream <[email protected]> (https://pipedream.com/)",
1212
"publishConfig": {
1313
"access": "public"
14+
},
15+
"dependencies": {
16+
"@pipedream/platform": "^3.0.3"
1417
}
15-
}
18+
}
Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,37 @@
1+
import { axios } from "@pipedream/platform";
2+
13
export default {
24
type: "app",
35
app: "scrapeless",
4-
propDefinitions: {},
56
methods: {
6-
// this.$auth contains connected account data
7-
authKeys() {
8-
console.log(Object.keys(this.$auth));
7+
_baseUrl() {
8+
return "https://api.scrapeless.com/api/v1";
9+
},
10+
_headers() {
11+
return {
12+
"x-api-token": `${this.$auth.api_key}`,
13+
};
14+
},
15+
_makeRequest({
16+
$ = this, path, ...opts
17+
}) {
18+
return axios($, {
19+
url: this._baseUrl() + path,
20+
headers: this._headers(),
21+
...opts,
22+
});
23+
},
24+
submitScrapeJob(opts = {}) {
25+
return this._makeRequest({
26+
method: "POST",
27+
path: "/scraper/request",
28+
...opts,
29+
});
30+
},
31+
getScrapeResult({ scrapeJobId }) {
32+
return this._makeRequest({
33+
path: `/scraper/result/${scrapeJobId}`,
34+
});
935
},
1036
},
1137
};

pnpm-lock.yaml

Lines changed: 5 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)