Skip to content

Commit f52d5e7

Browse files
authored
New Components - webscraping_ai (#15526)
* webscraping_ai init * new components * pnpm-lock.yaml * add additional optional props
1 parent 053faf6 commit f52d5e7

File tree

9 files changed

+563
-20
lines changed

9 files changed

+563
-20
lines changed

components/webscraping_ai/.gitignore

Lines changed: 0 additions & 3 deletions
This file was deleted.
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import webscrapingAI from "../../webscraping_ai.app.mjs";
2+
import utils from "../../common/utils.mjs";
3+
4+
export default {
5+
key: "webscraping_ai-ask-question",
6+
name: "Ask Question about Webpage",
7+
description: "Gets an answer to a question about a given webpage. [See the documentation](https://webscraping.ai/docs#tag/AI/operation/getQuestion)",
8+
version: "0.0.1",
9+
type: "action",
10+
props: {
11+
webscrapingAI,
12+
targetUrl: {
13+
propDefinition: [
14+
webscrapingAI,
15+
"targetUrl",
16+
],
17+
},
18+
question: {
19+
type: "string",
20+
label: "Question",
21+
description: "The question to ask about the given webpage. E.g. `What is the summary of this page content?`",
22+
},
23+
headers: {
24+
propDefinition: [
25+
webscrapingAI,
26+
"headers",
27+
],
28+
},
29+
timeout: {
30+
propDefinition: [
31+
webscrapingAI,
32+
"timeout",
33+
],
34+
},
35+
js: {
36+
propDefinition: [
37+
webscrapingAI,
38+
"js",
39+
],
40+
},
41+
jsTimeout: {
42+
propDefinition: [
43+
webscrapingAI,
44+
"jsTimeout",
45+
],
46+
},
47+
waitFor: {
48+
propDefinition: [
49+
webscrapingAI,
50+
"waitFor",
51+
],
52+
},
53+
proxy: {
54+
propDefinition: [
55+
webscrapingAI,
56+
"proxy",
57+
],
58+
},
59+
country: {
60+
propDefinition: [
61+
webscrapingAI,
62+
"country",
63+
],
64+
},
65+
customProxy: {
66+
propDefinition: [
67+
webscrapingAI,
68+
"customProxy",
69+
],
70+
},
71+
device: {
72+
propDefinition: [
73+
webscrapingAI,
74+
"device",
75+
],
76+
},
77+
errorOn404: {
78+
propDefinition: [
79+
webscrapingAI,
80+
"errorOn404",
81+
],
82+
},
83+
errorOnRedirect: {
84+
propDefinition: [
85+
webscrapingAI,
86+
"errorOnRedirect",
87+
],
88+
},
89+
jsScript: {
90+
propDefinition: [
91+
webscrapingAI,
92+
"jsScript",
93+
],
94+
},
95+
format: {
96+
propDefinition: [
97+
webscrapingAI,
98+
"format",
99+
],
100+
},
101+
},
102+
async run({ $ }) {
103+
const response = await this.webscrapingAI.getAnswerToQuestion({
104+
$,
105+
params: {
106+
url: this.targetUrl,
107+
question: this.question,
108+
headers: utils.stringifyHeaders(this.headers),
109+
timeout: this.timeout,
110+
js: this.js,
111+
js_timeout: this.jsTimeout,
112+
wait_for: this.waitFor,
113+
proxy: this.proxy,
114+
country: this.country,
115+
custom_proxy: this.customProxy,
116+
device: this.device,
117+
error_on_404: this.errorOn404,
118+
error_on_redirect: this.errorOnRedirect,
119+
js_script: this.jsScript,
120+
format: this.format,
121+
},
122+
});
123+
$.export("$summary", "Successfully retrieved answer to question");
124+
return response;
125+
},
126+
};
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import webscrapingAI from "../../webscraping_ai.app.mjs";
2+
import utils from "../../common/utils.mjs";
3+
4+
export default {
5+
key: "webscraping_ai-scrape-website-html",
6+
name: "Scrape Website HTML",
7+
description: "Returns the full HTML content of a webpage specified by the URL. [See the documentation](https://webscraping.ai/docs#tag/HTML/operation/getHTML):",
8+
version: "0.0.1",
9+
type: "action",
10+
props: {
11+
webscrapingAI,
12+
targetUrl: {
13+
propDefinition: [
14+
webscrapingAI,
15+
"targetUrl",
16+
],
17+
},
18+
headers: {
19+
propDefinition: [
20+
webscrapingAI,
21+
"headers",
22+
],
23+
},
24+
timeout: {
25+
propDefinition: [
26+
webscrapingAI,
27+
"timeout",
28+
],
29+
},
30+
js: {
31+
propDefinition: [
32+
webscrapingAI,
33+
"js",
34+
],
35+
},
36+
jsTimeout: {
37+
propDefinition: [
38+
webscrapingAI,
39+
"jsTimeout",
40+
],
41+
},
42+
waitFor: {
43+
propDefinition: [
44+
webscrapingAI,
45+
"waitFor",
46+
],
47+
},
48+
proxy: {
49+
propDefinition: [
50+
webscrapingAI,
51+
"proxy",
52+
],
53+
},
54+
country: {
55+
propDefinition: [
56+
webscrapingAI,
57+
"country",
58+
],
59+
},
60+
customProxy: {
61+
propDefinition: [
62+
webscrapingAI,
63+
"customProxy",
64+
],
65+
},
66+
device: {
67+
propDefinition: [
68+
webscrapingAI,
69+
"device",
70+
],
71+
},
72+
errorOn404: {
73+
propDefinition: [
74+
webscrapingAI,
75+
"errorOn404",
76+
],
77+
},
78+
errorOnRedirect: {
79+
propDefinition: [
80+
webscrapingAI,
81+
"errorOnRedirect",
82+
],
83+
},
84+
jsScript: {
85+
propDefinition: [
86+
webscrapingAI,
87+
"jsScript",
88+
],
89+
},
90+
format: {
91+
propDefinition: [
92+
webscrapingAI,
93+
"format",
94+
],
95+
},
96+
returnScriptResult: {
97+
type: "boolean",
98+
label: "Return Script Result",
99+
description: "Return result of the custom JavaScript code (`js_script` parameter) execution on the target page (`false` by default, page HTML will be returned).",
100+
optional: true,
101+
},
102+
},
103+
async run({ $ }) {
104+
const response = await this.webscrapingAI.pageHtmlByUrl({
105+
$,
106+
params: {
107+
url: this.targetUrl,
108+
headers: utils.stringifyHeaders(this.headers),
109+
timeout: this.timeout,
110+
js: this.js,
111+
js_timeout: this.jsTimeout,
112+
wait_for: this.waitFor,
113+
proxy: this.proxy,
114+
country: this.country,
115+
custom_proxy: this.customProxy,
116+
device: this.device,
117+
error_on_404: this.errorOn404,
118+
error_on_redirect: this.errorOnRedirect,
119+
js_script: this.jsScript,
120+
format: this.format,
121+
return_script_result: this.returnScriptResult,
122+
},
123+
});
124+
$.export("$summary", `Successfully scraped HTML of URL ${this.targetUrl}`);
125+
return response;
126+
},
127+
};
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import webscrapingAI from "../../webscraping_ai.app.mjs";
2+
import utils from "../../common/utils.mjs";
3+
4+
export default {
5+
key: "webscraping_ai-scrape-website-text",
6+
name: "Scrape Website Text",
7+
description: "Returns the visible text content of a webpage specified by the URL. [See the documentation](https://webscraping.ai/docs#tag/Text/operation/getText).",
8+
version: "0.0.1",
9+
type: "action",
10+
props: {
11+
webscrapingAI,
12+
targetUrl: {
13+
propDefinition: [
14+
webscrapingAI,
15+
"targetUrl",
16+
],
17+
},
18+
headers: {
19+
propDefinition: [
20+
webscrapingAI,
21+
"headers",
22+
],
23+
},
24+
timeout: {
25+
propDefinition: [
26+
webscrapingAI,
27+
"timeout",
28+
],
29+
},
30+
js: {
31+
propDefinition: [
32+
webscrapingAI,
33+
"js",
34+
],
35+
},
36+
jsTimeout: {
37+
propDefinition: [
38+
webscrapingAI,
39+
"jsTimeout",
40+
],
41+
},
42+
waitFor: {
43+
propDefinition: [
44+
webscrapingAI,
45+
"waitFor",
46+
],
47+
},
48+
proxy: {
49+
propDefinition: [
50+
webscrapingAI,
51+
"proxy",
52+
],
53+
},
54+
country: {
55+
propDefinition: [
56+
webscrapingAI,
57+
"country",
58+
],
59+
},
60+
customProxy: {
61+
propDefinition: [
62+
webscrapingAI,
63+
"customProxy",
64+
],
65+
},
66+
device: {
67+
propDefinition: [
68+
webscrapingAI,
69+
"device",
70+
],
71+
},
72+
errorOn404: {
73+
propDefinition: [
74+
webscrapingAI,
75+
"errorOn404",
76+
],
77+
},
78+
errorOnRedirect: {
79+
propDefinition: [
80+
webscrapingAI,
81+
"errorOnRedirect",
82+
],
83+
},
84+
jsScript: {
85+
propDefinition: [
86+
webscrapingAI,
87+
"jsScript",
88+
],
89+
},
90+
textFormat: {
91+
type: "string",
92+
label: "Text Format",
93+
description: "The format of the returned text content. Default: `json`",
94+
options: [
95+
"plain",
96+
"xml",
97+
"json",
98+
],
99+
default: "json",
100+
optional: true,
101+
},
102+
returnLinks: {
103+
type: "boolean",
104+
label: "Return Links",
105+
description: "Whether to include links in the returned text content. Works only when Text Format is `json`.",
106+
optional: true,
107+
},
108+
},
109+
async run({ $ }) {
110+
const response = await this.webscrapingAI.pageTextByUrl({
111+
$,
112+
params: {
113+
url: this.targetUrl,
114+
headers: utils.stringifyHeaders(this.headers),
115+
timeout: this.timeout,
116+
js: this.js,
117+
js_timeout: this.jsTimeout,
118+
wait_for: this.waitFor,
119+
proxy: this.proxy,
120+
country: this.country,
121+
custom_proxy: this.customProxy,
122+
device: this.device,
123+
error_on_404: this.errorOn404,
124+
error_on_redirect: this.errorOnRedirect,
125+
js_script: this.jsScript,
126+
text_format: this.textFormat,
127+
return_links: this.returnLinks,
128+
},
129+
});
130+
$.export("$summary", `Successfully scraped text from ${this.targetUrl}`);
131+
return response;
132+
},
133+
};

0 commit comments

Comments
 (0)