From ea500e670124b44f63c0ac4376202511d896f606 Mon Sep 17 00:00:00 2001 From: michelle0927 Date: Fri, 7 Feb 2025 15:20:16 -0500 Subject: [PATCH 1/4] webscraping_ai init --- .../actions/ask-question/ask-question.mjs | 30 +++++ .../scrape-website-html.mjs | 45 +++++++ .../scrape-website-text.mjs | 41 ++++++ .../webscraping_ai/webscraping_ai.app.mjs | 117 ++++++++++++++++++ 4 files changed, 233 insertions(+) create mode 100644 components/webscraping_ai/actions/ask-question/ask-question.mjs create mode 100644 components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs create mode 100644 components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs create mode 100644 components/webscraping_ai/webscraping_ai.app.mjs diff --git a/components/webscraping_ai/actions/ask-question/ask-question.mjs b/components/webscraping_ai/actions/ask-question/ask-question.mjs new file mode 100644 index 0000000000000..150bd764b2238 --- /dev/null +++ b/components/webscraping_ai/actions/ask-question/ask-question.mjs @@ -0,0 +1,30 @@ +import webscraping_ai from "../../webscraping_ai.app.mjs"; +import { axios } from "@pipedream/platform"; + +export default { + key: "webscraping_ai-ask-question", + name: "Ask Question about Webpage", + description: "Gets an answer to a question about a given webpage. [See the documentation](https://webscraping.ai/docs)", + version: "0.0.{{ts}}", + type: "action", + props: { + webscraping_ai, + targetUrl: { + propDefinition: [ + webscraping_ai, + "targetUrl", + ], + }, + question: { + propDefinition: [ + webscraping_ai, + "question", + ], + }, + }, + async run({ $ }) { + const response = await this.webscraping_ai.getAnswerToQuestion(); + $.export("$summary", `Answer: ${response.answer}`); + return response; + }, +}; diff --git a/components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs b/components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs new file mode 100644 index 0000000000000..85198507b1302 --- /dev/null +++ b/components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs @@ -0,0 +1,45 @@ +import webscraping_ai from "../../webscraping_ai.app.mjs"; +import { axios } from "@pipedream/platform"; + +export default { + key: "webscraping_ai-scrape-website-html", + name: "Scrape Website HTML", + description: "Starts a new web scraping job with specified configurations. [See the documentation]():", + version: "0.0.{{ts}}", + type: "action", + props: { + webscraping_ai, + targetUrl: { + propDefinition: [ + "webscraping_ai", + "targetUrl", + ], + }, + selectors: { + propDefinition: [ + "webscraping_ai", + "selectors", + ], + optional: true, + }, + renderingMode: { + propDefinition: [ + "webscraping_ai", + "renderingMode", + ], + optional: true, + }, + headers: { + propDefinition: [ + "webscraping_ai", + "headers", + ], + optional: true, + }, + }, + async run({ $ }) { + const response = await this.webscraping_ai.startScrapingJob(); + $.export("$summary", `Started scraping job for URL ${this.targetUrl}`); + return response; + }, +}; diff --git a/components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs b/components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs new file mode 100644 index 0000000000000..3ef2722d04a0f --- /dev/null +++ b/components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs @@ -0,0 +1,41 @@ +import webscraping_ai from "../../webscraping_ai.app.mjs"; +import { axios } from "@pipedream/platform"; + +export default { + key: "webscraping_ai-scrape-website-text", + name: "Scrape Website Text", + description: "Returns the visible text content of a webpage specified by the URL. [See the documentation]().", + version: "0.0.{{ts}}", + type: "action", + props: { + webscraping_ai: { + type: "app", + app: "webscraping_ai", + }, + targetUrl: { + propDefinition: [ + webscraping_ai, + "targetUrl", + ], + }, + textFormat: { + propDefinition: [ + webscraping_ai, + "textFormat", + ], + optional: true, + }, + returnLinks: { + propDefinition: [ + webscraping_ai, + "returnLinks", + ], + optional: true, + }, + }, + async run({ $ }) { + const response = await this.webscraping_ai.getVisibleTextContent(); + $.export("$summary", `Successfully scraped text from ${this.targetUrl}`); + return response; + }, +}; diff --git a/components/webscraping_ai/webscraping_ai.app.mjs b/components/webscraping_ai/webscraping_ai.app.mjs new file mode 100644 index 0000000000000..d264954fea62c --- /dev/null +++ b/components/webscraping_ai/webscraping_ai.app.mjs @@ -0,0 +1,117 @@ +import { axios } from "@pipedream/platform"; + +export default { + type: "app", + app: "webscraping_ai", + version: "0.0.{{ts}}", + propDefinitions: { + targetUrl: { + type: "string", + label: "Target URL", + description: "The URL of the webpage to scrape.", + }, + selectors: { + type: "string[]", + label: "Selectors", + description: "Optional CSS selectors to target specific elements on the page.", + optional: true, + }, + renderingMode: { + type: "string", + label: "Rendering Mode", + description: "The mode to render the page (e.g., 'light', 'dark').", + optional: true, + }, + headers: { + type: "string[]", + label: "Headers", + description: "Optional HTTP headers to include in the request, as JSON strings.", + optional: true, + }, + textFormat: { + type: "string", + label: "Text Format", + description: "The format of the returned text content (e.g., 'plain', 'html').", + optional: true, + }, + returnLinks: { + type: "boolean", + label: "Return Links", + description: "Whether to include links in the returned text content.", + optional: true, + }, + question: { + type: "string", + label: "Question", + description: "The question to ask about the given webpage.", + }, + }, + methods: { + _baseUrl() { + return "https://api.webscraping.ai"; + }, + async _makeRequest(opts = {}) { + const { + $, method = "GET", path = "/", headers = {}, ...otherOpts + } = opts; + return axios($, { + method, + url: `${this._baseUrl()}${path}`, + headers: { + ...headers, + "User-Agent": "@PipedreamHQ/pipedream v0.1", + "Authorization": `Bearer ${this.$auth.api_key}`, + }, + ...otherOpts, + }); + }, + async startScrapingJob() { + const data = { + url: this.targetUrl, + }; + if (this.selectors) data.selectors = this.selectors; + if (this.renderingMode) data.rendering_mode = this.renderingMode; + if (this.headers) { + data.headers = this.headers.reduce((acc, headerStr) => { + try { + const header = JSON.parse(headerStr); + return { + ...acc, + ...header, + }; + } catch (e) { + return acc; + } + }, {}); + } + return this._makeRequest({ + method: "POST", + path: "/scraping-jobs", + data, + }); + }, + async getVisibleTextContent() { + const params = { + url: this.targetUrl, + }; + if (this.textFormat) params.text_format = this.textFormat; + if (this.returnLinks !== undefined) params.return_links = this.returnLinks; + return this._makeRequest({ + method: "GET", + path: "/text-content", + params, + }); + }, + async getAnswerToQuestion() { + const data = { + url: this.targetUrl, + question: this.question, + }; + return this._makeRequest({ + method: "POST", + path: "/answer", + data, + }); + }, + }, +}; From 4402b450d186a458e651f6a2f405e21c41e3e38d Mon Sep 17 00:00:00 2001 From: michelle0927 Date: Fri, 7 Feb 2025 15:53:49 -0500 Subject: [PATCH 2/4] new components --- components/webscraping_ai/.gitignore | 3 - .../actions/ask-question/ask-question.mjs | 28 ++--- .../scrape-website-html.mjs | 42 +++----- .../scrape-website-text.mjs | 41 ++++--- .../webscraping_ai/app/webscraping_ai.app.ts | 13 --- components/webscraping_ai/package.json | 8 +- .../webscraping_ai/webscraping_ai.app.mjs | 102 ++++-------------- 7 files changed, 76 insertions(+), 161 deletions(-) delete mode 100644 components/webscraping_ai/.gitignore delete mode 100644 components/webscraping_ai/app/webscraping_ai.app.ts diff --git a/components/webscraping_ai/.gitignore b/components/webscraping_ai/.gitignore deleted file mode 100644 index ec761ccab7595..0000000000000 --- a/components/webscraping_ai/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.js -*.mjs -dist \ No newline at end of file diff --git a/components/webscraping_ai/actions/ask-question/ask-question.mjs b/components/webscraping_ai/actions/ask-question/ask-question.mjs index 150bd764b2238..b73ebed808565 100644 --- a/components/webscraping_ai/actions/ask-question/ask-question.mjs +++ b/components/webscraping_ai/actions/ask-question/ask-question.mjs @@ -1,30 +1,34 @@ -import webscraping_ai from "../../webscraping_ai.app.mjs"; -import { axios } from "@pipedream/platform"; +import webscrapingAI from "../../webscraping_ai.app.mjs"; export default { key: "webscraping_ai-ask-question", name: "Ask Question about Webpage", - description: "Gets an answer to a question about a given webpage. [See the documentation](https://webscraping.ai/docs)", - version: "0.0.{{ts}}", + description: "Gets an answer to a question about a given webpage. [See the documentation](https://webscraping.ai/docs#tag/AI/operation/getQuestion)", + version: "0.0.1", type: "action", props: { - webscraping_ai, + webscrapingAI, targetUrl: { propDefinition: [ - webscraping_ai, + webscrapingAI, "targetUrl", ], }, question: { - propDefinition: [ - webscraping_ai, - "question", - ], + type: "string", + label: "Question", + description: "The question to ask about the given webpage. E.g. `What is the summary of this page content?`", }, }, async run({ $ }) { - const response = await this.webscraping_ai.getAnswerToQuestion(); - $.export("$summary", `Answer: ${response.answer}`); + const response = await this.webscrapingAI.getAnswerToQuestion({ + $, + params: { + url: this.targetUrl, + question: this.question, + }, + }); + $.export("$summary", "Successfully retrieved answer to question"); return response; }, }; diff --git a/components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs b/components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs index 85198507b1302..84fa895d2f6a1 100644 --- a/components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs +++ b/components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs @@ -1,45 +1,29 @@ -import webscraping_ai from "../../webscraping_ai.app.mjs"; -import { axios } from "@pipedream/platform"; +import webscrapingAI from "../../webscraping_ai.app.mjs"; export default { key: "webscraping_ai-scrape-website-html", name: "Scrape Website HTML", - description: "Starts a new web scraping job with specified configurations. [See the documentation]():", - version: "0.0.{{ts}}", + description: "Returns the full HTML content of a webpage specified by the URL. [See the documentation](https://webscraping.ai/docs#tag/HTML/operation/getHTML):", + version: "0.0.1", type: "action", props: { - webscraping_ai, + webscrapingAI, targetUrl: { propDefinition: [ - "webscraping_ai", + webscrapingAI, "targetUrl", ], }, - selectors: { - propDefinition: [ - "webscraping_ai", - "selectors", - ], - optional: true, - }, - renderingMode: { - propDefinition: [ - "webscraping_ai", - "renderingMode", - ], - optional: true, - }, - headers: { - propDefinition: [ - "webscraping_ai", - "headers", - ], - optional: true, - }, }, async run({ $ }) { - const response = await this.webscraping_ai.startScrapingJob(); - $.export("$summary", `Started scraping job for URL ${this.targetUrl}`); + const response = await this.webscrapingAI.pageHtmlByUrl({ + $, + params: { + url: this.targetUrl, + format: "json", + }, + }); + $.export("$summary", `Successfully scraped HTML of URL ${this.targetUrl}`); return response; }, }; diff --git a/components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs b/components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs index 3ef2722d04a0f..a4d66ed67cea0 100644 --- a/components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs +++ b/components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs @@ -1,40 +1,47 @@ -import webscraping_ai from "../../webscraping_ai.app.mjs"; -import { axios } from "@pipedream/platform"; +import webscrapingAI from "../../webscraping_ai.app.mjs"; export default { key: "webscraping_ai-scrape-website-text", name: "Scrape Website Text", - description: "Returns the visible text content of a webpage specified by the URL. [See the documentation]().", - version: "0.0.{{ts}}", + description: "Returns the visible text content of a webpage specified by the URL. [See the documentation](https://webscraping.ai/docs#tag/Text/operation/getText).", + version: "0.0.1", type: "action", props: { - webscraping_ai: { - type: "app", - app: "webscraping_ai", - }, + webscrapingAI, targetUrl: { propDefinition: [ - webscraping_ai, + webscrapingAI, "targetUrl", ], }, textFormat: { - propDefinition: [ - webscraping_ai, - "textFormat", + type: "string", + label: "Text Format", + description: "The format of the returned text content. Default: `json`", + options: [ + "plain", + "xml", + "json", ], + default: "json", optional: true, }, returnLinks: { - propDefinition: [ - webscraping_ai, - "returnLinks", - ], + type: "boolean", + label: "Return Links", + description: "Whether to include links in the returned text content. Works only when Text Format is `json`.", optional: true, }, }, async run({ $ }) { - const response = await this.webscraping_ai.getVisibleTextContent(); + const response = await this.webscrapingAI.pageTextByUrl({ + $, + params: { + url: this.targetUrl, + text_format: this.textFormat, + return_links: this.returnLinks, + }, + }); $.export("$summary", `Successfully scraped text from ${this.targetUrl}`); return response; }, diff --git a/components/webscraping_ai/app/webscraping_ai.app.ts b/components/webscraping_ai/app/webscraping_ai.app.ts deleted file mode 100644 index ef05d2cd90d7f..0000000000000 --- a/components/webscraping_ai/app/webscraping_ai.app.ts +++ /dev/null @@ -1,13 +0,0 @@ -import { defineApp } from "@pipedream/types"; - -export default defineApp({ - type: "app", - app: "webscraping_ai", - propDefinitions: {}, - methods: { - // this.$auth contains connected account data - authKeys() { - console.log(Object.keys(this.$auth)); - }, - }, -}); diff --git a/components/webscraping_ai/package.json b/components/webscraping_ai/package.json index 6cc412b15c6c8..26e6a2e4a629c 100644 --- a/components/webscraping_ai/package.json +++ b/components/webscraping_ai/package.json @@ -1,16 +1,18 @@ { "name": "@pipedream/webscraping_ai", - "version": "0.0.3", + "version": "0.1.0", "description": "Pipedream WebScraping.AI Components", - "main": "dist/app/webscraping_ai.app.mjs", + "main": "webscraping_ai.app.mjs", "keywords": [ "pipedream", "webscraping_ai" ], - "files": ["dist"], "homepage": "https://pipedream.com/apps/webscraping_ai", "author": "Pipedream (https://pipedream.com/)", "publishConfig": { "access": "public" + }, + "dependencies": { + "@pipedream/platform": "^3.0.3" } } diff --git a/components/webscraping_ai/webscraping_ai.app.mjs b/components/webscraping_ai/webscraping_ai.app.mjs index d264954fea62c..ab619e052af69 100644 --- a/components/webscraping_ai/webscraping_ai.app.mjs +++ b/components/webscraping_ai/webscraping_ai.app.mjs @@ -3,114 +3,48 @@ import { axios } from "@pipedream/platform"; export default { type: "app", app: "webscraping_ai", - version: "0.0.{{ts}}", propDefinitions: { targetUrl: { type: "string", label: "Target URL", description: "The URL of the webpage to scrape.", }, - selectors: { - type: "string[]", - label: "Selectors", - description: "Optional CSS selectors to target specific elements on the page.", - optional: true, - }, - renderingMode: { - type: "string", - label: "Rendering Mode", - description: "The mode to render the page (e.g., 'light', 'dark').", - optional: true, - }, - headers: { - type: "string[]", - label: "Headers", - description: "Optional HTTP headers to include in the request, as JSON strings.", - optional: true, - }, - textFormat: { - type: "string", - label: "Text Format", - description: "The format of the returned text content (e.g., 'plain', 'html').", - optional: true, - }, - returnLinks: { - type: "boolean", - label: "Return Links", - description: "Whether to include links in the returned text content.", - optional: true, - }, - question: { - type: "string", - label: "Question", - description: "The question to ask about the given webpage.", - }, }, methods: { _baseUrl() { return "https://api.webscraping.ai"; }, - async _makeRequest(opts = {}) { - const { - $, method = "GET", path = "/", headers = {}, ...otherOpts - } = opts; + _makeRequest({ + $ = this, + path, + params, + ...otherOpts + }) { return axios($, { - method, url: `${this._baseUrl()}${path}`, - headers: { - ...headers, - "User-Agent": "@PipedreamHQ/pipedream v0.1", - "Authorization": `Bearer ${this.$auth.api_key}`, + params: { + ...params, + api_key: this.$auth.api_key, }, ...otherOpts, }); }, - async startScrapingJob() { - const data = { - url: this.targetUrl, - }; - if (this.selectors) data.selectors = this.selectors; - if (this.renderingMode) data.rendering_mode = this.renderingMode; - if (this.headers) { - data.headers = this.headers.reduce((acc, headerStr) => { - try { - const header = JSON.parse(headerStr); - return { - ...acc, - ...header, - }; - } catch (e) { - return acc; - } - }, {}); - } + pageHtmlByUrl(opts = {}) { return this._makeRequest({ - method: "POST", - path: "/scraping-jobs", - data, + path: "/html", + ...opts, }); }, - async getVisibleTextContent() { - const params = { - url: this.targetUrl, - }; - if (this.textFormat) params.text_format = this.textFormat; - if (this.returnLinks !== undefined) params.return_links = this.returnLinks; + pageTextByUrl(opts = {}) { return this._makeRequest({ - method: "GET", - path: "/text-content", - params, + path: "/text", + ...opts, }); }, - async getAnswerToQuestion() { - const data = { - url: this.targetUrl, - question: this.question, - }; + getAnswerToQuestion(opts = {}) { return this._makeRequest({ - method: "POST", - path: "/answer", - data, + path: "/ai/question", + ...opts, }); }, }, From 85a1e778f65fd57d417c784c8c5c6d46541250d4 Mon Sep 17 00:00:00 2001 From: michelle0927 Date: Fri, 7 Feb 2025 15:55:35 -0500 Subject: [PATCH 3/4] pnpm-lock.yaml --- pnpm-lock.yaml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c923106a30908..3e9fd6d7e8c98 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1891,11 +1891,9 @@ importers: components/clarify: {} - components/claris_filemaker_server_admin_api: - specifiers: {} + components/claris_filemaker_server_admin_api: {} - components/claris_filemaker_server_data_api: - specifiers: {} + components/claris_filemaker_server_data_api: {} components/claris_filemaker_server_odata_api: {} @@ -6810,8 +6808,7 @@ importers: specifier: ^1.5.1 version: 1.6.6 - components/morgen: - specifiers: {} + components/morgen: {} components/morningmate: dependencies: @@ -11862,7 +11859,11 @@ importers: specifier: ^1.4.1 version: 1.6.6 - components/webscraping_ai: {} + components/webscraping_ai: + dependencies: + '@pipedream/platform': + specifier: ^3.0.3 + version: 3.0.3 components/webvizio: dependencies: From c0b1155de398c78c713033dee4f3c8f9358520e4 Mon Sep 17 00:00:00 2001 From: michelle0927 Date: Tue, 11 Feb 2025 12:50:11 -0500 Subject: [PATCH 4/4] add additional optional props --- .../actions/ask-question/ask-question.mjs | 92 ++++++++++++++++ .../scrape-website-html.mjs | 100 ++++++++++++++++- .../scrape-website-text.mjs | 85 ++++++++++++++ components/webscraping_ai/common/utils.mjs | 12 ++ .../webscraping_ai/webscraping_ai.app.mjs | 104 ++++++++++++++++++ 5 files changed, 392 insertions(+), 1 deletion(-) create mode 100644 components/webscraping_ai/common/utils.mjs diff --git a/components/webscraping_ai/actions/ask-question/ask-question.mjs b/components/webscraping_ai/actions/ask-question/ask-question.mjs index b73ebed808565..8edd1ca4bac69 100644 --- a/components/webscraping_ai/actions/ask-question/ask-question.mjs +++ b/components/webscraping_ai/actions/ask-question/ask-question.mjs @@ -1,4 +1,5 @@ import webscrapingAI from "../../webscraping_ai.app.mjs"; +import utils from "../../common/utils.mjs"; export default { key: "webscraping_ai-ask-question", @@ -19,6 +20,84 @@ export default { label: "Question", description: "The question to ask about the given webpage. E.g. `What is the summary of this page content?`", }, + headers: { + propDefinition: [ + webscrapingAI, + "headers", + ], + }, + timeout: { + propDefinition: [ + webscrapingAI, + "timeout", + ], + }, + js: { + propDefinition: [ + webscrapingAI, + "js", + ], + }, + jsTimeout: { + propDefinition: [ + webscrapingAI, + "jsTimeout", + ], + }, + waitFor: { + propDefinition: [ + webscrapingAI, + "waitFor", + ], + }, + proxy: { + propDefinition: [ + webscrapingAI, + "proxy", + ], + }, + country: { + propDefinition: [ + webscrapingAI, + "country", + ], + }, + customProxy: { + propDefinition: [ + webscrapingAI, + "customProxy", + ], + }, + device: { + propDefinition: [ + webscrapingAI, + "device", + ], + }, + errorOn404: { + propDefinition: [ + webscrapingAI, + "errorOn404", + ], + }, + errorOnRedirect: { + propDefinition: [ + webscrapingAI, + "errorOnRedirect", + ], + }, + jsScript: { + propDefinition: [ + webscrapingAI, + "jsScript", + ], + }, + format: { + propDefinition: [ + webscrapingAI, + "format", + ], + }, }, async run({ $ }) { const response = await this.webscrapingAI.getAnswerToQuestion({ @@ -26,6 +105,19 @@ export default { params: { url: this.targetUrl, question: this.question, + headers: utils.stringifyHeaders(this.headers), + timeout: this.timeout, + js: this.js, + js_timeout: this.jsTimeout, + wait_for: this.waitFor, + proxy: this.proxy, + country: this.country, + custom_proxy: this.customProxy, + device: this.device, + error_on_404: this.errorOn404, + error_on_redirect: this.errorOnRedirect, + js_script: this.jsScript, + format: this.format, }, }); $.export("$summary", "Successfully retrieved answer to question"); diff --git a/components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs b/components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs index 84fa895d2f6a1..491062855566e 100644 --- a/components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs +++ b/components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs @@ -1,4 +1,5 @@ import webscrapingAI from "../../webscraping_ai.app.mjs"; +import utils from "../../common/utils.mjs"; export default { key: "webscraping_ai-scrape-website-html", @@ -14,13 +15,110 @@ export default { "targetUrl", ], }, + headers: { + propDefinition: [ + webscrapingAI, + "headers", + ], + }, + timeout: { + propDefinition: [ + webscrapingAI, + "timeout", + ], + }, + js: { + propDefinition: [ + webscrapingAI, + "js", + ], + }, + jsTimeout: { + propDefinition: [ + webscrapingAI, + "jsTimeout", + ], + }, + waitFor: { + propDefinition: [ + webscrapingAI, + "waitFor", + ], + }, + proxy: { + propDefinition: [ + webscrapingAI, + "proxy", + ], + }, + country: { + propDefinition: [ + webscrapingAI, + "country", + ], + }, + customProxy: { + propDefinition: [ + webscrapingAI, + "customProxy", + ], + }, + device: { + propDefinition: [ + webscrapingAI, + "device", + ], + }, + errorOn404: { + propDefinition: [ + webscrapingAI, + "errorOn404", + ], + }, + errorOnRedirect: { + propDefinition: [ + webscrapingAI, + "errorOnRedirect", + ], + }, + jsScript: { + propDefinition: [ + webscrapingAI, + "jsScript", + ], + }, + format: { + propDefinition: [ + webscrapingAI, + "format", + ], + }, + returnScriptResult: { + type: "boolean", + label: "Return Script Result", + description: "Return result of the custom JavaScript code (`js_script` parameter) execution on the target page (`false` by default, page HTML will be returned).", + optional: true, + }, }, async run({ $ }) { const response = await this.webscrapingAI.pageHtmlByUrl({ $, params: { url: this.targetUrl, - format: "json", + headers: utils.stringifyHeaders(this.headers), + timeout: this.timeout, + js: this.js, + js_timeout: this.jsTimeout, + wait_for: this.waitFor, + proxy: this.proxy, + country: this.country, + custom_proxy: this.customProxy, + device: this.device, + error_on_404: this.errorOn404, + error_on_redirect: this.errorOnRedirect, + js_script: this.jsScript, + format: this.format, + return_script_result: this.returnScriptResult, }, }); $.export("$summary", `Successfully scraped HTML of URL ${this.targetUrl}`); diff --git a/components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs b/components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs index a4d66ed67cea0..c5fd18a86a61f 100644 --- a/components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs +++ b/components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs @@ -1,4 +1,5 @@ import webscrapingAI from "../../webscraping_ai.app.mjs"; +import utils from "../../common/utils.mjs"; export default { key: "webscraping_ai-scrape-website-text", @@ -14,6 +15,78 @@ export default { "targetUrl", ], }, + headers: { + propDefinition: [ + webscrapingAI, + "headers", + ], + }, + timeout: { + propDefinition: [ + webscrapingAI, + "timeout", + ], + }, + js: { + propDefinition: [ + webscrapingAI, + "js", + ], + }, + jsTimeout: { + propDefinition: [ + webscrapingAI, + "jsTimeout", + ], + }, + waitFor: { + propDefinition: [ + webscrapingAI, + "waitFor", + ], + }, + proxy: { + propDefinition: [ + webscrapingAI, + "proxy", + ], + }, + country: { + propDefinition: [ + webscrapingAI, + "country", + ], + }, + customProxy: { + propDefinition: [ + webscrapingAI, + "customProxy", + ], + }, + device: { + propDefinition: [ + webscrapingAI, + "device", + ], + }, + errorOn404: { + propDefinition: [ + webscrapingAI, + "errorOn404", + ], + }, + errorOnRedirect: { + propDefinition: [ + webscrapingAI, + "errorOnRedirect", + ], + }, + jsScript: { + propDefinition: [ + webscrapingAI, + "jsScript", + ], + }, textFormat: { type: "string", label: "Text Format", @@ -38,6 +111,18 @@ export default { $, params: { url: this.targetUrl, + headers: utils.stringifyHeaders(this.headers), + timeout: this.timeout, + js: this.js, + js_timeout: this.jsTimeout, + wait_for: this.waitFor, + proxy: this.proxy, + country: this.country, + custom_proxy: this.customProxy, + device: this.device, + error_on_404: this.errorOn404, + error_on_redirect: this.errorOnRedirect, + js_script: this.jsScript, text_format: this.textFormat, return_links: this.returnLinks, }, diff --git a/components/webscraping_ai/common/utils.mjs b/components/webscraping_ai/common/utils.mjs new file mode 100644 index 0000000000000..0093b6517ff63 --- /dev/null +++ b/components/webscraping_ai/common/utils.mjs @@ -0,0 +1,12 @@ +function stringifyHeaders(headers) { + if (!headers) { + return undefined; + } + return typeof headers === "string" + ? headers + : JSON.stringify(headers); +} + +export default { + stringifyHeaders, +}; diff --git a/components/webscraping_ai/webscraping_ai.app.mjs b/components/webscraping_ai/webscraping_ai.app.mjs index ab619e052af69..1bfb8b639bdad 100644 --- a/components/webscraping_ai/webscraping_ai.app.mjs +++ b/components/webscraping_ai/webscraping_ai.app.mjs @@ -9,6 +9,110 @@ export default { label: "Target URL", description: "The URL of the webpage to scrape.", }, + headers: { + type: "object", + label: "Headers", + description: "HTTP headers to pass to the target page", + optional: true, + }, + timeout: { + type: "integer", + label: "Timeout", + description: "Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).", + optional: true, + }, + js: { + type: "boolean", + label: "JS", + description: "Execute on-page JavaScript using a headless browser (`true` by default)", + optional: true, + }, + jsTimeout: { + type: "integer", + label: "JS Timeout", + description: "Maximum JavaScript rendering time in ms. Default: `2000`", + optional: true, + }, + waitFor: { + type: "string", + label: "Wait For", + description: "CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.", + optional: true, + }, + proxy: { + type: "string", + label: "Proxy", + description: "Type of proxy, use residential proxies if your site restricts traffic from datacenters (`datacenter` by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details.", + options: [ + "datacenter", + "residential", + ], + optional: true, + }, + country: { + type: "string", + label: "Country", + description: "Country of the proxy to use (`us` by default)", + options: [ + "us", + "gb", + "de", + "it", + "fr", + "ca", + "es", + "ru", + "jp", + "kr", + "in", + ], + optional: true, + }, + customProxy: { + type: "string", + label: "Custom Proxy", + description: "Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format ([Smartproxy](https://webscraping.ai/proxies/smartproxy) for example).", + optional: true, + }, + device: { + type: "string", + label: "Device", + description: "Type of device emulation. Default is `desktop`", + options: [ + "desktop", + "mobile", + "tablet", + ], + optional: true, + }, + errorOn404: { + type: "boolean", + label: "Error on 404", + description: "Return error on 404 HTTP status on the target page (`false` by default)", + optional: true, + }, + errorOnRedirect: { + type: "boolean", + label: "Error on Redirect", + description: "Return error on redirect on the target page (`false` by default)", + optional: true, + }, + jsScript: { + type: "string", + label: "JS Script", + description: "Custom JavaScript code to execute on the target page. Example: `document.querySelector('button').click();`", + optional: true, + }, + format: { + type: "string", + label: "Format", + description: "Format of the response (`text` by default). `json` will return a JSON object with the response, `text` will return a plain text/HTML response.", + options: [ + "json", + "text", + ], + optional: true, + }, }, methods: { _baseUrl() {