diff --git a/components/webscraping_ai/.gitignore b/components/webscraping_ai/.gitignore deleted file mode 100644 index ec761ccab7595..0000000000000 --- a/components/webscraping_ai/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.js -*.mjs -dist \ No newline at end of file diff --git a/components/webscraping_ai/actions/ask-question/ask-question.mjs b/components/webscraping_ai/actions/ask-question/ask-question.mjs new file mode 100644 index 0000000000000..8edd1ca4bac69 --- /dev/null +++ b/components/webscraping_ai/actions/ask-question/ask-question.mjs @@ -0,0 +1,126 @@ +import webscrapingAI from "../../webscraping_ai.app.mjs"; +import utils from "../../common/utils.mjs"; + +export default { + key: "webscraping_ai-ask-question", + name: "Ask Question about Webpage", + description: "Gets an answer to a question about a given webpage. [See the documentation](https://webscraping.ai/docs#tag/AI/operation/getQuestion)", + version: "0.0.1", + type: "action", + props: { + webscrapingAI, + targetUrl: { + propDefinition: [ + webscrapingAI, + "targetUrl", + ], + }, + question: { + type: "string", + label: "Question", + description: "The question to ask about the given webpage. E.g. `What is the summary of this page content?`", + }, + headers: { + propDefinition: [ + webscrapingAI, + "headers", + ], + }, + timeout: { + propDefinition: [ + webscrapingAI, + "timeout", + ], + }, + js: { + propDefinition: [ + webscrapingAI, + "js", + ], + }, + jsTimeout: { + propDefinition: [ + webscrapingAI, + "jsTimeout", + ], + }, + waitFor: { + propDefinition: [ + webscrapingAI, + "waitFor", + ], + }, + proxy: { + propDefinition: [ + webscrapingAI, + "proxy", + ], + }, + country: { + propDefinition: [ + webscrapingAI, + "country", + ], + }, + customProxy: { + propDefinition: [ + webscrapingAI, + "customProxy", + ], + }, + device: { + propDefinition: [ + webscrapingAI, + "device", + ], + }, + errorOn404: { + propDefinition: [ + webscrapingAI, + "errorOn404", + ], + }, + errorOnRedirect: { + propDefinition: [ + webscrapingAI, + "errorOnRedirect", + ], + }, + jsScript: { + propDefinition: [ + webscrapingAI, + "jsScript", + ], + }, + format: { + propDefinition: [ + webscrapingAI, + "format", + ], + }, + }, + async run({ $ }) { + const response = await this.webscrapingAI.getAnswerToQuestion({ + $, + params: { + url: this.targetUrl, + question: this.question, + headers: utils.stringifyHeaders(this.headers), + timeout: this.timeout, + js: this.js, + js_timeout: this.jsTimeout, + wait_for: this.waitFor, + proxy: this.proxy, + country: this.country, + custom_proxy: this.customProxy, + device: this.device, + error_on_404: this.errorOn404, + error_on_redirect: this.errorOnRedirect, + js_script: this.jsScript, + format: this.format, + }, + }); + $.export("$summary", "Successfully retrieved answer to question"); + return response; + }, +}; diff --git a/components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs b/components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs new file mode 100644 index 0000000000000..491062855566e --- /dev/null +++ b/components/webscraping_ai/actions/scrape-website-html/scrape-website-html.mjs @@ -0,0 +1,127 @@ +import webscrapingAI from "../../webscraping_ai.app.mjs"; +import utils from "../../common/utils.mjs"; + +export default { + key: "webscraping_ai-scrape-website-html", + name: "Scrape Website HTML", + description: "Returns the full HTML content of a webpage specified by the URL. [See the documentation](https://webscraping.ai/docs#tag/HTML/operation/getHTML):", + version: "0.0.1", + type: "action", + props: { + webscrapingAI, + targetUrl: { + propDefinition: [ + webscrapingAI, + "targetUrl", + ], + }, + headers: { + propDefinition: [ + webscrapingAI, + "headers", + ], + }, + timeout: { + propDefinition: [ + webscrapingAI, + "timeout", + ], + }, + js: { + propDefinition: [ + webscrapingAI, + "js", + ], + }, + jsTimeout: { + propDefinition: [ + webscrapingAI, + "jsTimeout", + ], + }, + waitFor: { + propDefinition: [ + webscrapingAI, + "waitFor", + ], + }, + proxy: { + propDefinition: [ + webscrapingAI, + "proxy", + ], + }, + country: { + propDefinition: [ + webscrapingAI, + "country", + ], + }, + customProxy: { + propDefinition: [ + webscrapingAI, + "customProxy", + ], + }, + device: { + propDefinition: [ + webscrapingAI, + "device", + ], + }, + errorOn404: { + propDefinition: [ + webscrapingAI, + "errorOn404", + ], + }, + errorOnRedirect: { + propDefinition: [ + webscrapingAI, + "errorOnRedirect", + ], + }, + jsScript: { + propDefinition: [ + webscrapingAI, + "jsScript", + ], + }, + format: { + propDefinition: [ + webscrapingAI, + "format", + ], + }, + returnScriptResult: { + type: "boolean", + label: "Return Script Result", + description: "Return result of the custom JavaScript code (`js_script` parameter) execution on the target page (`false` by default, page HTML will be returned).", + optional: true, + }, + }, + async run({ $ }) { + const response = await this.webscrapingAI.pageHtmlByUrl({ + $, + params: { + url: this.targetUrl, + headers: utils.stringifyHeaders(this.headers), + timeout: this.timeout, + js: this.js, + js_timeout: this.jsTimeout, + wait_for: this.waitFor, + proxy: this.proxy, + country: this.country, + custom_proxy: this.customProxy, + device: this.device, + error_on_404: this.errorOn404, + error_on_redirect: this.errorOnRedirect, + js_script: this.jsScript, + format: this.format, + return_script_result: this.returnScriptResult, + }, + }); + $.export("$summary", `Successfully scraped HTML of URL ${this.targetUrl}`); + return response; + }, +}; diff --git a/components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs b/components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs new file mode 100644 index 0000000000000..c5fd18a86a61f --- /dev/null +++ b/components/webscraping_ai/actions/scrape-website-text/scrape-website-text.mjs @@ -0,0 +1,133 @@ +import webscrapingAI from "../../webscraping_ai.app.mjs"; +import utils from "../../common/utils.mjs"; + +export default { + key: "webscraping_ai-scrape-website-text", + name: "Scrape Website Text", + description: "Returns the visible text content of a webpage specified by the URL. [See the documentation](https://webscraping.ai/docs#tag/Text/operation/getText).", + version: "0.0.1", + type: "action", + props: { + webscrapingAI, + targetUrl: { + propDefinition: [ + webscrapingAI, + "targetUrl", + ], + }, + headers: { + propDefinition: [ + webscrapingAI, + "headers", + ], + }, + timeout: { + propDefinition: [ + webscrapingAI, + "timeout", + ], + }, + js: { + propDefinition: [ + webscrapingAI, + "js", + ], + }, + jsTimeout: { + propDefinition: [ + webscrapingAI, + "jsTimeout", + ], + }, + waitFor: { + propDefinition: [ + webscrapingAI, + "waitFor", + ], + }, + proxy: { + propDefinition: [ + webscrapingAI, + "proxy", + ], + }, + country: { + propDefinition: [ + webscrapingAI, + "country", + ], + }, + customProxy: { + propDefinition: [ + webscrapingAI, + "customProxy", + ], + }, + device: { + propDefinition: [ + webscrapingAI, + "device", + ], + }, + errorOn404: { + propDefinition: [ + webscrapingAI, + "errorOn404", + ], + }, + errorOnRedirect: { + propDefinition: [ + webscrapingAI, + "errorOnRedirect", + ], + }, + jsScript: { + propDefinition: [ + webscrapingAI, + "jsScript", + ], + }, + textFormat: { + type: "string", + label: "Text Format", + description: "The format of the returned text content. Default: `json`", + options: [ + "plain", + "xml", + "json", + ], + default: "json", + optional: true, + }, + returnLinks: { + type: "boolean", + label: "Return Links", + description: "Whether to include links in the returned text content. Works only when Text Format is `json`.", + optional: true, + }, + }, + async run({ $ }) { + const response = await this.webscrapingAI.pageTextByUrl({ + $, + params: { + url: this.targetUrl, + headers: utils.stringifyHeaders(this.headers), + timeout: this.timeout, + js: this.js, + js_timeout: this.jsTimeout, + wait_for: this.waitFor, + proxy: this.proxy, + country: this.country, + custom_proxy: this.customProxy, + device: this.device, + error_on_404: this.errorOn404, + error_on_redirect: this.errorOnRedirect, + js_script: this.jsScript, + text_format: this.textFormat, + return_links: this.returnLinks, + }, + }); + $.export("$summary", `Successfully scraped text from ${this.targetUrl}`); + return response; + }, +}; diff --git a/components/webscraping_ai/app/webscraping_ai.app.ts b/components/webscraping_ai/app/webscraping_ai.app.ts deleted file mode 100644 index ef05d2cd90d7f..0000000000000 --- a/components/webscraping_ai/app/webscraping_ai.app.ts +++ /dev/null @@ -1,13 +0,0 @@ -import { defineApp } from "@pipedream/types"; - -export default defineApp({ - type: "app", - app: "webscraping_ai", - propDefinitions: {}, - methods: { - // this.$auth contains connected account data - authKeys() { - console.log(Object.keys(this.$auth)); - }, - }, -}); diff --git a/components/webscraping_ai/common/utils.mjs b/components/webscraping_ai/common/utils.mjs new file mode 100644 index 0000000000000..0093b6517ff63 --- /dev/null +++ b/components/webscraping_ai/common/utils.mjs @@ -0,0 +1,12 @@ +function stringifyHeaders(headers) { + if (!headers) { + return undefined; + } + return typeof headers === "string" + ? headers + : JSON.stringify(headers); +} + +export default { + stringifyHeaders, +}; diff --git a/components/webscraping_ai/package.json b/components/webscraping_ai/package.json index 6cc412b15c6c8..26e6a2e4a629c 100644 --- a/components/webscraping_ai/package.json +++ b/components/webscraping_ai/package.json @@ -1,16 +1,18 @@ { "name": "@pipedream/webscraping_ai", - "version": "0.0.3", + "version": "0.1.0", "description": "Pipedream WebScraping.AI Components", - "main": "dist/app/webscraping_ai.app.mjs", + "main": "webscraping_ai.app.mjs", "keywords": [ "pipedream", "webscraping_ai" ], - "files": ["dist"], "homepage": "https://pipedream.com/apps/webscraping_ai", "author": "Pipedream (https://pipedream.com/)", "publishConfig": { "access": "public" + }, + "dependencies": { + "@pipedream/platform": "^3.0.3" } } diff --git a/components/webscraping_ai/webscraping_ai.app.mjs b/components/webscraping_ai/webscraping_ai.app.mjs new file mode 100644 index 0000000000000..1bfb8b639bdad --- /dev/null +++ b/components/webscraping_ai/webscraping_ai.app.mjs @@ -0,0 +1,155 @@ +import { axios } from "@pipedream/platform"; + +export default { + type: "app", + app: "webscraping_ai", + propDefinitions: { + targetUrl: { + type: "string", + label: "Target URL", + description: "The URL of the webpage to scrape.", + }, + headers: { + type: "object", + label: "Headers", + description: "HTTP headers to pass to the target page", + optional: true, + }, + timeout: { + type: "integer", + label: "Timeout", + description: "Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).", + optional: true, + }, + js: { + type: "boolean", + label: "JS", + description: "Execute on-page JavaScript using a headless browser (`true` by default)", + optional: true, + }, + jsTimeout: { + type: "integer", + label: "JS Timeout", + description: "Maximum JavaScript rendering time in ms. Default: `2000`", + optional: true, + }, + waitFor: { + type: "string", + label: "Wait For", + description: "CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.", + optional: true, + }, + proxy: { + type: "string", + label: "Proxy", + description: "Type of proxy, use residential proxies if your site restricts traffic from datacenters (`datacenter` by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details.", + options: [ + "datacenter", + "residential", + ], + optional: true, + }, + country: { + type: "string", + label: "Country", + description: "Country of the proxy to use (`us` by default)", + options: [ + "us", + "gb", + "de", + "it", + "fr", + "ca", + "es", + "ru", + "jp", + "kr", + "in", + ], + optional: true, + }, + customProxy: { + type: "string", + label: "Custom Proxy", + description: "Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format ([Smartproxy](https://webscraping.ai/proxies/smartproxy) for example).", + optional: true, + }, + device: { + type: "string", + label: "Device", + description: "Type of device emulation. Default is `desktop`", + options: [ + "desktop", + "mobile", + "tablet", + ], + optional: true, + }, + errorOn404: { + type: "boolean", + label: "Error on 404", + description: "Return error on 404 HTTP status on the target page (`false` by default)", + optional: true, + }, + errorOnRedirect: { + type: "boolean", + label: "Error on Redirect", + description: "Return error on redirect on the target page (`false` by default)", + optional: true, + }, + jsScript: { + type: "string", + label: "JS Script", + description: "Custom JavaScript code to execute on the target page. Example: `document.querySelector('button').click();`", + optional: true, + }, + format: { + type: "string", + label: "Format", + description: "Format of the response (`text` by default). `json` will return a JSON object with the response, `text` will return a plain text/HTML response.", + options: [ + "json", + "text", + ], + optional: true, + }, + }, + methods: { + _baseUrl() { + return "https://api.webscraping.ai"; + }, + _makeRequest({ + $ = this, + path, + params, + ...otherOpts + }) { + return axios($, { + url: `${this._baseUrl()}${path}`, + params: { + ...params, + api_key: this.$auth.api_key, + }, + ...otherOpts, + }); + }, + pageHtmlByUrl(opts = {}) { + return this._makeRequest({ + path: "/html", + ...opts, + }); + }, + pageTextByUrl(opts = {}) { + return this._makeRequest({ + path: "/text", + ...opts, + }); + }, + getAnswerToQuestion(opts = {}) { + return this._makeRequest({ + path: "/ai/question", + ...opts, + }); + }, + }, +}; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c923106a30908..3e9fd6d7e8c98 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1891,11 +1891,9 @@ importers: components/clarify: {} - components/claris_filemaker_server_admin_api: - specifiers: {} + components/claris_filemaker_server_admin_api: {} - components/claris_filemaker_server_data_api: - specifiers: {} + components/claris_filemaker_server_data_api: {} components/claris_filemaker_server_odata_api: {} @@ -6810,8 +6808,7 @@ importers: specifier: ^1.5.1 version: 1.6.6 - components/morgen: - specifiers: {} + components/morgen: {} components/morningmate: dependencies: @@ -11862,7 +11859,11 @@ importers: specifier: ^1.4.1 version: 1.6.6 - components/webscraping_ai: {} + components/webscraping_ai: + dependencies: + '@pipedream/platform': + specifier: ^3.0.3 + version: 3.0.3 components/webvizio: dependencies: