From baa56aba4009185ea1ceb898c733d5f22160404b Mon Sep 17 00:00:00 2001 From: Luan Cazarine Date: Wed, 26 Feb 2025 15:45:58 -0300 Subject: [PATCH 1/7] scrapeninja init --- .../non-js-scraping/non-js-scraping.mjs | 90 +++++++ .../scraping-with-js-rendering.mjs | 215 ++++++++++++++++ components/scrapeninja/app/scrapeninja.app.ts | 2 +- components/scrapeninja/scrapeninja.app.mjs | 242 ++++++++++++++++++ 4 files changed, 548 insertions(+), 1 deletion(-) create mode 100644 components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs create mode 100644 components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs create mode 100644 components/scrapeninja/scrapeninja.app.mjs diff --git a/components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs b/components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs new file mode 100644 index 0000000000000..c7773a99ccf45 --- /dev/null +++ b/components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs @@ -0,0 +1,90 @@ +import scrapeninja from "../../scrapeninja.app.mjs"; +import { axios } from "@pipedream/platform"; + +export default { + key: "scrapeninja-non-js-scraping", + name: "ScrapeNinja Non-JS Scraping", + description: "Use ScrapeNinja's high-performance non-JS scraping endpoint. [See the documentation]()", + version: "0.0.{{ts}}", + type: "action", + props: { + scrapeninja: { + type: "app", + app: "scrapeninja", + }, + url: { + propDefinition: [ + scrapeninja, + "url", + ], + }, + headers: { + propDefinition: [ + scrapeninja, + "headers", + ], + optional: true, + }, + retrynum: { + propDefinition: [ + scrapeninja, + "retrynum", + ], + optional: true, + }, + geo: { + propDefinition: [ + scrapeninja, + "geo", + ], + optional: true, + }, + proxy: { + propDefinition: [ + scrapeninja, + "proxy", + ], + optional: true, + }, + followredirects: { + propDefinition: [ + scrapeninja, + "followredirects", + ], + optional: true, + }, + timeout: { + propDefinition: [ + scrapeninja, + "timeout", + ], + optional: true, + }, + textnotexpected: { + propDefinition: [ + scrapeninja, + "textnotexpected", + ], + optional: true, + }, + statusnotexpected: { + propDefinition: [ + scrapeninja, + "statusnotexpected", + ], + optional: true, + }, + extractor: { + propDefinition: [ + scrapeninja, + "extractor", + ], + optional: true, + }, + }, + async run({ $ }) { + const response = await this.scrapeninja.scrapeNonJs(); + $.export("$summary", "Successfully scraped the URL"); + return response; + }, +}; diff --git a/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs b/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs new file mode 100644 index 0000000000000..fc4ca630671eb --- /dev/null +++ b/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs @@ -0,0 +1,215 @@ +import scrapeninja from "../../scrapeninja.app.mjs"; +import { axios } from "@pipedream/platform"; + +export default { + key: "scrapeninja-scraping-with-js-rendering", + name: "ScrapeNinja Scraping with JS Rendering", + description: "Uses the ScrapeNinja real Chrome browser engine to scrape pages that require JS rendering. [See the documentation](https://scrapeninja.net/docs/api-reference/scrape-js/)", + version: "0.0.{{ts}}", + type: "action", + props: { + scrapeninja, + url: { + propDefinition: [ + scrapeninja, + "url", + ], + }, + waitforselector: { + propDefinition: [ + scrapeninja, + "waitforselector", + ], + optional: true, + }, + postwaittime: { + propDefinition: [ + scrapeninja, + "postwaittime", + ], + optional: true, + }, + dumpiframe: { + propDefinition: [ + scrapeninja, + "dumpiframe", + ], + optional: true, + }, + waitforselectoriframe: { + propDefinition: [ + scrapeninja, + "waitforselectoriframe", + ], + optional: true, + }, + extractortargetiframe: { + propDefinition: [ + scrapeninja, + "extractortargetiframe", + ], + optional: true, + }, + headers: { + propDefinition: [ + scrapeninja, + "headers", + ], + optional: true, + }, + retrynum: { + propDefinition: [ + scrapeninja, + "retrynum", + ], + optional: true, + }, + geo: { + propDefinition: [ + scrapeninja, + "geo", + ], + optional: true, + }, + proxy: { + propDefinition: [ + scrapeninja, + "proxy", + ], + optional: true, + }, + timeout: { + propDefinition: [ + scrapeninja, + "timeout", + ], + optional: true, + }, + textnotexpected: { + propDefinition: [ + scrapeninja, + "textnotexpected", + ], + optional: true, + }, + statusnotexpected: { + propDefinition: [ + scrapeninja, + "statusnotexpected", + ], + optional: true, + }, + blockimages: { + propDefinition: [ + scrapeninja, + "blockimages", + ], + optional: true, + }, + blockmedia: { + propDefinition: [ + scrapeninja, + "blockmedia", + ], + optional: true, + }, + screenshot: { + propDefinition: [ + scrapeninja, + "screenshot", + ], + optional: true, + }, + catchajaxheadersurlmask: { + propDefinition: [ + scrapeninja, + "catchajaxheadersurlmask", + ], + optional: true, + }, + viewportWidth: { + propDefinition: [ + scrapeninja, + "viewportwitdh", + ], + optional: true, + }, + viewportHeight: { + propDefinition: [ + scrapeninja, + "viewportheight", + ], + optional: true, + }, + viewportDeviceScaleFactor: { + propDefinition: [ + scrapeninja, + "viewportdevicescalefactor", + ], + optional: true, + }, + viewportHasTouch: { + propDefinition: [ + scrapeninja, + "viewporthastouch", + ], + optional: true, + }, + viewportIsMobile: { + propDefinition: [ + scrapeninja, + "viewportismobile", + ], + optional: true, + }, + viewportIsLandscape: { + propDefinition: [ + scrapeninja, + "viewportislandscape", + ], + optional: true, + }, + extractor: { + propDefinition: [ + scrapeninja, + "extractor", + ], + optional: true, + }, + }, + async run({ $ }) { + const viewport = { + width: this.viewportWidth, + height: this.viewportHeight, + deviceScaleFactor: this.viewportDeviceScaleFactor, + hasTouch: this.viewportHasTouch, + isMobile: this.viewportIsMobile, + isLandscape: this.viewportIsLandscape, + }; + + const response = await this.scrapeninja.scrapeJs({ + url: this.url, + waitForSelector: this.waitforselector, + postWaitTime: this.postwaittime, + dumpIframe: this.dumpiframe, + waitForSelectorIframe: this.waitforselectoriframe, + extractorTargetIframe: this.extractortargetiframe, + headers: this.headers, + retryNum: this.retrynum, + geo: this.geo, + proxy: this.proxy, + timeout: this.timeout, + textNotExpected: this.textnotexpected, + statusNotExpected: this.statusnotexpected, + blockImages: this.blockimages, + blockMedia: this.blockmedia, + screenshot: this.screenshot, + catchAjaxHeadersUrlMask: this.catchajaxheadersurlmask, + viewport, + extractor: this.extractor, + }); + + $.export("$summary", `Successfully scraped ${this.url} with JS rendering`); + return response; + }, +}; diff --git a/components/scrapeninja/app/scrapeninja.app.ts b/components/scrapeninja/app/scrapeninja.app.ts index 5f790be0ca094..37f81a49ecdf5 100644 --- a/components/scrapeninja/app/scrapeninja.app.ts +++ b/components/scrapeninja/app/scrapeninja.app.ts @@ -10,4 +10,4 @@ export default defineApp({ console.log(Object.keys(this.$auth)); }, }, -}); \ No newline at end of file +}); diff --git a/components/scrapeninja/scrapeninja.app.mjs b/components/scrapeninja/scrapeninja.app.mjs new file mode 100644 index 0000000000000..149c839320222 --- /dev/null +++ b/components/scrapeninja/scrapeninja.app.mjs @@ -0,0 +1,242 @@ +import { axios } from "@pipedream/platform"; + +export default { + type: "app", + app: "scrapeninja", + propDefinitions: { + url: { + type: "string", + label: "URL", + description: "The URL to scrape.", + }, + headers: { + type: "string[]", + label: "Headers", + description: "Custom headers to send with the request.", + optional: true, + }, + retrynum: { + type: "integer", + label: "Retry Number", + description: "Number of retry attempts.", + optional: true, + }, + geo: { + type: "string", + label: "Geo", + description: "Geo location for proxy pools (default: us).", + optional: true, + default: "us", + }, + proxy: { + type: "string", + label: "Proxy", + description: "Premium or custom proxy URL.", + optional: true, + }, + followredirects: { + type: "integer", + label: "Follow Redirects", + description: "Whether to follow redirects (default: 1).", + optional: true, + default: 1, + }, + timeout: { + type: "integer", + label: "Timeout", + description: "Timeout per attempt in seconds.", + optional: true, + }, + textnotexpected: { + type: "string[]", + label: "Text Not Expected", + description: "Text that triggers a retry from another proxy.", + optional: true, + }, + statusnotexpected: { + type: "integer[]", + label: "Status Not Expected", + description: "HTTP statuses that trigger a retry from another proxy (default: [403, 502]).", + optional: true, + default: [ + 403, + 502, + ], + }, + extractor: { + type: "string", + label: "Extractor", + description: "Custom JS function to extract JSON values from scraped HTML.", + optional: true, + }, + waitforselector: { + type: "string", + label: "Wait For Selector", + description: "CSS selector to wait for before considering the page loaded.", + optional: true, + }, + postwaittime: { + type: "integer", + label: "Post Wait Time", + description: "Time to wait after page load in seconds.", + optional: true, + }, + dumpiframe: { + type: "string", + label: "Dump Iframe", + description: "Name of the iframe to dump.", + optional: true, + }, + waitforselectoriframe: { + type: "string", + label: "Wait For Selector Iframe", + description: "CSS selector to wait for inside the iframe.", + optional: true, + }, + extractortargetiframe: { + type: "boolean", + label: "Extractor Target Iframe", + description: "Run extractor function against iframe HTML.", + optional: true, + }, + blockimages: { + type: "boolean", + label: "Block Images", + description: "Block images from loading to speed up the request.", + optional: true, + }, + blockmedia: { + type: "boolean", + label: "Block Media", + description: "Block media resources like CSS and fonts from loading.", + optional: true, + }, + screenshot: { + type: "boolean", + label: "Screenshot", + description: "Take a screenshot of the page.", + optional: true, + }, + catchajaxheadersurlmask: { + type: "string", + label: "Catch Ajax Headers URL Mask", + description: "URL mask to catch specific AJAX responses.", + optional: true, + }, + viewportwitdh: { + type: "integer", + label: "Viewport Width", + description: "Width of the viewport.", + optional: true, + }, + viewportheight: { + type: "integer", + label: "Viewport Height", + description: "Height of the viewport.", + optional: true, + }, + viewportdevicescalefactor: { + type: "integer", + label: "Viewport Device Scale Factor", + description: "Device scale factor for the viewport.", + optional: true, + }, + viewporthastouch: { + type: "boolean", + label: "Viewport Has Touch", + description: "Whether the viewport has touch capabilities.", + optional: true, + }, + viewportismobile: { + type: "boolean", + label: "Viewport Is Mobile", + description: "Whether the viewport is mobile.", + optional: true, + }, + viewportislandscape: { + type: "boolean", + label: "Viewport Is Landscape", + description: "Whether the viewport is in landscape mode.", + optional: true, + }, + }, + methods: { + _baseUrl() { + return "https://scrapeninja.p.rapidapi.com"; + }, + async _makeRequest(opts = {}) { + const { + $ = this, + method = "POST", + path, + headers = {}, + ...otherOpts + } = opts; + return axios($, { + method, + url: this._baseUrl() + path, + headers: { + ...headers, + "X-RapidAPI-Key": this.$auth.api_key, + "Content-Type": "application/json", + "Accept": "application/json", + }, + ...otherOpts, + }); + }, + async scrapeNonJs(opts = {}) { + const data = { + url: this.url, + headers: this.headers, + retryNum: this.retrynum, + geo: this.geo, + proxy: this.proxy, + followRedirects: this.followredirects, + timeout: this.timeout, + textNotExpected: this.textnotexpected, + statusNotExpected: this.statusnotexpected, + extractor: this.extractor, + }; + return this._makeRequest({ + path: "/scrape", + data, + }); + }, + async scrapeJs(opts = {}) { + const viewport = { + width: this.viewportwitdh, + height: this.viewportheight, + deviceScaleFactor: this.viewportdevicescalefactor, + hasTouch: this.viewporthastouch, + isMobile: this.viewportismobile, + isLandscape: this.viewportislandscape, + }; + const data = { + url: this.url, + waitForSelector: this.waitforselector, + postWaitTime: this.postwaittime, + dumpIframe: this.dumpiframe, + waitForSelectorIframe: this.waitforselectoriframe, + extractorTargetIframe: this.extractortargetiframe, + headers: this.headers, + retryNum: this.retrynum, + geo: this.geo, + proxy: this.proxy, + timeout: this.timeout, + textNotExpected: this.textnotexpected, + statusNotExpected: this.statusnotexpected, + blockImages: this.blockimages, + blockMedia: this.blockmedia, + screenshot: this.screenshot, + catchAjaxHeadersUrlMask: this.catchajaxheadersurlmask, + viewport, + extractor: this.extractor, + }; + return this._makeRequest({ + path: "/scrape-js", + data, + }); + }, + }, + version: "0.0.{{ts}}", +}; From 3f31cb333d91f6029d6ff05a2841e309acaf5b85 Mon Sep 17 00:00:00 2001 From: Luan Cazarine Date: Thu, 27 Feb 2025 12:04:09 -0300 Subject: [PATCH 2/7] [Components] scrapeninja #15137 Actions - Non JS Scraping - Scraping With JS Rendering --- components/scrapeninja/.gitignore | 3 - .../non-js-scraping/non-js-scraping.mjs | 54 +++--- .../scraping-with-js-rendering.mjs | 140 ++++++++------- components/scrapeninja/app/scrapeninja.app.ts | 13 -- components/scrapeninja/common/utils.mjs | 48 ++++++ components/scrapeninja/package.json | 8 +- components/scrapeninja/scrapeninja.app.mjs | 159 +++++++----------- 7 files changed, 223 insertions(+), 202 deletions(-) delete mode 100644 components/scrapeninja/.gitignore delete mode 100644 components/scrapeninja/app/scrapeninja.app.ts create mode 100644 components/scrapeninja/common/utils.mjs diff --git a/components/scrapeninja/.gitignore b/components/scrapeninja/.gitignore deleted file mode 100644 index ec761ccab7595..0000000000000 --- a/components/scrapeninja/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.js -*.mjs -dist \ No newline at end of file diff --git a/components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs b/components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs index c7773a99ccf45..917ef7ebbd5ce 100644 --- a/components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs +++ b/components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs @@ -1,17 +1,15 @@ +import { ConfigurationError } from "@pipedream/platform"; +import { parseObject } from "../../common/utils.mjs"; import scrapeninja from "../../scrapeninja.app.mjs"; -import { axios } from "@pipedream/platform"; export default { key: "scrapeninja-non-js-scraping", - name: "ScrapeNinja Non-JS Scraping", - description: "Use ScrapeNinja's high-performance non-JS scraping endpoint. [See the documentation]()", - version: "0.0.{{ts}}", + name: "Non-JS Scraping", + description: "Use high-performance web scraping endpoint with Chrome browser TLS fingerprint, but without JavaScript execution and real browser overhead. [See the documentation](https://scrapeninja.net/docs/api-reference/scrape/)", + version: "0.0.1", type: "action", props: { - scrapeninja: { - type: "app", - app: "scrapeninja", - }, + scrapeninja, url: { propDefinition: [ scrapeninja, @@ -25,10 +23,10 @@ export default { ], optional: true, }, - retrynum: { + retryNum: { propDefinition: [ scrapeninja, - "retrynum", + "retryNum", ], optional: true, }, @@ -46,10 +44,10 @@ export default { ], optional: true, }, - followredirects: { + followRedirects: { propDefinition: [ scrapeninja, - "followredirects", + "followRedirects", ], optional: true, }, @@ -60,17 +58,17 @@ export default { ], optional: true, }, - textnotexpected: { + textNotExpected: { propDefinition: [ scrapeninja, - "textnotexpected", + "textNotExpected", ], optional: true, }, - statusnotexpected: { + statusNotExpected: { propDefinition: [ scrapeninja, - "statusnotexpected", + "statusNotExpected", ], optional: true, }, @@ -83,8 +81,26 @@ export default { }, }, async run({ $ }) { - const response = await this.scrapeninja.scrapeNonJs(); - $.export("$summary", "Successfully scraped the URL"); - return response; + try { + const response = await this.scrapeninja.scrapeNonJs({ + $, + data: { + url: this.url, + headers: parseObject(this.headers), + retryNum: this.retryNum, + geo: this.geo, + proxy: this.proxy, + followRedirects: this.followRedirects, + timeout: this.timeout, + textNotExpected: parseObject(this.textNotExpected), + statusNotExpected: parseObject(this.statusNotExpected), + extractor: this.extractor, + }, + }); + $.export("$summary", "Successfully scraped the URL"); + return response; + } catch ({ response: { data } }) { + throw new ConfigurationError(data.message || data.stderr); + } }, }; diff --git a/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs b/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs index fc4ca630671eb..c24aaacbaa613 100644 --- a/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs +++ b/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs @@ -1,11 +1,15 @@ +import { ConfigurationError } from "@pipedream/platform"; +import { + clearObj, + parseError, parseObject, +} from "../../common/utils.mjs"; import scrapeninja from "../../scrapeninja.app.mjs"; -import { axios } from "@pipedream/platform"; export default { key: "scrapeninja-scraping-with-js-rendering", - name: "ScrapeNinja Scraping with JS Rendering", + name: "Scraping with JS Rendering", description: "Uses the ScrapeNinja real Chrome browser engine to scrape pages that require JS rendering. [See the documentation](https://scrapeninja.net/docs/api-reference/scrape-js/)", - version: "0.0.{{ts}}", + version: "0.0.1", type: "action", props: { scrapeninja, @@ -15,38 +19,38 @@ export default { "url", ], }, - waitforselector: { + waitForSelector: { propDefinition: [ scrapeninja, - "waitforselector", + "waitForSelector", ], optional: true, }, - postwaittime: { + postWaitTime: { propDefinition: [ scrapeninja, - "postwaittime", + "postWaitTime", ], optional: true, }, - dumpiframe: { + dumpIframe: { propDefinition: [ scrapeninja, - "dumpiframe", + "dumpIframe", ], optional: true, }, - waitforselectoriframe: { + waitForSelectorIframe: { propDefinition: [ scrapeninja, - "waitforselectoriframe", + "waitForSelectorIframe", ], optional: true, }, - extractortargetiframe: { + extractorTargetIframe: { propDefinition: [ scrapeninja, - "extractortargetiframe", + "extractorTargetIframe", ], optional: true, }, @@ -57,10 +61,10 @@ export default { ], optional: true, }, - retrynum: { + retryNum: { propDefinition: [ scrapeninja, - "retrynum", + "retryNum", ], optional: true, }, @@ -85,31 +89,31 @@ export default { ], optional: true, }, - textnotexpected: { + textNotExpected: { propDefinition: [ scrapeninja, - "textnotexpected", + "textNotExpected", ], optional: true, }, - statusnotexpected: { + statusNotExpected: { propDefinition: [ scrapeninja, - "statusnotexpected", + "statusNotExpected", ], optional: true, }, - blockimages: { + blockImages: { propDefinition: [ scrapeninja, - "blockimages", + "blockImages", ], optional: true, }, - blockmedia: { + blockMedia: { propDefinition: [ scrapeninja, - "blockmedia", + "blockMedia", ], optional: true, }, @@ -120,52 +124,52 @@ export default { ], optional: true, }, - catchajaxheadersurlmask: { + catchAjaxHeadersUrlMask: { propDefinition: [ scrapeninja, - "catchajaxheadersurlmask", + "catchAjaxHeadersUrlMask", ], optional: true, }, viewportWidth: { propDefinition: [ scrapeninja, - "viewportwitdh", + "viewportWitdh", ], optional: true, }, viewportHeight: { propDefinition: [ scrapeninja, - "viewportheight", + "viewportHeight", ], optional: true, }, viewportDeviceScaleFactor: { propDefinition: [ scrapeninja, - "viewportdevicescalefactor", + "viewportDeviceScaleFactor", ], optional: true, }, viewportHasTouch: { propDefinition: [ scrapeninja, - "viewporthastouch", + "viewportHasTouch", ], optional: true, }, viewportIsMobile: { propDefinition: [ scrapeninja, - "viewportismobile", + "viewportIsMobile", ], optional: true, }, viewportIsLandscape: { propDefinition: [ scrapeninja, - "viewportislandscape", + "viewportIsLandscape", ], optional: true, }, @@ -178,38 +182,50 @@ export default { }, }, async run({ $ }) { - const viewport = { - width: this.viewportWidth, - height: this.viewportHeight, - deviceScaleFactor: this.viewportDeviceScaleFactor, - hasTouch: this.viewportHasTouch, - isMobile: this.viewportIsMobile, - isLandscape: this.viewportIsLandscape, - }; + try { + const viewport = clearObj({ + width: this.viewportWidth, + height: this.viewportHeight, + deviceScaleFactor: this.viewportDeviceScaleFactor, + hasTouch: this.viewportHasTouch, + isMobile: this.viewportIsMobile, + isLandscape: this.viewportIsLandscape, + }); - const response = await this.scrapeninja.scrapeJs({ - url: this.url, - waitForSelector: this.waitforselector, - postWaitTime: this.postwaittime, - dumpIframe: this.dumpiframe, - waitForSelectorIframe: this.waitforselectoriframe, - extractorTargetIframe: this.extractortargetiframe, - headers: this.headers, - retryNum: this.retrynum, - geo: this.geo, - proxy: this.proxy, - timeout: this.timeout, - textNotExpected: this.textnotexpected, - statusNotExpected: this.statusnotexpected, - blockImages: this.blockimages, - blockMedia: this.blockmedia, - screenshot: this.screenshot, - catchAjaxHeadersUrlMask: this.catchajaxheadersurlmask, - viewport, - extractor: this.extractor, - }); + const data = clearObj({ + url: this.url, + waitForSelector: this.waitForSelector, + postWaitTime: this.postWaitTime, + dumpIframe: this.dumpIframe, + waitForSelectorIframe: this.waitForSelectorIframe, + extractorTargetIframe: this.extractorTargetIframe, + headers: parseObject(this.headers), + retryNum: this.retryNum, + geo: this.geo, + proxy: this.proxy, + timeout: this.timeout, + textNotExpected: parseObject(this.textNotExpected), + statusNotExpected: parseObject(this.statusNotExpected), + blockImages: this.blockImages, + blockMedia: this.blockMedia, + screenshot: this.screenshot, + catchAjaxHeadersUrlMask: this.catchAjaxHeadersUrlMask, + extractor: this.extractor, + }); - $.export("$summary", `Successfully scraped ${this.url} with JS rendering`); - return response; + if (Object.entries(viewport).length) { + data.viewport = viewport; + } + + const response = await this.scrapeninja.scrapeJs({ + $, + data, + }); + + $.export("$summary", `Successfully scraped ${this.url} with JS rendering`); + return response; + } catch ({ response: { data } }) { + throw new ConfigurationError(parseError(data)); + } }, }; diff --git a/components/scrapeninja/app/scrapeninja.app.ts b/components/scrapeninja/app/scrapeninja.app.ts deleted file mode 100644 index 37f81a49ecdf5..0000000000000 --- a/components/scrapeninja/app/scrapeninja.app.ts +++ /dev/null @@ -1,13 +0,0 @@ -import { defineApp } from "@pipedream/types"; - -export default defineApp({ - type: "app", - app: "scrapeninja", - propDefinitions: {}, - methods: { - // this.$auth contains connected account data - authKeys() { - console.log(Object.keys(this.$auth)); - }, - }, -}); diff --git a/components/scrapeninja/common/utils.mjs b/components/scrapeninja/common/utils.mjs new file mode 100644 index 0000000000000..7f4e3a5ddd760 --- /dev/null +++ b/components/scrapeninja/common/utils.mjs @@ -0,0 +1,48 @@ +export const parseObject = (obj) => { + if (!obj) return undefined; + + if (Array.isArray(obj)) { + return obj.map((item) => { + if (typeof item === "string") { + try { + return JSON.parse(item); + } catch (e) { + return item; + } + } + return item; + }); + } + if (typeof obj === "string") { + try { + return JSON.parse(obj); + } catch (e) { + return obj; + } + } + return obj; +}; + +export const parseError = (data) => { + if (data.message) return data.message; + if (data.stderr) return data.stderr; + if (data.errors) return Object.entries(data.errors[0])[0][1]; +}; + +export const clearObj = (obj) => { + return Object.entries(obj) + .filter(([ + _, + v, + ]) => (v != null && v != "" && _ != "$emit")) + .reduce((acc, [ + k, + v, + ]) => ({ + ...acc, + [k]: (!Array.isArray(v) && v === Object(v)) + ? clearObj(v) + : v, + }), {}); +}; + diff --git a/components/scrapeninja/package.json b/components/scrapeninja/package.json index c03a3ee9f6be1..076db2ae6dccb 100644 --- a/components/scrapeninja/package.json +++ b/components/scrapeninja/package.json @@ -1,16 +1,18 @@ { "name": "@pipedream/scrapeninja", - "version": "0.0.2", + "version": "0.1.0", "description": "Pipedream ScrapeNinja Components", - "main": "dist/app/scrapeninja.app.mjs", + "main": "scrapeninja.app.mjs", "keywords": [ "pipedream", "scrapeninja" ], - "files": ["dist"], "homepage": "https://pipedream.com/apps/scrapeninja", "author": "Pipedream (https://pipedream.com/)", "publishConfig": { "access": "public" + }, + "dependencies": { + "@pipedream/platform": "^3.0.3" } } diff --git a/components/scrapeninja/scrapeninja.app.mjs b/components/scrapeninja/scrapeninja.app.mjs index 149c839320222..0d441d8866b47 100644 --- a/components/scrapeninja/scrapeninja.app.mjs +++ b/components/scrapeninja/scrapeninja.app.mjs @@ -12,148 +12,147 @@ export default { headers: { type: "string[]", label: "Headers", - description: "Custom headers to send with the request.", + description: "Custom headers to send with the request. By default, regular Chrome browser headers are sent to the target URL.", optional: true, }, - retrynum: { + retryNum: { type: "integer", label: "Retry Number", - description: "Number of retry attempts.", + description: "Amount of attempts.", optional: true, }, geo: { type: "string", label: "Geo", - description: "Geo location for proxy pools (default: us).", + description: "Geo location for basic proxy pools (you can purchase premium ScrapeNinja proxies for wider country selection and higher proxy quality). [Read more about ScrapeNinja proxy setup](https://scrapeninja.net/docs/proxy-setup/)", optional: true, - default: "us", }, proxy: { type: "string", label: "Proxy", - description: "Premium or custom proxy URL.", + description: "Premium or your own proxy URL (overrides geo field). [Read more about ScrapeNinja proxy setup](https://scrapeninja.net/docs/proxy-setup/).", optional: true, }, - followredirects: { + followRedirects: { type: "integer", label: "Follow Redirects", - description: "Whether to follow redirects (default: 1).", + description: "Whether to follow redirects.", optional: true, default: 1, }, timeout: { type: "integer", label: "Timeout", - description: "Timeout per attempt in seconds.", + description: "Timeout per attempt, in seconds. Each retry will take [timeout] number of seconds.", + min: 4, + max: 30, optional: true, }, - textnotexpected: { + textNotExpected: { type: "string[]", label: "Text Not Expected", - description: "Text that triggers a retry from another proxy.", + description: "Text which will trigger a retry from another proxy address.", optional: true, }, - statusnotexpected: { + statusNotExpected: { type: "integer[]", label: "Status Not Expected", - description: "HTTP statuses that trigger a retry from another proxy (default: [403, 502]).", + description: "HTTP response statuses which will trigger a retry from another proxy address.", optional: true, - default: [ - 403, - 502, - ], }, extractor: { type: "string", label: "Extractor", - description: "Custom JS function to extract JSON values from scraped HTML.", + description: "Custom JS function to extract JSON values from scraped HTML. Write&test your own extractor on [https://scrapeninja.net/cheerio-sandbox/](https://scrapeninja.net/cheerio-sandbox/)", optional: true, }, - waitforselector: { + waitForSelector: { type: "string", label: "Wait For Selector", description: "CSS selector to wait for before considering the page loaded.", optional: true, }, - postwaittime: { + postWaitTime: { type: "integer", label: "Post Wait Time", - description: "Time to wait after page load in seconds.", + description: "Wait for specified amount of seconds after page load (from 1 to 12s). Use this only if ScrapeNinja failed to wait for required page elements automatically.", + min: 1, + max: 12, optional: true, }, - dumpiframe: { + dumpIframe: { type: "string", label: "Dump Iframe", - description: "Name of the iframe to dump.", + description: "If some particular iframe needs to be dumped, specify its name HTML value in this argument. The ScrapeNinja JS renderer will wait for CSS selector to wait for iframe DOM elements to appear inside.", optional: true, }, - waitforselectoriframe: { + waitForSelectorIframe: { type: "string", label: "Wait For Selector Iframe", - description: "CSS selector to wait for inside the iframe.", + description: "If `Dump Iframe` is activated, this property allows to wait for CSS selector inside this iframe.", optional: true, }, - extractortargetiframe: { + extractorTargetIframe: { type: "boolean", label: "Extractor Target Iframe", - description: "Run extractor function against iframe HTML.", + description: "If `Dump Iframe` is activated, this property allows to run JS extractor function against iframe HTML instead of running it against base body. This is only useful if `Dump Iframe` is activated.", optional: true, }, - blockimages: { + blockImages: { type: "boolean", label: "Block Images", - description: "Block images from loading to speed up the request.", + description: "Block images from loading. This will speed up page loading and reduce bandwidth usage.", optional: true, }, - blockmedia: { + blockMedia: { type: "boolean", label: "Block Media", - description: "Block media resources like CSS and fonts from loading.", + description: "Block (CSS, fonts) from loading. This will speed up page loading and reduce bandwidth usage.", optional: true, }, screenshot: { type: "boolean", label: "Screenshot", - description: "Take a screenshot of the page.", + description: "Take a screenshot of the page. Pass \"false\" to increase the speed of the request.", optional: true, }, - catchajaxheadersurlmask: { + catchAjaxHeadersUrlMask: { type: "string", label: "Catch Ajax Headers URL Mask", - description: "URL mask to catch specific AJAX responses.", + description: "Useful to dump some XHR response. Pass URL mask here. For example, if you need to catch all requests to https://example.com/api/data.json, pass \"api/data.json\" here. In response, you will get new property `.info.catchedAjax` with the XHR response data - { url, method, headers[], body , status, responseHeaders{} }", optional: true, }, - viewportwitdh: { + viewportWitdh: { type: "integer", label: "Viewport Width", description: "Width of the viewport.", optional: true, }, - viewportheight: { + viewportHeight: { type: "integer", label: "Viewport Height", description: "Height of the viewport.", optional: true, }, - viewportdevicescalefactor: { + viewportDeviceScaleFactor: { type: "integer", label: "Viewport Device Scale Factor", description: "Device scale factor for the viewport.", optional: true, }, - viewporthastouch: { + viewportHasTouch: { type: "boolean", label: "Viewport Has Touch", description: "Whether the viewport has touch capabilities.", optional: true, }, - viewportismobile: { + viewportIsMobile: { type: "boolean", label: "Viewport Is Mobile", description: "Whether the viewport is mobile.", optional: true, }, - viewportislandscape: { + viewportIsLandscape: { type: "boolean", label: "Viewport Is Landscape", description: "Whether the viewport is in landscape mode.", @@ -164,79 +163,35 @@ export default { _baseUrl() { return "https://scrapeninja.p.rapidapi.com"; }, - async _makeRequest(opts = {}) { - const { - $ = this, - method = "POST", - path, - headers = {}, - ...otherOpts - } = opts; + _headers() { + return { + "content-type": "application/json", + "X-RapidAPI-Key": this.$auth.rapid_api_key, + "X-RapidAPI-Host": "scrapeninja.p.rapidapi.com", + }; + }, + _makeRequest({ + $ = this, path, ...opts + }) { return axios($, { - method, url: this._baseUrl() + path, - headers: { - ...headers, - "X-RapidAPI-Key": this.$auth.api_key, - "Content-Type": "application/json", - "Accept": "application/json", - }, - ...otherOpts, + headers: this._headers(), + ...opts, }); }, - async scrapeNonJs(opts = {}) { - const data = { - url: this.url, - headers: this.headers, - retryNum: this.retrynum, - geo: this.geo, - proxy: this.proxy, - followRedirects: this.followredirects, - timeout: this.timeout, - textNotExpected: this.textnotexpected, - statusNotExpected: this.statusnotexpected, - extractor: this.extractor, - }; + scrapeNonJs(opts = {}) { return this._makeRequest({ + method: "POST", path: "/scrape", - data, + ...opts, }); }, - async scrapeJs(opts = {}) { - const viewport = { - width: this.viewportwitdh, - height: this.viewportheight, - deviceScaleFactor: this.viewportdevicescalefactor, - hasTouch: this.viewporthastouch, - isMobile: this.viewportismobile, - isLandscape: this.viewportislandscape, - }; - const data = { - url: this.url, - waitForSelector: this.waitforselector, - postWaitTime: this.postwaittime, - dumpIframe: this.dumpiframe, - waitForSelectorIframe: this.waitforselectoriframe, - extractorTargetIframe: this.extractortargetiframe, - headers: this.headers, - retryNum: this.retrynum, - geo: this.geo, - proxy: this.proxy, - timeout: this.timeout, - textNotExpected: this.textnotexpected, - statusNotExpected: this.statusnotexpected, - blockImages: this.blockimages, - blockMedia: this.blockmedia, - screenshot: this.screenshot, - catchAjaxHeadersUrlMask: this.catchajaxheadersurlmask, - viewport, - extractor: this.extractor, - }; + scrapeJs(opts = {}) { return this._makeRequest({ + method: "POST", path: "/scrape-js", - data, + ...opts, }); }, }, - version: "0.0.{{ts}}", }; From eac6a0cc7540baa2f1e8dde94f413380740c53d4 Mon Sep 17 00:00:00 2001 From: Luan Cazarine Date: Thu, 27 Feb 2025 12:05:44 -0300 Subject: [PATCH 3/7] pnpm update --- pnpm-lock.yaml | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a7dec408ca081..9acfe2f4c3f46 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -3592,8 +3592,7 @@ importers: components/dokan: {} - components/domain_group: - specifiers: {} + components/domain_group: {} components/domo: {} @@ -4838,8 +4837,7 @@ importers: components/geckoboard: {} - components/gem: - specifiers: {} + components/gem: {} components/gemini_public: dependencies: @@ -4916,8 +4914,7 @@ importers: components/getswift: {} - components/getty_images: - specifiers: {} + components/getty_images: {} components/ghost_org_admin_api: dependencies: @@ -10878,7 +10875,11 @@ importers: components/scrapein_: {} - components/scrapeninja: {} + components/scrapeninja: + dependencies: + '@pipedream/platform': + specifier: ^3.0.3 + version: 3.0.3 components/scrapfly: dependencies: @@ -34035,6 +34036,8 @@ snapshots: '@putout/operator-filesystem': 5.0.0(putout@36.13.1(eslint@8.57.1)(typescript@5.6.3)) '@putout/operator-json': 2.2.0 putout: 36.13.1(eslint@8.57.1)(typescript@5.6.3) + transitivePeerDependencies: + - supports-color '@putout/operator-regexp@1.0.0(putout@36.13.1(eslint@8.57.1)(typescript@5.6.3))': dependencies: From cdc2434965bc152a73bfe47098175dcd8b1ef5a9 Mon Sep 17 00:00:00 2001 From: Luan Cazarine Date: Fri, 28 Feb 2025 15:34:18 -0300 Subject: [PATCH 4/7] some adjusts --- .../scrapeninja/actions/non-js-scraping/non-js-scraping.mjs | 2 +- .../scraping-with-js-rendering/scraping-with-js-rendering.mjs | 2 +- components/scrapeninja/scrapeninja.app.mjs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs b/components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs index 917ef7ebbd5ce..7e46850cb6c89 100644 --- a/components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs +++ b/components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs @@ -4,7 +4,7 @@ import scrapeninja from "../../scrapeninja.app.mjs"; export default { key: "scrapeninja-non-js-scraping", - name: "Non-JS Scraping", + name: "Scrape without JS", description: "Use high-performance web scraping endpoint with Chrome browser TLS fingerprint, but without JavaScript execution and real browser overhead. [See the documentation](https://scrapeninja.net/docs/api-reference/scrape/)", version: "0.0.1", type: "action", diff --git a/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs b/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs index c24aaacbaa613..124be6fb9d235 100644 --- a/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs +++ b/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs @@ -7,7 +7,7 @@ import scrapeninja from "../../scrapeninja.app.mjs"; export default { key: "scrapeninja-scraping-with-js-rendering", - name: "Scraping with JS Rendering", + name: "Scrape with JS Rendering", description: "Uses the ScrapeNinja real Chrome browser engine to scrape pages that require JS rendering. [See the documentation](https://scrapeninja.net/docs/api-reference/scrape-js/)", version: "0.0.1", type: "action", diff --git a/components/scrapeninja/scrapeninja.app.mjs b/components/scrapeninja/scrapeninja.app.mjs index 0d441d8866b47..dbb1b895138af 100644 --- a/components/scrapeninja/scrapeninja.app.mjs +++ b/components/scrapeninja/scrapeninja.app.mjs @@ -30,7 +30,7 @@ export default { proxy: { type: "string", label: "Proxy", - description: "Premium or your own proxy URL (overrides geo field). [Read more about ScrapeNinja proxy setup](https://scrapeninja.net/docs/proxy-setup/).", + description: "Premium or your own proxy URL (overrides `Geo` prop). [Read more about ScrapeNinja proxy setup](https://scrapeninja.net/docs/proxy-setup/).", optional: true, }, followRedirects: { From 35beabafa880ae2d17dffef9bd1b09c1ab8e6413 Mon Sep 17 00:00:00 2001 From: Luan Cazarine Date: Fri, 28 Feb 2025 17:46:25 -0300 Subject: [PATCH 5/7] Update components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- .../scraping-with-js-rendering/scraping-with-js-rendering.mjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs b/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs index 124be6fb9d235..b55093842f169 100644 --- a/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs +++ b/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs @@ -134,7 +134,7 @@ export default { viewportWidth: { propDefinition: [ scrapeninja, - "viewportWitdh", + "viewportWidth", ], optional: true, }, From 113f341f0ff1cefeeaf5462f2b3ac918016d83be Mon Sep 17 00:00:00 2001 From: Luan Cazarine Date: Fri, 28 Feb 2025 17:47:29 -0300 Subject: [PATCH 6/7] Update components/scrapeninja/scrapeninja.app.mjs Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- components/scrapeninja/scrapeninja.app.mjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/scrapeninja/scrapeninja.app.mjs b/components/scrapeninja/scrapeninja.app.mjs index dbb1b895138af..bef0c87167e1f 100644 --- a/components/scrapeninja/scrapeninja.app.mjs +++ b/components/scrapeninja/scrapeninja.app.mjs @@ -122,7 +122,7 @@ export default { description: "Useful to dump some XHR response. Pass URL mask here. For example, if you need to catch all requests to https://example.com/api/data.json, pass \"api/data.json\" here. In response, you will get new property `.info.catchedAjax` with the XHR response data - { url, method, headers[], body , status, responseHeaders{} }", optional: true, }, - viewportWitdh: { + viewportWidth: { type: "integer", label: "Viewport Width", description: "Width of the viewport.", From 49a3b4647261255eb1bd4b984af2ad496989312c Mon Sep 17 00:00:00 2001 From: GTFalcao Date: Mon, 3 Mar 2025 20:45:25 -0300 Subject: [PATCH 7/7] Renaming action files to match their names --- .../scrape-with-js-rendering.mjs} | 2 +- .../scrape-without-js.mjs} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename components/scrapeninja/actions/{scraping-with-js-rendering/scraping-with-js-rendering.mjs => scrape-with-js-rendering/scrape-with-js-rendering.mjs} (99%) rename components/scrapeninja/actions/{non-js-scraping/non-js-scraping.mjs => scrape-without-js/scrape-without-js.mjs} (98%) diff --git a/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs b/components/scrapeninja/actions/scrape-with-js-rendering/scrape-with-js-rendering.mjs similarity index 99% rename from components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs rename to components/scrapeninja/actions/scrape-with-js-rendering/scrape-with-js-rendering.mjs index b55093842f169..ab619dd0cd23b 100644 --- a/components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs +++ b/components/scrapeninja/actions/scrape-with-js-rendering/scrape-with-js-rendering.mjs @@ -6,7 +6,7 @@ import { import scrapeninja from "../../scrapeninja.app.mjs"; export default { - key: "scrapeninja-scraping-with-js-rendering", + key: "scrapeninja-scrape-with-js-rendering", name: "Scrape with JS Rendering", description: "Uses the ScrapeNinja real Chrome browser engine to scrape pages that require JS rendering. [See the documentation](https://scrapeninja.net/docs/api-reference/scrape-js/)", version: "0.0.1", diff --git a/components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs b/components/scrapeninja/actions/scrape-without-js/scrape-without-js.mjs similarity index 98% rename from components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs rename to components/scrapeninja/actions/scrape-without-js/scrape-without-js.mjs index 7e46850cb6c89..adab5a23e6d3d 100644 --- a/components/scrapeninja/actions/non-js-scraping/non-js-scraping.mjs +++ b/components/scrapeninja/actions/scrape-without-js/scrape-without-js.mjs @@ -3,7 +3,7 @@ import { parseObject } from "../../common/utils.mjs"; import scrapeninja from "../../scrapeninja.app.mjs"; export default { - key: "scrapeninja-non-js-scraping", + key: "scrapeninja-scrape-without-js", name: "Scrape without JS", description: "Use high-performance web scraping endpoint with Chrome browser TLS fingerprint, but without JavaScript execution and real browser overhead. [See the documentation](https://scrapeninja.net/docs/api-reference/scrape/)", version: "0.0.1",