diff --git a/components/scrapeninja/.gitignore b/components/scrapeninja/.gitignore deleted file mode 100644 index ec761ccab7595..0000000000000 --- a/components/scrapeninja/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.js -*.mjs -dist \ No newline at end of file diff --git a/components/scrapeninja/actions/scrape-with-js-rendering/scrape-with-js-rendering.mjs b/components/scrapeninja/actions/scrape-with-js-rendering/scrape-with-js-rendering.mjs new file mode 100644 index 0000000000000..ab619dd0cd23b --- /dev/null +++ b/components/scrapeninja/actions/scrape-with-js-rendering/scrape-with-js-rendering.mjs @@ -0,0 +1,231 @@ +import { ConfigurationError } from "@pipedream/platform"; +import { + clearObj, + parseError, parseObject, +} from "../../common/utils.mjs"; +import scrapeninja from "../../scrapeninja.app.mjs"; + +export default { + key: "scrapeninja-scrape-with-js-rendering", + name: "Scrape with JS Rendering", + description: "Uses the ScrapeNinja real Chrome browser engine to scrape pages that require JS rendering. [See the documentation](https://scrapeninja.net/docs/api-reference/scrape-js/)", + version: "0.0.1", + type: "action", + props: { + scrapeninja, + url: { + propDefinition: [ + scrapeninja, + "url", + ], + }, + waitForSelector: { + propDefinition: [ + scrapeninja, + "waitForSelector", + ], + optional: true, + }, + postWaitTime: { + propDefinition: [ + scrapeninja, + "postWaitTime", + ], + optional: true, + }, + dumpIframe: { + propDefinition: [ + scrapeninja, + "dumpIframe", + ], + optional: true, + }, + waitForSelectorIframe: { + propDefinition: [ + scrapeninja, + "waitForSelectorIframe", + ], + optional: true, + }, + extractorTargetIframe: { + propDefinition: [ + scrapeninja, + "extractorTargetIframe", + ], + optional: true, + }, + headers: { + propDefinition: [ + scrapeninja, + "headers", + ], + optional: true, + }, + retryNum: { + propDefinition: [ + scrapeninja, + "retryNum", + ], + optional: true, + }, + geo: { + propDefinition: [ + scrapeninja, + "geo", + ], + optional: true, + }, + proxy: { + propDefinition: [ + scrapeninja, + "proxy", + ], + optional: true, + }, + timeout: { + propDefinition: [ + scrapeninja, + "timeout", + ], + optional: true, + }, + textNotExpected: { + propDefinition: [ + scrapeninja, + "textNotExpected", + ], + optional: true, + }, + statusNotExpected: { + propDefinition: [ + scrapeninja, + "statusNotExpected", + ], + optional: true, + }, + blockImages: { + propDefinition: [ + scrapeninja, + "blockImages", + ], + optional: true, + }, + blockMedia: { + propDefinition: [ + scrapeninja, + "blockMedia", + ], + optional: true, + }, + screenshot: { + propDefinition: [ + scrapeninja, + "screenshot", + ], + optional: true, + }, + catchAjaxHeadersUrlMask: { + propDefinition: [ + scrapeninja, + "catchAjaxHeadersUrlMask", + ], + optional: true, + }, + viewportWidth: { + propDefinition: [ + scrapeninja, + "viewportWidth", + ], + optional: true, + }, + viewportHeight: { + propDefinition: [ + scrapeninja, + "viewportHeight", + ], + optional: true, + }, + viewportDeviceScaleFactor: { + propDefinition: [ + scrapeninja, + "viewportDeviceScaleFactor", + ], + optional: true, + }, + viewportHasTouch: { + propDefinition: [ + scrapeninja, + "viewportHasTouch", + ], + optional: true, + }, + viewportIsMobile: { + propDefinition: [ + scrapeninja, + "viewportIsMobile", + ], + optional: true, + }, + viewportIsLandscape: { + propDefinition: [ + scrapeninja, + "viewportIsLandscape", + ], + optional: true, + }, + extractor: { + propDefinition: [ + scrapeninja, + "extractor", + ], + optional: true, + }, + }, + async run({ $ }) { + try { + const viewport = clearObj({ + width: this.viewportWidth, + height: this.viewportHeight, + deviceScaleFactor: this.viewportDeviceScaleFactor, + hasTouch: this.viewportHasTouch, + isMobile: this.viewportIsMobile, + isLandscape: this.viewportIsLandscape, + }); + + const data = clearObj({ + url: this.url, + waitForSelector: this.waitForSelector, + postWaitTime: this.postWaitTime, + dumpIframe: this.dumpIframe, + waitForSelectorIframe: this.waitForSelectorIframe, + extractorTargetIframe: this.extractorTargetIframe, + headers: parseObject(this.headers), + retryNum: this.retryNum, + geo: this.geo, + proxy: this.proxy, + timeout: this.timeout, + textNotExpected: parseObject(this.textNotExpected), + statusNotExpected: parseObject(this.statusNotExpected), + blockImages: this.blockImages, + blockMedia: this.blockMedia, + screenshot: this.screenshot, + catchAjaxHeadersUrlMask: this.catchAjaxHeadersUrlMask, + extractor: this.extractor, + }); + + if (Object.entries(viewport).length) { + data.viewport = viewport; + } + + const response = await this.scrapeninja.scrapeJs({ + $, + data, + }); + + $.export("$summary", `Successfully scraped ${this.url} with JS rendering`); + return response; + } catch ({ response: { data } }) { + throw new ConfigurationError(parseError(data)); + } + }, +}; diff --git a/components/scrapeninja/actions/scrape-without-js/scrape-without-js.mjs b/components/scrapeninja/actions/scrape-without-js/scrape-without-js.mjs new file mode 100644 index 0000000000000..adab5a23e6d3d --- /dev/null +++ b/components/scrapeninja/actions/scrape-without-js/scrape-without-js.mjs @@ -0,0 +1,106 @@ +import { ConfigurationError } from "@pipedream/platform"; +import { parseObject } from "../../common/utils.mjs"; +import scrapeninja from "../../scrapeninja.app.mjs"; + +export default { + key: "scrapeninja-scrape-without-js", + name: "Scrape without JS", + description: "Use high-performance web scraping endpoint with Chrome browser TLS fingerprint, but without JavaScript execution and real browser overhead. [See the documentation](https://scrapeninja.net/docs/api-reference/scrape/)", + version: "0.0.1", + type: "action", + props: { + scrapeninja, + url: { + propDefinition: [ + scrapeninja, + "url", + ], + }, + headers: { + propDefinition: [ + scrapeninja, + "headers", + ], + optional: true, + }, + retryNum: { + propDefinition: [ + scrapeninja, + "retryNum", + ], + optional: true, + }, + geo: { + propDefinition: [ + scrapeninja, + "geo", + ], + optional: true, + }, + proxy: { + propDefinition: [ + scrapeninja, + "proxy", + ], + optional: true, + }, + followRedirects: { + propDefinition: [ + scrapeninja, + "followRedirects", + ], + optional: true, + }, + timeout: { + propDefinition: [ + scrapeninja, + "timeout", + ], + optional: true, + }, + textNotExpected: { + propDefinition: [ + scrapeninja, + "textNotExpected", + ], + optional: true, + }, + statusNotExpected: { + propDefinition: [ + scrapeninja, + "statusNotExpected", + ], + optional: true, + }, + extractor: { + propDefinition: [ + scrapeninja, + "extractor", + ], + optional: true, + }, + }, + async run({ $ }) { + try { + const response = await this.scrapeninja.scrapeNonJs({ + $, + data: { + url: this.url, + headers: parseObject(this.headers), + retryNum: this.retryNum, + geo: this.geo, + proxy: this.proxy, + followRedirects: this.followRedirects, + timeout: this.timeout, + textNotExpected: parseObject(this.textNotExpected), + statusNotExpected: parseObject(this.statusNotExpected), + extractor: this.extractor, + }, + }); + $.export("$summary", "Successfully scraped the URL"); + return response; + } catch ({ response: { data } }) { + throw new ConfigurationError(data.message || data.stderr); + } + }, +}; diff --git a/components/scrapeninja/app/scrapeninja.app.ts b/components/scrapeninja/app/scrapeninja.app.ts deleted file mode 100644 index 5f790be0ca094..0000000000000 --- a/components/scrapeninja/app/scrapeninja.app.ts +++ /dev/null @@ -1,13 +0,0 @@ -import { defineApp } from "@pipedream/types"; - -export default defineApp({ - type: "app", - app: "scrapeninja", - propDefinitions: {}, - methods: { - // this.$auth contains connected account data - authKeys() { - console.log(Object.keys(this.$auth)); - }, - }, -}); \ No newline at end of file diff --git a/components/scrapeninja/common/utils.mjs b/components/scrapeninja/common/utils.mjs new file mode 100644 index 0000000000000..7f4e3a5ddd760 --- /dev/null +++ b/components/scrapeninja/common/utils.mjs @@ -0,0 +1,48 @@ +export const parseObject = (obj) => { + if (!obj) return undefined; + + if (Array.isArray(obj)) { + return obj.map((item) => { + if (typeof item === "string") { + try { + return JSON.parse(item); + } catch (e) { + return item; + } + } + return item; + }); + } + if (typeof obj === "string") { + try { + return JSON.parse(obj); + } catch (e) { + return obj; + } + } + return obj; +}; + +export const parseError = (data) => { + if (data.message) return data.message; + if (data.stderr) return data.stderr; + if (data.errors) return Object.entries(data.errors[0])[0][1]; +}; + +export const clearObj = (obj) => { + return Object.entries(obj) + .filter(([ + _, + v, + ]) => (v != null && v != "" && _ != "$emit")) + .reduce((acc, [ + k, + v, + ]) => ({ + ...acc, + [k]: (!Array.isArray(v) && v === Object(v)) + ? clearObj(v) + : v, + }), {}); +}; + diff --git a/components/scrapeninja/package.json b/components/scrapeninja/package.json index c03a3ee9f6be1..076db2ae6dccb 100644 --- a/components/scrapeninja/package.json +++ b/components/scrapeninja/package.json @@ -1,16 +1,18 @@ { "name": "@pipedream/scrapeninja", - "version": "0.0.2", + "version": "0.1.0", "description": "Pipedream ScrapeNinja Components", - "main": "dist/app/scrapeninja.app.mjs", + "main": "scrapeninja.app.mjs", "keywords": [ "pipedream", "scrapeninja" ], - "files": ["dist"], "homepage": "https://pipedream.com/apps/scrapeninja", "author": "Pipedream (https://pipedream.com/)", "publishConfig": { "access": "public" + }, + "dependencies": { + "@pipedream/platform": "^3.0.3" } } diff --git a/components/scrapeninja/scrapeninja.app.mjs b/components/scrapeninja/scrapeninja.app.mjs new file mode 100644 index 0000000000000..bef0c87167e1f --- /dev/null +++ b/components/scrapeninja/scrapeninja.app.mjs @@ -0,0 +1,197 @@ +import { axios } from "@pipedream/platform"; + +export default { + type: "app", + app: "scrapeninja", + propDefinitions: { + url: { + type: "string", + label: "URL", + description: "The URL to scrape.", + }, + headers: { + type: "string[]", + label: "Headers", + description: "Custom headers to send with the request. By default, regular Chrome browser headers are sent to the target URL.", + optional: true, + }, + retryNum: { + type: "integer", + label: "Retry Number", + description: "Amount of attempts.", + optional: true, + }, + geo: { + type: "string", + label: "Geo", + description: "Geo location for basic proxy pools (you can purchase premium ScrapeNinja proxies for wider country selection and higher proxy quality). [Read more about ScrapeNinja proxy setup](https://scrapeninja.net/docs/proxy-setup/)", + optional: true, + }, + proxy: { + type: "string", + label: "Proxy", + description: "Premium or your own proxy URL (overrides `Geo` prop). [Read more about ScrapeNinja proxy setup](https://scrapeninja.net/docs/proxy-setup/).", + optional: true, + }, + followRedirects: { + type: "integer", + label: "Follow Redirects", + description: "Whether to follow redirects.", + optional: true, + default: 1, + }, + timeout: { + type: "integer", + label: "Timeout", + description: "Timeout per attempt, in seconds. Each retry will take [timeout] number of seconds.", + min: 4, + max: 30, + optional: true, + }, + textNotExpected: { + type: "string[]", + label: "Text Not Expected", + description: "Text which will trigger a retry from another proxy address.", + optional: true, + }, + statusNotExpected: { + type: "integer[]", + label: "Status Not Expected", + description: "HTTP response statuses which will trigger a retry from another proxy address.", + optional: true, + }, + extractor: { + type: "string", + label: "Extractor", + description: "Custom JS function to extract JSON values from scraped HTML. Write&test your own extractor on [https://scrapeninja.net/cheerio-sandbox/](https://scrapeninja.net/cheerio-sandbox/)", + optional: true, + }, + waitForSelector: { + type: "string", + label: "Wait For Selector", + description: "CSS selector to wait for before considering the page loaded.", + optional: true, + }, + postWaitTime: { + type: "integer", + label: "Post Wait Time", + description: "Wait for specified amount of seconds after page load (from 1 to 12s). Use this only if ScrapeNinja failed to wait for required page elements automatically.", + min: 1, + max: 12, + optional: true, + }, + dumpIframe: { + type: "string", + label: "Dump Iframe", + description: "If some particular iframe needs to be dumped, specify its name HTML value in this argument. The ScrapeNinja JS renderer will wait for CSS selector to wait for iframe DOM elements to appear inside.", + optional: true, + }, + waitForSelectorIframe: { + type: "string", + label: "Wait For Selector Iframe", + description: "If `Dump Iframe` is activated, this property allows to wait for CSS selector inside this iframe.", + optional: true, + }, + extractorTargetIframe: { + type: "boolean", + label: "Extractor Target Iframe", + description: "If `Dump Iframe` is activated, this property allows to run JS extractor function against iframe HTML instead of running it against base body. This is only useful if `Dump Iframe` is activated.", + optional: true, + }, + blockImages: { + type: "boolean", + label: "Block Images", + description: "Block images from loading. This will speed up page loading and reduce bandwidth usage.", + optional: true, + }, + blockMedia: { + type: "boolean", + label: "Block Media", + description: "Block (CSS, fonts) from loading. This will speed up page loading and reduce bandwidth usage.", + optional: true, + }, + screenshot: { + type: "boolean", + label: "Screenshot", + description: "Take a screenshot of the page. Pass \"false\" to increase the speed of the request.", + optional: true, + }, + catchAjaxHeadersUrlMask: { + type: "string", + label: "Catch Ajax Headers URL Mask", + description: "Useful to dump some XHR response. Pass URL mask here. For example, if you need to catch all requests to https://example.com/api/data.json, pass \"api/data.json\" here. In response, you will get new property `.info.catchedAjax` with the XHR response data - { url, method, headers[], body , status, responseHeaders{} }", + optional: true, + }, + viewportWidth: { + type: "integer", + label: "Viewport Width", + description: "Width of the viewport.", + optional: true, + }, + viewportHeight: { + type: "integer", + label: "Viewport Height", + description: "Height of the viewport.", + optional: true, + }, + viewportDeviceScaleFactor: { + type: "integer", + label: "Viewport Device Scale Factor", + description: "Device scale factor for the viewport.", + optional: true, + }, + viewportHasTouch: { + type: "boolean", + label: "Viewport Has Touch", + description: "Whether the viewport has touch capabilities.", + optional: true, + }, + viewportIsMobile: { + type: "boolean", + label: "Viewport Is Mobile", + description: "Whether the viewport is mobile.", + optional: true, + }, + viewportIsLandscape: { + type: "boolean", + label: "Viewport Is Landscape", + description: "Whether the viewport is in landscape mode.", + optional: true, + }, + }, + methods: { + _baseUrl() { + return "https://scrapeninja.p.rapidapi.com"; + }, + _headers() { + return { + "content-type": "application/json", + "X-RapidAPI-Key": this.$auth.rapid_api_key, + "X-RapidAPI-Host": "scrapeninja.p.rapidapi.com", + }; + }, + _makeRequest({ + $ = this, path, ...opts + }) { + return axios($, { + url: this._baseUrl() + path, + headers: this._headers(), + ...opts, + }); + }, + scrapeNonJs(opts = {}) { + return this._makeRequest({ + method: "POST", + path: "/scrape", + ...opts, + }); + }, + scrapeJs(opts = {}) { + return this._makeRequest({ + method: "POST", + path: "/scrape-js", + ...opts, + }); + }, + }, +}; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a7dec408ca081..9acfe2f4c3f46 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -3592,8 +3592,7 @@ importers: components/dokan: {} - components/domain_group: - specifiers: {} + components/domain_group: {} components/domo: {} @@ -4838,8 +4837,7 @@ importers: components/geckoboard: {} - components/gem: - specifiers: {} + components/gem: {} components/gemini_public: dependencies: @@ -4916,8 +4914,7 @@ importers: components/getswift: {} - components/getty_images: - specifiers: {} + components/getty_images: {} components/ghost_org_admin_api: dependencies: @@ -10878,7 +10875,11 @@ importers: components/scrapein_: {} - components/scrapeninja: {} + components/scrapeninja: + dependencies: + '@pipedream/platform': + specifier: ^3.0.3 + version: 3.0.3 components/scrapfly: dependencies: @@ -34035,6 +34036,8 @@ snapshots: '@putout/operator-filesystem': 5.0.0(putout@36.13.1(eslint@8.57.1)(typescript@5.6.3)) '@putout/operator-json': 2.2.0 putout: 36.13.1(eslint@8.57.1)(typescript@5.6.3) + transitivePeerDependencies: + - supports-color '@putout/operator-regexp@1.0.0(putout@36.13.1(eslint@8.57.1)(typescript@5.6.3))': dependencies: