Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions components/scrapeless/actions/crawler/crawler.mjs
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @joy-chanboop I've just tried with this modification and it worked just fine altought I ran out of credits with the account that Leo shared [Scrapeless]: insufficient balance, please recharge first.

import scrapeless from "../../scrapeless.app.mjs";

export default {
  key: "scrapeless-crawler",
  name: "Crawler",
  description: "Crawl any website at scale and say goodbye to blocks. [See the documentation](https://apidocs.scrapeless.com/api-17509010).",
  version: "0.0.9",
  type: "action",
  props: {
    scrapeless,
    apiServer: {
      type: "string",
      label: "Please select a API server",
      description: "Please select a API server to use",
      default: "crawl",
      options: [
        {
          label: "Crawl",
          value: "crawl",
        },
        {
          label: "Scrape",
          value: "scrape",
        },
      ],
      reloadProps: true,
    },
  },
  additionalProps() {
    const { apiServer } = this;

    const props = {};

    if (apiServer === "crawl" || apiServer === "scrape") {
      props.url = {
        type: "string",
        label: "URL to Crawl",
        description: "If you want to crawl in batches, please refer to the SDK of the document",
      };
    }

    if (apiServer === "crawl") {
      props.limitCrawlPages = {
        type: "integer",
        label: "Number Of Subpages",
        default: 5,
        description: "Max number of results to return",
      };
    }

    return props;
  },
  async run({ $ }) {
    const {
      scrapeless,
      apiServer,
      url,
      limitCrawlPages,
    } = this;

    console.log("url", url);
    console.log("limitCrawlPages", limitCrawlPages);
    console.log("apiServer", apiServer);

    const browserOptions = {
      "proxy_country": "ANY",
      "session_name": "Crawl",
      "session_recording": true,
      "session_ttl": 900,
    };

    let response;

    if (apiServer === "crawl") {
      response =
        await scrapeless._scrapelessClient().scrapingCrawl.crawl.crawlUrl(url, {
          limit: limitCrawlPages,
          browserOptions,
        });
    }

    if (apiServer === "scrape") {
      response =
        await scrapeless._scrapelessClient().scrapingCrawl.scrape.scrapeUrl(url, {
          browserOptions,
        });
    }

    if (response?.status === "completed" && response?.data) {
      $.export("$summary", `Successfully retrieved crawling results for ${url}`);
      return response;
    } else {
      throw new Error(response?.error || "Failed to retrieve crawling results");
    }
  },
};

Copy link
Contributor Author

@joy-chanboop joy-chanboop Jul 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @joy-chanboop I've just tried with this modification and it worked just fine altought I ran out of credits with the account that Leo shared [Scrapeless]: insufficient balance, please recharge first.

import scrapeless from "../../scrapeless.app.mjs";

export default {
  key: "scrapeless-crawler",
  name: "Crawler",
  description: "Crawl any website at scale and say goodbye to blocks. [See the documentation](https://apidocs.scrapeless.com/api-17509010).",
  version: "0.0.9",
  type: "action",
  props: {
    scrapeless,
    apiServer: {
      type: "string",
      label: "Please select a API server",
      description: "Please select a API server to use",
      default: "crawl",
      options: [
        {
          label: "Crawl",
          value: "crawl",
        },
        {
          label: "Scrape",
          value: "scrape",
        },
      ],
      reloadProps: true,
    },
  },
  additionalProps() {
    const { apiServer } = this;

    const props = {};

    if (apiServer === "crawl" || apiServer === "scrape") {
      props.url = {
        type: "string",
        label: "URL to Crawl",
        description: "If you want to crawl in batches, please refer to the SDK of the document",
      };
    }

    if (apiServer === "crawl") {
      props.limitCrawlPages = {
        type: "integer",
        label: "Number Of Subpages",
        default: 5,
        description: "Max number of results to return",
      };
    }

    return props;
  },
  async run({ $ }) {
    const {
      scrapeless,
      apiServer,
      url,
      limitCrawlPages,
    } = this;

    console.log("url", url);
    console.log("limitCrawlPages", limitCrawlPages);
    console.log("apiServer", apiServer);

    const browserOptions = {
      "proxy_country": "ANY",
      "session_name": "Crawl",
      "session_recording": true,
      "session_ttl": 900,
    };

    let response;

    if (apiServer === "crawl") {
      response =
        await scrapeless._scrapelessClient().scrapingCrawl.crawl.crawlUrl(url, {
          limit: limitCrawlPages,
          browserOptions,
        });
    }

    if (apiServer === "scrape") {
      response =
        await scrapeless._scrapelessClient().scrapingCrawl.scrape.scrapeUrl(url, {
          browserOptions,
        });
    }

    if (response?.status === "completed" && response?.data) {
      $.export("$summary", `Successfully retrieved crawling results for ${url}`);
      return response;
    } else {
      throw new Error(response?.error || "Failed to retrieve crawling results");
    }
  },
};

Hi @jcortes ,

First, regarding the error you encountered — it’s actually due to your Scrapeless API KEY having no remaining balance. Could you please provide your email address? We’ll send you a dedicated test API KEY so you can continue testing without interruptions.

Also, I’m not sure if this was influenced by running in Pipedream’s production environment, but I found that previously, when executing the scraping-api action, inputProps didn’t contain the form field values returned by the additionalProps function. After reviewing the code, I realized that by adding an explicit await, I was able to correctly retrieve the props values.

Let me know if you have any thoughts on this, or if there’s more you’d like me to check.

Thanks a lot for your time!

You can use the following online environment code for testing.

import scrapeless from "../../scrapeless.app.mjs";
import { log } from "../../common/utils.mjs";
export default {
  key: "scrapeless-scraping-api",
  name: "Scraping API",
  description: "Endpoints for fresh, structured data from 100+ popular sites. [See the documentation](https://apidocs.scrapeless.com/api-12919045).",
  version: "0.0.1",
  type: "action",
  props: {
    scrapeless,
    apiServer: {
      type: "string",
      label: "Please select a API server",
      default: "googleSearch",
      description: "Please select a API server to use",
      options: [
        {
          label: "Google Search",
          value: "googleSearch",
        },
      ],
      reloadProps: true,
    },
  },
  async run({ $ }) {
    const {
      scrapeless, apiServer, ...inputProps
    } = this;

    const MAX_RETRIES = 3;
    // 10 seconds
    const DELAY = 1000 * 10;
    const { run } = $.context;

    let submitData;
    let job;

    // pre check if the job is already in the context
    if (run?.context?.job) {
      job = run.context.job;
    }

    if (apiServer === "googleSearch") {
      submitData = {
        actor: "scraper.google.search",
        input: {
          q: inputProps.q,
          hl: inputProps.hl,
          gl: inputProps.gl,
        },
      };
    }

    if (!submitData) {
      throw new Error("No actor found");
    }
    // 1. Create a new scraping job
    if (!job) {
      job = await scrapeless._scrapelessClient().deepserp.createTask({
        actor: submitData.actor,
        input: submitData.input,
      });

      if (job.status === 200) {
        $.export("$summary", "Successfully retrieved scraping results");
        return job.data;
      }

      log("task in progress");
    }

    // 2. Wait for the job to complete
    if (run.runs === 1) {
      $.flow.rerun(DELAY, {
        job,
      }, MAX_RETRIES);
    } else if (run.runs > MAX_RETRIES ) {
      throw new Error("Max retries reached");
    } else if (job && job?.data?.taskId) {
      const result = await scrapeless._scrapelessClient().deepserp.getTaskResult(job.data.taskId);
      if (result.status === 200) {
        $.export("$summary", "Successfully retrieved scraping results");
        return result.data;
      } else {
        $.flow.rerun(DELAY, {
          job,
        }, MAX_RETRIES);
      }
    } else {
      throw new Error("No job found");
    }

  },
  additionalProps() {
    const { apiServer } = this;

    const props = {};

    if (apiServer === "googleSearch") {
      props.q = {
        type: "string",
        label: "Search Query",
        description: "Parameter defines the query you want to search. You can use anything that you would use in a regular Google search. e.g. inurl:, site:, intitle:. We also support advanced search query parameters such as as_dt and as_eq.",
        default: "coffee",
      };

      props.hl = {
        type: "string",
        label: "Language",
        description: "Parameter defines the language to use for the Google search. It's a two-letter language code. (e.g., en for English, es for Spanish, or fr for French).",
        default: "en",
      };

      props.gl = {
        type: "string",
        label: "Country",
        description: "Parameter defines the country to use for the Google search. It's a two-letter country code. (e.g., us for the United States, uk for United Kingdom, or fr for France).",
        default: "us",
      };
    }

    return props;
  },
};

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @joy-chanboop this is my email [email protected]. The way you are deferring the values with await is weird to me because the additionalProps method doesn't have the async await signature which is not needed in this case. However in my test I can see the logs with the values of the props whenever I run the action so I'm wondering if at least you are able to see them too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @jcortes ,

I’ve just sent the testing API KEY to your email, please check your inbox. Let me know if you didn’t receive it.

Additionally, I found that only the scraping-api.mjs action requires adding await to properly retrieve the props values, which is quite odd — because other actions like crawler.mjs work fine without using await, and still correctly get the props. This inconsistency is also puzzling to me.

Could you help by running a quick test on the current scraping-api.mjs action in the master branch of the pipedream repo? In my local testing, without adding await, it consistently fails to retrieve the props, which led me to apply this fix.

Thanks a lot for taking a look!

Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ export default {
key: "scrapeless-crawler",
name: "Crawler",
description: "Crawl any website at scale and say goodbye to blocks. [See the documentation](https://apidocs.scrapeless.com/api-17509010).",
version: "0.0.2",
version: "0.0.3",
type: "action",
props: {
scrapeless,
Expand All @@ -29,7 +29,7 @@ export default {
async run({ $ }) {
const {
scrapeless, apiServer, ...inputProps
} = this;
} = await this;

const browserOptions = {
"proxy_country": "ANY",
Expand Down
4 changes: 2 additions & 2 deletions components/scrapeless/actions/scraping-api/scraping-api.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ export default {
key: "scrapeless-scraping-api",
name: "Scraping API",
description: "Endpoints for fresh, structured data from 100+ popular sites. [See the documentation](https://apidocs.scrapeless.com/api-12919045).",
version: "0.0.1",
version: "0.0.2",
type: "action",
props: {
scrapeless,
Expand All @@ -25,7 +25,7 @@ export default {
async run({ $ }) {
const {
scrapeless, apiServer, ...inputProps
} = this;
} = await this;

const MAX_RETRIES = 3;
// 10 seconds
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ export default {
key: "scrapeless-universal-scraping-api",
name: "Universal Scraping API",
description: "Access any website at scale and say goodbye to blocks. [See the documentation](https://apidocs.scrapeless.com/api-11949854).",
version: "0.0.1",
version: "0.0.2",
type: "action",
props: {
scrapeless,
Expand All @@ -27,7 +27,7 @@ export default {
const {
scrapeless,
apiServer, ...inputProps
} = this;
} = await this;

if (apiServer === "webUnlocker") {
const response = await scrapeless._scrapelessClient().universal.scrape({
Expand Down
2 changes: 1 addition & 1 deletion components/scrapeless/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@pipedream/scrapeless",
"version": "0.2.0",
"version": "0.2.1",
"description": "Pipedream Scrapeless Components",
"main": "scrapeless.app.mjs",
"keywords": [
Expand Down
8 changes: 4 additions & 4 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading