Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 6 additions & 38 deletions components/apify/actions/scrape-single-url/scrape-single-url.mjs
Original file line number Diff line number Diff line change
@@ -1,57 +1,25 @@
import apify from "../../apify.app.mjs";
import { ACTOR_ID } from "../../common/constants.mjs";
import { gotScraping } from "got-scraping";

export default {
key: "apify-scrape-single-url",
name: "Scrape Single URL",
description: "Executes a scraper on a specific website and returns its content as text. This action is perfect for extracting content from a single page.",
version: "0.0.4",
description: "Executes a scraper on a specific website and returns its content as HTML. This action is perfect for extracting content from a single page. [See the documentation](https://docs.apify.com/sdk/js/docs/examples/crawl-single-url)",
version: "0.1.0",
type: "action",
props: {
apify,
url: {
type: "string",
label: "URL",
description: "The URL of the web page to scrape.",
optional: false,
},
crawlerType: {
type: "string",
label: "Crawler Type",
description: "Select the crawling engine:\n- **Headless web browser** - Useful for modern websites with anti-scraping protections and JavaScript rendering. It recognizes common blocking patterns like CAPTCHAs and automatically retries blocked requests through new sessions. However, running web browsers is more expensive as it requires more computing resources and is slower. It is recommended to use at least 8 GB of RAM.\n- **Stealthy web browser** (default) - Another headless web browser with anti-blocking measures enabled. Try this if you encounter bot protection while scraping. For best performance, use with Apify Proxy residential IPs. \n- **Raw HTTP client** - High-performance crawling mode that uses raw HTTP requests to fetch the pages. It is faster and cheaper, but it might not work on all websites.",
options: [
{
label: "Headless browser (stealthy Firefox+Playwright) - Very reliable, best in avoiding blocking, but might be slow",
value: "playwright:firefox",
},
{
label: "Headless browser (Chrome+Playwright) - Reliable, but might be slow",
value: "playwright:chrome",
},
{
label: "Raw HTTP client (Cheerio) - Extremely fast, but cannot handle dynamic content",
value: "cheerio",
},
],
},
},
async run({ $ }) {
const response = await this.apify.runActor({
$,
actorId: ACTOR_ID,
data: {
crawlerType: this.crawlerType,
maxCrawlDepth: 0,
maxCrawlPages: 1,
maxResults: 1,
startUrls: [
{
url: this.url,
},
],
},
const { body } = await gotScraping({
url: this.url,
});
$.export("$summary", `Successfully scraped content from ${this.url}`);
return response;
return body;
},
};
5 changes: 3 additions & 2 deletions components/apify/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@pipedream/apify",
"version": "0.2.2",
"version": "0.3.0",
"description": "Pipedream Apify Components",
"main": "apify.app.mjs",
"keywords": [
Expand All @@ -14,6 +14,7 @@
},
"dependencies": {
"@apify/consts": "^2.41.0",
"@pipedream/platform": "^3.0.3"
"@pipedream/platform": "^3.0.3",
"got-scraping": "^4.1.2"
}
}
98 changes: 94 additions & 4 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading