@@ -22,9 +22,26 @@ This demo showcases how to use Trigger.dev with Python to build a web crawler th
2222- Our [ Python build extension] ( /config/extensions/pythonExtension ) to install the dependencies and run the Python script
2323- [ Crawl4AI] ( https://github.com/unclecode/crawl4ai ) , an open source LLM friendly web crawler
2424- A custom [ Playwright extension] ( https://playwright.dev/ ) to create a headless chromium browser
25+ - Proxy support
26+
27+ ## Using Proxies
2528
2629<ScrapingWarning />
2730
31+ Some popular proxy services are:
32+
33+ - [ Smartproxy] ( https://smartproxy.com/ )
34+ - [ Bright Data] ( https://brightdata.com/ )
35+ - [ Browserbase] ( https://browserbase.com/ )
36+ - [ Oxylabs] ( https://oxylabs.io/ )
37+ - [ ScrapingBee] ( https://scrapingbee.com/ )
38+
39+ Once you have a proxy service, set the following environment variables in your Trigger.dev .env file, and add them in the Trigger.dev dashboard:
40+
41+ - ` PROXY_URL ` : The URL of your proxy server (e.g., ` http://proxy.example.com:8080 ` )
42+ - ` PROXY_USERNAME ` : Username for authenticated proxies (optional)
43+ - ` PROXY_PASSWORD ` : Password for authenticated proxies (optional)
44+
2845## GitHub repo
2946
3047<Card
@@ -113,7 +130,14 @@ export const convertUrlToMarkdown = schemaTask({
113130 url: z .string ().url (),
114131 }),
115132 run : async (payload ) => {
116- const result = await python .runScript (" ./src/python/crawl-url.py" , [payload .url ]);
133+ // Pass through any proxy environment variables
134+ const env = {
135+ PROXY_URL: process .env .PROXY_URL ,
136+ PROXY_USERNAME: process .env .PROXY_USERNAME ,
137+ PROXY_PASSWORD: process .env .PROXY_PASSWORD ,
138+ };
139+
140+ const result = await python .runScript (" ./src/python/crawl-url.py" , [payload .url ], { env });
117141
118142 logger .debug (" convert-url-to-markdown" , {
119143 url: payload .url ,
@@ -142,10 +166,34 @@ The Python script is a simple script using Crawl4AI that takes a URL and returns
142166``` python src/python/crawl-url.py
143167import asyncio
144168import sys
169+ import os
145170from crawl4ai import *
171+ from crawl4ai.async_configs import BrowserConfig
146172
147173async def main (url : str ):
148- async with AsyncWebCrawler() as crawler:
174+ # Get proxy configuration from environment variables
175+ proxy_url = os.environ.get(" PROXY_URL" )
176+ proxy_username = os.environ.get(" PROXY_USERNAME" )
177+ proxy_password = os.environ.get(" PROXY_PASSWORD" )
178+
179+ # Configure the proxy
180+ browser_config = None
181+ if proxy_url:
182+ if proxy_username and proxy_password:
183+ # Use authenticated proxy
184+ proxy_config = {
185+ " server" : proxy_url,
186+ " username" : proxy_username,
187+ " password" : proxy_password
188+ }
189+ browser_config = BrowserConfig(proxy_config = proxy_config)
190+ else :
191+ # Use simple proxy
192+ browser_config = BrowserConfig(proxy = proxy_url)
193+ else :
194+ browser_config = BrowserConfig()
195+
196+ async with AsyncWebCrawler(config = browser_config) as crawler:
149197 result = await crawler.arun(
150198 url = url,
151199 )
0 commit comments