Skip to content

Commit f457792

Browse files
authored
docs: add deploy guide for google functions (#1078)
### Description - add deploy guide for google functions ### Issues - Closes: #707
1 parent 26b6c00 commit f457792

File tree

4 files changed

+115
-8
lines changed

4 files changed

+115
-8
lines changed
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# mypy: disable-error-code="misc"
2+
import asyncio
3+
import json
4+
from datetime import timedelta
5+
6+
import functions_framework
7+
from flask import Request, Response
8+
9+
from crawlee import service_locator
10+
from crawlee.crawlers import (
11+
BeautifulSoupCrawler,
12+
BeautifulSoupCrawlingContext,
13+
)
14+
15+
# highlight-start
16+
# Disable writing storage data to the file system
17+
configuration = service_locator.get_configuration()
18+
configuration.persist_storage = False
19+
configuration.write_metadata = False
20+
# highlight-end
21+
22+
23+
async def main() -> str:
24+
crawler = BeautifulSoupCrawler(
25+
max_request_retries=1,
26+
request_handler_timeout=timedelta(seconds=30),
27+
max_requests_per_crawl=10,
28+
)
29+
30+
@crawler.router.default_handler
31+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
32+
context.log.info(f'Processing {context.request.url} ...')
33+
34+
data = {
35+
'url': context.request.url,
36+
'title': context.soup.title.string if context.soup.title else None,
37+
'h1s': [h1.text for h1 in context.soup.find_all('h1')],
38+
'h2s': [h2.text for h2 in context.soup.find_all('h2')],
39+
'h3s': [h3.text for h3 in context.soup.find_all('h3')],
40+
}
41+
42+
await context.push_data(data)
43+
await context.enqueue_links()
44+
45+
await crawler.run(['https://crawlee.dev'])
46+
47+
# highlight-start
48+
# Extract data saved in `Dataset`
49+
data = await crawler.get_data()
50+
# Serialize to json string and return
51+
return json.dumps(data.items)
52+
# highlight-end
53+
54+
55+
@functions_framework.http
56+
def crawlee_run(request: Request) -> Response:
57+
# You can pass data to your crawler using `request`
58+
function_id = request.headers['Function-Execution-Id']
59+
response_str = asyncio.run(main())
60+
61+
# Return a response with the crawling results
62+
return Response(response=response_str, status=200)

docs/deployment/google_cloud.mdx

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
---
2+
id: gcp-functions
3+
title: Deploy to GCP Cloud Functions
4+
description: Prepare your crawler to run in Cloud functions on Google Cloud Platform
5+
---
6+
7+
import ApiLink from '@site/src/components/ApiLink';
8+
9+
import CodeBlock from '@theme/CodeBlock';
10+
11+
import GoogleFunctions from '!!raw-loader!./code_examples/google/google_example.py';
12+
13+
## Updating the project
14+
15+
For the project foundation, use <ApiLink to="class/BeautifulSoupCrawler">BeautifulSoupCrawler</ApiLink> as described in this [example](../examples/beautifulsoup-crawler).
16+
17+
Add [functions-framework](https://pypi.org/project/functions-framework/) to your dependencies file `requirements.txt`. If you're using a project manager like `poetry` or `uv`, export your dependencies to `requirements.txt`.
18+
19+
Update the project code to make it compatible with Cloud Functions and return data in JSON format. Also add an entry point that Cloud Functions will use to run the project.
20+
21+
<CodeBlock className="language-python">
22+
{GoogleFunctions.replace(/^.*?\n/, '')}
23+
</CodeBlock>
24+
25+
You can test your project locally. Start the server by running:
26+
27+
```bash
28+
functions-framework --target=crawlee_run
29+
```
30+
31+
Then make a GET request to `http://127.0.0.1:8080/`, for example in your browser.
32+
33+
## Deploying to Google Cloud Platform
34+
35+
In the Google Cloud dashboard, create a new function, allocate memory and CPUs to it, set region and function timeout.
36+
37+
When deploying, select **"Use an inline editor to create a function"**. This allows you to configure the project using only the Google Cloud Console dashboard.
38+
39+
Using the `inline editor`, update the function files according to your project. **Make sure** to update the `requirements.txt` file to match your project's dependencies.
40+
41+
Also, make sure to set the **Function entry point** to the name of the function decorated with `@functions_framework.http`, which in our case is `crawlee_run`.
42+
43+
After the Function deploys, you can test it by clicking the "Test" button. This button opens a popup with a `curl` script that calls your new Cloud Function. To avoid having to install the `gcloud` CLI application locally, you can also run this script in the Cloud Shell by clicking the link above the code block.

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,8 @@ module = [
227227
"apify", # Example code shows integration of apify and crawlee.
228228
"apify_fingerprint_datapoints", # Untyped and stubs not available
229229
"camoufox", # Example code shows integration of camoufox and crawlee.
230+
"flask", # Example code shows deploy on Google Cloud.
231+
"functions_framework", # Example code shows deploy on Google Cloud.
230232
"jaro", # Untyped and stubs not available
231233
"loguru", # Example code shows integration of loguru and crawlee for JSON logging.
232234
"sklearn.linear_model", # Untyped and stubs not available

website/sidebars.js

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,14 @@ module.exports = {
6060
// 'deployment/aws-browsers',
6161
// ],
6262
// },
63-
// {
64-
// type: 'category',
65-
// label: 'Deploy to Google Cloud',
66-
// items: [
67-
// 'deployment/gcp-cheerio',
68-
// 'deployment/gcp-browsers',
69-
// ],
70-
// },
63+
{
64+
type: 'category',
65+
label: 'Deploy to Google Cloud',
66+
items: [
67+
'deployment/gcp-functions',
68+
// 'deployment/gcp-browsers',
69+
],
70+
},
7171
],
7272
},
7373
{

0 commit comments

Comments
 (0)