docs: add deploy guide for google functions (#1078)

Mantisus · web-flow · commit f457792eb813 · 2025-04-04T09:08:22.000+02:00
### Description - add deploy guide for google functions ### Issues - Closes: #707
diff --git a/docs/deployment/code_examples/google/google_example.py b/docs/deployment/code_examples/google/google_example.py
@@ -0,0 +1,62 @@
+# mypy: disable-error-code="misc"
+import asyncio
+import json
+from datetime import timedelta
+
+import functions_framework
+from flask import Request, Response
+
+from crawlee import service_locator
+from crawlee.crawlers import (
+    BeautifulSoupCrawler,
+    BeautifulSoupCrawlingContext,
+)
+
+# highlight-start
+# Disable writing storage data to the file system
+configuration = service_locator.get_configuration()
+configuration.persist_storage = False
+configuration.write_metadata = False
+# highlight-end
+
+
+async def main() -> str:
+    crawler = BeautifulSoupCrawler(
+        max_request_retries=1,
+        request_handler_timeout=timedelta(seconds=30),
+        max_requests_per_crawl=10,
+    )
+
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        data = {
+            'url': context.request.url,
+            'title': context.soup.title.string if context.soup.title else None,
+            'h1s': [h1.text for h1 in context.soup.find_all('h1')],
+            'h2s': [h2.text for h2 in context.soup.find_all('h2')],
+            'h3s': [h3.text for h3 in context.soup.find_all('h3')],
+        }
+
+        await context.push_data(data)
+        await context.enqueue_links()
+
+    await crawler.run(['https://crawlee.dev'])
+
+    # highlight-start
+    # Extract data saved in `Dataset`
+    data = await crawler.get_data()
+    # Serialize to json string and return
+    return json.dumps(data.items)
+    # highlight-end
+
+
+@functions_framework.http
+def crawlee_run(request: Request) -> Response:
+    # You can pass data to your crawler using `request`
+    function_id = request.headers['Function-Execution-Id']
+    response_str = asyncio.run(main())
+
+    # Return a response with the crawling results
+    return Response(response=response_str, status=200)
diff --git a/docs/deployment/google_cloud.mdx b/docs/deployment/google_cloud.mdx
@@ -0,0 +1,43 @@
+---
+id: gcp-functions
+title: Deploy to GCP Cloud Functions
+description: Prepare your crawler to run in Cloud functions on Google Cloud Platform
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+
+import CodeBlock from '@theme/CodeBlock';
+
+import GoogleFunctions from '!!raw-loader!./code_examples/google/google_example.py';
+
+## Updating the project
+
+For the project foundation, use <ApiLink to="class/BeautifulSoupCrawler">BeautifulSoupCrawler</ApiLink> as described in this [example](../examples/beautifulsoup-crawler).
+
+Add [functions-framework](https://pypi.org/project/functions-framework/) to your dependencies file `requirements.txt`. If you're using a project manager like `poetry` or `uv`, export your dependencies to `requirements.txt`.
+
+Update the project code to make it compatible with Cloud Functions and return data in JSON format. Also add an entry point that Cloud Functions will use to run the project.
+
+<CodeBlock className="language-python">
+    {GoogleFunctions.replace(/^.*?\n/, '')}
+</CodeBlock>
+
+You can test your project locally. Start the server by running:
+
+```bash
+functions-framework --target=crawlee_run
+```
+
+Then make a GET request to `http://127.0.0.1:8080/`, for example in your browser.
+
+## Deploying to Google Cloud Platform
+
+In the Google Cloud dashboard, create a new function, allocate memory and CPUs to it, set region and function timeout.
+
+When deploying, select **"Use an inline editor to create a function"**. This allows you to configure the project using only the Google Cloud Console dashboard.
+
+Using the `inline editor`, update the function files according to your project. **Make sure** to update the `requirements.txt` file to match your project's dependencies.
+
+Also, make sure to set the **Function entry point** to the name of the function decorated with `@functions_framework.http`, which in our case is `crawlee_run`.
+
+After the Function deploys, you can test it by clicking the "Test" button. This button opens a popup with a `curl` script that calls your new Cloud Function. To avoid having to install the `gcloud` CLI application locally, you can also run this script in the Cloud Shell by clicking the link above the code block.
diff --git a/pyproject.toml b/pyproject.toml
@@ -227,6 +227,8 @@ module = [
     "apify",                        # Example code shows integration of apify and crawlee.
     "apify_fingerprint_datapoints", # Untyped and stubs not available
     "camoufox",                     # Example code shows integration of camoufox and crawlee.
+    "flask",                        # Example code shows deploy on Google Cloud.
+    "functions_framework",          # Example code shows deploy on Google Cloud.
     "jaro",                         # Untyped and stubs not available
     "loguru",                       # Example code shows integration of loguru and crawlee for JSON logging.
     "sklearn.linear_model",         # Untyped and stubs not available
diff --git a/website/sidebars.js b/website/sidebars.js
@@ -60,14 +60,14 @@ module.exports = {
         //                 'deployment/aws-browsers',
         //             ],
         //         },
-        //         {
-        //             type: 'category',
-        //             label: 'Deploy to Google Cloud',
-        //             items: [
-        //                 'deployment/gcp-cheerio',
-        //                 'deployment/gcp-browsers',
-        //             ],
-        //         },
+                {
+                    type: 'category',
+                    label: 'Deploy to Google Cloud',
+                    items: [
+                        'deployment/gcp-functions',
+                        // 'deployment/gcp-browsers',
+                    ],
+                },
             ],
         },
         {