fix: Fix Python templates to work with Crawlee v0.5 (#308)

vdusek · web-flow · commit 675b8d32ca72 · 2025-01-14T14:50:23.000+01:00
- Pin Crawlee &amp; Scrapy versions
- Fixing Scrapy template
diff --git a/templates/manifest.json b/templates/manifest.json
@@ -2,36 +2,6 @@
     "consoleReadmeSuffixUrl": "https://raw.githubusercontent.com/apify/actor-templates/master/templates/console_readme_suffix.md",
     "localReadmeSuffixUrl": "https://raw.githubusercontent.com/apify/actor-templates/master/templates/local_readme_suffix.md",
     "templates": [
-        {
-            "id": "python-scrapy",
-            "name": "python-scrapy",
-            "label": "Scrapy",
-            "category": "python",
-            "technologies": [
-                "scrapy"
-            ],
-            "description": "This example Scrapy spider scrapes page titles from URLs defined in input parameter. It shows how to use Apify SDK for Python and Scrapy pipelines to save results.",
-            "messages": {
-                "postCreate": "To install additional Python packages, you need to activate the virtual environment in the \".venv\" folder in the actor directory."
-            },
-            "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/python-scrapy.zip?raw=true",
-            "defaultRunOptions": {
-                "build": "latest",
-                "memoryMbytes": 4096,
-                "timeoutSecs": 3600
-            },
-            "showcaseFiles": [
-                "src/main.py",
-                "src/spiders/title.py",
-                "src/__main__.py",
-                "src/items.py",
-                "src/pipelines.py",
-                "src/settings.py"
-            ],
-            "useCases": [
-                "WEB_SCRAPING"
-            ]
-        },
         {
             "id": "python-start",
             "name": "python-start",
@@ -185,6 +155,36 @@
                 "STARTER"
             ]
         },
+        {
+            "id": "python-scrapy",
+            "name": "python-scrapy",
+            "label": "Scrapy",
+            "category": "python",
+            "technologies": [
+                "scrapy"
+            ],
+            "description": "This example Scrapy spider scrapes page titles from URLs defined in input parameter. It shows how to use Apify SDK for Python and Scrapy pipelines to save results.",
+            "messages": {
+                "postCreate": "To install additional Python packages, you need to activate the virtual environment in the \".venv\" folder in the actor directory."
+            },
+            "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/python-scrapy.zip?raw=true",
+            "defaultRunOptions": {
+                "build": "latest",
+                "memoryMbytes": 4096,
+                "timeoutSecs": 3600
+            },
+            "showcaseFiles": [
+                "src/main.py",
+                "src/spiders/title.py",
+                "src/__main__.py",
+                "src/items.py",
+                "src/pipelines.py",
+                "src/settings.py"
+            ],
+            "useCases": [
+                "WEB_SCRAPING"
+            ]
+        },
         {
             "id": "python-crawlee-beautifulsoup",
             "name": "python-crawlee-beautifulsoup",
diff --git a/templates/python-crawlee-beautifulsoup/requirements.txt b/templates/python-crawlee-beautifulsoup/requirements.txt
@@ -2,4 +2,4 @@
 # https://pip.pypa.io/en/latest/reference/requirements-file-format/
 
 apify < 3.0
-crawlee[beautifulsoup]
+crawlee[beautifulsoup] < 0.6
diff --git a/templates/python-crawlee-beautifulsoup/src/main.py b/templates/python-crawlee-beautifulsoup/src/main.py
@@ -7,7 +7,7 @@
 """
 
 from apify import Actor, Request
-from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 
 
 async def main() -> None:
diff --git a/templates/python-crawlee-playwright/requirements.txt b/templates/python-crawlee-playwright/requirements.txt
@@ -2,4 +2,4 @@
 # https://pip.pypa.io/en/latest/reference/requirements-file-format/
 
 apify < 3.0
-crawlee[playwright]
+crawlee[playwright] < 0.6
diff --git a/templates/python-crawlee-playwright/src/main.py b/templates/python-crawlee-playwright/src/main.py
@@ -7,7 +7,7 @@
 """
 
 from apify import Actor, Request
-from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
+from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 
 
 async def main() -> None:
@@ -32,7 +32,7 @@ async def main() -> None:
             # Limit the crawl to max requests. Remove or increase it for crawling all links.
             max_requests_per_crawl=50,
             headless=True,
-            browser_options={
+            browser_launch_options={
                 'args': ['--disable-gpu'],
             }
         )
diff --git a/templates/python-scrapy/requirements.txt b/templates/python-scrapy/requirements.txt
@@ -2,5 +2,5 @@
 # https://pip.pypa.io/en/latest/reference/requirements-file-format/
 
 apify[scrapy] < 3.0
-nest-asyncio
-scrapy
+nest-asyncio ~= 1.6
+scrapy ~= 2.12
diff --git a/templates/python-scrapy/src/__main__.py b/templates/python-scrapy/src/__main__.py
@@ -13,10 +13,13 @@
 # We need to configure the logging first before we import anything else, so that nothing else imports
 # `scrapy.utils.log` before we patch it.
 from __future__ import annotations
+
 from logging import StreamHandler, getLogger
 from typing import Any
+
 from scrapy.utils import log as scrapy_logging
 from scrapy.utils.project import get_project_settings
+
 from apify.log import ActorLogFormatter
 
 # Define names of the loggers.
@@ -94,8 +97,10 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
 # Now we can do the rest of the setup.
 import asyncio
 import os
+
 import nest_asyncio
 from scrapy.utils.reactor import install_reactor
+
 from .main import main
 
 # For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is
diff --git a/templates/python-scrapy/src/items.py b/templates/python-scrapy/src/items.py
@@ -11,9 +11,7 @@
 
 
 class TitleItem(Item):
-    """
-    Represents a title item scraped from a web page.
-    """
+    """Represents a title item scraped from a web page."""
 
     url = Field()
     title = Field()
diff --git a/templates/python-scrapy/src/middlewares.py b/templates/python-scrapy/src/middlewares.py
@@ -10,14 +10,17 @@
 """
 
 from __future__ import annotations
-from typing import Generator, Iterable
 
+from typing import TYPE_CHECKING
+
+# Useful for handling different item types with a single interface
 from scrapy import Request, Spider, signals
-from scrapy.crawler import Crawler
-from scrapy.http import Response
 
-# useful for handling different item types with a single interface
-from itemadapter import is_item, ItemAdapter
+if TYPE_CHECKING:
+    from collections.abc import Generator, Iterable
+
+    from scrapy.crawler import Crawler
+    from scrapy.http import Response
 
 
 class TitleSpiderMiddleware:
diff --git a/templates/python-scrapy/src/pipelines.py b/templates/python-scrapy/src/pipelines.py
@@ -13,9 +13,7 @@
 
 
 class TitleItemPipeline:
-    """
-    This item pipeline defines processing steps for TitleItem objects scraped by spiders.
-    """
+    """This item pipeline defines processing steps for TitleItem objects scraped by spiders."""
 
     def process_item(self, item: TitleItem, spider: Spider) -> TitleItem:
         # Do something with the item here, such as cleaning it or persisting it to a database
diff --git a/templates/python-scrapy/src/py.typed b/templates/python-scrapy/src/py.typed
diff --git a/templates/python-scrapy/src/spiders/__init__.py b/templates/python-scrapy/src/spiders/__init__.py
@@ -1,5 +1,4 @@
-"""
-Scrapy spiders package
+"""Scrapy spiders package.
 
 This package contains the spiders for your Scrapy project. Spiders are the classes that define how to scrape
 and process data from websites.
diff --git a/templates/python-scrapy/src/spiders/py.typed b/templates/python-scrapy/src/spiders/py.typed
diff --git a/templates/python-scrapy/src/spiders/title.py b/templates/python-scrapy/src/spiders/title.py
@@ -1,28 +1,35 @@
 from __future__ import annotations
 
-from typing import Generator
+from typing import TYPE_CHECKING
 from urllib.parse import urljoin
 
 from scrapy import Request, Spider
-from scrapy.responsetypes import Response
 
 from ..items import TitleItem
 
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from scrapy.responsetypes import Response
+
 
 class TitleSpider(Spider):
-    """
-    Scrapes title pages and enqueues all links found on the page.
-    """
+    """Scrapes title pages and enqueues all links found on the page."""
 
     name = 'title_spider'
 
     # The `start_urls` specified in this class will be merged with the `start_urls` value from your Actor input
     # when the project is executed using Apify.
     start_urls = ['https://apify.com/']
 
+    # Scrape only the pages within the Apify domain.
+    allowed_domains = ['apify.com']
+
+    # Limit the number of pages to scrape.
+    custom_settings = {'CLOSESPIDER_PAGECOUNT': 10}
+
     def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]:
-        """
-        Parse the web page response.
+        """Parse the web page response.
 
         Args:
             response: The web page response.

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`"""`
`8`	`8`
`9`	`9`	`from apify import Actor, Request`
`10`		`-from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext`
	`10`	`+from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext`
`11`	`11`
`12`	`12`
`13`	`13`	`async def main() -> None:`
`@@ -32,7 +32,7 @@ async def main() -> None:`
`32`	`32`	`# Limit the crawl to max requests. Remove or increase it for crawling all links.`
`33`	`33`	`max_requests_per_crawl=50,`
`34`	`34`	`headless=True,`
`35`		`- browser_options={`
	`35`	`+ browser_launch_options={`
`36`	`36`	`'args': ['--disable-gpu'],`
`37`	`37`	`}`
`38`	`38`	`)`