Skip to content

Commit 675b8d3

Browse files
authored
fix: Fix Python templates to work with Crawlee v0.5 (#308)
- Pin Crawlee & Scrapy versions - Fixing Scrapy template
1 parent f18cded commit 675b8d3

File tree

14 files changed

+67
-57
lines changed

14 files changed

+67
-57
lines changed

templates/manifest.json

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,36 +2,6 @@
22
"consoleReadmeSuffixUrl": "https://raw.githubusercontent.com/apify/actor-templates/master/templates/console_readme_suffix.md",
33
"localReadmeSuffixUrl": "https://raw.githubusercontent.com/apify/actor-templates/master/templates/local_readme_suffix.md",
44
"templates": [
5-
{
6-
"id": "python-scrapy",
7-
"name": "python-scrapy",
8-
"label": "Scrapy",
9-
"category": "python",
10-
"technologies": [
11-
"scrapy"
12-
],
13-
"description": "This example Scrapy spider scrapes page titles from URLs defined in input parameter. It shows how to use Apify SDK for Python and Scrapy pipelines to save results.",
14-
"messages": {
15-
"postCreate": "To install additional Python packages, you need to activate the virtual environment in the \".venv\" folder in the actor directory."
16-
},
17-
"archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/python-scrapy.zip?raw=true",
18-
"defaultRunOptions": {
19-
"build": "latest",
20-
"memoryMbytes": 4096,
21-
"timeoutSecs": 3600
22-
},
23-
"showcaseFiles": [
24-
"src/main.py",
25-
"src/spiders/title.py",
26-
"src/__main__.py",
27-
"src/items.py",
28-
"src/pipelines.py",
29-
"src/settings.py"
30-
],
31-
"useCases": [
32-
"WEB_SCRAPING"
33-
]
34-
},
355
{
366
"id": "python-start",
377
"name": "python-start",
@@ -185,6 +155,36 @@
185155
"STARTER"
186156
]
187157
},
158+
{
159+
"id": "python-scrapy",
160+
"name": "python-scrapy",
161+
"label": "Scrapy",
162+
"category": "python",
163+
"technologies": [
164+
"scrapy"
165+
],
166+
"description": "This example Scrapy spider scrapes page titles from URLs defined in input parameter. It shows how to use Apify SDK for Python and Scrapy pipelines to save results.",
167+
"messages": {
168+
"postCreate": "To install additional Python packages, you need to activate the virtual environment in the \".venv\" folder in the actor directory."
169+
},
170+
"archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/python-scrapy.zip?raw=true",
171+
"defaultRunOptions": {
172+
"build": "latest",
173+
"memoryMbytes": 4096,
174+
"timeoutSecs": 3600
175+
},
176+
"showcaseFiles": [
177+
"src/main.py",
178+
"src/spiders/title.py",
179+
"src/__main__.py",
180+
"src/items.py",
181+
"src/pipelines.py",
182+
"src/settings.py"
183+
],
184+
"useCases": [
185+
"WEB_SCRAPING"
186+
]
187+
},
188188
{
189189
"id": "python-crawlee-beautifulsoup",
190190
"name": "python-crawlee-beautifulsoup",

templates/python-crawlee-beautifulsoup/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
# https://pip.pypa.io/en/latest/reference/requirements-file-format/
33

44
apify < 3.0
5-
crawlee[beautifulsoup]
5+
crawlee[beautifulsoup] < 0.6

templates/python-crawlee-beautifulsoup/src/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"""
88

99
from apify import Actor, Request
10-
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
10+
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
1111

1212

1313
async def main() -> None:

templates/python-crawlee-playwright/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
# https://pip.pypa.io/en/latest/reference/requirements-file-format/
33

44
apify < 3.0
5-
crawlee[playwright]
5+
crawlee[playwright] < 0.6

templates/python-crawlee-playwright/src/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"""
88

99
from apify import Actor, Request
10-
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
10+
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
1111

1212

1313
async def main() -> None:
@@ -32,7 +32,7 @@ async def main() -> None:
3232
# Limit the crawl to max requests. Remove or increase it for crawling all links.
3333
max_requests_per_crawl=50,
3434
headless=True,
35-
browser_options={
35+
browser_launch_options={
3636
'args': ['--disable-gpu'],
3737
}
3838
)

templates/python-scrapy/requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
# https://pip.pypa.io/en/latest/reference/requirements-file-format/
33

44
apify[scrapy] < 3.0
5-
nest-asyncio
6-
scrapy
5+
nest-asyncio ~= 1.6
6+
scrapy ~= 2.12

templates/python-scrapy/src/__main__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,13 @@
1313
# We need to configure the logging first before we import anything else, so that nothing else imports
1414
# `scrapy.utils.log` before we patch it.
1515
from __future__ import annotations
16+
1617
from logging import StreamHandler, getLogger
1718
from typing import Any
19+
1820
from scrapy.utils import log as scrapy_logging
1921
from scrapy.utils.project import get_project_settings
22+
2023
from apify.log import ActorLogFormatter
2124

2225
# Define names of the loggers.
@@ -94,8 +97,10 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
9497
# Now we can do the rest of the setup.
9598
import asyncio
9699
import os
100+
97101
import nest_asyncio
98102
from scrapy.utils.reactor import install_reactor
103+
99104
from .main import main
100105

101106
# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is

templates/python-scrapy/src/items.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,7 @@
1111

1212

1313
class TitleItem(Item):
14-
"""
15-
Represents a title item scraped from a web page.
16-
"""
14+
"""Represents a title item scraped from a web page."""
1715

1816
url = Field()
1917
title = Field()

templates/python-scrapy/src/middlewares.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,17 @@
1010
"""
1111

1212
from __future__ import annotations
13-
from typing import Generator, Iterable
1413

14+
from typing import TYPE_CHECKING
15+
16+
# Useful for handling different item types with a single interface
1517
from scrapy import Request, Spider, signals
16-
from scrapy.crawler import Crawler
17-
from scrapy.http import Response
1818

19-
# useful for handling different item types with a single interface
20-
from itemadapter import is_item, ItemAdapter
19+
if TYPE_CHECKING:
20+
from collections.abc import Generator, Iterable
21+
22+
from scrapy.crawler import Crawler
23+
from scrapy.http import Response
2124

2225

2326
class TitleSpiderMiddleware:

templates/python-scrapy/src/pipelines.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,7 @@
1313

1414

1515
class TitleItemPipeline:
16-
"""
17-
This item pipeline defines processing steps for TitleItem objects scraped by spiders.
18-
"""
16+
"""This item pipeline defines processing steps for TitleItem objects scraped by spiders."""
1917

2018
def process_item(self, item: TitleItem, spider: Spider) -> TitleItem:
2119
# Do something with the item here, such as cleaning it or persisting it to a database

0 commit comments

Comments
 (0)