Skip to content

Commit b96ef88

Browse files
committed
fix: fix Scrapy integration
1 parent 56a9a93 commit b96ef88

31 files changed

+1085
-243
lines changed

docs/02_guides/05_scrapy.mdx

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@ import CodeBlock from '@theme/CodeBlock';
77
import Tabs from '@theme/Tabs';
88
import TabItem from '@theme/TabItem';
99

10-
import UnderscoreMainExample from '!!raw-loader!./code/scrapy_src/__main__.py';
11-
import MainExample from '!!raw-loader!./code/scrapy_src/main.py';
12-
import ItemsExample from '!!raw-loader!./code/scrapy_src/items.py';
13-
import SettingsExample from '!!raw-loader!./code/scrapy_src/settings.py';
14-
import TitleSpiderExample from '!!raw-loader!./code/scrapy_src/spiders/title.py';
10+
import UnderscoreMainExample from '!!raw-loader!./code/_scrapy_project/src/__main__.py';
11+
import MainExample from '!!raw-loader!./code/_scrapy_project/src/main.py';
12+
import ItemsExample from '!!raw-loader!./code/_scrapy_project/src/items.py';
13+
import SettingsExample from '!!raw-loader!./code/_scrapy_project/src/settings.py';
14+
import TitleSpiderExample from '!!raw-loader!./code/_scrapy_project/src/spiders/title.py';
1515

1616
[Scrapy](https://scrapy.org/) is an open-source web scraping framework written in Python. It provides a complete set of tools for web scraping, including the ability to define how to extract data from websites, handle pagination and navigation.
1717

@@ -92,5 +92,4 @@ Here is an example of a Scrapy Actor that scrapes the titles of web pages and en
9292

9393
## Conclusion
9494

95-
In this guide you learned how to use Scrapy in Apify Actors. You can now start building your own web scraping projects
96-
using Scrapy, the Apify SDK and host them on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
95+
In this guide you learned how to use Scrapy in Apify Actors. You can now start building your own web scraping projects using Scrapy, the Apify SDK and host them on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
FROM apify/actor-python:3.12
2+
3+
COPY pyproject.toml ./
4+
5+
RUN echo "Python version:" \
6+
&& python --version \
7+
&& echo "Pip version:" \
8+
&& pip --version \
9+
&& echo "Installing Poetry:" \
10+
&& pip install --no-cache-dir poetry~=1.8.0 \
11+
&& echo "Installing dependencies:" \
12+
&& poetry config virtualenvs.create false \
13+
&& poetry install --only main --no-interaction --no-ansi \
14+
&& rm -rf /tmp/.poetry-cache \
15+
&& echo "All installed Python packages:" \
16+
&& pip freeze
17+
18+
COPY . ./
19+
20+
RUN python3 -m compileall -q .
21+
22+
CMD ["python3", "-m", "src"]
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"actorSpecification": 1,
3+
"name": "getting-started-python-scrapy",
4+
"title": "Getting started with Python and Scrapy",
5+
"description": "Scrapes titles of websites using Scrapy.",
6+
"version": "0.0",
7+
"buildTag": "latest",
8+
"meta": {
9+
"templateId": "python-scrapy"
10+
},
11+
"input": "./input_schema.json",
12+
"dockerfile": "./Dockerfile"
13+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"title": "Python Scrapy Scraper",
3+
"type": "object",
4+
"schemaVersion": 1,
5+
"properties": {
6+
"startUrls": {
7+
"title": "Start URLs",
8+
"type": "array",
9+
"description": "URLs to start with",
10+
"editor": "requestListSources",
11+
"prefill": [{ "url": "https://crawlee.dev/" }],
12+
"default": [{ "url": "https://crawlee.dev/" }]
13+
},
14+
"allowedDomains": {
15+
"title": "Allowed domains",
16+
"type": "array",
17+
"description": "Domains that the scraper is allowed to crawl.",
18+
"editor": "json",
19+
"prefill": ["crawlee.dev"],
20+
"default": ["crawlee.dev"]
21+
},
22+
"proxyConfiguration": {
23+
"sectionCaption": "Proxy and HTTP configuration",
24+
"title": "Proxy configuration",
25+
"type": "object",
26+
"description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.",
27+
"editor": "proxy",
28+
"prefill": { "useApifyProxy": false },
29+
"default": { "useApifyProxy": false }
30+
}
31+
},
32+
"required": ["startUrls"]
33+
}
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
.git
2+
.mise.toml
3+
.nvim.lua
4+
storage
5+
6+
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
7+
8+
# Byte-compiled / optimized / DLL files
9+
__pycache__/
10+
*.py[cod]
11+
*$py.class
12+
13+
# C extensions
14+
*.so
15+
16+
# Distribution / packaging
17+
.Python
18+
build/
19+
develop-eggs/
20+
dist/
21+
downloads/
22+
eggs/
23+
.eggs/
24+
lib/
25+
lib64/
26+
parts/
27+
sdist/
28+
var/
29+
wheels/
30+
share/python-wheels/
31+
*.egg-info/
32+
.installed.cfg
33+
*.egg
34+
MANIFEST
35+
36+
# PyInstaller
37+
# Usually these files are written by a python script from a template
38+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
39+
*.manifest
40+
*.spec
41+
42+
# Installer logs
43+
pip-log.txt
44+
pip-delete-this-directory.txt
45+
46+
# Unit test / coverage reports
47+
htmlcov/
48+
.tox/
49+
.nox/
50+
.coverage
51+
.coverage.*
52+
.cache
53+
nosetests.xml
54+
coverage.xml
55+
*.cover
56+
*.py,cover
57+
.hypothesis/
58+
.pytest_cache/
59+
cover/
60+
61+
# Translations
62+
*.mo
63+
*.pot
64+
65+
# Django stuff:
66+
*.log
67+
local_settings.py
68+
db.sqlite3
69+
db.sqlite3-journal
70+
71+
# Flask stuff:
72+
instance/
73+
.webassets-cache
74+
75+
# Scrapy stuff:
76+
.scrapy
77+
78+
# Sphinx documentation
79+
docs/_build/
80+
81+
# PyBuilder
82+
.pybuilder/
83+
target/
84+
85+
# Jupyter Notebook
86+
.ipynb_checkpoints
87+
88+
# IPython
89+
profile_default/
90+
ipython_config.py
91+
92+
# pyenv
93+
# For a library or package, you might want to ignore these files since the code is
94+
# intended to run in multiple environments; otherwise, check them in:
95+
.python-version
96+
97+
# pdm
98+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
99+
#pdm.lock
100+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
101+
# in version control.
102+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
103+
.pdm.toml
104+
.pdm-python
105+
.pdm-build/
106+
107+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
108+
__pypackages__/
109+
110+
# Celery stuff
111+
celerybeat-schedule
112+
celerybeat.pid
113+
114+
# SageMath parsed files
115+
*.sage.py
116+
117+
# Environments
118+
.env
119+
.venv
120+
env/
121+
venv/
122+
ENV/
123+
env.bak/
124+
venv.bak/
125+
126+
# Spyder project settings
127+
.spyderproject
128+
.spyproject
129+
130+
# Rope project settings
131+
.ropeproject
132+
133+
# mkdocs documentation
134+
/site
135+
136+
# mypy
137+
.mypy_cache/
138+
.dmypy.json
139+
dmypy.json
140+
141+
# Pyre type checker
142+
.pyre/
143+
144+
# pytype static type analyzer
145+
.pytype/
146+
147+
# Cython debug symbols
148+
cython_debug/
149+
150+
# PyCharm
151+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
152+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
153+
# and can be added to the global gitignore or merged into this file. For a more nuclear
154+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
155+
.idea/

0 commit comments

Comments
 (0)