apify
diff --git a/‎docs/02_guides/05_scrapy.mdx‎
Lines changed: 6 additions & 7 deletions b/‎docs/02_guides/05_scrapy.mdx‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎docs/02_guides/code/_scrapy_project/.actor/Dockerfile‎
Lines changed: 22 additions & 0 deletions b/‎docs/02_guides/code/_scrapy_project/.actor/Dockerfile‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎docs/02_guides/code/_scrapy_project/.actor/actor.json‎
Lines changed: 13 additions & 0 deletions b/‎docs/02_guides/code/_scrapy_project/.actor/actor.json‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎docs/02_guides/code/_scrapy_project/.actor/input_schema.json‎
Lines changed: 33 additions & 0 deletions b/‎docs/02_guides/code/_scrapy_project/.actor/input_schema.json‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎docs/02_guides/code/_scrapy_project/.dockerignore‎
Lines changed: 155 additions & 0 deletions b/‎docs/02_guides/code/_scrapy_project/.dockerignore‎
Lines changed: 155 additions & 0 deletions
@@ -7,11 +7,11 @@ import CodeBlock from '@theme/CodeBlock';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 
-import UnderscoreMainExample from '!!raw-loader!./code/scrapy_src/__main__.py';
-import MainExample from '!!raw-loader!./code/scrapy_src/main.py';
-import ItemsExample from '!!raw-loader!./code/scrapy_src/items.py';
-import SettingsExample from '!!raw-loader!./code/scrapy_src/settings.py';
-import TitleSpiderExample from '!!raw-loader!./code/scrapy_src/spiders/title.py';
+import UnderscoreMainExample from '!!raw-loader!./code/_scrapy_project/src/__main__.py';
+import MainExample from '!!raw-loader!./code/_scrapy_project/src/main.py';
+import ItemsExample from '!!raw-loader!./code/_scrapy_project/src/items.py';
+import SettingsExample from '!!raw-loader!./code/_scrapy_project/src/settings.py';
+import TitleSpiderExample from '!!raw-loader!./code/_scrapy_project/src/spiders/title.py';
 
 [Scrapy](https://scrapy.org/) is an open-source web scraping framework written in Python. It provides a complete set of tools for web scraping, including the ability to define how to extract data from websites, handle pagination and navigation.
 
@@ -92,5 +92,4 @@ Here is an example of a Scrapy Actor that scrapes the titles of web pages and en
 
 ## Conclusion
 
-In this guide you learned how to use Scrapy in Apify Actors. You can now start building your own web scraping projects
-using Scrapy, the Apify SDK and host them on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
+In this guide you learned how to use Scrapy in Apify Actors. You can now start building your own web scraping projects using Scrapy, the Apify SDK and host them on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
@@ -0,0 +1,22 @@
+FROM apify/actor-python:3.12
+
+COPY pyproject.toml ./
+
+RUN echo "Python version:" \
+    && python --version \
+    && echo "Pip version:" \
+    && pip --version \
+    && echo "Installing Poetry:" \
+    && pip install --no-cache-dir poetry~=1.8.0 \
+    && echo "Installing dependencies:" \
+    && poetry config virtualenvs.create false \
+    && poetry install --only main --no-interaction --no-ansi \
+    && rm -rf /tmp/.poetry-cache \
+    && echo "All installed Python packages:" \
+    && pip freeze
+
+COPY . ./
+
+RUN python3 -m compileall -q .
+
+CMD ["python3", "-m", "src"]
@@ -0,0 +1,13 @@
+{
+    "actorSpecification": 1,
+    "name": "getting-started-python-scrapy",
+    "title": "Getting started with Python and Scrapy",
+    "description": "Scrapes titles of websites using Scrapy.",
+    "version": "0.0",
+    "buildTag": "latest",
+    "meta": {
+        "templateId": "python-scrapy"
+    },
+    "input": "./input_schema.json",
+    "dockerfile": "./Dockerfile"
+}
@@ -0,0 +1,33 @@
+{
+    "title": "Python Scrapy Scraper",
+    "type": "object",
+    "schemaVersion": 1,
+    "properties": {
+        "startUrls": {
+            "title": "Start URLs",
+            "type": "array",
+            "description": "URLs to start with",
+            "editor": "requestListSources",
+            "prefill": [{ "url": "https://crawlee.dev/" }],
+            "default": [{ "url": "https://crawlee.dev/" }]
+        },
+        "allowedDomains": {
+            "title": "Allowed domains",
+            "type": "array",
+            "description": "Domains that the scraper is allowed to crawl.",
+            "editor": "json",
+            "prefill": ["crawlee.dev"],
+            "default": ["crawlee.dev"]
+        },
+        "proxyConfiguration": {
+            "sectionCaption": "Proxy and HTTP configuration",
+            "title": "Proxy configuration",
+            "type": "object",
+            "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.",
+            "editor": "proxy",
+            "prefill": { "useApifyProxy": false },
+            "default": { "useApifyProxy": false }
+        }
+    },
+    "required": ["startUrls"]
+}
@@ -0,0 +1,155 @@
+.git
+.mise.toml
+.nvim.lua
+storage
+
+# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+.python-version
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/