address feedback

vdusek · vdusek · commit f1a7fd7ce455 · 2025-02-10T14:52:16.000+01:00
diff --git a/.github/workflows/run_code_checks.yaml b/.github/workflows/run_code_checks.yaml
@@ -19,9 +19,9 @@ jobs:
     name: Unit tests
     uses: apify/workflows/.github/workflows/python_unit_tests.yaml@main
 
-  # docs_check:
-  #   name: Docs check
-  #   uses: apify/workflows/.github/workflows/python_docs_check.yaml@main
+  docs_check:
+    name: Docs check
+    uses: apify/workflows/.github/workflows/python_docs_check.yaml@main
 
   integration_tests:
     name: Integration tests
diff --git a/docs/02_guides/05_scrapy.mdx b/docs/02_guides/05_scrapy.mdx
@@ -10,7 +10,7 @@ import TabItem from '@theme/TabItem';
 import UnderscoreMainExample from '!!raw-loader!./code/scrapy_project/src/__main__.py';
 import MainExample from '!!raw-loader!./code/scrapy_project/src/main.py';
 import ItemsExample from '!!raw-loader!./code/scrapy_project/src/items.py';
-import TitleSpiderExample from '!!raw-loader!./code/scrapy_project/src/spiders/title.py';
+import SpidersExample from '!!raw-loader!./code/scrapy_project/src/spiders.py';
 import SettingsExample from '!!raw-loader!./code/scrapy_project/src/settings.py';
 
 [Scrapy](https://scrapy.org/) is an open-source web scraping framework for Python. It provides tools for defining scrapers, extracting data from web pages, following links, and handling pagination. With the Apify SDK, Scrapy projects can be converted into Apify [Actors](https://docs.apify.com/platform/actors), integrated with Apify [storages](https://docs.apify.com/platform/storage), and executed on the Apify [platform](https://docs.apify.com/platform).
@@ -48,7 +48,7 @@ Additional helper functions in the [`apify.scrapy`](https://github.com/apify/api
 
 ## Create a new Apify-Scrapy project
 
-The simplest way to start using Scrapy in Apify Actors is to use the [Scrapy Actor template](https://apify.com/templates/categories/python). The template provides a pre-configured project structure and setup that includes all necessary components to run Scrapy spiders as Actors and store their output in Apify datasets. If you prefer manual setup, refer to the example Actor section below for configuration details.
+The simplest way to start using Scrapy in Apify Actors is to use the [Scrapy Actor template](https://apify.com/templates/python-scrapy). The template provides a pre-configured project structure and setup that includes all necessary components to run Scrapy spiders as Actors and store their output in Apify datasets. If you prefer manual setup, refer to the example Actor section below for configuration details.
 
 ## Wrapping an existing Scrapy project
 
@@ -80,9 +80,9 @@ The following example demonstrates a Scrapy Actor that scrapes page titles and e
             {ItemsExample}
         </CodeBlock>
     </TabItem>
-    <TabItem value="spiders/title.py" label="spiders/title.py">
+    <TabItem value="spiders.py" label="spiders.py">
         <CodeBlock className="language-python">
-            {TitleSpiderExample}
+            {SpidersExample}
         </CodeBlock>
     </TabItem>
     <TabItem value="settings.py" label="settings.py">
diff --git a/docs/02_guides/code/scrapy_project/src/__main__.py b/docs/02_guides/code/scrapy_project/src/__main__.py
@@ -1,13 +1,15 @@
-# ruff: noqa: E402, I001
-
 from __future__ import annotations
+
 from twisted.internet import asyncioreactor
 
 # Install Twisted's asyncio reactor before importing any other Twisted or Scrapy components.
 asyncioreactor.install()  # type: ignore[no-untyped-call]
 
 import os
+
 from apify.scrapy import initialize_logging, run_scrapy_actor
+
+# Import your main Actor coroutine here.
 from .main import main
 
 # Ensure the location to the Scrapy settings module is defined.
diff --git a/docs/02_guides/code/scrapy_project/src/items.py b/docs/02_guides/code/scrapy_project/src/items.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from scrapy import Field, Item
 
 
diff --git a/docs/02_guides/code/scrapy_project/src/main.py b/docs/02_guides/code/scrapy_project/src/main.py
@@ -1,5 +1,3 @@
-# ruff: noqa: I001
-
 from __future__ import annotations
 
 from scrapy.crawler import CrawlerRunner
@@ -9,7 +7,7 @@
 from apify.scrapy import apply_apify_settings
 
 # Import your Scrapy spider here.
-from .spiders.title import TitleSpider as Spider
+from .spiders import TitleSpider as Spider
 
 
 async def main() -> None:
diff --git a/docs/02_guides/code/scrapy_project/src/settings.py b/docs/02_guides/code/scrapy_project/src/settings.py
@@ -6,13 +6,3 @@
 SPIDER_MODULES = ['src.spiders']
 TELNETCONSOLE_ENABLED = False
 TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
-
-ITEM_PIPELINES = {
-    'src.pipelines.TitleItemPipeline': 123,
-}
-SPIDER_MIDDLEWARES = {
-    'src.middlewares.TitleSpiderMiddleware': 543,
-}
-DOWNLOADER_MIDDLEWARES = {
-    'src.middlewares.TitleDownloaderMiddleware': 543,
-}
diff --git a/docs/02_guides/code/scrapy_project/src/spiders.py b/docs/02_guides/code/scrapy_project/src/spiders.py
@@ -1,13 +1,11 @@
-# ruff: noqa: TID252, RUF012
-
 from __future__ import annotations
 
 from typing import TYPE_CHECKING, Any
 from urllib.parse import urljoin
 
 from scrapy import Request, Spider
 
-from ..items import TitleItem
+from .items import TitleItem
 
 if TYPE_CHECKING:
     from collections.abc import Generator
@@ -53,7 +51,7 @@ def parse(self, response: Response) -> Generator[TitleItem | Request, None, None
             response: The web page response.
 
         Yields:
-            Yields scraped TitleItem and Requests for links.
+            Yields scraped `TitleItem` and new `Request` objects for links.
         """
         self.logger.info('TitleSpider is parsing %s...', response)
 
diff --git a/docs/02_guides/code/scrapy_project/src/spiders/__init__.py b/docs/02_guides/code/scrapy_project/src/spiders/__init__.py
diff --git a/docs/02_guides/code/scrapy_project/src/spiders/py.typed b/docs/02_guides/code/scrapy_project/src/spiders/py.typed
diff --git a/pyproject.toml b/pyproject.toml
@@ -136,6 +136,16 @@ indent-style = "space"
     "TRY301",  # Abstract `raise` to an inner function
     "PLW0603", # Using the global statement to update `{name}` is discouraged
 ]
+"**/docs/**/scrapy_project/**/__main__.py" = [
+    # Because of asyncioreactor.install() call.
+    "E402", # Module level import not at top of file
+]
+"**/docs/**/scrapy_project/**" = [
+    # Local imports are mixed up with the Apify SDK.
+    "I001",  # Import block is un-sorted or un-formatted
+    # Class variables are common in Scrapy projects.
+    "RUF012",  # Mutable class attributes should be annotated with `typing.ClassVar`
+]
 
 [tool.ruff.lint.flake8-quotes]
 docstring-quotes = "double"
diff --git a/src/apify/scrapy/_async_thread.py b/src/apify/scrapy/_async_thread.py
@@ -48,12 +48,12 @@ def run_coro(
             The result returned by the coroutine.
 
         Raises:
+            RuntimeError: If the event loop is not running.
             TimeoutError: If the coroutine does not complete within the timeout.
             Exception: Any exception raised during coroutine execution.
         """
         if not self._eventloop.is_running():
-            logger.warning('Event loop is not running! Ignoring coroutine execution.')
-            return None
+            raise RuntimeError(f'The coroutine {coro} cannot be executed because the event loop is not running.')
 
         # Submit the coroutine to the event loop running in the other thread.
         future = asyncio.run_coroutine_threadsafe(coro, self._eventloop)
@@ -104,8 +104,6 @@ async def _shutdown_tasks(self) -> None:
         """Cancel all pending tasks in the event loop."""
         # Retrieve all tasks for the event loop, excluding the current task.
         tasks = [task for task in asyncio.all_tasks(self._eventloop) if task is not asyncio.current_task()]
-        if not tasks:
-            return
 
         # Cancel each pending task.
         for task in tasks:
diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py
@@ -22,7 +22,7 @@
 
 
 class ApifyScheduler(BaseScheduler):
-    """A Scrapy scheduler that uses the Apify request queue to manage requests.
+    """A Scrapy scheduler that uses the Apify `RequestQueue` to manage requests.
 
     This scheduler requires the asyncio Twisted reactor to be installed.
     """
diff --git a/tests/integration/_utils.py b/tests/integration/_utils.py
@@ -3,6 +3,12 @@
 from crawlee._utils.crypto import crypto_random_object_id
 
 
+def read_file(file_path: str) -> str:
+    """Read the content of a file and return it as a string."""
+    with open(file_path, encoding='utf-8') as file:
+        return file.read()
+
+
 def generate_unique_resource_name(label: str) -> str:
     """Generates a unique resource name, which will contain the given label."""
     label = label.replace('_', '-')
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -349,14 +349,14 @@ def __call__(
 
 
 @pytest.fixture
-async def run_actor(apify_client_async: ApifyClientAsync) -> RunActorFunction:
+async def run_actor(apify_client_async: ApifyClientAsync, run_input: Any = None) -> RunActorFunction:
     """Fixture for calling an Actor run and waiting for its completion.
 
     This fixture returns a function that initiates an Actor run with optional run input, waits for its completion,
     and retrieves the final result. It uses the `wait_for_finish` method with a timeout of 10 minutes.
     """
 
-    async def _run_actor(actor: ActorClientAsync, *, run_input: Any = None) -> ActorRun:
+    async def _run_actor(actor: ActorClientAsync, *, run_input: Any = run_input) -> ActorRun:
         call_result = await actor.call(run_input=run_input)
 
         assert isinstance(call_result, dict), 'The result of ActorClientAsync.call() is not a dictionary.'
diff --git a/tests/integration/test_actor_scrapy.py b/tests/integration/test_actor_scrapy.py
@@ -5,125 +5,31 @@
 if TYPE_CHECKING:
     from .conftest import MakeActorFunction, RunActorFunction
 
+from ._utils import read_file
+
 
 async def test_actor_scrapy_title_spider(
     make_actor: MakeActorFunction,
     run_actor: RunActorFunction,
 ) -> None:
     actor_source_files = {
-        'src/spiders/title.py': """
-            from __future__ import annotations
-            from typing import TYPE_CHECKING, Any
-            from urllib.parse import urljoin
-            from scrapy import Request, Spider
-            from ..items import TitleItem
-
-            if TYPE_CHECKING:
-                from collections.abc import Generator
-                from scrapy.responsetypes import Response
-
-
-            class TitleSpider(Spider):
-                name = 'title_spider'
-
-                # Limit the number of pages to scrape.
-                custom_settings = {'CLOSESPIDER_PAGECOUNT': 10}
-
-                def __init__(
-                    self,
-                    start_urls: list[str],
-                    allowed_domains: list[str],
-                    *args: Any,
-                    **kwargs: Any,
-                ) -> None:
-                    super().__init__(*args, **kwargs)
-                    self.start_urls = start_urls
-                    self.allowed_domains = allowed_domains
-
-                def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]:
-                    self.logger.info('TitleSpider is parsing %s...', response)
-                    url = response.url
-                    title = response.css('title::text').extract_first()
-                    yield TitleItem(url=url, title=title)
-
-                    for link_href in response.css('a::attr("href")'):
-                        link_url = urljoin(response.url, link_href.get())
-                        if link_url.startswith(('http://', 'https://')):
-                            yield Request(link_url)
-        """,
-        'src/spiders/__init__.py': """
-            from .title import TitleSpider
-        """,
-        'src/items.py': """
-            import scrapy
-
-            class TitleItem(scrapy.Item):
-                url = scrapy.Field()
-                title = scrapy.Field()
-        """,
-        'src/settings.py': """
-            BOT_NAME = 'titlebot'
-            DEPTH_LIMIT = 1
-            LOG_LEVEL = 'INFO'
-            NEWSPIDER_MODULE = 'src.spiders'
-            ROBOTSTXT_OBEY = True
-            SPIDER_MODULES = ['src.spiders']
-            TELNETCONSOLE_ENABLED = False
-            TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
-        """,
-        'src/__init__.py': '',
-        'src/main.py': """
-            from __future__ import annotations
-            from scrapy.crawler import CrawlerRunner
-            from scrapy.utils.defer import deferred_to_future
-            from apify import Actor
-            from apify.scrapy.utils import apply_apify_settings
-            from .spiders.title import TitleSpider as Spider
-
-
-            async def main() -> None:
-                async with Actor:
-                    Actor.log.info('Actor is being executed...')
-
-                    # Retrieve and process Actor input.
-                    start_urls = ['https://crawlee.dev']
-                    allowed_domains = ['crawlee.dev']
-                    proxy_config = {'useApifyProxy': True}
-
-                    # Apply Apify settings, which will override the Scrapy project settings.
-                    settings = apply_apify_settings(proxy_config=proxy_config)
-
-                    # Create CrawlerRunner and execute the Scrapy spider.
-                    crawler_runner = CrawlerRunner(settings)
-                    crawl_deferred = crawler_runner.crawl(
-                        Spider,
-                        start_urls=start_urls,
-                        allowed_domains=allowed_domains,
-                    )
-                    await deferred_to_future(crawl_deferred)
-        """,
-        'src/__main__.py': """
-            from __future__ import annotations
-            from twisted.internet import asyncioreactor
-
-            # Install Twisted's asyncio reactor before importing any other Twisted or Scrapy components.
-            asyncioreactor.install()
-
-            import os
-            from apify.scrapy import initialize_logging, run_scrapy_actor
-            from .main import main
-
-            # Ensure the location to the Scrapy settings module is defined.
-            os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'
-
-            if __name__ == '__main__':
-                initialize_logging()
-                run_scrapy_actor(main())
-        """,
+        'src/spiders.py': read_file('docs/02_guides/code/scrapy_project/src/spiders.py'),
+        'src/items.py': read_file('docs/02_guides/code/scrapy_project/src/items.py'),
+        'src/settings.py': read_file('docs/02_guides/code/scrapy_project/src/settings.py'),
+        'src/__init__.py': read_file('docs/02_guides/code/scrapy_project/src/__init__.py'),
+        'src/main.py': read_file('docs/02_guides/code/scrapy_project/src/main.py'),
+        'src/__main__.py': read_file('docs/02_guides/code/scrapy_project/src/__main__.py'),
     }
 
     actor = await make_actor('actor-scrapy-title-spider', source_files=actor_source_files)
-    run_result = await run_actor(actor)
+    run_result = await run_actor(
+        actor,
+        run_input={
+            'startUrls': [{'url': 'https://crawlee.dev'}],
+            'allowedDomains': ['crawlee.dev'],
+            'proxyConfiguration': {'useApifyProxy': True},
+        },
+    )
 
     assert run_result.status == 'SUCCEEDED'
 

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import annotations`
	`2`	`+`
`1`	`3`	`from scrapy import Field, Item`
`2`	`4`
`3`	`5`