Scrapy integration test is working

vdusek · vdusek · commit 1e37e47c6195 · 2025-02-06T19:25:25.000+01:00
diff --git a/docs/02_guides/code/_scrapy_project/src/__main__.py b/docs/02_guides/code/_scrapy_project/src/__main__.py
@@ -12,11 +12,10 @@
 # ruff: noqa: E402, I001
 
 from __future__ import annotations
-import asyncio
 from twisted.internet import asyncioreactor
 
 # Install Twisted's asyncio reactor before importing any other Twisted or Scrapy components.
-asyncioreactor.install(asyncio.get_event_loop())  # type: ignore[no-untyped-call]
+asyncioreactor.install()  # type: ignore[no-untyped-call]
 
 import os
 from apify.scrapy import initialize_logging, run_scrapy_actor
diff --git a/docs/02_guides/code/_scrapy_project/src/spiders/title.py b/docs/02_guides/code/_scrapy_project/src/spiders/title.py
@@ -24,6 +24,9 @@ class TitleSpider(Spider):
 
     name = 'title_spider'
 
+    # Limit the number of pages to scrape.
+    custom_settings = {'CLOSESPIDER_PAGECOUNT': 10}
+
     def __init__(
         self,
         start_urls: list[str],
diff --git a/tests/integration/README.md b/tests/integration/README.md
@@ -94,7 +94,6 @@ async def test_something(
     output_record = await actor.last_run().key_value_store().get_record('OUTPUT')
     assert output_record is not None
     assert output_record['value'] == expected_output
-
 ```
 
 Or you can pass multiple source files with the `source_files` argument, if you need something really complex:
diff --git a/tests/integration/actor_source_base/requirements.txt b/tests/integration/actor_source_base/requirements.txt
@@ -1,2 +1,3 @@
 # The test fixture will put the Apify SDK wheel path on the next line
 APIFY_SDK_WHEEL_PLACEHOLDER
+scrapy~=2.12.0
diff --git a/tests/integration/test_actor_scrapy.py b/tests/integration/test_actor_scrapy.py
@@ -9,14 +9,11 @@
 
 
 @pytest.mark.only
-async def test_actor_scrapy_title_spider(
+async def test_actor_scrapy_title_spider_v2(
     make_actor: MakeActorFunction,
     run_actor: RunActorFunction,
 ) -> None:
     actor_source_files = {
-        'requirements.txt': """
-            scrapy ~= 2.12
-        """,
         'src/spiders/title.py': """
             from __future__ import annotations
             from typing import TYPE_CHECKING, Any
@@ -32,6 +29,9 @@ async def test_actor_scrapy_title_spider(
             class TitleSpider(Spider):
                 name = 'title_spider'
 
+                # Limit the number of pages to scrape.
+                custom_settings = {'CLOSESPIDER_PAGECOUNT': 10}
+
                 def __init__(
                     self,
                     start_urls: list[str],
@@ -61,7 +61,7 @@ def parse(self, response: Response) -> Generator[TitleItem | Request, None, None
             import scrapy
 
             class TitleItem(scrapy.Item):
-                url = scrapy.Field
+                url = scrapy.Field()
                 title = scrapy.Field()
         """,
         'src/settings.py': """
@@ -107,11 +107,10 @@ async def main() -> None:
         """,
         'src/__main__.py': """
             from __future__ import annotations
-            import asyncio
             from twisted.internet import asyncioreactor
 
             # Install Twisted's asyncio reactor before importing any other Twisted or Scrapy components.
-            asyncioreactor.install(asyncio.get_event_loop())
+            asyncioreactor.install()
 
             import os
             from apify.scrapy import initialize_logging, run_scrapy_actor
@@ -133,5 +132,8 @@ async def main() -> None:
 
     items = await actor.last_run().dataset().list_items()
 
-    assert items.count == 48
-    assert items.items == {'blah'}
+    assert items.count >= 10
+
+    for item in items.items:
+        assert 'url' in item
+        assert 'title' in item

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`# The test fixture will put the Apify SDK wheel path on the next line`
`2`	`2`	`APIFY_SDK_WHEEL_PLACEHOLDER`
	`3`	`+scrapy~=2.12.0`