55import concurrent
66import json
77import logging
8- import multiprocessing
98import os
109import sys
1110import time
@@ -1650,7 +1649,7 @@ async def handler(context: BasicCrawlingContext) -> None:
16501649async def _run_crawler (requests : list [str ], storage_dir : str ) -> StatisticsState :
16511650 """Run crawler and return its statistics state.
16521651
1653- Must be defined like this to be picklable for ProcessPoolExecutor."""
1652+ Must be defined like this to be pickable for ProcessPoolExecutor."""
16541653 service_locator .set_configuration (
16551654 Configuration (
16561655 crawlee_storage_dir = storage_dir , # type: ignore[call-arg]
@@ -1679,7 +1678,7 @@ async def test_crawler_statistics_persistence(tmp_path: Path) -> None:
16791678
16801679 This test simulates starting the crawler process twice, and checks that the statistics include first run."""
16811680
1682- with concurrent .futures .ProcessPoolExecutor (mp_context = multiprocessing . get_context ( 'fork' ) ) as executor :
1681+ with concurrent .futures .ProcessPoolExecutor () as executor :
16831682 # Crawl 2 requests in the first run and automatically persist the state.
16841683 first_run_state = executor .submit (
16851684 _process_run_crawler ,
@@ -1688,6 +1687,8 @@ async def test_crawler_statistics_persistence(tmp_path: Path) -> None:
16881687 ).result ()
16891688 assert first_run_state .requests_finished == 2
16901689
1690+ # Do not reuse the executor to simulate a fresh process to avoid modified class attributes.
1691+ with concurrent .futures .ProcessPoolExecutor () as executor :
16911692 # Crawl 1 additional requests in the second run, but use previously automatically persisted state.
16921693 second_run_state = executor .submit (
16931694 _process_run_crawler , requests = ['https://c.placeholder.com' ], storage_dir = str (tmp_path )
0 commit comments