|
2 | 2 | from __future__ import annotations |
3 | 3 |
|
4 | 4 | import asyncio |
| 5 | +import concurrent |
5 | 6 | import json |
6 | 7 | import logging |
7 | 8 | import os |
@@ -1643,3 +1644,58 @@ async def handler(context: BasicCrawlingContext) -> None: |
1643 | 1644 |
|
1644 | 1645 | # Crawler should not fall back to the default storage after the purge |
1645 | 1646 | assert await unrelated_rq.fetch_next_request() == unrelated_request |
| 1647 | + |
| 1648 | + |
| 1649 | +async def _run_crawler(requests: list[str], storage_dir: str) -> StatisticsState: |
| 1650 | + """Run crawler and return its statistics state. |
| 1651 | +
|
| 1652 | + Must be defined like this to be picklable for ProcessPoolExecutor.""" |
| 1653 | + service_locator.set_configuration( |
| 1654 | + Configuration( |
| 1655 | + crawlee_storage_dir=storage_dir, # type: ignore[call-arg] |
| 1656 | + purge_on_start=False, |
| 1657 | + ) |
| 1658 | + ) |
| 1659 | + |
| 1660 | + async def request_handler(context: BasicCrawlingContext) -> None: |
| 1661 | + context.log.info(f'Processing {context.request.url} ...') |
| 1662 | + |
| 1663 | + crawler = BasicCrawler( |
| 1664 | + request_handler=request_handler, |
| 1665 | + concurrency_settings=ConcurrencySettings(max_concurrency=1, desired_concurrency=1), |
| 1666 | + ) |
| 1667 | + |
| 1668 | + await crawler.run(requests) |
| 1669 | + return crawler.statistics.state |
| 1670 | + |
| 1671 | + |
| 1672 | +def _process_run_crawler(requests: list[str], storage_dir: str) -> StatisticsState: |
| 1673 | + return asyncio.run(_run_crawler(requests=requests, storage_dir=storage_dir)) |
| 1674 | + |
| 1675 | + |
| 1676 | +async def test_crawler_statistics_persistence(tmp_path: Path) -> None: |
| 1677 | + """Test that crawler statistics persist and are loaded correctly. |
| 1678 | +
|
| 1679 | + This test simulates starting the crawler process twice, and checks that the statistics include first run.""" |
| 1680 | + |
| 1681 | + with concurrent.futures.ProcessPoolExecutor() as executor: |
| 1682 | + # Crawl 2 requests in the first run and automatically persist the state. |
| 1683 | + first_run_state = executor.submit( |
| 1684 | + _process_run_crawler, |
| 1685 | + requests=['https://a.placeholder.com', 'https://b.placeholder.com'], |
| 1686 | + storage_dir=str(tmp_path), |
| 1687 | + ).result() |
| 1688 | + assert first_run_state.requests_finished == 2 |
| 1689 | + |
| 1690 | + # Crawl 1 additional requests in the second run, but use previously automatically persisted state. |
| 1691 | + second_run_state = executor.submit( |
| 1692 | + _process_run_crawler, requests=['https://c.placeholder.com'], storage_dir=str(tmp_path) |
| 1693 | + ).result() |
| 1694 | + assert second_run_state.requests_finished == 3 |
| 1695 | + |
| 1696 | + assert first_run_state.crawler_started_at == second_run_state.crawler_started_at |
| 1697 | + assert first_run_state.crawler_finished_at |
| 1698 | + assert second_run_state.crawler_finished_at |
| 1699 | + |
| 1700 | + assert first_run_state.crawler_finished_at < second_run_state.crawler_finished_at |
| 1701 | + assert first_run_state.crawler_runtime < second_run_state.crawler_runtime |
0 commit comments