feat: Persist the SitemapRequestLoader state (apify#1347)

Mantisus · Mantisus · commit 3241785e2f10 · 2025-08-30T12:00:57.000Z
### Description - Persist the `SitemapRequestLoader` state ### Issues - Closes: apify#1269
diff --git a/docs/guides/code_examples/request_loaders/rl_basic_example_with_persist.py b/docs/guides/code_examples/request_loaders/rl_basic_example_with_persist.py
@@ -0,0 +1,46 @@
+import asyncio
+import logging
+
+from crawlee import service_locator
+from crawlee.request_loaders import RequestList
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(message)s')
+logger = logging.getLogger(__name__)
+
+
+# Disable clearing the `KeyValueStore` on each run.
+# This is necessary so that the state keys are not cleared at startup.
+# The recommended way to achieve this behavior is setting the environment variable
+# `CRAWLEE_PURGE_ON_START=0`
+configuration = service_locator.get_configuration()
+configuration.purge_on_start = False
+
+
+async def main() -> None:
+    # Open the request list, if it does not exist, it will be created.
+    # Leave name empty to use the default request list.
+    request_list = RequestList(
+        name='my-request-list',
+        requests=[
+            'https://apify.com/',
+            'https://crawlee.dev/',
+            'https://crawlee.dev/python/',
+        ],
+        # Enable persistence
+        persist_state_key='my-persist-state',
+        persist_requests_key='my-persist-requests',
+    )
+
+    # We receive only one request.
+    # Each time you run it, it will be a new request until you exhaust the `RequestList`.
+    request = await request_list.fetch_next_request()
+    if request:
+        logger.info(f'Processing request: {request.url}')
+        # Do something with it...
+
+        # And mark it as handled.
+        await request_list.mark_request_as_handled(request)
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/request_loaders/sitemap_basic_example.py b/docs/guides/code_examples/request_loaders/sitemap_basic_example.py
@@ -17,6 +17,7 @@ async def main() -> None:
         max_buffer_size=500,  # Keep up to 500 URLs in memory before processing.
     )
 
+    # We work with the loader until we process all relevant links from the sitemap.
     while request := await sitemap_loader.fetch_next_request():
         # Do something with it...
         print(f'Processing {request.url}')
diff --git a/docs/guides/code_examples/request_loaders/sitemap_example_with_persist.py b/docs/guides/code_examples/request_loaders/sitemap_example_with_persist.py
@@ -0,0 +1,45 @@
+import asyncio
+import logging
+
+from crawlee import service_locator
+from crawlee.http_clients import ImpitHttpClient
+from crawlee.request_loaders import SitemapRequestLoader
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(message)s')
+logger = logging.getLogger(__name__)
+
+
+# Disable clearing the `KeyValueStore` on each run.
+# This is necessary so that the state keys are not cleared at startup.
+# The recommended way to achieve this behavior is setting the environment variable
+# `CRAWLEE_PURGE_ON_START=0`
+configuration = service_locator.get_configuration()
+configuration.purge_on_start = False
+
+
+async def main() -> None:
+    # Create an HTTP client for fetching sitemaps
+    # Use the context manager for `SitemapRequestLoader` to correctly save the state when
+    # the work is completed.
+    async with (
+        ImpitHttpClient() as http_client,
+        SitemapRequestLoader(
+            sitemap_urls=['https://crawlee.dev/sitemap.xml'],
+            http_client=http_client,
+            # Enable persistence
+            persist_state_key='my-persist-state',
+        ) as sitemap_loader,
+    ):
+        # We receive only one request.
+        # Each time you run it, it will be a new request until you exhaust the sitemap.
+        request = await sitemap_loader.fetch_next_request()
+        if request:
+            logger.info(f'Processing request: {request.url}')
+            # Do something with it...
+
+            # And mark it as handled.
+            await sitemap_loader.mark_request_as_handled(request)
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/request_loaders.mdx b/docs/guides/request_loaders.mdx
@@ -15,6 +15,8 @@ import RlTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loa
 import RlExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example_explicit.py';
 import SitemapTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example.py';
 import SitemapExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example_explicit.py';
+import RlBasicPersistExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example_with_persist.py';
+import SitemapPersistExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_example_with_persist.py';
 
 The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/crawlee/request_loaders) sub-package extends the functionality of the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, providing additional tools for managing URLs and requests. If you are new to Crawlee and unfamiliar with the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, consider starting with the [Storages](https://crawlee.dev/python/docs/guides/storages) guide first. Request loaders define how requests are fetched and stored, enabling various use cases such as reading URLs from files, external APIs, or combining multiple sources together.
 
@@ -116,6 +118,16 @@ Here is a basic example of working with the <ApiLink to="class/RequestList">`Req
     {RlBasicExample}
 </RunnableCodeBlock>
 
+### Request list with persistence
+
+The <ApiLink to="class/RequestList">`RequestList`</ApiLink> supports state persistence, allowing it to resume from where it left off after interruption. This is particularly useful for long-running crawls or when you need to pause and resume crawling later.
+
+To enable persistence, provide `persist_state_key` and optionally `persist_requests_key` parameters, and disable automatic cleanup by setting `purge_on_start = False` in the configuration. The `persist_state_key` saves the loader's progress, while `persist_requests_key` ensures that the request data doesn't change between runs. For more details on resuming interrupted crawls, see the [Resuming a paused crawl](../examples/resuming-paused-crawl) example.
+
+<RunnableCodeBlock className="language-python" language="python">
+    {RlBasicPersistExample}
+</RunnableCodeBlock>
+
 ### Sitemap request loader
 
 The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> is a specialized request loader that reads URLs from XML sitemaps. It's particularly useful when you want to crawl a website systematically by following its sitemap structure. The loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> provides streaming processing of sitemaps, ensuring efficient memory usage without loading the entire sitemap into memory.
@@ -124,6 +136,16 @@ The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> is
     {SitemapExample}
 </RunnableCodeBlock>
 
+### Sitemap request loader with persistence
+
+Similarly, the <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> supports state persistence to resume processing from where it left off. This is especially valuable when processing large sitemaps that may take considerable time to complete.
+
+<RunnableCodeBlock className="language-python" language="python">
+    {SitemapPersistExample}
+</RunnableCodeBlock>
+
+When using persistence with `SitemapRequestLoader`, make sure to use the context manager (`async with`) to properly save the state when the work is completed.
+
 ## Request managers
 
 The <ApiLink to="class/RequestManager">`RequestManager`</ApiLink> extends `RequestLoader` with write capabilities. In addition to reading requests, a request manager can add and reclaim them. This is essential for dynamic crawling projects where new URLs may emerge during the crawl process, or when certain requests fail and need to be retried. For more details, refer to the <ApiLink to="class/RequestManager">`RequestManager`</ApiLink> API reference.
diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py
@@ -9,6 +9,7 @@
 from hashlib import sha256
 from logging import getLogger
 from typing import TYPE_CHECKING, Literal, TypedDict
+from xml.sax import SAXParseException
 from xml.sax.expatreader import ExpatParser
 from xml.sax.handler import ContentHandler
 
@@ -192,7 +193,8 @@ async def flush(self) -> AsyncGenerator[_SitemapItem, None]:
 
     def close(self) -> None:
         """Clean up resources."""
-        self._parser.close()
+        with suppress(SAXParseException):
+            self._parser.close()
 
 
 def _get_parser(content_type: str = '', url: str | None = None) -> _XmlSitemapParser | _TxtSitemapParser:
diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py
diff --git a/tests/unit/request_loaders/test_sitemap_request_loader.py b/tests/unit/request_loaders/test_sitemap_request_loader.py

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@ async def main() -> None:`
`17`	`17`	`max_buffer_size=500, # Keep up to 500 URLs in memory before processing.`
`18`	`18`	`)`
`19`	`19`
	`20`	`+ # We work with the loader until we process all relevant links from the sitemap.`
`20`	`21`	`while request := await sitemap_loader.fetch_next_request():`
`21`	`22`	`# Do something with it...`
`22`	`23`	`print(f'Processing {request.url}')`