Skip to content

Commit ccc9b68

Browse files
committed
move requests from in_progress to queue after restore
1 parent b34ad2b commit ccc9b68

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

src/crawlee/request_loaders/_sitemap_request_loader.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ class SitemapRequestLoaderState(BaseModel):
3838
"""Queue of URLs extracted from sitemaps and ready for processing."""
3939

4040
in_progress: Annotated[set[str], Field(alias='inProgress')] = set()
41-
"""Set of request IDs currently being processed."""
41+
"""Set of request URLs currently being processed."""
4242

4343
pending_sitemap_urls: Annotated[deque[str], Field(alias='pendingSitemapUrls')]
4444
"""Queue of sitemap URLs that need to be fetched and processed."""
@@ -135,6 +135,10 @@ async def _get_state(self) -> SitemapRequestLoaderState:
135135
if not has_sitemap_for_processing and not self._state.current_value.completed:
136136
self._state.current_value.pending_sitemap_urls.extend(self._sitemap_urls)
137137

138+
if self._state.current_value.in_progress:
139+
self._state.current_value.url_queue.extendleft(self._state.current_value.in_progress)
140+
self._state.current_value.in_progress.clear()
141+
138142
if (
139143
self._state.current_value.url_queue
140144
and len(self._state.current_value.url_queue) >= self._max_buffer_size
@@ -271,7 +275,7 @@ async def fetch_next_request(self) -> Request | None:
271275
url = state.url_queue.popleft()
272276

273277
request = Request.from_url(url)
274-
state.in_progress.add(request.id)
278+
state.in_progress.add(request.url)
275279
if len(state.url_queue) < self._max_buffer_size:
276280
self._queue_has_capacity.set()
277281

@@ -281,8 +285,8 @@ async def fetch_next_request(self) -> Request | None:
281285
async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
282286
"""Mark a request as successfully handled."""
283287
state = await self._get_state()
284-
if request.id in state.in_progress:
285-
state.in_progress.remove(request.id)
288+
if request.url in state.in_progress:
289+
state.in_progress.remove(request.url)
286290
state.handled_count += 1
287291
return None
288292

0 commit comments

Comments
 (0)