@@ -38,7 +38,7 @@ class SitemapRequestLoaderState(BaseModel):
3838 """Queue of URLs extracted from sitemaps and ready for processing."""
3939
4040 in_progress : Annotated [set [str ], Field (alias = 'inProgress' )] = set ()
41- """Set of request IDs currently being processed."""
41+ """Set of request URLs currently being processed."""
4242
4343 pending_sitemap_urls : Annotated [deque [str ], Field (alias = 'pendingSitemapUrls' )]
4444 """Queue of sitemap URLs that need to be fetched and processed."""
@@ -135,6 +135,10 @@ async def _get_state(self) -> SitemapRequestLoaderState:
135135 if not has_sitemap_for_processing and not self ._state .current_value .completed :
136136 self ._state .current_value .pending_sitemap_urls .extend (self ._sitemap_urls )
137137
138+ if self ._state .current_value .in_progress :
139+ self ._state .current_value .url_queue .extendleft (self ._state .current_value .in_progress )
140+ self ._state .current_value .in_progress .clear ()
141+
138142 if (
139143 self ._state .current_value .url_queue
140144 and len (self ._state .current_value .url_queue ) >= self ._max_buffer_size
@@ -271,7 +275,7 @@ async def fetch_next_request(self) -> Request | None:
271275 url = state .url_queue .popleft ()
272276
273277 request = Request .from_url (url )
274- state .in_progress .add (request .id )
278+ state .in_progress .add (request .url )
275279 if len (state .url_queue ) < self ._max_buffer_size :
276280 self ._queue_has_capacity .set ()
277281
@@ -281,8 +285,8 @@ async def fetch_next_request(self) -> Request | None:
281285 async def mark_request_as_handled (self , request : Request ) -> ProcessedRequest | None :
282286 """Mark a request as successfully handled."""
283287 state = await self ._get_state ()
284- if request .id in state .in_progress :
285- state .in_progress .remove (request .id )
288+ if request .url in state .in_progress :
289+ state .in_progress .remove (request .url )
286290 state .handled_count += 1
287291 return None
288292
0 commit comments