Skip to content

Commit afe6b2e

Browse files
committed
feat(spiders): Change items hook to make it for processing items + add a stat for this
1 parent 41c7a7e commit afe6b2e

File tree

3 files changed

+16
-9
lines changed

3 files changed

+16
-9
lines changed

scrapling/spiders/engine.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -102,12 +102,17 @@ async def _process_request(self, request: Request) -> None:
102102
self.stats.offsite_requests_count += 1
103103
log.debug(f"Filtered offsite request to: {result.url}")
104104
elif isinstance(result, dict):
105-
self.stats.items_scraped += 1
106-
self._items.append(result)
107-
if self._item_stream:
108-
await self._item_stream.send(result)
109-
await self.spider.on_scraped_item(result)
110-
log.debug(f"Scraped from {str(response)}\n{result}")
105+
processed_result = await self.spider.on_scraped_item(result)
106+
if processed_result:
107+
self.stats.items_scraped += 1
108+
log.debug(f"Scraped from {str(response)}\n{processed_result}")
109+
if self._item_stream:
110+
await self._item_stream.send(processed_result)
111+
else:
112+
self._items.append(processed_result)
113+
else:
114+
self.stats.items_dropped += 1
115+
log.warning(f"Dropped from {str(response)}\n{processed_result}")
111116
elif result is not None:
112117
log.error(f"Spider must return Request, dict or None, got '{type(result)}' in {request}")
113118
except Exception as e:

scrapling/spiders/result.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class CrawlStats:
5050
offsite_requests_count: int = 0
5151
response_bytes: int = 0
5252
items_scraped: int = 0
53+
items_dropped: int = 0
5354
start_time: float = 0.0
5455
end_time: float = 0.0
5556
download_delay: float = 0.0
@@ -85,6 +86,7 @@ def increment_requests_count(self, sid: str) -> None:
8586
def to_dict(self) -> dict[str, Any]:
8687
return {
8788
"items_scraped": self.items_scraped,
89+
"items_dropped": self.items_dropped,
8890
"elapsed_seconds": round(self.elapsed_seconds, 2),
8991
"download_delay": round(self.download_delay, 2),
9092
"concurrent_requests": self.concurrent_requests,

scrapling/spiders/spider.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,9 @@ async def on_error(self, request: Request, error: Exception) -> None:
160160
"""
161161
self.logger.error(error, exc_info=error)
162162

163-
async def on_scraped_item(self, item: dict[str, Any]) -> None:
164-
"""Handle a scraped item. Override or extend for item pipelines."""
165-
pass
163+
async def on_scraped_item(self, item: Dict[str, Any]) -> Dict[str, Any] | None:
164+
"""A hook to be overridden by users to do some processing on scraped items, return `None` to drop the item silently."""
165+
return item
166166

167167
async def is_blocked(self, response: "Response") -> bool:
168168
"""Check if the response is blocked. Users should override this for custom detection logic."""

0 commit comments

Comments
 (0)