Skip to content

Commit f2ececf

Browse files
herohemantedivm
authored andcommitted
fix(history): ensure history key always exists in items
- Use `get` method to safely access history in hubs and items - Prevent potential KeyError by ensuring history key is present
1 parent 0b328fa commit f2ececf

File tree

2 files changed

+11
-8
lines changed

2 files changed

+11
-8
lines changed

scp_crawler/postprocessing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def get_wiki_source(page_id, domain, attempts=5):
112112
hub_list,
113113
):
114114
# Convert history dict to list and sort by date.
115-
hub["history"] = process_history(hub["history"])
115+
hub["history"] = process_history(hub.get("history"))
116116

117117
if len(hub["history"]) > 0:
118118
hub["created_at"] = hub["history"][0]["date"]
@@ -161,7 +161,7 @@ def run_postproc_items():
161161
item["hubs"] = get_hubs(item["link"])
162162

163163
# Convert history dict to list and sort by date.
164-
item["history"] = process_history(item["history"])
164+
item["history"] = process_history(item.get("history"))
165165

166166
if len(item["history"]) > 0:
167167
item["created_at"] = item["history"][0]["date"]

scp_crawler/spiders/scp.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ def parse_history(self, response, item, history_page=1):
2929
self.logger.info(f"Reviewing Page {item['page_id']} history")
3030

3131
page_id = item["page_id"]
32-
changes = item["history"] if "history" in item else {}
32+
changes = item.get("history", {})
33+
item["history"] = changes # Ensure history key always exists
3334

3435
try:
3536
response_text = getattr(response, "text", "") or ""
@@ -116,11 +117,13 @@ def parse_history(self, response, item, history_page=1):
116117
self.logger.exception("Could not process row.")
117118
self.logger.error(row)
118119

119-
item["history"] = changes
120-
# The "0" change is the first revision, and the last one that shows up.
121-
# If we have it then we're done.
122-
if "0" in changes:
123-
return self.get_page_source_request(page_id, item)
120+
# Update item history after processing all rows
121+
item["history"] = changes
122+
123+
# The "0" change is the first revision, and the last one that shows up.
124+
# If we have it then we're done.
125+
if "0" in changes:
126+
return self.get_page_source_request(page_id, item)
124127

125128
next_page = history_page + 1
126129
if next_page > MAX_HISTORY_PAGES:

0 commit comments

Comments
 (0)