Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 26 additions & 8 deletions scp_crawler/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,29 @@ def get_images(html):


def process_history(history):
history = [v for v in history.values()]
for revision in history:
revision["date"] = datetime.strptime(revision["date"], "%d %b %Y %H:%M")
history.sort(key=lambda x: x["date"])
return history
if not history:
return []

if isinstance(history, dict):
revisions = list(history.values())
elif isinstance(history, list):
revisions = history
else:
return []

for revision in revisions:
if not isinstance(revision, dict):
continue
revision_date = revision.get("date")
if isinstance(revision_date, str):
try:
revision["date"] = datetime.strptime(revision_date, "%d %b %Y %H:%M")
except Exception:
# Keep original value if parsing fails.
pass

revisions.sort(key=lambda x: x.get("date") or datetime.min)
return revisions


def get_wiki_source(page_id, domain, attempts=5):
Expand Down Expand Up @@ -94,7 +112,7 @@ def get_wiki_source(page_id, domain, attempts=5):
hub_list,
):
# Convert history dict to list and sort by date.
hub["history"] = process_history(hub["history"])
hub["history"] = process_history(hub.get("history"))

if len(hub["history"]) > 0:
hub["created_at"] = hub["history"][0]["date"]
Expand Down Expand Up @@ -143,7 +161,7 @@ def run_postproc_items():
item["hubs"] = get_hubs(item["link"])

# Convert history dict to list and sort by date.
item["history"] = process_history(item["history"])
item["history"] = process_history(item.get("history"))

if len(item["history"]) > 0:
item["created_at"] = item["history"][0]["date"]
Expand Down Expand Up @@ -200,7 +218,7 @@ def run_postproc_tales():
tale["raw_source"] = get_wiki_source(tale["page_id"], tale["domain"])

# Convert history dict to list and sort by date.
tale["history"] = process_history(tale["history"])
tale["history"] = process_history(tale.get("history"))

if len(tale["history"]) > 0:
tale["created_at"] = tale["history"][0]["date"]
Expand Down
75 changes: 66 additions & 9 deletions scp_crawler/spiders/scp.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import re
import sys
from pprint import pprint
Expand Down Expand Up @@ -28,13 +29,67 @@ def parse_history(self, response, item, history_page=1):
self.logger.info(f"Reviewing Page {item['page_id']} history")

page_id = item["page_id"]
changes = item["history"] if "history" in item else {}
changes = item.get("history", {})
item["history"] = changes # Ensure history key always exists

try:
response_text = getattr(response, "text", "") or ""
if not response_text.strip():
self.logger.error(
"Empty response when fetching history for %s (status=%s, page=%s)",
item.get("url"),
getattr(response, "status", None),
history_page,
)
return self.get_page_source_request(page_id, item)

history = response.json()
soup = BeautifulSoup(history["body"], "lxml")
if not isinstance(history, dict) or "body" not in history:
self.logger.error(
"Missing 'body' in history lookup for %s (status=%s, page=%s). Keys=%s",
item.get("url"),
getattr(response, "status", None),
history_page,
list(history.keys()) if isinstance(history, dict) else type(history),
)
return self.get_page_source_request(page_id, item)

body = history.get("body")
if not body:
self.logger.error(
"Empty 'body' in history lookup for %s (status=%s, page=%s)",
item.get("url"),
getattr(response, "status", None),
history_page,
)
return self.get_page_source_request(page_id, item)

soup = BeautifulSoup(body, "lxml")
if soup.table is None:
self.logger.error(
"Missing <table> in history HTML for %s (status=%s, page=%s)",
item.get("url"),
getattr(response, "status", None),
history_page,
)
return self.get_page_source_request(page_id, item)
rows = soup.table.find_all("tr")
except:
self.logger.exception(f"Unable to parse history lookup. {item['url']}")

except (json.JSONDecodeError, ValueError):
self.logger.error(
"JSON decode error in history lookup for %s (status=%s, page=%s)",
item.get("url"),
getattr(response, "status", None),
history_page,
)
return self.get_page_source_request(page_id, item)
except Exception:
self.logger.exception(
"Unable to parse history lookup for %s (status=%s, page=%s)",
item.get("url"),
getattr(response, "status", None),
history_page,
)
return self.get_page_source_request(page_id, item)
for row in rows:
try:
Expand Down Expand Up @@ -62,11 +117,13 @@ def parse_history(self, response, item, history_page=1):
self.logger.exception("Could not process row.")
self.logger.error(row)

item["history"] = changes
# The "0" change is the first revision, and the last one that shows up.
# If we have it then we're done.
if "0" in changes:
return self.get_page_source_request(page_id, item)
# Update item history after processing all rows
item["history"] = changes

# The "0" change is the first revision, and the last one that shows up.
# If we have it then we're done.
if "0" in changes:
return self.get_page_source_request(page_id, item)

next_page = history_page + 1
if next_page > MAX_HISTORY_PAGES:
Expand Down
Loading