Skip to content

Commit 1bd7177

Browse files
committed
New: add wiki recovery script for historical data restoration
- Parse wiki markdown to extract modlog entries - Insert recovered entries into database - Skip duplicates automatically - Successfully recovered 254 entries for opensignups, 299 for usenet
1 parent 665d73e commit 1bd7177

File tree

1 file changed

+195
-0
lines changed

1 file changed

+195
-0
lines changed

wiki_recovery.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Recovery script to rebuild database from historical wiki revisions.
4+
This recovers modlog data that was lost when the database was accidentally deleted.
5+
"""
6+
7+
import json
8+
import re
9+
import sqlite3
10+
import sys
11+
from datetime import datetime
12+
from typing import Any, List, Tuple
13+
14+
import praw
15+
16+
17+
def parse_wiki_content(content: str, subreddit_name: str) -> List[Tuple[Any, ...]]:
18+
"""Parse wiki markdown content and extract modlog entries."""
19+
entries: list[tuple[Any, ...]] = []
20+
current_date = None
21+
22+
lines = content.split("\n")
23+
24+
for line in lines:
25+
# Check for date header
26+
date_match = re.match(r"^## (\d{4}-\d{2}-\d{2})$", line.strip())
27+
if date_match:
28+
current_date = date_match.group(1)
29+
continue
30+
31+
# Skip table headers and dividers
32+
if line.startswith("|---") or line.startswith("| Time | Action"):
33+
continue
34+
35+
# Parse table rows
36+
if line.startswith("|") and current_date and "|" in line[1:]:
37+
parts = [p.strip() for p in line.split("|")[1:-1]] # Remove empty first/last
38+
if len(parts) < 6:
39+
continue
40+
41+
time_str, action, entry_id, moderator, content, reason = parts[:6]
42+
43+
# Skip empty or header rows
44+
if not time_str or time_str == "Time":
45+
continue
46+
47+
# Parse timestamp
48+
try:
49+
timestamp_str = f"{current_date} {time_str}"
50+
dt = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S %Z")
51+
created_at = int(dt.timestamp())
52+
except ValueError:
53+
print(f"Warning: Could not parse timestamp: {timestamp_str}", file=sys.stderr)
54+
continue
55+
56+
# Extract target info from content markdown
57+
target_author = None
58+
target_permalink = None
59+
target_type = None
60+
61+
# Extract permalink
62+
permalink_match = re.search(r"\[.*?\]\((https://[^)]+)\)", content)
63+
if permalink_match:
64+
target_permalink = permalink_match.group(1)
65+
66+
# Extract author
67+
author_match = re.search(r"u/([A-Za-z0-9_-]+)", content)
68+
if author_match:
69+
target_author = author_match.group(1)
70+
71+
# Determine target type from action
72+
if "comment" in action.lower():
73+
target_type = "comment"
74+
elif "link" in action.lower() or "post" in action.lower():
75+
target_type = "submission"
76+
else:
77+
target_type = "unknown"
78+
79+
# Clean up action type (remove filter- prefix if present)
80+
action_clean = action.replace("filter-", "")
81+
82+
# Create entry tuple matching database schema
83+
entry = (
84+
entry_id, # action_id
85+
created_at, # created_at
86+
action_clean, # action_type
87+
moderator, # moderator
88+
entry_id, # target_id (same as action_id for display)
89+
target_type, # target_type
90+
entry_id, # display_id
91+
target_permalink, # target_permalink
92+
reason if reason and reason != "-" else None, # removal_reason
93+
subreddit_name, # subreddit
94+
target_author, # target_author
95+
)
96+
entries.append(entry)
97+
98+
return entries
99+
100+
101+
def insert_entries(db_path: str, entries: List[Tuple]) -> Tuple[int, int]:
102+
"""Insert entries into database, returning (inserted, skipped) counts."""
103+
conn = sqlite3.connect(db_path)
104+
cursor = conn.cursor()
105+
106+
inserted = 0
107+
skipped = 0
108+
109+
for entry in entries:
110+
try:
111+
cursor.execute(
112+
"""
113+
INSERT INTO processed_actions
114+
(action_id, created_at, action_type, moderator, target_id,
115+
target_type, display_id, target_permalink, removal_reason,
116+
subreddit, target_author)
117+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
118+
""",
119+
entry,
120+
)
121+
inserted += 1
122+
except sqlite3.IntegrityError:
123+
# Already exists (UNIQUE constraint on action_id)
124+
skipped += 1
125+
126+
conn.commit()
127+
conn.close()
128+
129+
return inserted, skipped
130+
131+
132+
def main():
133+
if len(sys.argv) < 2:
134+
print("Usage: python3 wiki_recovery.py <config_path> [revision_offset]")
135+
print(" revision_offset: How many revisions back from latest (default: 10)")
136+
sys.exit(1)
137+
138+
config_path = sys.argv[1]
139+
revision_offset = int(sys.argv[2]) if len(sys.argv) > 2 else 10
140+
141+
# Load config
142+
with open(config_path) as f:
143+
config = json.load(f)
144+
145+
print(f"Connecting to Reddit as {config['reddit']['username']}...")
146+
reddit = praw.Reddit(
147+
client_id=config["reddit"]["client_id"],
148+
client_secret=config["reddit"]["client_secret"],
149+
username=config["reddit"]["username"],
150+
password=config["reddit"]["password"],
151+
user_agent="RedditModLog Wiki Recovery/1.0",
152+
)
153+
154+
subreddit_name = config["source_subreddit"]
155+
wiki_page_name = config.get("wiki_page", "modlog")
156+
db_path = config.get("database_path", "/config/data/modlog.db")
157+
158+
print(f"Fetching wiki revisions for /r/{subreddit_name}/wiki/{wiki_page_name}...")
159+
subreddit = reddit.subreddit(subreddit_name)
160+
wiki_page = subreddit.wiki[wiki_page_name]
161+
162+
revisions_list = list(wiki_page.revisions(limit=100))
163+
print(f"Found {len(revisions_list)} revisions")
164+
165+
if len(revisions_list) < revision_offset:
166+
print(f"Warning: Only {len(revisions_list)} revisions available, using oldest")
167+
revision_offset = len(revisions_list)
168+
169+
target_rev = revisions_list[-revision_offset]
170+
print(f"Recovering from revision -{revision_offset} (timestamp: {target_rev['timestamp']})")
171+
172+
old_page = subreddit.wiki[wiki_page_name].revision(target_rev["id"])
173+
old_content = old_page.content_md
174+
175+
print(f"Wiki content: {len(old_content)} chars, {old_content.count(chr(10))} lines")
176+
177+
print("Parsing wiki content...")
178+
entries = parse_wiki_content(old_content, subreddit_name)
179+
print(f"Parsed {len(entries)} entries from wiki")
180+
181+
if not entries:
182+
print("No entries found! Check wiki format.")
183+
sys.exit(1)
184+
185+
print(f"Inserting into database: {db_path}")
186+
inserted, skipped = insert_entries(db_path, entries)
187+
188+
print(f"\nRecovery complete!")
189+
print(f" Inserted: {inserted}")
190+
print(f" Skipped (already exist): {skipped}")
191+
print(f" Total processed: {inserted + skipped}")
192+
193+
194+
if __name__ == "__main__":
195+
main()

0 commit comments

Comments
 (0)