|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Recovery script to rebuild database from historical wiki revisions. |
| 4 | +This recovers modlog data that was lost when the database was accidentally deleted. |
| 5 | +""" |
| 6 | + |
| 7 | +import json |
| 8 | +import re |
| 9 | +import sqlite3 |
| 10 | +import sys |
| 11 | +from datetime import datetime |
| 12 | +from typing import Any, List, Tuple |
| 13 | + |
| 14 | +import praw |
| 15 | + |
| 16 | + |
| 17 | +def parse_wiki_content(content: str, subreddit_name: str) -> List[Tuple[Any, ...]]: |
| 18 | + """Parse wiki markdown content and extract modlog entries.""" |
| 19 | + entries: list[tuple[Any, ...]] = [] |
| 20 | + current_date = None |
| 21 | + |
| 22 | + lines = content.split("\n") |
| 23 | + |
| 24 | + for line in lines: |
| 25 | + # Check for date header |
| 26 | + date_match = re.match(r"^## (\d{4}-\d{2}-\d{2})$", line.strip()) |
| 27 | + if date_match: |
| 28 | + current_date = date_match.group(1) |
| 29 | + continue |
| 30 | + |
| 31 | + # Skip table headers and dividers |
| 32 | + if line.startswith("|---") or line.startswith("| Time | Action"): |
| 33 | + continue |
| 34 | + |
| 35 | + # Parse table rows |
| 36 | + if line.startswith("|") and current_date and "|" in line[1:]: |
| 37 | + parts = [p.strip() for p in line.split("|")[1:-1]] # Remove empty first/last |
| 38 | + if len(parts) < 6: |
| 39 | + continue |
| 40 | + |
| 41 | + time_str, action, entry_id, moderator, content, reason = parts[:6] |
| 42 | + |
| 43 | + # Skip empty or header rows |
| 44 | + if not time_str or time_str == "Time": |
| 45 | + continue |
| 46 | + |
| 47 | + # Parse timestamp |
| 48 | + try: |
| 49 | + timestamp_str = f"{current_date} {time_str}" |
| 50 | + dt = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S %Z") |
| 51 | + created_at = int(dt.timestamp()) |
| 52 | + except ValueError: |
| 53 | + print(f"Warning: Could not parse timestamp: {timestamp_str}", file=sys.stderr) |
| 54 | + continue |
| 55 | + |
| 56 | + # Extract target info from content markdown |
| 57 | + target_author = None |
| 58 | + target_permalink = None |
| 59 | + target_type = None |
| 60 | + |
| 61 | + # Extract permalink |
| 62 | + permalink_match = re.search(r"\[.*?\]\((https://[^)]+)\)", content) |
| 63 | + if permalink_match: |
| 64 | + target_permalink = permalink_match.group(1) |
| 65 | + |
| 66 | + # Extract author |
| 67 | + author_match = re.search(r"u/([A-Za-z0-9_-]+)", content) |
| 68 | + if author_match: |
| 69 | + target_author = author_match.group(1) |
| 70 | + |
| 71 | + # Determine target type from action |
| 72 | + if "comment" in action.lower(): |
| 73 | + target_type = "comment" |
| 74 | + elif "link" in action.lower() or "post" in action.lower(): |
| 75 | + target_type = "submission" |
| 76 | + else: |
| 77 | + target_type = "unknown" |
| 78 | + |
| 79 | + # Clean up action type (remove filter- prefix if present) |
| 80 | + action_clean = action.replace("filter-", "") |
| 81 | + |
| 82 | + # Create entry tuple matching database schema |
| 83 | + entry = ( |
| 84 | + entry_id, # action_id |
| 85 | + created_at, # created_at |
| 86 | + action_clean, # action_type |
| 87 | + moderator, # moderator |
| 88 | + entry_id, # target_id (same as action_id for display) |
| 89 | + target_type, # target_type |
| 90 | + entry_id, # display_id |
| 91 | + target_permalink, # target_permalink |
| 92 | + reason if reason and reason != "-" else None, # removal_reason |
| 93 | + subreddit_name, # subreddit |
| 94 | + target_author, # target_author |
| 95 | + ) |
| 96 | + entries.append(entry) |
| 97 | + |
| 98 | + return entries |
| 99 | + |
| 100 | + |
| 101 | +def insert_entries(db_path: str, entries: List[Tuple]) -> Tuple[int, int]: |
| 102 | + """Insert entries into database, returning (inserted, skipped) counts.""" |
| 103 | + conn = sqlite3.connect(db_path) |
| 104 | + cursor = conn.cursor() |
| 105 | + |
| 106 | + inserted = 0 |
| 107 | + skipped = 0 |
| 108 | + |
| 109 | + for entry in entries: |
| 110 | + try: |
| 111 | + cursor.execute( |
| 112 | + """ |
| 113 | + INSERT INTO processed_actions |
| 114 | + (action_id, created_at, action_type, moderator, target_id, |
| 115 | + target_type, display_id, target_permalink, removal_reason, |
| 116 | + subreddit, target_author) |
| 117 | + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) |
| 118 | + """, |
| 119 | + entry, |
| 120 | + ) |
| 121 | + inserted += 1 |
| 122 | + except sqlite3.IntegrityError: |
| 123 | + # Already exists (UNIQUE constraint on action_id) |
| 124 | + skipped += 1 |
| 125 | + |
| 126 | + conn.commit() |
| 127 | + conn.close() |
| 128 | + |
| 129 | + return inserted, skipped |
| 130 | + |
| 131 | + |
| 132 | +def main(): |
| 133 | + if len(sys.argv) < 2: |
| 134 | + print("Usage: python3 wiki_recovery.py <config_path> [revision_offset]") |
| 135 | + print(" revision_offset: How many revisions back from latest (default: 10)") |
| 136 | + sys.exit(1) |
| 137 | + |
| 138 | + config_path = sys.argv[1] |
| 139 | + revision_offset = int(sys.argv[2]) if len(sys.argv) > 2 else 10 |
| 140 | + |
| 141 | + # Load config |
| 142 | + with open(config_path) as f: |
| 143 | + config = json.load(f) |
| 144 | + |
| 145 | + print(f"Connecting to Reddit as {config['reddit']['username']}...") |
| 146 | + reddit = praw.Reddit( |
| 147 | + client_id=config["reddit"]["client_id"], |
| 148 | + client_secret=config["reddit"]["client_secret"], |
| 149 | + username=config["reddit"]["username"], |
| 150 | + password=config["reddit"]["password"], |
| 151 | + user_agent="RedditModLog Wiki Recovery/1.0", |
| 152 | + ) |
| 153 | + |
| 154 | + subreddit_name = config["source_subreddit"] |
| 155 | + wiki_page_name = config.get("wiki_page", "modlog") |
| 156 | + db_path = config.get("database_path", "/config/data/modlog.db") |
| 157 | + |
| 158 | + print(f"Fetching wiki revisions for /r/{subreddit_name}/wiki/{wiki_page_name}...") |
| 159 | + subreddit = reddit.subreddit(subreddit_name) |
| 160 | + wiki_page = subreddit.wiki[wiki_page_name] |
| 161 | + |
| 162 | + revisions_list = list(wiki_page.revisions(limit=100)) |
| 163 | + print(f"Found {len(revisions_list)} revisions") |
| 164 | + |
| 165 | + if len(revisions_list) < revision_offset: |
| 166 | + print(f"Warning: Only {len(revisions_list)} revisions available, using oldest") |
| 167 | + revision_offset = len(revisions_list) |
| 168 | + |
| 169 | + target_rev = revisions_list[-revision_offset] |
| 170 | + print(f"Recovering from revision -{revision_offset} (timestamp: {target_rev['timestamp']})") |
| 171 | + |
| 172 | + old_page = subreddit.wiki[wiki_page_name].revision(target_rev["id"]) |
| 173 | + old_content = old_page.content_md |
| 174 | + |
| 175 | + print(f"Wiki content: {len(old_content)} chars, {old_content.count(chr(10))} lines") |
| 176 | + |
| 177 | + print("Parsing wiki content...") |
| 178 | + entries = parse_wiki_content(old_content, subreddit_name) |
| 179 | + print(f"Parsed {len(entries)} entries from wiki") |
| 180 | + |
| 181 | + if not entries: |
| 182 | + print("No entries found! Check wiki format.") |
| 183 | + sys.exit(1) |
| 184 | + |
| 185 | + print(f"Inserting into database: {db_path}") |
| 186 | + inserted, skipped = insert_entries(db_path, entries) |
| 187 | + |
| 188 | + print(f"\nRecovery complete!") |
| 189 | + print(f" Inserted: {inserted}") |
| 190 | + print(f" Skipped (already exist): {skipped}") |
| 191 | + print(f" Total processed: {inserted + skipped}") |
| 192 | + |
| 193 | + |
| 194 | +if __name__ == "__main__": |
| 195 | + main() |
0 commit comments