|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Analyze cloudfuse size tracker logs to track file sizes and detect discrepancies. |
| 4 | +
|
| 5 | +This script processes log files to: |
| 6 | +1. Track the size of each file in the filesystem |
| 7 | +2. Detect when files are created, modified, renamed, or deleted |
| 8 | +3. Identify discrepancies in size calculations |
| 9 | +4. Monitor sync operations and epoch changes |
| 10 | +""" |
| 11 | + |
| 12 | +import re |
| 13 | +import sys |
| 14 | +from typing import Dict, Optional |
| 15 | +from dataclasses import dataclass |
| 16 | +from datetime import datetime |
| 17 | + |
| 18 | + |
| 19 | +@dataclass |
| 20 | +class FileInfo: |
| 21 | + """Information about a file in the filesystem.""" |
| 22 | + size: Optional[int] # None means unknown size |
| 23 | + last_updated: str # Timestamp of last update |
| 24 | + |
| 25 | + def __repr__(self): |
| 26 | + size_str = str(self.size) if self.size is not None else "UNKNOWN" |
| 27 | + return f"FileInfo(size={size_str}, last_updated={self.last_updated})" |
| 28 | + |
| 29 | + |
| 30 | +class SizeTrackerAnalyzer: |
| 31 | + """Analyzer for size tracker logs.""" |
| 32 | + |
| 33 | + def __init__(self): |
| 34 | + self.files: Dict[str, FileInfo] = {} |
| 35 | + self.total_delta = 0 |
| 36 | + self.last_sync_total = None |
| 37 | + self.discrepancies = [] |
| 38 | + self.line_number = 0 |
| 39 | + self.timestamp = "" |
| 40 | + self.epochChanged = False |
| 41 | + self.firstSync = True |
| 42 | + |
| 43 | + def parse_timestamp(self, line: str) -> Optional[str]: |
| 44 | + """Extract timestamp from log line.""" |
| 45 | + match = re.match(r'^(\w+ \w+ \d+ \d+:\d+:\d+\.\d+ \w+ \d+)', line) |
| 46 | + return match.group(1) if match else None |
| 47 | + |
| 48 | + def handle_add(self, line: str): |
| 49 | + """Handle SizeTracker::Add log entries.""" |
| 50 | + match = re.search(r'SizeTracker::Add : (-?\d+)', line) |
| 51 | + if match: |
| 52 | + delta = int(match.group(1)) |
| 53 | + self.total_delta += delta |
| 54 | + print(f"[{self.line_number}] [{self.timestamp}] Add delta: {delta:+d}, cumulative delta: {self.total_delta:+d}") |
| 55 | + |
| 56 | + def handle_copy_from_file(self, line: str): |
| 57 | + """Handle SizeTracker::CopyFromFile log entries.""" |
| 58 | + debug_match = re.search(r'SizeTracker::CopyFromFile : (.+?) Add\((.+?)\)', line) |
| 59 | + |
| 60 | + if debug_match: |
| 61 | + filepath = debug_match.group(1).strip() |
| 62 | + size_info = debug_match.group(2) |
| 63 | + |
| 64 | + # Parse size_info which |
| 65 | + # e.g. "84535181-4096" (new size - old size) |
| 66 | + if size_info.find("-") < 1: |
| 67 | + delta = int(size_info) |
| 68 | + # Update file info |
| 69 | + if filepath in self.files: |
| 70 | + self.files[filepath].size += delta |
| 71 | + self.files[filepath].last_updated = self.timestamp |
| 72 | + print(f"[{self.line_number}] [{self.timestamp}] CopyFromFile (OLD STYLE): '{filepath}' -> delta={delta:+d}") |
| 73 | + return |
| 74 | + parts = size_info.split('-') |
| 75 | + new_size = int(parts[0]) |
| 76 | + old_size = int(parts[1]) |
| 77 | + delta = new_size - old_size |
| 78 | + |
| 79 | + # Check if file exists and if the old size matches |
| 80 | + if filepath in self.files: |
| 81 | + tracked_size = self.files[filepath].size |
| 82 | + if tracked_size is not None and tracked_size != old_size: |
| 83 | + discrepancy = f"[{self.line_number}] [{self.timestamp}] CopyFromFile size mismatch for '{filepath}': expected old_size={tracked_size}, got old_size={old_size}" |
| 84 | + print(f" ⚠️ {discrepancy}") |
| 85 | + self.discrepancies.append(discrepancy) |
| 86 | + |
| 87 | + # Update file info |
| 88 | + self.files[filepath] = FileInfo(size=new_size, last_updated=self.timestamp) |
| 89 | + print(f"[{self.line_number}] [{self.timestamp}] CopyFromFile: '{filepath}' -> size={new_size} (delta={delta:+d})") |
| 90 | + |
| 91 | + def handle_delete_file(self, line: str): |
| 92 | + """Handle SizeTracker::DeleteFile log entries.""" |
| 93 | + debug_match = re.search(r'SizeTracker::DeleteFile : (.+?) Add\((-\d+)\)', line) |
| 94 | + |
| 95 | + if debug_match: |
| 96 | + filepath = debug_match.group(1).strip() |
| 97 | + delta = int(debug_match.group(2)) |
| 98 | + |
| 99 | + # Check if file exists |
| 100 | + if filepath in self.files: |
| 101 | + tracked_size = self.files[filepath].size |
| 102 | + if tracked_size is not None and delta != -tracked_size: |
| 103 | + discrepancy = f"[{self.line_number}] [{self.timestamp}] DeleteFile size mismatch for '{filepath}': expected delta={-tracked_size}, got delta={delta}" |
| 104 | + print(f" ⚠️ {discrepancy}") |
| 105 | + self.discrepancies.append(discrepancy) |
| 106 | + |
| 107 | + # Remove from tracking |
| 108 | + del self.files[filepath] |
| 109 | + print(f"[{self.line_number}] [{self.timestamp}] DeleteFile: '{filepath}' (delta={delta:+d})") |
| 110 | + else: |
| 111 | + print(f"[{self.line_number}] [{self.timestamp}] DeleteFile: '{filepath}' (delta={delta:+d}) [file not tracked]") |
| 112 | + |
| 113 | + def handle_rename_file(self, line: str): |
| 114 | + """Handle SizeTracker::RenameFile log entries.""" |
| 115 | + match = re.search(r'SizeTracker::RenameFile : (.+?)->(.+)$', line) |
| 116 | + if match: |
| 117 | + src = match.group(1).strip() |
| 118 | + dst = match.group(2).strip() |
| 119 | + |
| 120 | + # Transfer file info from src to dst |
| 121 | + if src in self.files: |
| 122 | + self.files[dst] = self.files[src] |
| 123 | + del self.files[src] |
| 124 | + print(f"[{self.line_number}] [{self.timestamp}] RenameFile: '{src}' -> '{dst}'") |
| 125 | + else: |
| 126 | + # File not tracked, but still record the rename |
| 127 | + self.files[dst] = FileInfo(size=None, last_updated=self.timestamp) |
| 128 | + print(f"[{self.line_number}] [{self.timestamp}] RenameFile: '{src}' -> '{dst}' [src not tracked]") |
| 129 | + |
| 130 | + def handle_sync(self, line: str): |
| 131 | + """Handle SizeTracker::sync log entries.""" |
| 132 | + # Check for epoch change |
| 133 | + epoch_match = re.search(r'epoch changed \(local=(\d+) -> file=(\d+)\) — discarding delta (-?\d+)', line) |
| 134 | + if epoch_match: |
| 135 | + local_epoch = int(epoch_match.group(1)) |
| 136 | + file_epoch = int(epoch_match.group(2)) |
| 137 | + discarded_delta = int(epoch_match.group(3)) |
| 138 | + print(f"\n{'='*80}") |
| 139 | + print(f"[{self.line_number}] [{self.timestamp}] 🔄 EPOCH CHANGE: local={local_epoch} -> file={file_epoch}, discarding delta={discarded_delta:+d}") |
| 140 | + if self.total_delta != discarded_delta: |
| 141 | + discrepancy = f"[{self.line_number}] [{self.timestamp}] Delta mismatch: expected delta={self.total_delta}. Discarded delta is off by {discarded_delta-self.total_delta}!" |
| 142 | + print(f" ⚠️ {discrepancy}") |
| 143 | + self.discrepancies.append(discrepancy) |
| 144 | + print(f"{'='*80}\n") |
| 145 | + self.total_delta = 0 # Reset delta after epoch change |
| 146 | + self.epochChanged = True |
| 147 | + self.firstSync = False |
| 148 | + return |
| 149 | + |
| 150 | + # Normal sync pattern: "old_total + delta = new_total" |
| 151 | + sync_match = re.search(r'SizeTracker::sync : (\d+) \+ (-?\d+) = (\d+)', line) |
| 152 | + if sync_match: |
| 153 | + old_total = int(sync_match.group(1)) |
| 154 | + delta = int(sync_match.group(2)) |
| 155 | + new_total = int(sync_match.group(3)) |
| 156 | + |
| 157 | + print(f"\n{'='*80}") |
| 158 | + print(f"[{self.line_number}] [{self.timestamp}] 📊 SYNC: {old_total} + {delta:+d} = {new_total}") |
| 159 | + |
| 160 | + # Verify the old total is correct |
| 161 | + if self.last_sync_total is not None and old_total != self.last_sync_total: |
| 162 | + epochDisclaimer = "" |
| 163 | + if self.epochChanged: |
| 164 | + epochDisclaimer = "(updated by audit)" |
| 165 | + self.epochChanged = False |
| 166 | + discrepancy = f"[{self.line_number}] [{self.timestamp}] Sync size mismatch: {old_total} != {self.last_sync_total} (off by {old_total - self.last_sync_total})! {epochDisclaimer}" |
| 167 | + print(f" ⚠️ {discrepancy}") |
| 168 | + self.discrepancies.append(discrepancy) |
| 169 | + |
| 170 | + # Compare with tracked delta |
| 171 | + if not self.firstSync and self.total_delta != delta: |
| 172 | + discrepancy = f"[{self.line_number}] [{self.timestamp}] Sync delta mismatch: tracked={self.total_delta:+d}, sync={delta:+d}, difference={self.total_delta - delta:+d}" |
| 173 | + print(f" ⚠️ {discrepancy}") |
| 174 | + self.discrepancies.append(discrepancy) |
| 175 | + |
| 176 | + print(f"{'='*80}\n") |
| 177 | + |
| 178 | + # Reset for next sync period |
| 179 | + self.last_sync_total = new_total |
| 180 | + self.total_delta = 0 |
| 181 | + self.firstSync = False |
| 182 | + |
| 183 | + def process_line(self, line: str): |
| 184 | + """Process a single log line.""" |
| 185 | + self.line_number += 1 |
| 186 | + |
| 187 | + self.timestamp = self.parse_timestamp(line) |
| 188 | + if not self.timestamp: |
| 189 | + return |
| 190 | + |
| 191 | + # Route to appropriate handler based on log message |
| 192 | + if 'SizeTracker::Add :' in line and 'journal.go' in line: |
| 193 | + self.handle_add(line) |
| 194 | + elif 'SizeTracker::CopyFromFile' in line: |
| 195 | + self.handle_copy_from_file(line) |
| 196 | + elif 'SizeTracker::DeleteFile' in line: |
| 197 | + self.handle_delete_file(line) |
| 198 | + elif 'SizeTracker::RenameFile' in line: |
| 199 | + self.handle_rename_file(line) |
| 200 | + elif 'SizeTracker::sync' in line: |
| 201 | + self.handle_sync(line) |
| 202 | + |
| 203 | + def print_summary(self): |
| 204 | + """Print summary of analysis.""" |
| 205 | + print("\n" + "="*80) |
| 206 | + print("ANALYSIS SUMMARY") |
| 207 | + print("="*80) |
| 208 | + print(f"Total lines processed: {self.line_number}") |
| 209 | + print(f"Total files tracked: {len(self.files)}") |
| 210 | + print(f"Pending delta (since last sync): {self.total_delta:+d}") |
| 211 | + |
| 212 | + if self.discrepancies: |
| 213 | + print(f"\n⚠️ Found {len(self.discrepancies)} discrepancies:") |
| 214 | + for i, discrepancy in enumerate(self.discrepancies, 1): |
| 215 | + print(f" {i}. {discrepancy}") |
| 216 | + else: |
| 217 | + print("\n✅ No discrepancies found!") |
| 218 | + |
| 219 | + # Show some file statistics |
| 220 | + known_size_files = [f for f in self.files.values() if f.size is not None] |
| 221 | + unknown_size_files = [f for f in self.files.values() if f.size is None] |
| 222 | + |
| 223 | + print(f"\nFiles with known size: {len(known_size_files)}") |
| 224 | + print(f"Files with unknown size: {len(unknown_size_files)}") |
| 225 | + |
| 226 | + if known_size_files: |
| 227 | + total_size = sum(f.size for f in known_size_files) |
| 228 | + print(f"Total size of tracked files: {total_size:,} bytes ({total_size / (1024**3):.2f} GB)") |
| 229 | + |
| 230 | + |
| 231 | +def main(): |
| 232 | + """Main entry point.""" |
| 233 | + if len(sys.argv) < 2: |
| 234 | + print(f"Usage: {sys.argv[0]} <log_file>") |
| 235 | + print(f"Example: {sys.argv[0]} combined.log") |
| 236 | + sys.exit(1) |
| 237 | + |
| 238 | + log_file = sys.argv[1] |
| 239 | + |
| 240 | + print(f"Analyzing log file: {log_file}") |
| 241 | + print("="*80 + "\n") |
| 242 | + |
| 243 | + analyzer = SizeTrackerAnalyzer() |
| 244 | + |
| 245 | + try: |
| 246 | + with open(log_file, 'r', encoding='utf-8', errors='ignore') as f: |
| 247 | + for line in f: |
| 248 | + analyzer.process_line(line.rstrip('\n')) |
| 249 | + except FileNotFoundError: |
| 250 | + print(f"Error: File '{log_file}' not found") |
| 251 | + sys.exit(1) |
| 252 | + except Exception as e: |
| 253 | + print(f"Error processing log file: {e}") |
| 254 | + sys.exit(1) |
| 255 | + |
| 256 | + analyzer.print_summary() |
| 257 | + |
| 258 | + |
| 259 | +if __name__ == "__main__": |
| 260 | + main() |
0 commit comments