|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Parse TheAirTraffic Database CSV and produce community_submission.v1 JSON. |
| 4 | +
|
| 5 | +Source: "TheAirTraffic Database - Aircraft 2.csv" |
| 6 | +Output: community/YYYY-MM-DD/theairtraffic_<date>_<hash>.json |
| 7 | +
|
| 8 | +Categories in the spreadsheet columns (paired: name, registrations, separator): |
| 9 | + Col 1-3: Business |
| 10 | + Col 4-6: Government |
| 11 | + Col 7-9: People |
| 12 | + Col 10-12: Sports |
| 13 | + Col 13-15: Celebrity |
| 14 | + Col 16-18: State Govt./Law |
| 15 | + Col 19-21: Other |
| 16 | + Col 22-24: Test Aircraft |
| 17 | + Col 25-27: YouTubers |
| 18 | + Col 28-30: Formula 1 VIP's |
| 19 | + Col 31-33: Active GII's and GIII's (test/demo aircraft) |
| 20 | + Col 34-37: Russia & Ukraine (extra col for old/new) |
| 21 | + Col 38-40: Helicopters & Blimps |
| 22 | + Col 41-43: Unique Reg's |
| 23 | + Col 44-46: Saudi & UAE |
| 24 | + Col 47-49: Schools |
| 25 | + Col 50-52: Special Charter |
| 26 | + Col 53-55: Unknown Owners |
| 27 | + Col 56-59: Frequent Flyers (extra cols: name, aircraft, logged, hours) |
| 28 | +""" |
| 29 | + |
| 30 | +import csv |
| 31 | +import json |
| 32 | +import hashlib |
| 33 | +import re |
| 34 | +import sys |
| 35 | +import uuid |
| 36 | +from datetime import datetime, timezone |
| 37 | +from pathlib import Path |
| 38 | + |
| 39 | +# ── Category mapping ──────────────────────────────────────────────────────── |
| 40 | +# Each entry: (name_col, reg_col, owner_category_tags) |
| 41 | +# owner_category_tags is a dict of tag keys to add beyond "owner" |
| 42 | +CATEGORY_COLUMNS = [ |
| 43 | + # (name_col, reg_col, {tag_key: tag_value, ...}) |
| 44 | + (1, 2, {"owner_category_0": "business"}), |
| 45 | + (4, 5, {"owner_category_0": "government"}), |
| 46 | + (7, 8, {"owner_category_0": "celebrity"}), |
| 47 | + (10, 11, {"owner_category_0": "sports"}), |
| 48 | + (13, 14, {"owner_category_0": "celebrity"}), |
| 49 | + (16, 17, {"owner_category_0": "government", "owner_category_1": "law_enforcement"}), |
| 50 | + (19, 20, {"owner_category_0": "other"}), |
| 51 | + (22, 23, {"owner_category_0": "test_aircraft"}), |
| 52 | + (25, 26, {"owner_category_0": "youtuber", "owner_category_1": "celebrity"}), |
| 53 | + (28, 29, {"owner_category_0": "celebrity", "owner_category_1": "motorsport"}), |
| 54 | + (31, 32, {"owner_category_0": "test_aircraft"}), |
| 55 | + # Russia & Ukraine: col 34=name, col 35 or 36 may have reg |
| 56 | + (34, 35, {"owner_category_0": "russia_ukraine"}), |
| 57 | + (38, 39, {"owner_category_0": "celebrity", "category": "helicopter_or_blimp"}), |
| 58 | + (41, 42, {"owner_category_0": "other"}), |
| 59 | + (44, 45, {"owner_category_0": "government", "owner_category_1": "royal_family"}), |
| 60 | + (47, 48, {"owner_category_0": "education"}), |
| 61 | + (50, 51, {"owner_category_0": "charter"}), |
| 62 | + (53, 54, {"owner_category_0": "unknown"}), |
| 63 | + (56, 57, {"owner_category_0": "celebrity"}), # Frequent Flyers name col, aircraft col |
| 64 | +] |
| 65 | + |
| 66 | +# First data row index (0-based) in the CSV |
| 67 | +DATA_START_ROW = 4 |
| 68 | + |
| 69 | +# ── Contributor info ──────────────────────────────────────────────────────── |
| 70 | +CONTRIBUTOR_NAME = "TheAirTraffic" |
| 71 | +# Deterministic UUID v5 from contributor name |
| 72 | +CONTRIBUTOR_UUID = str(uuid.uuid5(uuid.NAMESPACE_URL, "https://theairtraffic.com")) |
| 73 | + |
| 74 | +# Citation |
| 75 | +CITATION = "https://docs.google.com/spreadsheets/d/1JHhfJBnJPNBA6TgiSHjkXFkHBdVTTz_nXxaUDRWcHpk" |
| 76 | + |
| 77 | + |
| 78 | +def looks_like_military_serial(reg: str) -> bool: |
| 79 | + """ |
| 80 | + Detect military-style serials like 92-9000, 82-8000, 98-0001 |
| 81 | + or pure numeric IDs like 929000, 828000, 980001. |
| 82 | + These aren't standard civil registrations; use openairframes_id. |
| 83 | + """ |
| 84 | + # Pattern: NN-NNNN |
| 85 | + if re.match(r'^\d{2}-\d{4}$', reg): |
| 86 | + return True |
| 87 | + # Pure 6-digit numbers (likely ICAO hex or military mode-S) |
| 88 | + if re.match(r'^\d{6}$', reg): |
| 89 | + return True |
| 90 | + # Short numeric-only (1-5 digits) like "01", "02", "676" |
| 91 | + if re.match(r'^\d{1,5}$', reg): |
| 92 | + return True |
| 93 | + return False |
| 94 | + |
| 95 | + |
| 96 | +def normalize_reg(raw: str) -> str: |
| 97 | + """Clean up a registration string.""" |
| 98 | + reg = raw.strip().rstrip(',').strip() |
| 99 | + # Remove carriage returns and other whitespace |
| 100 | + reg = reg.replace('\r', '').replace('\n', '').strip() |
| 101 | + return reg |
| 102 | + |
| 103 | + |
| 104 | +def parse_regs(cell_value: str) -> list[str]: |
| 105 | + """ |
| 106 | + Parse a cell that may contain one or many registrations, |
| 107 | + separated by commas, possibly wrapped in quotes. |
| 108 | + """ |
| 109 | + if not cell_value or not cell_value.strip(): |
| 110 | + return [] |
| 111 | + |
| 112 | + # Some cells have ADS-B exchange URLs – skip those |
| 113 | + if 'globe.adsbexchange.com' in cell_value: |
| 114 | + return [] |
| 115 | + if cell_value.strip() in ('.', ',', ''): |
| 116 | + return [] |
| 117 | + |
| 118 | + results = [] |
| 119 | + # Split on comma |
| 120 | + parts = cell_value.split(',') |
| 121 | + for part in parts: |
| 122 | + reg = normalize_reg(part) |
| 123 | + if not reg: |
| 124 | + continue |
| 125 | + # Skip URLs, section labels, etc. |
| 126 | + if reg.startswith('http') or reg.startswith('Link') or reg == 'Section 1': |
| 127 | + continue |
| 128 | + # Skip if it's just whitespace or dots |
| 129 | + if reg in ('.', '..', '...'): |
| 130 | + continue |
| 131 | + results.append(reg) |
| 132 | + return results |
| 133 | + |
| 134 | + |
| 135 | +def make_submission( |
| 136 | + reg: str, |
| 137 | + owner: str, |
| 138 | + category_tags: dict[str, str], |
| 139 | +) -> dict: |
| 140 | + """Build a single community_submission.v1 object.""" |
| 141 | + |
| 142 | + entry: dict = {} |
| 143 | + |
| 144 | + # Decide identifier field |
| 145 | + if looks_like_military_serial(reg): |
| 146 | + entry["openairframes_id"] = reg |
| 147 | + else: |
| 148 | + entry["registration_number"] = reg |
| 149 | + |
| 150 | + # Tags |
| 151 | + tags: dict = { |
| 152 | + "citation_0": CITATION, |
| 153 | + } |
| 154 | + if owner: |
| 155 | + tags["owner"] = owner.strip() |
| 156 | + tags.update(category_tags) |
| 157 | + entry["tags"] = tags |
| 158 | + |
| 159 | + return entry |
| 160 | + |
| 161 | + |
| 162 | +def main(): |
| 163 | + csv_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path( |
| 164 | + "/Users/jonahgoode/Downloads/TheAirTraffic Database - Aircraft 2.csv" |
| 165 | + ) |
| 166 | + |
| 167 | + if not csv_path.exists(): |
| 168 | + print(f"ERROR: CSV not found at {csv_path}", file=sys.stderr) |
| 169 | + sys.exit(1) |
| 170 | + |
| 171 | + # Read CSV |
| 172 | + with open(csv_path, 'r', encoding='utf-8-sig') as f: |
| 173 | + reader = csv.reader(f) |
| 174 | + rows = list(reader) |
| 175 | + |
| 176 | + print(f"Read {len(rows)} rows from {csv_path.name}") |
| 177 | + |
| 178 | + date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") |
| 179 | + |
| 180 | + submissions: list[dict] = [] |
| 181 | + seen: set[tuple] = set() # (reg, owner) dedup |
| 182 | + |
| 183 | + for row_idx in range(DATA_START_ROW, len(rows)): |
| 184 | + row = rows[row_idx] |
| 185 | + if len(row) < 3: |
| 186 | + continue |
| 187 | + |
| 188 | + for name_col, reg_col, cat_tags in CATEGORY_COLUMNS: |
| 189 | + if reg_col >= len(row) or name_col >= len(row): |
| 190 | + continue |
| 191 | + |
| 192 | + owner_raw = row[name_col].strip().rstrip(',').strip() |
| 193 | + reg_raw = row[reg_col] |
| 194 | + |
| 195 | + # Clean owner name |
| 196 | + owner = owner_raw.replace('\r', '').replace('\n', '').strip() |
| 197 | + if not owner or owner in ('.', ',', 'Section 1'): |
| 198 | + continue |
| 199 | + # Skip header-like values |
| 200 | + if owner.startswith('http') or owner.startswith('Link '): |
| 201 | + continue |
| 202 | + |
| 203 | + regs = parse_regs(reg_raw) |
| 204 | + if not regs: |
| 205 | + # For Russia & Ukraine, try the next column too (col 35 might have old reg, col 36 new) |
| 206 | + if name_col == 34 and reg_col + 1 < len(row): |
| 207 | + regs = parse_regs(row[reg_col + 1]) |
| 208 | + |
| 209 | + for reg in regs: |
| 210 | + key = (reg, owner) |
| 211 | + if key in seen: |
| 212 | + continue |
| 213 | + seen.add(key) |
| 214 | + submissions.append(make_submission(reg, owner, cat_tags)) |
| 215 | + |
| 216 | + print(f"Generated {len(submissions)} submissions") |
| 217 | + |
| 218 | + # Write output |
| 219 | + proj_root = Path(__file__).resolve().parent.parent |
| 220 | + out_dir = proj_root / "community" / date_str |
| 221 | + out_dir.mkdir(parents=True, exist_ok=True) |
| 222 | + |
| 223 | + out_file = out_dir / f"theairtraffic_{date_str}.json" |
| 224 | + |
| 225 | + with open(out_file, 'w', encoding='utf-8') as f: |
| 226 | + json.dump(submissions, f, indent=2, ensure_ascii=False) |
| 227 | + |
| 228 | + print(f"Written to {out_file}") |
| 229 | + print(f"Sample entry:\n{json.dumps(submissions[0], indent=2)}") |
| 230 | + |
| 231 | + # Quick stats |
| 232 | + cats = {} |
| 233 | + for s in submissions: |
| 234 | + c = s['tags'].get('owner_category_0', 'NONE') |
| 235 | + cats[c] = cats.get(c, 0) + 1 |
| 236 | + print("\nCategory breakdown:") |
| 237 | + for c, n in sorted(cats.items(), key=lambda x: -x[1]): |
| 238 | + print(f" {c}: {n}") |
| 239 | + |
| 240 | + |
| 241 | +if __name__ == "__main__": |
| 242 | + main() |
0 commit comments