Skip to content

Commit ebda047

Browse files
authored
Merge pull request #34 from PlaneQuery/develop
Develop to main: theairtraffic google sheet
2 parents 2bb0a5e + 3fdf443 commit ebda047

File tree

2 files changed

+311
-0
lines changed

2 files changed

+311
-0
lines changed

scripts/scrape_theairtraffic.py

Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Parse TheAirTraffic Database CSV and produce community_submission.v1 JSON.
4+
5+
Source: "TheAirTraffic Database - Aircraft 2.csv"
6+
Output: community/YYYY-MM-DD/theairtraffic_<date>_<hash>.json
7+
8+
Categories in the spreadsheet columns (paired: name, registrations, separator):
9+
Col 1-3: Business
10+
Col 4-6: Government
11+
Col 7-9: People
12+
Col 10-12: Sports
13+
Col 13-15: Celebrity
14+
Col 16-18: State Govt./Law
15+
Col 19-21: Other
16+
Col 22-24: Test Aircraft
17+
Col 25-27: YouTubers
18+
Col 28-30: Formula 1 VIP's
19+
Col 31-33: Active GII's and GIII's (test/demo aircraft)
20+
Col 34-37: Russia & Ukraine (extra col for old/new)
21+
Col 38-40: Helicopters & Blimps
22+
Col 41-43: Unique Reg's
23+
Col 44-46: Saudi & UAE
24+
Col 47-49: Schools
25+
Col 50-52: Special Charter
26+
Col 53-55: Unknown Owners
27+
Col 56-59: Frequent Flyers (extra cols: name, aircraft, logged, hours)
28+
"""
29+
30+
import csv
31+
import json
32+
import hashlib
33+
import re
34+
import sys
35+
import uuid
36+
from datetime import datetime, timezone
37+
from pathlib import Path
38+
39+
# ── Category mapping ────────────────────────────────────────────────────────
40+
# Each entry: (name_col, reg_col, owner_category_tags)
41+
# owner_category_tags is a dict of tag keys to add beyond "owner"
42+
CATEGORY_COLUMNS = [
43+
# (name_col, reg_col, {tag_key: tag_value, ...})
44+
(1, 2, {"owner_category_0": "business"}),
45+
(4, 5, {"owner_category_0": "government"}),
46+
(7, 8, {"owner_category_0": "celebrity"}),
47+
(10, 11, {"owner_category_0": "sports"}),
48+
(13, 14, {"owner_category_0": "celebrity"}),
49+
(16, 17, {"owner_category_0": "government", "owner_category_1": "law_enforcement"}),
50+
(19, 20, {"owner_category_0": "other"}),
51+
(22, 23, {"owner_category_0": "test_aircraft"}),
52+
(25, 26, {"owner_category_0": "youtuber", "owner_category_1": "celebrity"}),
53+
(28, 29, {"owner_category_0": "celebrity", "owner_category_1": "motorsport"}),
54+
(31, 32, {"owner_category_0": "test_aircraft"}),
55+
# Russia & Ukraine: col 34=name, col 35 or 36 may have reg
56+
(34, 35, {"owner_category_0": "russia_ukraine"}),
57+
(38, 39, {"owner_category_0": "celebrity", "category": "helicopter_or_blimp"}),
58+
(41, 42, {"owner_category_0": "other"}),
59+
(44, 45, {"owner_category_0": "government", "owner_category_1": "royal_family"}),
60+
(47, 48, {"owner_category_0": "education"}),
61+
(50, 51, {"owner_category_0": "charter"}),
62+
(53, 54, {"owner_category_0": "unknown"}),
63+
(56, 57, {"owner_category_0": "celebrity"}), # Frequent Flyers name col, aircraft col
64+
]
65+
66+
# First data row index (0-based) in the CSV
67+
DATA_START_ROW = 4
68+
69+
# ── Contributor info ────────────────────────────────────────────────────────
70+
CONTRIBUTOR_NAME = "TheAirTraffic"
71+
# Deterministic UUID v5 from contributor name
72+
CONTRIBUTOR_UUID = str(uuid.uuid5(uuid.NAMESPACE_URL, "https://theairtraffic.com"))
73+
74+
# Citation
75+
CITATION = "https://docs.google.com/spreadsheets/d/1JHhfJBnJPNBA6TgiSHjkXFkHBdVTTz_nXxaUDRWcHpk"
76+
77+
78+
def looks_like_military_serial(reg: str) -> bool:
79+
"""
80+
Detect military-style serials like 92-9000, 82-8000, 98-0001
81+
or pure numeric IDs like 929000, 828000, 980001.
82+
These aren't standard civil registrations; use openairframes_id.
83+
"""
84+
# Pattern: NN-NNNN
85+
if re.match(r'^\d{2}-\d{4}$', reg):
86+
return True
87+
# Pure 6-digit numbers (likely ICAO hex or military mode-S)
88+
if re.match(r'^\d{6}$', reg):
89+
return True
90+
# Short numeric-only (1-5 digits) like "01", "02", "676"
91+
if re.match(r'^\d{1,5}$', reg):
92+
return True
93+
return False
94+
95+
96+
def normalize_reg(raw: str) -> str:
97+
"""Clean up a registration string."""
98+
reg = raw.strip().rstrip(',').strip()
99+
# Remove carriage returns and other whitespace
100+
reg = reg.replace('\r', '').replace('\n', '').strip()
101+
return reg
102+
103+
104+
def parse_regs(cell_value: str) -> list[str]:
105+
"""
106+
Parse a cell that may contain one or many registrations,
107+
separated by commas, possibly wrapped in quotes.
108+
"""
109+
if not cell_value or not cell_value.strip():
110+
return []
111+
112+
# Some cells have ADS-B exchange URLs – skip those
113+
if 'globe.adsbexchange.com' in cell_value:
114+
return []
115+
if cell_value.strip() in ('.', ',', ''):
116+
return []
117+
118+
results = []
119+
# Split on comma
120+
parts = cell_value.split(',')
121+
for part in parts:
122+
reg = normalize_reg(part)
123+
if not reg:
124+
continue
125+
# Skip URLs, section labels, etc.
126+
if reg.startswith('http') or reg.startswith('Link') or reg == 'Section 1':
127+
continue
128+
# Skip if it's just whitespace or dots
129+
if reg in ('.', '..', '...'):
130+
continue
131+
results.append(reg)
132+
return results
133+
134+
135+
def make_submission(
136+
reg: str,
137+
owner: str,
138+
category_tags: dict[str, str],
139+
) -> dict:
140+
"""Build a single community_submission.v1 object."""
141+
142+
entry: dict = {}
143+
144+
# Decide identifier field
145+
if looks_like_military_serial(reg):
146+
entry["openairframes_id"] = reg
147+
else:
148+
entry["registration_number"] = reg
149+
150+
# Tags
151+
tags: dict = {
152+
"citation_0": CITATION,
153+
}
154+
if owner:
155+
tags["owner"] = owner.strip()
156+
tags.update(category_tags)
157+
entry["tags"] = tags
158+
159+
return entry
160+
161+
162+
def main():
163+
csv_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(
164+
"/Users/jonahgoode/Downloads/TheAirTraffic Database - Aircraft 2.csv"
165+
)
166+
167+
if not csv_path.exists():
168+
print(f"ERROR: CSV not found at {csv_path}", file=sys.stderr)
169+
sys.exit(1)
170+
171+
# Read CSV
172+
with open(csv_path, 'r', encoding='utf-8-sig') as f:
173+
reader = csv.reader(f)
174+
rows = list(reader)
175+
176+
print(f"Read {len(rows)} rows from {csv_path.name}")
177+
178+
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
179+
180+
submissions: list[dict] = []
181+
seen: set[tuple] = set() # (reg, owner) dedup
182+
183+
for row_idx in range(DATA_START_ROW, len(rows)):
184+
row = rows[row_idx]
185+
if len(row) < 3:
186+
continue
187+
188+
for name_col, reg_col, cat_tags in CATEGORY_COLUMNS:
189+
if reg_col >= len(row) or name_col >= len(row):
190+
continue
191+
192+
owner_raw = row[name_col].strip().rstrip(',').strip()
193+
reg_raw = row[reg_col]
194+
195+
# Clean owner name
196+
owner = owner_raw.replace('\r', '').replace('\n', '').strip()
197+
if not owner or owner in ('.', ',', 'Section 1'):
198+
continue
199+
# Skip header-like values
200+
if owner.startswith('http') or owner.startswith('Link '):
201+
continue
202+
203+
regs = parse_regs(reg_raw)
204+
if not regs:
205+
# For Russia & Ukraine, try the next column too (col 35 might have old reg, col 36 new)
206+
if name_col == 34 and reg_col + 1 < len(row):
207+
regs = parse_regs(row[reg_col + 1])
208+
209+
for reg in regs:
210+
key = (reg, owner)
211+
if key in seen:
212+
continue
213+
seen.add(key)
214+
submissions.append(make_submission(reg, owner, cat_tags))
215+
216+
print(f"Generated {len(submissions)} submissions")
217+
218+
# Write output
219+
proj_root = Path(__file__).resolve().parent.parent
220+
out_dir = proj_root / "community" / date_str
221+
out_dir.mkdir(parents=True, exist_ok=True)
222+
223+
out_file = out_dir / f"theairtraffic_{date_str}.json"
224+
225+
with open(out_file, 'w', encoding='utf-8') as f:
226+
json.dump(submissions, f, indent=2, ensure_ascii=False)
227+
228+
print(f"Written to {out_file}")
229+
print(f"Sample entry:\n{json.dumps(submissions[0], indent=2)}")
230+
231+
# Quick stats
232+
cats = {}
233+
for s in submissions:
234+
c = s['tags'].get('owner_category_0', 'NONE')
235+
cats[c] = cats.get(c, 0) + 1
236+
print("\nCategory breakdown:")
237+
for c, n in sorted(cats.items(), key=lambda x: -x[1]):
238+
print(f" {c}: {n}")
239+
240+
241+
if __name__ == "__main__":
242+
main()

scripts/validate_theairtraffic.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/usr/bin/env python3
2+
"""Validate the generated theairtraffic JSON output."""
3+
import json
4+
import glob
5+
import sys
6+
7+
# Find the latest output
8+
files = sorted(glob.glob("community/2026-02-*/theairtraffic_*.json"))
9+
if not files:
10+
print("No output files found!")
11+
sys.exit(1)
12+
13+
path = files[-1]
14+
print(f"Validating: {path}")
15+
16+
with open(path) as f:
17+
data = json.load(f)
18+
19+
print(f"Total entries: {len(data)}")
20+
21+
# Check military serial handling
22+
mil = [d for d in data if "openairframes_id" in d]
23+
print(f"\nEntries using openairframes_id: {len(mil)}")
24+
for m in mil[:10]:
25+
print(f" {m['openairframes_id']} -> owner: {m['tags'].get('owner','?')}")
26+
27+
# Check youtuber entries
28+
yt = [d for d in data if d["tags"].get("owner_category_0") == "youtuber"]
29+
print(f"\nYouTuber entries: {len(yt)}")
30+
for y in yt[:5]:
31+
reg = y.get("registration_number", y.get("openairframes_id"))
32+
c0 = y["tags"].get("owner_category_0")
33+
c1 = y["tags"].get("owner_category_1")
34+
print(f" {reg} -> owner: {y['tags']['owner']}, cat0: {c0}, cat1: {c1}")
35+
36+
# Check US Govt / military
37+
gov = [d for d in data if d["tags"].get("owner") == "United States of America 747/757"]
38+
print(f"\nUSA 747/757 entries: {len(gov)}")
39+
for g in gov:
40+
oid = g.get("openairframes_id", g.get("registration_number"))
41+
print(f" {oid}")
42+
43+
# Schema validation
44+
issues = 0
45+
for i, d in enumerate(data):
46+
has_id = any(k in d for k in ["registration_number", "transponder_code_hex", "openairframes_id"])
47+
if not has_id:
48+
print(f" Entry {i}: no identifier!")
49+
issues += 1
50+
if "tags" not in d:
51+
print(f" Entry {i}: no tags!")
52+
issues += 1
53+
# Check tag key format
54+
for k in d.get("tags", {}):
55+
import re
56+
if not re.match(r"^[a-z][a-z0-9_]{0,63}$", k):
57+
print(f" Entry {i}: invalid tag key '{k}'")
58+
issues += 1
59+
60+
print(f"\nSchema issues: {issues}")
61+
62+
# Category breakdown
63+
cats = {}
64+
for s in data:
65+
c = s["tags"].get("owner_category_0", "NONE")
66+
cats[c] = cats.get(c, 0) + 1
67+
print("\nCategory breakdown:")
68+
for c, n in sorted(cats.items(), key=lambda x: -x[1]):
69+
print(f" {c}: {n}")

0 commit comments

Comments
 (0)