diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..a608310 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,3 @@ +# Intermediate pipeline outputs +/downloads.json +/uploads.json diff --git a/data/README.md b/data/README.md index 2a713e2..be9b32d 100644 --- a/data/README.md +++ b/data/README.md @@ -1,3 +1,183 @@ -# Datasets +# IQB Static Data Files -A set of sample datasets. \ No newline at end of file +This directory contains static measurement data used by +the IQB prototype for Phase 1 development. + +## Current Dataset + +**Period**: October 2024 (2024-10-01 to 2024-10-31) + +**Source**: [M-Lab NDT](https://www.measurementlab.net/tests/ndt/) unified views + +**Countries**: United States (US), Germany (DE), Brazil (BR) + +### Files + +- `us_2024_10.json` - United States, ~31M download samples, ~24M upload samples + +- `de_2024_10.json` - Germany, ~7M download samples, ~4M upload samples + +- `br_2024_10.json` - Brazil, ~5M download samples, ~3M upload samples + +### Data Structure + +Each JSON file contains: + +```JavaScript +{ + "metadata": { + "country_code": "US", + "country_name": "United States", + "period": "2024-10", + "period_description": "October 2024", + "dataset": "M-Lab NDT", + "download_samples": 31443312, + "upload_samples": 24288961 + }, + "metrics": { + "download_throughput_mbps": {"p1": 0.38, /* ... */, "p99": 891.82}, + "upload_throughput_mbps": {"p1": 0.06, /* ... */, "p99": 813.73}, + "latency_ms": {"p1": 0.16, /* ... */, "p99": 254.34}, + "packet_loss": {"p1": 0.0, /* ... */, "p99": 0.25} + } +} +``` + +**Percentiles included**: p1, p5, p10, p25, p50, p75, p90, p95, p99 + +## How This Data Was Generated + +### BigQuery Queries + +The data was extracted from M-Lab's public BigQuery tables using two queries: + +1. **Downloads** (`query_downloads.sql`): Queries +`measurement-lab.ndt.unified_downloads` for: + + - Download throughput (`a.MeanThroughputMbps`) + + - Latency (`a.MinRTT`) + + - Packet loss (`a.LossRate`) + +2. **Uploads** (`query_uploads.sql`): Queries +`measurement-lab.ndt.unified_uploads` for: + + - Upload throughput (`a.MeanThroughputMbps`) + +### Running the Data Generation Pipeline + +**Prerequisites**: + +- Google Cloud SDK (`gcloud`) installed + +- BigQuery CLI (`bq`) installed + +- `gcloud`-authenticated with an account subscribed to +[M-Lab Discuss mailing list](https://groups.google.com/a/measurementlab.net/g/discuss) + +- Python 3.11+ + +**Complete Pipeline** (recommended): + +```bash +cd data/ +python3 generate_data.py +``` + +This orchestrates the complete pipeline: + +1. Queries BigQuery for download metrics (throughput, latency, packet loss) + +2. Queries BigQuery for upload metrics (throughput) + +3. Merges the data into per-country JSON files + +Generated files: `us_2024_10.json`, `de_2024_10.json`, `br_2024_10.json`. + +**Individual Pipeline Stages** (for debugging): + +```bash +cd data/ + +# Stage 1a: Query downloads +python3 run_query.py query_downloads.sql -o downloads.json + +# Stage 1b: Query uploads +python3 run_query.py query_uploads.sql -o uploads.json + +# Stage 2: Merge data +python3 merge_data.py +``` + +**Pipeline Scripts**: + +- [generate_data.py](generate_data.py) - Orchestrates the complete pipeline + +- [run_query.py](run_query.py) - Executes a BigQuery query and saves results + +- [merge_data.py](merge_data.py) - Merges download and upload data into +per-country files + +### Modifying Queries + +To change the time period or countries, edit the SQL files: + +```sql +WHERE + date BETWEEN "2024-10-01" AND "2024-10-31" -- Change dates here + AND client.Geo.CountryCode IN ("US", "DE", "BR") -- Change countries here +``` + +Country codes follow the +[ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) standard. + +## Notes + +- **Static data**: These files contain pre-aggregated percentiles +for Phase 1 prototype. Phase 2 will add dynamic data fetching. + +- **Time granularity**: Data is aggregated over the entire +month (October 2024). The analyst decides which time window +to use when fethcing data for running IQB calculations. + +- **Percentile selection**: The Streamlit UI allows users +to select which percentile(s) to use for IQB score calculations. + +- **File size**: Each file is ~1.4KB (uncompressed). No +compression needed. + +## M-Lab NDT Data Schema + +M-Lab provides two unified views: + +- `measurement-lab.ndt.unified_downloads` - Download tests + +- `measurement-lab.ndt.unified_uploads` - Upload tests + +Key fields used: + +- `a.MeanThroughputMbps` - Mean throughput in Mbps + +- `a.MinRTT` - Minimum round-trip time in milliseconds + +- `a.LossRate` - Packet loss rate (0.0-1.0) + +- `client.Geo.CountryCode` - ISO country code + +- `date` - Measurement date (YYYY-MM-DD) + +See [M-Lab NDT documentation](https://www.measurementlab.net/tests/ndt/#ndt-data-in-bigquery) +for details. + +## Future Improvements (Phase 2+) + +- Dynamic data fetching from BigQuery + +- Support for additional datasets (Ookla, Cloudflare) + +- Finer time granularity (daily, weekly) + +- Sub-national geographic resolution (cities, ASNs) + +- Local database integration for caching aggregated data diff --git a/data/br_2024_10.json b/data/br_2024_10.json new file mode 100644 index 0000000..c991760 --- /dev/null +++ b/data/br_2024_10.json @@ -0,0 +1,57 @@ +{ + "metadata": { + "country_code": "BR", + "country_name": "Brazil", + "period": "2024-10", + "period_description": "October 2024", + "dataset": "M-Lab NDT", + "download_samples": 4944407, + "upload_samples": 3496328 + }, + "metrics": { + "download_throughput_mbps": { + "p1": 0.15979623373499155, + "p5": 0.9501991252036766, + "p10": 3.101174869710966, + "p25": 15.0340700432778, + "p50": 51.9831305263177, + "p75": 158.38962702858973, + "p90": 330.3352983503099, + "p95": 456.0950392154999, + "p99": 696.5613392781584 + }, + "upload_throughput_mbps": { + "p1": 0.042563080079753776, + "p5": 0.07560071683921148, + "p10": 0.08980854096320207, + "p25": 5.545812099052701, + "p50": 30.78175191467136, + "p75": 88.37694460346944, + "p90": 181.64033113619195, + "p95": 255.97876412741525, + "p99": 394.3416893812533 + }, + "latency_ms": { + "p1": 1.394, + "p5": 3.637, + "p10": 4.958, + "p25": 9.079, + "p50": 19.953, + "p75": 52.065, + "p90": 184.738, + "p95": 234.072, + "p99": 273.0 + }, + "packet_loss": { + "p1": 0.0, + "p5": 0.0, + "p10": 0.0, + "p25": 1.1042755272820004e-05, + "p50": 0.004822712745559209, + "p75": 0.05811090765473097, + "p90": 0.13649207990035975, + "p95": 0.1987869577393624, + "p99": 0.3652163739953438 + } + } +} \ No newline at end of file diff --git a/data/de_2024_10.json b/data/de_2024_10.json new file mode 100644 index 0000000..b83340b --- /dev/null +++ b/data/de_2024_10.json @@ -0,0 +1,57 @@ +{ + "metadata": { + "country_code": "DE", + "country_name": "Germany", + "period": "2024-10", + "period_description": "October 2024", + "dataset": "M-Lab NDT", + "download_samples": 7419055, + "upload_samples": 4377008 + }, + "metrics": { + "download_throughput_mbps": { + "p1": 0.22367850581560372, + "p5": 1.262769802856182, + "p10": 3.4166592054870026, + "p25": 13.817824595534129, + "p50": 45.24430302103892, + "p75": 100.56946051210859, + "p90": 248.78115747983244, + "p95": 377.8657642766346, + "p99": 741.7983223940372 + }, + "upload_throughput_mbps": { + "p1": 0.04798033204768874, + "p5": 0.07565187888251705, + "p10": 0.19852741925194242, + "p25": 3.5715003423978087, + "p50": 17.172955392453527, + "p75": 36.63458526768415, + "p90": 53.192909502396375, + "p95": 101.34444079000329, + "p99": 285.7324202068485 + }, + "latency_ms": { + "p1": 0.438, + "p5": 3.433, + "p10": 6.787, + "p25": 11.589, + "p50": 17.712, + "p75": 26.382, + "p90": 38.489, + "p95": 57.061, + "p99": 305.85 + }, + "packet_loss": { + "p1": 0.0, + "p5": 0.0, + "p10": 0.0, + "p25": 0.0, + "p50": 0.00034573047467282084, + "p75": 0.016581558328885995, + "p90": 0.07073353719313655, + "p95": 0.11517449630011735, + "p99": 0.2521127443846117 + } + } +} \ No newline at end of file diff --git a/data/generate_data.py b/data/generate_data.py new file mode 100755 index 0000000..5a6c90d --- /dev/null +++ b/data/generate_data.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +Orchestrate the data generation pipeline for IQB static data. + +This script: +1. Runs BigQuery queries for downloads and uploads +2. Merges the results into per-country JSON files +""" + +import subprocess +import sys +from pathlib import Path + + +def run_command(cmd: list[str], description: str) -> None: + """Run a command and handle errors.""" + print(f"\n{'=' * 60}") + print(f"{description}") + print(f"{'=' * 60}") + + result = subprocess.run(cmd, capture_output=False) + + if result.returncode != 0: + print(f"\n✗ Failed: {description}", file=sys.stderr) + sys.exit(1) + + print(f"✓ Completed: {description}") + + +def main(): + # Ensure we're in the data directory + data_dir = Path(__file__).parent + + print("IQB Data Generation Pipeline") + print("=" * 60) + + # Stage 1a: Query downloads + run_command( + [ + "python3", + str(data_dir / "run_query.py"), + str(data_dir / "query_downloads.sql"), + "-o", + str(data_dir / "downloads.json"), + ], + "Stage 1a: Querying download metrics (throughput, latency, packet loss)", + ) + + # Stage 1b: Query uploads + run_command( + [ + "python3", + str(data_dir / "run_query.py"), + str(data_dir / "query_uploads.sql"), + "-o", + str(data_dir / "uploads.json"), + ], + "Stage 1b: Querying upload metrics (throughput)", + ) + + # Stage 2: Merge data + run_command( + ["python3", str(data_dir / "merge_data.py")], + "Stage 2: Merging download and upload data into per-country files", + ) + + print("\n" + "=" * 60) + print("✓ Pipeline completed successfully!") + print("=" * 60) + print("\nGenerated files:") + + for country in ["us", "de", "br"]: + file_path = data_dir / f"{country}_2024_10.json" + if file_path.exists(): + size = file_path.stat().st_size + print(f" - {file_path.name} ({size:,} bytes)") + + +if __name__ == "__main__": + main() diff --git a/data/merge_data.py b/data/merge_data.py new file mode 100755 index 0000000..0d0798e --- /dev/null +++ b/data/merge_data.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +"""Merge download and upload data into clean JSON files per country.""" + +import json + +# Load raw data +with open("downloads.json") as f: + downloads = json.load(f) + +with open("uploads.json") as f: + uploads = json.load(f) + +# Country names +COUNTRY_NAMES = {"US": "United States", "DE": "Germany", "BR": "Brazil"} + +# Merge by country +for dl in downloads: + country_code = dl["country_code"] + + # Find matching upload data + ul = next(u for u in uploads if u["country_code"] == country_code) + + # Extract percentiles into structured format + download_percentiles = { + f"p{p}": float(dl[f"download_p{p}"]) for p in [1, 5, 10, 25, 50, 75, 90, 95, 99] + } + upload_percentiles = { + f"p{p}": float(ul[f"upload_p{p}"]) for p in [1, 5, 10, 25, 50, 75, 90, 95, 99] + } + latency_percentiles = { + f"p{p}": float(dl[f"latency_p{p}"]) for p in [1, 5, 10, 25, 50, 75, 90, 95, 99] + } + loss_percentiles = { + f"p{p}": float(dl[f"loss_p{p}"]) for p in [1, 5, 10, 25, 50, 75, 90, 95, 99] + } + + # Create clean output + output = { + "metadata": { + "country_code": country_code, + "country_name": COUNTRY_NAMES[country_code], + "period": "2024-10", + "period_description": "October 2024", + "dataset": "M-Lab NDT", + "download_samples": int(dl["sample_count"]), + "upload_samples": int(ul["sample_count"]), + }, + "metrics": { + "download_throughput_mbps": download_percentiles, + "upload_throughput_mbps": upload_percentiles, + "latency_ms": latency_percentiles, + "packet_loss": loss_percentiles, + }, + } + + # Write to file + filename = f"{country_code.lower()}_2024_10.json" + with open(filename, "w") as f: + json.dump(output, f, indent=2) + + print(f"✓ Created {filename}") + +print("\nData files created successfully!") diff --git a/data/query_downloads.sql b/data/query_downloads.sql new file mode 100644 index 0000000..08a0f2c --- /dev/null +++ b/data/query_downloads.sql @@ -0,0 +1,39 @@ +SELECT + client.Geo.CountryCode as country_code, + COUNT(*) as sample_count, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(1)] as download_p1, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(5)] as download_p5, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(10)] as download_p10, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(25)] as download_p25, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(50)] as download_p50, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(75)] as download_p75, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(90)] as download_p90, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(95)] as download_p95, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(99)] as download_p99, + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(1)] as latency_p1, + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(5)] as latency_p5, + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(10)] as latency_p10, + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(25)] as latency_p25, + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(50)] as latency_p50, + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(75)] as latency_p75, + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(90)] as latency_p90, + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(95)] as latency_p95, + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(99)] as latency_p99, + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(1)] as loss_p1, + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(5)] as loss_p5, + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(10)] as loss_p10, + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(25)] as loss_p25, + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(50)] as loss_p50, + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(75)] as loss_p75, + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(90)] as loss_p90, + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(95)] as loss_p95, + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(99)] as loss_p99 +FROM + `measurement-lab.ndt.unified_downloads` +WHERE + date BETWEEN "2024-10-01" AND "2024-10-31" + AND client.Geo.CountryCode IN ("US", "DE", "BR") + AND a.MeanThroughputMbps IS NOT NULL + AND a.MinRTT IS NOT NULL +GROUP BY country_code +ORDER BY country_code diff --git a/data/query_uploads.sql b/data/query_uploads.sql new file mode 100644 index 0000000..0097a12 --- /dev/null +++ b/data/query_uploads.sql @@ -0,0 +1,20 @@ +SELECT + client.Geo.CountryCode as country_code, + COUNT(*) as sample_count, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(1)] as upload_p1, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(5)] as upload_p5, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(10)] as upload_p10, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(25)] as upload_p25, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(50)] as upload_p50, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(75)] as upload_p75, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(90)] as upload_p90, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(95)] as upload_p95, + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(99)] as upload_p99 +FROM + `measurement-lab.ndt.unified_uploads` +WHERE + date BETWEEN "2024-10-01" AND "2024-10-31" + AND client.Geo.CountryCode IN ("US", "DE", "BR") + AND a.MeanThroughputMbps IS NOT NULL +GROUP BY country_code +ORDER BY country_code diff --git a/data/run_query.py b/data/run_query.py new file mode 100755 index 0000000..e0d8e3f --- /dev/null +++ b/data/run_query.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +"""Execute a BigQuery query and save results to a JSON file.""" + +import argparse +import subprocess +import sys +from pathlib import Path + + +def run_bq_query(query_file: Path, output_file: Path | None, project_id: str) -> None: + """ + Execute a BigQuery query and save the JSON output. + + Args: + query_file: Path to SQL query file + output_file: Path where to save JSON output (None = stdout) + project_id: GCP project ID for billing + """ + print(f"Running query: {query_file}", file=sys.stderr) + + # Read query + with open(query_file) as f: + query = f.read() + + # Execute BigQuery command + # stdout = data (JSON), stderr = logs + cmd = [ + "bq", + "query", + "--use_legacy_sql=false", + f"--project_id={project_id}", + "--format=json", + query, + ] + + try: + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + + # Write data (stdout) to output file or stdout + if output_file: + with open(output_file, "w") as f: + f.write(result.stdout) + print(f"✓ Query completed: {output_file}", file=sys.stderr) + else: + # Output to stdout for piping + print(result.stdout) + + # Print logs (stderr) to console + if result.stderr: + print(result.stderr, file=sys.stderr) + + except subprocess.CalledProcessError as e: + print(f"✗ Query failed: {e}", file=sys.stderr) + if e.stderr: + print(e.stderr, file=sys.stderr) + sys.exit(1) + + +def main(): + # TODO(bassosimone): Should we use 'measurement-lab' as the project ID instead? + # The web console (https://console.cloud.google.com/bigquery?project=measurement-lab) + # uses measurement-lab as the project, so I am a bit unsure about what to use here. + DEFAULT_PROJECT_ID = "mlab-sandbox" + + parser = argparse.ArgumentParser( + description="Execute BigQuery query and save results" + ) + parser.add_argument("query_file", type=Path, help="Path to SQL query file") + parser.add_argument( + "-o", "--output", type=Path, help="Path to output JSON file (default: stdout)" + ) + parser.add_argument( + "--project-id", + default=DEFAULT_PROJECT_ID, + help=f"GCP project ID for billing (default: {DEFAULT_PROJECT_ID})", + ) + + args = parser.parse_args() + + if not args.query_file.exists(): + print(f"Error: Query file not found: {args.query_file}", file=sys.stderr) + sys.exit(1) + + run_bq_query(args.query_file, args.output, args.project_id) + + +if __name__ == "__main__": + main() diff --git a/data/us_2024_10.json b/data/us_2024_10.json new file mode 100644 index 0000000..fdef478 --- /dev/null +++ b/data/us_2024_10.json @@ -0,0 +1,57 @@ +{ + "metadata": { + "country_code": "US", + "country_name": "United States", + "period": "2024-10", + "period_description": "October 2024", + "dataset": "M-Lab NDT", + "download_samples": 31443312, + "upload_samples": 24288961 + }, + "metrics": { + "download_throughput_mbps": { + "p1": 0.37354810526833476, + "p5": 2.7494108827310177, + "p10": 7.6575433038007406, + "p25": 29.94873577502137, + "p50": 96.36533017831101, + "p75": 268.1810327939917, + "p90": 474.1768162996085, + "p95": 625.4494125653449, + "p99": 893.2782851912168 + }, + "upload_throughput_mbps": { + "p1": 0.06279911698366483, + "p5": 0.15105079102447938, + "p10": 1.0130561597157441, + "p25": 8.030055616329323, + "p50": 20.95814566696693, + "p75": 65.73945359925672, + "p90": 223.9767416770114, + "p95": 370.4336035390081, + "p99": 813.7319533731953 + }, + "latency_ms": { + "p1": 0.16, + "p5": 0.808, + "p10": 2.886, + "p25": 7.778, + "p50": 16.124, + "p75": 30.0, + "p90": 51.303, + "p95": 80.55, + "p99": 251.545 + }, + "packet_loss": { + "p1": 0.0, + "p5": 0.0, + "p10": 0.0, + "p25": 0.0, + "p50": 0.000516724336793541, + "p75": 0.019090240380880846, + "p90": 0.07332944466732425, + "p95": 0.12018590164702943, + "p99": 0.253111989432024 + } + } +} \ No newline at end of file