feat: add country-city-asn queries (#42)

bassosimone · web-flow · commit b47324ab32e8 · 2025-11-27T16:27:50.000+01:00
This diff adds and uses queries for country, city, and ASN.

We now generate more fine grained data.
diff --git a/data/generate_data.py b/data/generate_data.py
@@ -78,8 +78,42 @@ def generate_for_period(
         f"Stage 1b: Querying upload metrics for {period_str}",
     )
 
+    # Stage 1c: Query downloads by country/city/ASN (parquet cache only, no JSON output yet)
+    run_command(
+        [
+            "python3",
+            str(data_dir / "run_query.py"),
+            "downloads_by_country_city_asn",
+            "--start-date",
+            start_date,
+            "--end-date",
+            end_date,
+            "-o",
+            str(cache_dir / "downloads_by_country_city_asn.json"),
+        ],
+        f"Stage 1c: Querying download metrics by country/city/ASN for {period_str}",
+    )
+
+    # Stage 1d: Query uploads by country/city/ASN (parquet cache only, no JSON output yet)
+    run_command(
+        [
+            "python3",
+            str(data_dir / "run_query.py"),
+            "uploads_by_country_city_asn",
+            "--start-date",
+            start_date,
+            "--end-date",
+            end_date,
+            "-o",
+            str(cache_dir / "uploads_by_country_city_asn.json"),
+        ],
+        f"Stage 1d: Querying upload metrics by country/city/ASN for {period_str}",
+    )
+
     # Stage 2: Merge data
     # Creates: {country}_{period_str}.json (e.g., us_2024_10.json)
+    # Note: This only merges downloads_by_country and uploads_by_country
+    # The country/city/ASN queries are cached as parquet only for now
     run_command(
         [
             "python3",
diff --git a/data/run_query.py b/data/run_query.py
@@ -89,27 +89,25 @@ def run_bq_query(
     # Data directory is ./iqb/data (where this script lives)
     data_dir = Path(__file__).parent
 
-    # Step 1: Execute query and save results using IQBPipeline
+    # Step 1: Get or create cache entry
     # This creates: ./iqb/data/cache/v1/{start}/{end}/{query_name}/
     #   - data.parquet: query results (empty file if no results)
     #   - stats.json: query metadata
+    # fetch_if_missing=True makes this idempotent: skips query if cache exists
     pipeline = IQBPipeline(project_id=project_id, data_dir=data_dir)
-    result = pipeline.execute_query_template(
+    entry = pipeline.get_cache_entry(
         template=query_name,
         start_date=start_date,
         end_date=end_date,
+        fetch_if_missing=True,
     )
-    info = result.save_parquet()
+    print(f"✓ Cache entry: {entry.data_path.parent.name}", file=sys.stderr)
+    print(f"  Data: {entry.data_path}", file=sys.stderr)
+    print(f"  Stats: {entry.stats_path}", file=sys.stderr)
 
-    print(f"✓ Parquet saved: {info.file_path}", file=sys.stderr)
-
-    # Save query statistics (timing, bytes processed, template hash)
-    stats_path = result.save_stats()
-    print(f"✓ Stats saved: {stats_path}", file=sys.stderr)
-
-    # Step 2: Convert parquet to JSON
+    # Step 2: Convert the parquet file to JSON
     print("Converting parquet to JSON...", file=sys.stderr)
-    table = pq.read_table(info.file_path)
+    table = pq.read_table(entry.data_path)
     records = table.to_pylist()
 
     # Check if query returned no results
diff --git a/library/src/iqb/pipeline.py b/library/src/iqb/pipeline.py
@@ -93,6 +93,8 @@
 VALID_TEMPLATE_NAMES: Final[set[str]] = {
     "downloads_by_country",
     "uploads_by_country",
+    "downloads_by_country_city_asn",
+    "uploads_by_country_city_asn",
 }
 
 # Cache file names
diff --git a/library/src/iqb/queries/downloads_by_country_city_asn.sql b/library/src/iqb/queries/downloads_by_country_city_asn.sql
@@ -0,0 +1,81 @@
+SELECT
+    client.Geo.CountryCode as country_code,
+    client.Geo.city as city,
+    client.Network.ASNumber as asn,
+    COUNT(*) as sample_count,
+
+    -- ============================================================================
+    -- PERCENTILE LABELING CONVENTION FOR IQB QUALITY ASSESSMENT
+    -- ============================================================================
+    --
+    -- For "higher is better" metrics (throughput):
+    --   - Raw p95 = "95% of users have ≤ X Mbit/s"
+    --   - Label: OFFSET(95) → download_p95 (standard statistical definition)
+    --   - Interpretation: top ~5% of users have > p95 throughput
+    --
+    -- For "lower is better" metrics (latency, packet loss):
+    --   - Raw p95 = "95% of users have ≤ X ms latency" (worst-case typical)
+    --   - We want p95 to represent best-case typical (to match throughput semantics)
+    --   - Solution: Invert labels - use raw p5 labeled as p95
+    --   - Label: OFFSET(5) → latency_p95 (inverted!)
+    --   - Interpretation: top ~5% of users (best latency) have < p95
+    --
+    -- Result: Uniform comparison logic where p95 always means "typical best
+    -- performance" rather than "typical worst performance"
+    --
+    -- NOTE: This creates semantics where checking p95 thresholds asks
+    -- "Can the top ~5% of users perform this use case?" - empirical validation
+    -- against real data will determine if this interpretation is appropriate.
+    -- ============================================================================
+
+    -- Download throughput (higher is better - NO INVERSION)
+    -- Standard percentile labels matching statistical definition
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(1)] as download_p1,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(5)] as download_p5,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(10)] as download_p10,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(25)] as download_p25,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(50)] as download_p50,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(75)] as download_p75,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(90)] as download_p90,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(95)] as download_p95,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(99)] as download_p99,
+
+    -- Latency/MinRTT (lower is better - INVERTED LABELS!)
+    -- ⚠️  OFFSET(99) = worst latency = top 1% worst users → labeled as p1
+    -- ⚠️  OFFSET(5) = 5th percentile = best ~5% of users → labeled as p95
+    APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(99)] as latency_p1,
+    APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(95)] as latency_p5,
+    APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(90)] as latency_p10,
+    APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(75)] as latency_p25,
+    APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(50)] as latency_p50,
+    APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(25)] as latency_p75,
+    APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(10)] as latency_p90,
+    APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(5)] as latency_p95,
+    APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(1)] as latency_p99,
+
+    -- Packet Loss Rate (lower is better - INVERTED LABELS!)
+    -- ⚠️  OFFSET(99) = worst loss = top 1% worst users → labeled as p1
+    -- ⚠️  OFFSET(5) = 5th percentile = best ~5% of users → labeled as p95
+    APPROX_QUANTILES(a.LossRate, 100)[OFFSET(99)] as loss_p1,
+    APPROX_QUANTILES(a.LossRate, 100)[OFFSET(95)] as loss_p5,
+    APPROX_QUANTILES(a.LossRate, 100)[OFFSET(90)] as loss_p10,
+    APPROX_QUANTILES(a.LossRate, 100)[OFFSET(75)] as loss_p25,
+    APPROX_QUANTILES(a.LossRate, 100)[OFFSET(50)] as loss_p50,
+    APPROX_QUANTILES(a.LossRate, 100)[OFFSET(25)] as loss_p75,
+    APPROX_QUANTILES(a.LossRate, 100)[OFFSET(10)] as loss_p90,
+    APPROX_QUANTILES(a.LossRate, 100)[OFFSET(5)] as loss_p95,
+    APPROX_QUANTILES(a.LossRate, 100)[OFFSET(1)] as loss_p99
+FROM
+    -- TODO(bassosimone): switch to union tables `measurement-lab.ndt.ndt7_union`
+    -- when they have been blessed as the new stable tables.
+    `measurement-lab.ndt.unified_downloads`
+WHERE
+    date >= "{START_DATE}" AND date < "{END_DATE}"
+    AND client.Geo.CountryCode IS NOT NULL
+    AND client.Geo.city IS NOT NULL
+    AND client.Network.ASNumber IS NOT NULL
+    AND a.MeanThroughputMbps IS NOT NULL
+    AND a.MinRTT IS NOT NULL
+    AND a.LossRate IS NOT NULL
+GROUP BY country_code, city, asn
+ORDER BY country_code, city, asn
diff --git a/library/src/iqb/queries/uploads_by_country.sql b/library/src/iqb/queries/uploads_by_country.sql
@@ -9,7 +9,7 @@ SELECT
     -- Upload throughput is "higher is better", so we use standard percentile
     -- labels (no inversion).
     --
-    -- See query_downloads.sql for detailed explanation and rationale.
+    -- See downloads_by_country.sql for detailed explanation and rationale.
     -- ============================================================================
 
     -- Upload throughput (higher is better - NO INVERSION)
diff --git a/library/src/iqb/queries/uploads_by_country_city_asn.sql b/library/src/iqb/queries/uploads_by_country_city_asn.sql
@@ -0,0 +1,39 @@
+SELECT
+    client.Geo.CountryCode as country_code,
+    client.Geo.city as city,
+    client.Network.ASNumber as asn,
+    COUNT(*) as sample_count,
+
+    -- ============================================================================
+    -- PERCENTILE LABELING CONVENTION FOR IQB QUALITY ASSESSMENT
+    -- ============================================================================
+    --
+    -- Upload throughput is "higher is better", so we use standard percentile
+    -- labels (no inversion).
+    --
+    -- See downloads_by_country.sql for detailed explanation and rationale.
+    -- ============================================================================
+
+    -- Upload throughput (higher is better - NO INVERSION)
+    -- Standard percentile labels matching statistical definition
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(1)] as upload_p1,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(5)] as upload_p5,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(10)] as upload_p10,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(25)] as upload_p25,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(50)] as upload_p50,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(75)] as upload_p75,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(90)] as upload_p90,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(95)] as upload_p95,
+    APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(99)] as upload_p99
+FROM
+    -- TODO(bassosimone): switch to union tables `measurement-lab.ndt.ndt7_union`
+    -- when they have been blessed as the new stable tables.
+    `measurement-lab.ndt.unified_uploads`
+WHERE
+    date >= "{START_DATE}" AND date < "{END_DATE}"
+    AND client.Geo.CountryCode IS NOT NULL
+    AND client.Geo.city IS NOT NULL
+    AND client.Network.ASNumber IS NOT NULL
+    AND a.MeanThroughputMbps IS NOT NULL
+GROUP BY country_code, city, asn
+ORDER BY country_code, city, asn

Original file line number	Diff line number	Diff line change
`@@ -93,6 +93,8 @@`
`93`	`93`	`VALID_TEMPLATE_NAMES: Final[set[str]] = {`
`94`	`94`	`"downloads_by_country",`
`95`	`95`	`"uploads_by_country",`
	`96`	`+ "downloads_by_country_city_asn",`
	`97`	`+ "uploads_by_country_city_asn",`
`96`	`98`	`}`
`97`	`99`
`98`	`100`	`# Cache file names`