|
| 1 | +SELECT |
| 2 | + client.Geo.CountryCode as country_code, |
| 3 | + client.Geo.Subdivision1ISOCode as subdivision1_iso_code, |
| 4 | + client.Geo.Subdivision1Name as subdivision1_name, |
| 5 | + client.Network.ASNumber as asn, |
| 6 | + client.Network.ASName as as_name, |
| 7 | + COUNT(*) as sample_count, |
| 8 | + |
| 9 | + -- ============================================================================ |
| 10 | + -- PERCENTILE LABELING CONVENTION FOR IQB QUALITY ASSESSMENT |
| 11 | + -- ============================================================================ |
| 12 | + -- |
| 13 | + -- For "higher is better" metrics (throughput): |
| 14 | + -- - Raw p95 = "95% of users have ≤ X Mbit/s" |
| 15 | + -- - Label: OFFSET(95) → download_p95 (standard statistical definition) |
| 16 | + -- - Interpretation: top ~5% of users have > p95 throughput |
| 17 | + -- |
| 18 | + -- For "lower is better" metrics (latency, packet loss): |
| 19 | + -- - Raw p95 = "95% of users have ≤ X ms latency" (worst-case typical) |
| 20 | + -- - We want p95 to represent best-case typical (to match throughput semantics) |
| 21 | + -- - Solution: Invert labels - use raw p5 labeled as p95 |
| 22 | + -- - Label: OFFSET(5) → latency_p95 (inverted!) |
| 23 | + -- - Interpretation: top ~5% of users (best latency) have < p95 |
| 24 | + -- |
| 25 | + -- Result: Uniform comparison logic where p95 always means "typical best |
| 26 | + -- performance" rather than "typical worst performance" |
| 27 | + -- |
| 28 | + -- NOTE: This creates semantics where checking p95 thresholds asks |
| 29 | + -- "Can the top ~5% of users perform this use case?" - empirical validation |
| 30 | + -- against real data will determine if this interpretation is appropriate. |
| 31 | + -- ============================================================================ |
| 32 | + |
| 33 | + -- Download throughput (higher is better - NO INVERSION) |
| 34 | + -- Standard percentile labels matching statistical definition |
| 35 | + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(1)] as download_p1, |
| 36 | + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(5)] as download_p5, |
| 37 | + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(10)] as download_p10, |
| 38 | + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(25)] as download_p25, |
| 39 | + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(50)] as download_p50, |
| 40 | + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(75)] as download_p75, |
| 41 | + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(90)] as download_p90, |
| 42 | + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(95)] as download_p95, |
| 43 | + APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(99)] as download_p99, |
| 44 | + |
| 45 | + -- Latency/MinRTT (lower is better - INVERTED LABELS!) |
| 46 | + -- ⚠️ OFFSET(99) = worst latency = top 1% worst users → labeled as p1 |
| 47 | + -- ⚠️ OFFSET(5) = 5th percentile = best ~5% of users → labeled as p95 |
| 48 | + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(99)] as latency_p1, |
| 49 | + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(95)] as latency_p5, |
| 50 | + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(90)] as latency_p10, |
| 51 | + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(75)] as latency_p25, |
| 52 | + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(50)] as latency_p50, |
| 53 | + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(25)] as latency_p75, |
| 54 | + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(10)] as latency_p90, |
| 55 | + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(5)] as latency_p95, |
| 56 | + APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(1)] as latency_p99, |
| 57 | + |
| 58 | + -- Packet Loss Rate (lower is better - INVERTED LABELS!) |
| 59 | + -- ⚠️ OFFSET(99) = worst loss = top 1% worst users → labeled as p1 |
| 60 | + -- ⚠️ OFFSET(5) = 5th percentile = best ~5% of users → labeled as p95 |
| 61 | + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(99)] as loss_p1, |
| 62 | + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(95)] as loss_p5, |
| 63 | + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(90)] as loss_p10, |
| 64 | + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(75)] as loss_p25, |
| 65 | + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(50)] as loss_p50, |
| 66 | + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(25)] as loss_p75, |
| 67 | + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(10)] as loss_p90, |
| 68 | + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(5)] as loss_p95, |
| 69 | + APPROX_QUANTILES(a.LossRate, 100)[OFFSET(1)] as loss_p99 |
| 70 | +FROM |
| 71 | + -- TODO(bassosimone): current unified_downloads/unified_uploads tables lack BYOS |
| 72 | + -- support. We'll eventually need to switch to better tables. |
| 73 | + `measurement-lab.ndt.unified_downloads` |
| 74 | +WHERE |
| 75 | + date >= "{START_DATE}" AND date < "{END_DATE}" |
| 76 | + AND client.Geo.CountryCode IS NOT NULL |
| 77 | + AND client.Geo.Subdivision1ISOCode IS NOT NULL |
| 78 | + AND client.Geo.Subdivision1Name IS NOT NULL |
| 79 | + AND client.Network.ASNumber IS NOT NULL |
| 80 | + AND client.Network.ASName IS NOT NULL |
| 81 | + AND a.MeanThroughputMbps IS NOT NULL |
| 82 | + AND a.MinRTT IS NOT NULL |
| 83 | + AND a.LossRate IS NOT NULL |
| 84 | +GROUP BY country_code, subdivision1_iso_code, subdivision1_name, asn, as_name |
| 85 | +ORDER BY country_code, subdivision1_iso_code, subdivision1_name, asn, as_name |
0 commit comments