Skip to content

Commit b47324a

Browse files
authored
feat: add country-city-asn queries (#42)
This diff adds and uses queries for country, city, and ASN. We now generate more fine grained data.
1 parent 466e4f0 commit b47324a

File tree

6 files changed

+166
-12
lines changed

6 files changed

+166
-12
lines changed

data/generate_data.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,42 @@ def generate_for_period(
7878
f"Stage 1b: Querying upload metrics for {period_str}",
7979
)
8080

81+
# Stage 1c: Query downloads by country/city/ASN (parquet cache only, no JSON output yet)
82+
run_command(
83+
[
84+
"python3",
85+
str(data_dir / "run_query.py"),
86+
"downloads_by_country_city_asn",
87+
"--start-date",
88+
start_date,
89+
"--end-date",
90+
end_date,
91+
"-o",
92+
str(cache_dir / "downloads_by_country_city_asn.json"),
93+
],
94+
f"Stage 1c: Querying download metrics by country/city/ASN for {period_str}",
95+
)
96+
97+
# Stage 1d: Query uploads by country/city/ASN (parquet cache only, no JSON output yet)
98+
run_command(
99+
[
100+
"python3",
101+
str(data_dir / "run_query.py"),
102+
"uploads_by_country_city_asn",
103+
"--start-date",
104+
start_date,
105+
"--end-date",
106+
end_date,
107+
"-o",
108+
str(cache_dir / "uploads_by_country_city_asn.json"),
109+
],
110+
f"Stage 1d: Querying upload metrics by country/city/ASN for {period_str}",
111+
)
112+
81113
# Stage 2: Merge data
82114
# Creates: {country}_{period_str}.json (e.g., us_2024_10.json)
115+
# Note: This only merges downloads_by_country and uploads_by_country
116+
# The country/city/ASN queries are cached as parquet only for now
83117
run_command(
84118
[
85119
"python3",

data/run_query.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -89,27 +89,25 @@ def run_bq_query(
8989
# Data directory is ./iqb/data (where this script lives)
9090
data_dir = Path(__file__).parent
9191

92-
# Step 1: Execute query and save results using IQBPipeline
92+
# Step 1: Get or create cache entry
9393
# This creates: ./iqb/data/cache/v1/{start}/{end}/{query_name}/
9494
# - data.parquet: query results (empty file if no results)
9595
# - stats.json: query metadata
96+
# fetch_if_missing=True makes this idempotent: skips query if cache exists
9697
pipeline = IQBPipeline(project_id=project_id, data_dir=data_dir)
97-
result = pipeline.execute_query_template(
98+
entry = pipeline.get_cache_entry(
9899
template=query_name,
99100
start_date=start_date,
100101
end_date=end_date,
102+
fetch_if_missing=True,
101103
)
102-
info = result.save_parquet()
104+
print(f"✓ Cache entry: {entry.data_path.parent.name}", file=sys.stderr)
105+
print(f" Data: {entry.data_path}", file=sys.stderr)
106+
print(f" Stats: {entry.stats_path}", file=sys.stderr)
103107

104-
print(f"✓ Parquet saved: {info.file_path}", file=sys.stderr)
105-
106-
# Save query statistics (timing, bytes processed, template hash)
107-
stats_path = result.save_stats()
108-
print(f"✓ Stats saved: {stats_path}", file=sys.stderr)
109-
110-
# Step 2: Convert parquet to JSON
108+
# Step 2: Convert the parquet file to JSON
111109
print("Converting parquet to JSON...", file=sys.stderr)
112-
table = pq.read_table(info.file_path)
110+
table = pq.read_table(entry.data_path)
113111
records = table.to_pylist()
114112

115113
# Check if query returned no results

library/src/iqb/pipeline.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@
9393
VALID_TEMPLATE_NAMES: Final[set[str]] = {
9494
"downloads_by_country",
9595
"uploads_by_country",
96+
"downloads_by_country_city_asn",
97+
"uploads_by_country_city_asn",
9698
}
9799

98100
# Cache file names
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
SELECT
2+
client.Geo.CountryCode as country_code,
3+
client.Geo.city as city,
4+
client.Network.ASNumber as asn,
5+
COUNT(*) as sample_count,
6+
7+
-- ============================================================================
8+
-- PERCENTILE LABELING CONVENTION FOR IQB QUALITY ASSESSMENT
9+
-- ============================================================================
10+
--
11+
-- For "higher is better" metrics (throughput):
12+
-- - Raw p95 = "95% of users have ≤ X Mbit/s"
13+
-- - Label: OFFSET(95) → download_p95 (standard statistical definition)
14+
-- - Interpretation: top ~5% of users have > p95 throughput
15+
--
16+
-- For "lower is better" metrics (latency, packet loss):
17+
-- - Raw p95 = "95% of users have ≤ X ms latency" (worst-case typical)
18+
-- - We want p95 to represent best-case typical (to match throughput semantics)
19+
-- - Solution: Invert labels - use raw p5 labeled as p95
20+
-- - Label: OFFSET(5) → latency_p95 (inverted!)
21+
-- - Interpretation: top ~5% of users (best latency) have < p95
22+
--
23+
-- Result: Uniform comparison logic where p95 always means "typical best
24+
-- performance" rather than "typical worst performance"
25+
--
26+
-- NOTE: This creates semantics where checking p95 thresholds asks
27+
-- "Can the top ~5% of users perform this use case?" - empirical validation
28+
-- against real data will determine if this interpretation is appropriate.
29+
-- ============================================================================
30+
31+
-- Download throughput (higher is better - NO INVERSION)
32+
-- Standard percentile labels matching statistical definition
33+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(1)] as download_p1,
34+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(5)] as download_p5,
35+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(10)] as download_p10,
36+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(25)] as download_p25,
37+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(50)] as download_p50,
38+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(75)] as download_p75,
39+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(90)] as download_p90,
40+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(95)] as download_p95,
41+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(99)] as download_p99,
42+
43+
-- Latency/MinRTT (lower is better - INVERTED LABELS!)
44+
-- ⚠️ OFFSET(99) = worst latency = top 1% worst users → labeled as p1
45+
-- ⚠️ OFFSET(5) = 5th percentile = best ~5% of users → labeled as p95
46+
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(99)] as latency_p1,
47+
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(95)] as latency_p5,
48+
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(90)] as latency_p10,
49+
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(75)] as latency_p25,
50+
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(50)] as latency_p50,
51+
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(25)] as latency_p75,
52+
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(10)] as latency_p90,
53+
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(5)] as latency_p95,
54+
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(1)] as latency_p99,
55+
56+
-- Packet Loss Rate (lower is better - INVERTED LABELS!)
57+
-- ⚠️ OFFSET(99) = worst loss = top 1% worst users → labeled as p1
58+
-- ⚠️ OFFSET(5) = 5th percentile = best ~5% of users → labeled as p95
59+
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(99)] as loss_p1,
60+
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(95)] as loss_p5,
61+
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(90)] as loss_p10,
62+
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(75)] as loss_p25,
63+
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(50)] as loss_p50,
64+
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(25)] as loss_p75,
65+
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(10)] as loss_p90,
66+
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(5)] as loss_p95,
67+
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(1)] as loss_p99
68+
FROM
69+
-- TODO(bassosimone): switch to union tables `measurement-lab.ndt.ndt7_union`
70+
-- when they have been blessed as the new stable tables.
71+
`measurement-lab.ndt.unified_downloads`
72+
WHERE
73+
date >= "{START_DATE}" AND date < "{END_DATE}"
74+
AND client.Geo.CountryCode IS NOT NULL
75+
AND client.Geo.city IS NOT NULL
76+
AND client.Network.ASNumber IS NOT NULL
77+
AND a.MeanThroughputMbps IS NOT NULL
78+
AND a.MinRTT IS NOT NULL
79+
AND a.LossRate IS NOT NULL
80+
GROUP BY country_code, city, asn
81+
ORDER BY country_code, city, asn

library/src/iqb/queries/uploads_by_country.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ SELECT
99
-- Upload throughput is "higher is better", so we use standard percentile
1010
-- labels (no inversion).
1111
--
12-
-- See query_downloads.sql for detailed explanation and rationale.
12+
-- See downloads_by_country.sql for detailed explanation and rationale.
1313
-- ============================================================================
1414

1515
-- Upload throughput (higher is better - NO INVERSION)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
SELECT
2+
client.Geo.CountryCode as country_code,
3+
client.Geo.city as city,
4+
client.Network.ASNumber as asn,
5+
COUNT(*) as sample_count,
6+
7+
-- ============================================================================
8+
-- PERCENTILE LABELING CONVENTION FOR IQB QUALITY ASSESSMENT
9+
-- ============================================================================
10+
--
11+
-- Upload throughput is "higher is better", so we use standard percentile
12+
-- labels (no inversion).
13+
--
14+
-- See downloads_by_country.sql for detailed explanation and rationale.
15+
-- ============================================================================
16+
17+
-- Upload throughput (higher is better - NO INVERSION)
18+
-- Standard percentile labels matching statistical definition
19+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(1)] as upload_p1,
20+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(5)] as upload_p5,
21+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(10)] as upload_p10,
22+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(25)] as upload_p25,
23+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(50)] as upload_p50,
24+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(75)] as upload_p75,
25+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(90)] as upload_p90,
26+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(95)] as upload_p95,
27+
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(99)] as upload_p99
28+
FROM
29+
-- TODO(bassosimone): switch to union tables `measurement-lab.ndt.ndt7_union`
30+
-- when they have been blessed as the new stable tables.
31+
`measurement-lab.ndt.unified_uploads`
32+
WHERE
33+
date >= "{START_DATE}" AND date < "{END_DATE}"
34+
AND client.Geo.CountryCode IS NOT NULL
35+
AND client.Geo.city IS NOT NULL
36+
AND client.Network.ASNumber IS NOT NULL
37+
AND a.MeanThroughputMbps IS NOT NULL
38+
GROUP BY country_code, city, asn
39+
ORDER BY country_code, city, asn

0 commit comments

Comments
 (0)