Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions data/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,42 @@ def generate_for_period(
f"Stage 1b: Querying upload metrics for {period_str}",
)

# Stage 1c: Query downloads by country/city/ASN (parquet cache only, no JSON output yet)
run_command(
[
"python3",
str(data_dir / "run_query.py"),
"downloads_by_country_city_asn",
"--start-date",
start_date,
"--end-date",
end_date,
"-o",
str(cache_dir / "downloads_by_country_city_asn.json"),
],
f"Stage 1c: Querying download metrics by country/city/ASN for {period_str}",
)

# Stage 1d: Query uploads by country/city/ASN (parquet cache only, no JSON output yet)
run_command(
[
"python3",
str(data_dir / "run_query.py"),
"uploads_by_country_city_asn",
"--start-date",
start_date,
"--end-date",
end_date,
"-o",
str(cache_dir / "uploads_by_country_city_asn.json"),
],
f"Stage 1d: Querying upload metrics by country/city/ASN for {period_str}",
)

# Stage 2: Merge data
# Creates: {country}_{period_str}.json (e.g., us_2024_10.json)
# Note: This only merges downloads_by_country and uploads_by_country
# The country/city/ASN queries are cached as parquet only for now
run_command(
[
"python3",
Expand Down
20 changes: 9 additions & 11 deletions data/run_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,27 +89,25 @@ def run_bq_query(
# Data directory is ./iqb/data (where this script lives)
data_dir = Path(__file__).parent

# Step 1: Execute query and save results using IQBPipeline
# Step 1: Get or create cache entry
# This creates: ./iqb/data/cache/v1/{start}/{end}/{query_name}/
# - data.parquet: query results (empty file if no results)
# - stats.json: query metadata
# fetch_if_missing=True makes this idempotent: skips query if cache exists
pipeline = IQBPipeline(project_id=project_id, data_dir=data_dir)
result = pipeline.execute_query_template(
entry = pipeline.get_cache_entry(
template=query_name,
start_date=start_date,
end_date=end_date,
fetch_if_missing=True,
)
info = result.save_parquet()
print(f"✓ Cache entry: {entry.data_path.parent.name}", file=sys.stderr)
print(f" Data: {entry.data_path}", file=sys.stderr)
print(f" Stats: {entry.stats_path}", file=sys.stderr)

print(f"✓ Parquet saved: {info.file_path}", file=sys.stderr)

# Save query statistics (timing, bytes processed, template hash)
stats_path = result.save_stats()
print(f"✓ Stats saved: {stats_path}", file=sys.stderr)

# Step 2: Convert parquet to JSON
# Step 2: Convert the parquet file to JSON
print("Converting parquet to JSON...", file=sys.stderr)
table = pq.read_table(info.file_path)
table = pq.read_table(entry.data_path)
records = table.to_pylist()

# Check if query returned no results
Expand Down
2 changes: 2 additions & 0 deletions library/src/iqb/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@
VALID_TEMPLATE_NAMES: Final[set[str]] = {
"downloads_by_country",
"uploads_by_country",
"downloads_by_country_city_asn",
"uploads_by_country_city_asn",
}

# Cache file names
Expand Down
81 changes: 81 additions & 0 deletions library/src/iqb/queries/downloads_by_country_city_asn.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
SELECT
client.Geo.CountryCode as country_code,
client.Geo.city as city,
client.Network.ASNumber as asn,
COUNT(*) as sample_count,

-- ============================================================================
-- PERCENTILE LABELING CONVENTION FOR IQB QUALITY ASSESSMENT
-- ============================================================================
--
-- For "higher is better" metrics (throughput):
-- - Raw p95 = "95% of users have ≤ X Mbit/s"
-- - Label: OFFSET(95) → download_p95 (standard statistical definition)
-- - Interpretation: top ~5% of users have > p95 throughput
--
-- For "lower is better" metrics (latency, packet loss):
-- - Raw p95 = "95% of users have ≤ X ms latency" (worst-case typical)
-- - We want p95 to represent best-case typical (to match throughput semantics)
-- - Solution: Invert labels - use raw p5 labeled as p95
-- - Label: OFFSET(5) → latency_p95 (inverted!)
-- - Interpretation: top ~5% of users (best latency) have < p95
--
-- Result: Uniform comparison logic where p95 always means "typical best
-- performance" rather than "typical worst performance"
--
-- NOTE: This creates semantics where checking p95 thresholds asks
-- "Can the top ~5% of users perform this use case?" - empirical validation
-- against real data will determine if this interpretation is appropriate.
-- ============================================================================

-- Download throughput (higher is better - NO INVERSION)
-- Standard percentile labels matching statistical definition
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(1)] as download_p1,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(5)] as download_p5,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(10)] as download_p10,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(25)] as download_p25,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(50)] as download_p50,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(75)] as download_p75,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(90)] as download_p90,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(95)] as download_p95,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(99)] as download_p99,

-- Latency/MinRTT (lower is better - INVERTED LABELS!)
-- ⚠️ OFFSET(99) = worst latency = top 1% worst users → labeled as p1
-- ⚠️ OFFSET(5) = 5th percentile = best ~5% of users → labeled as p95
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(99)] as latency_p1,
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(95)] as latency_p5,
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(90)] as latency_p10,
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(75)] as latency_p25,
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(50)] as latency_p50,
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(25)] as latency_p75,
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(10)] as latency_p90,
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(5)] as latency_p95,
APPROX_QUANTILES(a.MinRTT, 100)[OFFSET(1)] as latency_p99,

-- Packet Loss Rate (lower is better - INVERTED LABELS!)
-- ⚠️ OFFSET(99) = worst loss = top 1% worst users → labeled as p1
-- ⚠️ OFFSET(5) = 5th percentile = best ~5% of users → labeled as p95
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(99)] as loss_p1,
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(95)] as loss_p5,
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(90)] as loss_p10,
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(75)] as loss_p25,
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(50)] as loss_p50,
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(25)] as loss_p75,
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(10)] as loss_p90,
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(5)] as loss_p95,
APPROX_QUANTILES(a.LossRate, 100)[OFFSET(1)] as loss_p99
FROM
-- TODO(bassosimone): switch to union tables `measurement-lab.ndt.ndt7_union`
-- when they have been blessed as the new stable tables.
`measurement-lab.ndt.unified_downloads`
WHERE
date >= "{START_DATE}" AND date < "{END_DATE}"
AND client.Geo.CountryCode IS NOT NULL
AND client.Geo.city IS NOT NULL
AND client.Network.ASNumber IS NOT NULL
AND a.MeanThroughputMbps IS NOT NULL
AND a.MinRTT IS NOT NULL
AND a.LossRate IS NOT NULL
GROUP BY country_code, city, asn
ORDER BY country_code, city, asn
2 changes: 1 addition & 1 deletion library/src/iqb/queries/uploads_by_country.sql
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ SELECT
-- Upload throughput is "higher is better", so we use standard percentile
-- labels (no inversion).
--
-- See query_downloads.sql for detailed explanation and rationale.
-- See downloads_by_country.sql for detailed explanation and rationale.
-- ============================================================================

-- Upload throughput (higher is better - NO INVERSION)
Expand Down
39 changes: 39 additions & 0 deletions library/src/iqb/queries/uploads_by_country_city_asn.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
SELECT
client.Geo.CountryCode as country_code,
client.Geo.city as city,
client.Network.ASNumber as asn,
COUNT(*) as sample_count,

-- ============================================================================
-- PERCENTILE LABELING CONVENTION FOR IQB QUALITY ASSESSMENT
-- ============================================================================
--
-- Upload throughput is "higher is better", so we use standard percentile
-- labels (no inversion).
--
-- See downloads_by_country.sql for detailed explanation and rationale.
-- ============================================================================

-- Upload throughput (higher is better - NO INVERSION)
-- Standard percentile labels matching statistical definition
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(1)] as upload_p1,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(5)] as upload_p5,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(10)] as upload_p10,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(25)] as upload_p25,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(50)] as upload_p50,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(75)] as upload_p75,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(90)] as upload_p90,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(95)] as upload_p95,
APPROX_QUANTILES(a.MeanThroughputMbps, 100)[OFFSET(99)] as upload_p99
FROM
-- TODO(bassosimone): switch to union tables `measurement-lab.ndt.ndt7_union`
-- when they have been blessed as the new stable tables.
`measurement-lab.ndt.unified_uploads`
WHERE
date >= "{START_DATE}" AND date < "{END_DATE}"
AND client.Geo.CountryCode IS NOT NULL
AND client.Geo.city IS NOT NULL
AND client.Network.ASNumber IS NOT NULL
AND a.MeanThroughputMbps IS NOT NULL
GROUP BY country_code, city, asn
ORDER BY country_code, city, asn
Loading