diff --git a/sql/2025/third-parties/a11y_overall_tech_usage_by_rank.sql b/sql/2025/third-parties/a11y_overall_tech_usage_by_rank.sql new file mode 100644 index 00000000000..f3882c6ad08 --- /dev/null +++ b/sql/2025/third-parties/a11y_overall_tech_usage_by_rank.sql @@ -0,0 +1,61 @@ +#standardSQL +# Overall A11Y technology usage by domain rank + +WITH a11y_technologies AS ( + SELECT + _TABLE_SUFFIX AS client, + url + FROM + `httparchive.technologies.2025_06_01_*` + WHERE + category = 'Accessibility' +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + url, + rank_grouping + FROM + `httparchive.summary_pages.2025_06_01_*`, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + rank <= rank_grouping +), + +rank_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + rank_grouping, + COUNT(0) AS total + FROM + `httparchive.summary_pages.2025_06_01_*`, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + rank <= rank_grouping + GROUP BY + client, + rank_grouping +) + +SELECT + client, + rank_grouping AS rank, + COUNT(DISTINCT url) AS freq, + total, + (COUNT(DISTINCT url) / total) * 100 AS pct +FROM + a11y_technologies +LEFT OUTER JOIN + pages +USING (client, url) +JOIN + rank_totals +USING (client, rank_grouping) +GROUP BY + rank_grouping, + total, + client +ORDER BY + client, + rank diff --git a/sql/2025/third-parties/a11y_technology_usage.sql b/sql/2025/third-parties/a11y_technology_usage.sql new file mode 100644 index 00000000000..988478c30bb --- /dev/null +++ b/sql/2025/third-parties/a11y_technology_usage.sql @@ -0,0 +1,35 @@ +#standardSQL +# A11Y technology usage + +WITH a11y_technologies AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS freq + FROM + `httparchive.technologies.2025_06_01_*` + WHERE + category = 'Accessibility' + GROUP BY + client +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total + FROM + `httparchive.summary_pages.2025_06_01_*` + GROUP BY + client +) + +SELECT + client, + freq, + total, + (freq / total) * 100 AS pct +FROM + a11y_technologies +JOIN + pages +USING (client) diff --git a/sql/2025/third-parties/a11y_technology_usage_by_rank.sql b/sql/2025/third-parties/a11y_technology_usage_by_rank.sql new file mode 100644 index 00000000000..cecc18fd76f --- /dev/null +++ b/sql/2025/third-parties/a11y_technology_usage_by_rank.sql @@ -0,0 +1,65 @@ +#standardSQL +# A11Y technology usage by domain rank + +WITH a11y_technologies AS ( + SELECT + _TABLE_SUFFIX AS client, + app, + url + FROM + `httparchive.technologies.2025_06_01_*` + WHERE + category = 'Accessibility' +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + url, + rank_grouping + FROM + `httparchive.summary_pages.2025_06_01_*`, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + rank <= rank_grouping +), + +rank_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + rank_grouping, + COUNT(0) AS total + FROM + `httparchive.summary_pages.2025_06_01_*`, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + rank <= rank_grouping + GROUP BY + client, + rank_grouping +) + +SELECT + client, + rank_grouping AS rank, + app, + COUNT(0) AS freq, + total, + (COUNT(0) / total) * 100 AS pct +FROM + a11y_technologies +LEFT OUTER JOIN + pages +USING (client, url) +JOIN + rank_totals +USING (client, rank_grouping) +GROUP BY + rank_grouping, + total, + client, + app +ORDER BY + client, + rank, + pct DESC diff --git a/sql/2025/third-parties/compressed_images_by_3p.sql b/sql/2025/third-parties/compressed_images_by_3p.sql new file mode 100644 index 00000000000..36f36b9e20a --- /dev/null +++ b/sql/2025/third-parties/compressed_images_by_3p.sql @@ -0,0 +1,81 @@ +#standardSQL +# Compressed images (excluding SVG) by third parties + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + resp_content_encoding AS content_encoding, + type, + respBodySize AS size + FROM + `httparchive.summary_requests.2025_06_01_*` + WHERE + type = 'image' AND ( + resp_content_encoding = 'gzip' OR + resp_content_encoding = 'br' + ) AND NOT ( + resp_content_type LIKE 'image/svg%' OR + ENDS_WITH(url, '.svg') + ) +), + +third_party AS ( + SELECT + NET.HOST(domain) AS domain, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + domain + HAVING + page_usage >= 50 +) + +SELECT + client, + content_encoding, + domain, + size, + SUM(size) OVER (PARTITION BY client) AS total_size, + size / SUM(size) OVER (PARTITION BY client) AS pct_size, + num_requests, + total_requests, + pct_requests +FROM ( + SELECT + client, + content_encoding, + domain, + COUNT(0) AS num_requests, + SUM(size) AS size, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_requests, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_requests, + RANK() OVER (PARTITION BY client, type, content_encoding ORDER BY COUNT(0) DESC) AS domain_rank + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + WHERE + domain IS NOT NULL + GROUP BY + client, + type, + content_encoding, + domain +) +WHERE + domain_rank <= 100 +ORDER BY + client, + content_encoding, + size DESC diff --git a/sql/2025/third-parties/consent_signal_basic_analysis.sql b/sql/2025/third-parties/consent_signal_basic_analysis.sql new file mode 100644 index 00000000000..9ca789bcf11 --- /dev/null +++ b/sql/2025/third-parties/consent_signal_basic_analysis.sql @@ -0,0 +1,90 @@ +#standardSQL +# Basic consent signal analysis (simplified version to ensure data returns) + +WITH pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' + AND rank <= 50000 -- Expand to top 50K sites +), + +-- Find requests with consent signals (no redirect filtering) +consent_requests AS ( + SELECT + r.client, + r.page, + r.url, + NET.REG_DOMAIN(r.page) AS page_domain, + NET.REG_DOMAIN(r.url) AS url_domain, + + -- Extract consent signals + REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') AS has_usp_standard, + REGEXP_CONTAINS(r.url, r'[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=') AS has_usp_nonstandard, + REGEXP_CONTAINS(r.url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') AS has_tcf_standard, + REGEXP_CONTAINS(r.url, r'[?&](gpp|gpp_sid)=') AS has_gpp_standard, + + -- Check if request has redirects + JSON_EXTRACT(r.summary, '$.redirects') IS NOT NULL AND + TO_JSON_STRING(JSON_EXTRACT(r.summary, '$.redirects')) != '[]' AS has_redirects + FROM + `httparchive.crawl.requests` r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + WHERE + r.date = '2025-06-01' + AND NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) -- Third-party only + AND ( + REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') OR + REGEXP_CONTAINS(r.url, r'[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=') OR + REGEXP_CONTAINS(r.url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') OR + REGEXP_CONTAINS(r.url, r'[?&](gpp|gpp_sid)=') + ) +), + +-- Add any consent signal flag +requests_with_signals AS ( + SELECT + *, + (has_usp_standard OR has_usp_nonstandard OR has_tcf_standard OR has_gpp_standard) AS has_any_signal + FROM + consent_requests +) + +-- Basic analysis +SELECT + client, + + -- Overall counts + COUNT(*) AS total_requests_with_consent_signals, + COUNT(DISTINCT page) AS total_pages_with_consent_signals, + COUNT(DISTINCT url_domain) AS total_domains_with_consent_signals, + + -- Signal type breakdown + COUNTIF(has_usp_standard) AS usp_standard_requests, + COUNTIF(has_usp_nonstandard) AS usp_nonstandard_requests, + COUNTIF(has_tcf_standard) AS tcf_standard_requests, + COUNTIF(has_gpp_standard) AS gpp_standard_requests, + + -- Percentage breakdown + COUNTIF(has_usp_standard) / COUNT(*) AS pct_usp_standard, + COUNTIF(has_usp_nonstandard) / COUNT(*) AS pct_usp_nonstandard, + COUNTIF(has_tcf_standard) / COUNT(*) AS pct_tcf_standard, + COUNTIF(has_gpp_standard) / COUNT(*) AS pct_gpp_standard, + + -- Redirect availability + COUNTIF(has_redirects) AS requests_with_redirects, + COUNTIF(has_redirects) / COUNT(*) AS pct_requests_with_redirects + +FROM + requests_with_signals +GROUP BY + client +ORDER BY + client diff --git a/sql/2025/third-parties/consent_signal_prevalence_by_third_party_category.sql b/sql/2025/third-parties/consent_signal_prevalence_by_third_party_category.sql new file mode 100644 index 00000000000..1d6e9eca25a --- /dev/null +++ b/sql/2025/third-parties/consent_signal_prevalence_by_third_party_category.sql @@ -0,0 +1,195 @@ +#standardSQL +# Consent signal prevalence broken down by third-party category + +WITH pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +), + +requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +-- Get total requests per category and rank grouping for percentage calculations +category_totals AS ( + SELECT + r.client, + rank_grouping, + tp.category, + COUNT(*) AS total_category_requests, + COUNT(DISTINCT r.page) AS total_category_pages, + COUNT(DISTINCT tp.canonicalDomain) AS total_category_domains + FROM + requests r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + INNER JOIN + third_party tp + ON + NET.HOST(r.url) = NET.HOST(tp.domain), + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + p.rank <= rank_grouping + GROUP BY + r.client, + rank_grouping, + tp.category +), + +-- Extract consent signals from third-party requests +consent_signals_by_category AS ( + SELECT + r.client, + rank_grouping, + tp.category, + tp.canonicalDomain, + r.page, + r.url, + + -- Single-pass consent signal detection + REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') AS has_usp_standard, + REGEXP_CONTAINS(r.url, r'[?&](ccpa|usp_consent|uspString|sst\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\.us_privacy|cnsnt|ccpaconsent|usp_string)=') AS has_usp_nonstandard, + REGEXP_CONTAINS(r.url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') AS has_tcf_standard, + REGEXP_CONTAINS(r.url, r'[?&](gpp|gpp_sid)=') AS has_gpp_standard + + FROM + requests r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + INNER JOIN + third_party tp + ON + NET.HOST(r.url) = NET.HOST(tp.domain), + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + p.rank <= rank_grouping + -- Pre-filter: only process URLs that might contain consent-related parameters + AND REGEXP_CONTAINS(r.url, r'[?&](us_privacy|ccpa|usp_consent|uspString|sst\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\.us_privacy|cnsnt|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') +), + +-- Add computed flag for any consent signal +signals_with_any AS ( + SELECT + *, + (has_usp_standard OR has_usp_nonstandard OR has_tcf_standard OR has_gpp_standard) AS has_any_consent_signal + FROM + consent_signals_by_category +), + +-- Aggregate consent signals by category +category_signal_aggregates AS ( + SELECT + client, + rank_grouping, + category, + + -- USP Standard metrics + COUNTIF(has_usp_standard) AS usp_standard_requests, + COUNT(DISTINCT CASE WHEN has_usp_standard THEN page END) AS usp_standard_pages, + COUNT(DISTINCT CASE WHEN has_usp_standard THEN canonicalDomain END) AS usp_standard_domains, + + -- USP Non-Standard metrics + COUNTIF(has_usp_nonstandard) AS usp_nonstandard_requests, + COUNT(DISTINCT CASE WHEN has_usp_nonstandard THEN page END) AS usp_nonstandard_pages, + COUNT(DISTINCT CASE WHEN has_usp_nonstandard THEN canonicalDomain END) AS usp_nonstandard_domains, + + -- TCF Standard metrics + COUNTIF(has_tcf_standard) AS tcf_standard_requests, + COUNT(DISTINCT CASE WHEN has_tcf_standard THEN page END) AS tcf_standard_pages, + COUNT(DISTINCT CASE WHEN has_tcf_standard THEN canonicalDomain END) AS tcf_standard_domains, + + -- GPP Standard metrics + COUNTIF(has_gpp_standard) AS gpp_standard_requests, + COUNT(DISTINCT CASE WHEN has_gpp_standard THEN page END) AS gpp_standard_pages, + COUNT(DISTINCT CASE WHEN has_gpp_standard THEN canonicalDomain END) AS gpp_standard_domains, + + -- Any consent signal metrics + COUNTIF(has_any_consent_signal) AS any_consent_requests, + COUNT(DISTINCT CASE WHEN has_any_consent_signal THEN page END) AS any_consent_pages, + COUNT(DISTINCT CASE WHEN has_any_consent_signal THEN canonicalDomain END) AS any_consent_domains, + + -- Totals for this filtered dataset + COUNT(0) AS total_filtered_requests + FROM + signals_with_any + GROUP BY + client, + rank_grouping, + category +) + +-- Final output using UNNEST to avoid repetitive UNION ALL +SELECT + agg.client, + agg.rank_grouping, + agg.category, + signal_data.signal_type, + signal_data.requests_with_signal, + totals.total_category_requests, + signal_data.requests_with_signal / totals.total_category_requests AS pct_requests_with_signal, + signal_data.pages_with_signal, + totals.total_category_pages, + signal_data.pages_with_signal / totals.total_category_pages AS pct_pages_with_signal, + signal_data.domains_with_signal, + totals.total_category_domains, + signal_data.domains_with_signal / totals.total_category_domains AS pct_domains_with_signal +FROM + category_signal_aggregates agg +JOIN + category_totals totals +USING (client, rank_grouping, category) +CROSS JOIN + UNNEST([ + STRUCT('USP Standard' AS signal_type, usp_standard_requests AS requests_with_signal, usp_standard_pages AS pages_with_signal, usp_standard_domains AS domains_with_signal), + STRUCT('USP Non-Standard' AS signal_type, usp_nonstandard_requests AS requests_with_signal, usp_nonstandard_pages AS pages_with_signal, usp_nonstandard_domains AS domains_with_signal), + STRUCT('TCF Standard' AS signal_type, tcf_standard_requests AS requests_with_signal, tcf_standard_pages AS pages_with_signal, tcf_standard_domains AS domains_with_signal), + STRUCT('GPP Standard' AS signal_type, gpp_standard_requests AS requests_with_signal, gpp_standard_pages AS pages_with_signal, gpp_standard_domains AS domains_with_signal), + STRUCT('Any Consent Signal' AS signal_type, any_consent_requests AS requests_with_signal, any_consent_pages AS pages_with_signal, any_consent_domains AS domains_with_signal) + ]) AS signal_data +WHERE + signal_data.requests_with_signal > 0 -- Only show categories with consent signals + +ORDER BY + client, + rank_grouping, + category, + signal_type diff --git a/sql/2025/third-parties/consent_signal_survival_rate_through_chains_optimized.sql b/sql/2025/third-parties/consent_signal_survival_rate_through_chains_optimized.sql new file mode 100644 index 00000000000..69447ea5273 --- /dev/null +++ b/sql/2025/third-parties/consent_signal_survival_rate_through_chains_optimized.sql @@ -0,0 +1,214 @@ +#standardSQL +# Optimized: Consent signal survival rate through inclusion chains (memory-efficient) + +CREATE TEMP FUNCTION extractConsentSignals(url STRING) +RETURNS STRUCT< + has_usp_standard BOOL, + has_usp_nonstandard BOOL, + has_tcf_standard BOOL, + has_gpp_standard BOOL, + has_any_signal BOOL +> +LANGUAGE js AS """ + try { + const signals = { + has_usp_standard: /[?&]us_privacy=/.test(url), + has_usp_nonstandard: /[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=/.test(url), + has_tcf_standard: /[?&](gdpr|gdpr_consent|gdpr_pd)=/.test(url), + has_gpp_standard: /[?&](gpp|gpp_sid)=/.test(url) + }; + + signals.has_any_signal = signals.has_usp_standard || + signals.has_usp_nonstandard || + signals.has_tcf_standard || + signals.has_gpp_standard; + + return signals; + } catch (e) { + return { + has_usp_standard: false, + has_usp_nonstandard: false, + has_tcf_standard: false, + has_gpp_standard: false, + has_any_signal: false + }; + } +"""; + +WITH pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' + AND rank <= 10000 -- Aggressive filtering: top 10K only +), + +-- Pre-filter to only requests with consent signals or initiator info +filtered_requests AS ( + SELECT + r.client, + r.page, + r.url, + NET.REG_DOMAIN(r.page) AS root_page, + NET.REG_DOMAIN(r.url) AS third_party, + NET.REG_DOMAIN(JSON_VALUE(r.payload, '$._initiator')) AS initiator_etld, + extractConsentSignals(r.url) AS consent_signals + FROM + `httparchive.crawl.requests` r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + WHERE + r.date = '2025-06-01' + AND NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) -- Third-party only + AND ( + -- Only process requests with consent signals OR that are part of chains + REGEXP_CONTAINS(r.url, r'[?&](us_privacy|ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') + OR JSON_VALUE(r.payload, '$._initiator') IS NOT NULL + ) +), + +-- Simplified two-step chain analysis (avoid complex recursion) +step_1_requests AS ( + SELECT + client, + root_page, + third_party, + consent_signals, + COUNT(*) as step1_count + FROM + filtered_requests + WHERE + initiator_etld = root_page -- Direct first-party to third-party requests + AND consent_signals.has_any_signal = true + GROUP BY + client, + root_page, + third_party, + consent_signals +), + +step_2_requests AS ( + SELECT + fr.client, + s1.root_page, + s1.third_party AS step1_party, + fr.third_party AS step2_party, + s1.consent_signals AS step1_signals, + fr.consent_signals AS step2_signals, + COUNT(*) as step2_count + FROM + filtered_requests fr + INNER JOIN + step_1_requests s1 + ON + fr.client = s1.client + AND fr.root_page = s1.root_page + AND fr.initiator_etld = s1.third_party -- Third-party chain + GROUP BY + fr.client, + s1.root_page, + s1.third_party, + fr.third_party, + s1.consent_signals, + fr.consent_signals +), + +-- Calculate survival stats by step +step_1_stats AS ( + SELECT + client, + 1 AS step_number, + + COUNTIF(consent_signals.has_usp_standard) AS usp_standard_count, + COUNTIF(consent_signals.has_usp_nonstandard) AS usp_nonstandard_count, + COUNTIF(consent_signals.has_tcf_standard) AS tcf_standard_count, + COUNTIF(consent_signals.has_gpp_standard) AS gpp_standard_count, + COUNTIF(consent_signals.has_any_signal) AS any_signal_count, + + COUNT(*) AS total_requests, + COUNT(DISTINCT root_page) AS total_pages + FROM + step_1_requests + GROUP BY + client +), + +step_2_stats AS ( + SELECT + client, + 2 AS step_number, + + COUNTIF(step2_signals.has_usp_standard) AS usp_standard_count, + COUNTIF(step2_signals.has_usp_nonstandard) AS usp_nonstandard_count, + COUNTIF(step2_signals.has_tcf_standard) AS tcf_standard_count, + COUNTIF(step2_signals.has_gpp_standard) AS gpp_standard_count, + COUNTIF(step2_signals.has_any_signal) AS any_signal_count, + + COUNT(*) AS total_requests, + COUNT(DISTINCT root_page) AS total_pages + FROM + step_2_requests + GROUP BY + client +), + +-- Combine step statistics +combined_stats AS ( + SELECT * FROM step_1_stats + UNION ALL + SELECT * FROM step_2_stats +), + +-- Get baselines for survival rate calculation +baselines AS ( + SELECT + client, + usp_standard_count AS usp_standard_baseline, + usp_nonstandard_count AS usp_nonstandard_baseline, + tcf_standard_count AS tcf_standard_baseline, + gpp_standard_count AS gpp_standard_baseline, + any_signal_count AS any_signal_baseline + FROM + combined_stats + WHERE + step_number = 1 +) + +-- Final survival rate output (simplified) +SELECT + cs.client, + cs.step_number, + cs.total_requests, + cs.total_pages, + + -- Signal counts and survival rates + cs.usp_standard_count, + SAFE_DIVIDE(cs.usp_standard_count, b.usp_standard_baseline) AS usp_standard_survival_rate, + + cs.usp_nonstandard_count, + SAFE_DIVIDE(cs.usp_nonstandard_count, b.usp_nonstandard_baseline) AS usp_nonstandard_survival_rate, + + cs.tcf_standard_count, + SAFE_DIVIDE(cs.tcf_standard_count, b.tcf_standard_baseline) AS tcf_standard_survival_rate, + + cs.gpp_standard_count, + SAFE_DIVIDE(cs.gpp_standard_count, b.gpp_standard_baseline) AS gpp_standard_survival_rate, + + cs.any_signal_count, + SAFE_DIVIDE(cs.any_signal_count, b.any_signal_baseline) AS any_signal_survival_rate + +FROM + combined_stats cs +JOIN + baselines b +USING (client) + +ORDER BY + client, + step_number diff --git a/sql/2025/third-parties/consent_signal_survival_rate_through_redirects.sql b/sql/2025/third-parties/consent_signal_survival_rate_through_redirects.sql new file mode 100644 index 00000000000..342341c0325 --- /dev/null +++ b/sql/2025/third-parties/consent_signal_survival_rate_through_redirects.sql @@ -0,0 +1,225 @@ +#standardSQL +# Consent signal survival rate through HTTP redirects + +CREATE TEMP FUNCTION extractConsentSignals(url STRING) +RETURNS STRUCT< + has_usp_standard BOOL, + has_usp_nonstandard BOOL, + has_tcf_standard BOOL, + has_gpp_standard BOOL, + has_any_signal BOOL, + signal_count INT64 +> +LANGUAGE js AS """ + try { + const signals = { + has_usp_standard: /[?&]us_privacy=/.test(url), + has_usp_nonstandard: /[?&](ccpa|usp_consent|uspString|sst\\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\\.us_privacy|cnsnt|ccpaconsent|usp_string)=/.test(url), + has_tcf_standard: /[?&](gdpr|gdpr_consent|gdpr_pd)=/.test(url), + has_gpp_standard: /[?&](gpp|gpp_sid)=/.test(url) + }; + + signals.signal_count = [ + signals.has_usp_standard, + signals.has_usp_nonstandard, + signals.has_tcf_standard, + signals.has_gpp_standard + ].filter(Boolean).length; + + signals.has_any_signal = signals.signal_count > 0; + + return signals; + } catch (e) { + return { + has_usp_standard: false, + has_usp_nonstandard: false, + has_tcf_standard: false, + has_gpp_standard: false, + has_any_signal: false, + signal_count: 0 + }; + } +"""; + +WITH pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +), + +-- Get redirect chains from crawl.requests summary column +redirect_chains AS ( + SELECT + r.client, + r.page, + r.url AS final_url, + JSON_EXTRACT(r.summary, '$.redirects') AS redirects, + JSON_EXTRACT_SCALAR(r.summary, '$.startedDateTime') AS startedDateTime, + JSON_EXTRACT_SCALAR(r.summary, '$.endedDateTime') AS endedDateTime, + NET.REG_DOMAIN(r.url) AS final_domain + FROM + `httparchive.crawl.requests` r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + WHERE + r.date = '2025-06-01' + AND JSON_EXTRACT(r.summary, '$.redirects') IS NOT NULL + AND JSON_EXTRACT(r.summary, '$.redirects') != '[]' + -- AND p.rank <= 100000 -- Limit to top 100K sites + AND NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) -- Third-party only +), + +-- Parse redirect chains and extract consent signals at each step +parsed_redirects AS ( + SELECT + client, + page, + final_url, + final_domain, + redirect_step.url AS step_url, + redirect_step.redirectURL AS next_url, + ROW_NUMBER() OVER ( + PARTITION BY client, page, final_url + ORDER BY COALESCE(redirect_step.startedDateTime, redirect_chains.startedDateTime) + ) AS redirect_step_number, + extractConsentSignals(redirect_step.url) AS step_signals, + extractConsentSignals(COALESCE(redirect_step.redirectURL, final_url)) AS next_signals + FROM + redirect_chains, + UNNEST(JSON_EXTRACT_ARRAY(redirects)) AS redirect_step_json, + UNNEST([STRUCT( + JSON_EXTRACT_SCALAR(redirect_step_json, '$.url') AS url, + JSON_EXTRACT_SCALAR(redirect_step_json, '$.redirectURL') AS redirectURL, + JSON_EXTRACT_SCALAR(redirect_step_json, '$.startedDateTime') AS startedDateTime + )]) AS redirect_step + WHERE + redirect_step.url IS NOT NULL +), + +-- Add final URL as the last step +redirect_chains_with_final AS ( + -- Intermediate redirect steps + SELECT + client, + page, + final_url, + final_domain, + redirect_step_number, + step_url AS url_at_step, + step_signals AS signals_at_step, + 'redirect' AS step_type + FROM + parsed_redirects + + UNION ALL + + -- Final destination URL + SELECT + client, + page, + final_url, + final_domain, + MAX(redirect_step_number) + 1 AS redirect_step_number, + final_url AS url_at_step, + extractConsentSignals(final_url) AS signals_at_step, + 'final' AS step_type + FROM + parsed_redirects + GROUP BY + client, + page, + final_url, + final_domain +), + +-- Calculate signal preservation statistics by step +step_survival_stats AS ( + SELECT + client, + redirect_step_number, + step_type, + + -- Signals present at this step + COUNTIF(signals_at_step.has_usp_standard) AS usp_standard_at_step, + COUNTIF(signals_at_step.has_usp_nonstandard) AS usp_nonstandard_at_step, + COUNTIF(signals_at_step.has_tcf_standard) AS tcf_standard_at_step, + COUNTIF(signals_at_step.has_gpp_standard) AS gpp_standard_at_step, + COUNTIF(signals_at_step.has_any_signal) AS any_signal_at_step, + + -- Average signal count per URL + AVG(signals_at_step.signal_count) AS avg_signal_count_at_step, + + COUNT(*) AS total_urls_at_step, + COUNT(DISTINCT page) AS total_pages_at_step + FROM + redirect_chains_with_final + WHERE + redirect_step_number <= 6 -- Limit to first 6 steps (most redirects are short) + GROUP BY + client, + redirect_step_number, + step_type +), + +-- Get baseline (first step) for survival rate calculation +baseline_stats AS ( + SELECT + client, + usp_standard_at_step AS usp_standard_baseline, + usp_nonstandard_at_step AS usp_nonstandard_baseline, + tcf_standard_at_step AS tcf_standard_baseline, + gpp_standard_at_step AS gpp_standard_baseline, + any_signal_at_step AS any_signal_baseline, + avg_signal_count_at_step AS avg_signal_count_baseline + FROM + step_survival_stats + WHERE + redirect_step_number = 1 +) + +-- Final output with survival rates +SELECT + ss.client, + ss.redirect_step_number, + ss.step_type, + ss.total_urls_at_step, + ss.total_pages_at_step, + + -- Signal counts and survival rates + ss.usp_standard_at_step, + SAFE_DIVIDE(ss.usp_standard_at_step, bs.usp_standard_baseline) AS usp_standard_survival_rate, + + ss.usp_nonstandard_at_step, + SAFE_DIVIDE(ss.usp_nonstandard_at_step, bs.usp_nonstandard_baseline) AS usp_nonstandard_survival_rate, + + ss.tcf_standard_at_step, + SAFE_DIVIDE(ss.tcf_standard_at_step, bs.tcf_standard_baseline) AS tcf_standard_survival_rate, + + ss.gpp_standard_at_step, + SAFE_DIVIDE(ss.gpp_standard_at_step, bs.gpp_standard_baseline) AS gpp_standard_survival_rate, + + ss.any_signal_at_step, + SAFE_DIVIDE(ss.any_signal_at_step, bs.any_signal_baseline) AS any_signal_survival_rate, + + -- Average signal degradation + ss.avg_signal_count_at_step, + bs.avg_signal_count_baseline, + ss.avg_signal_count_at_step - bs.avg_signal_count_baseline AS signal_count_change, + SAFE_DIVIDE(ss.avg_signal_count_at_step, bs.avg_signal_count_baseline) AS signal_count_retention_rate + +FROM + step_survival_stats ss +JOIN + baseline_stats bs +USING (client) + +ORDER BY + client, + redirect_step_number diff --git a/sql/2025/third-parties/consent_signal_survival_rate_through_redirects_optimized.sql b/sql/2025/third-parties/consent_signal_survival_rate_through_redirects_optimized.sql new file mode 100644 index 00000000000..304fe587ac3 --- /dev/null +++ b/sql/2025/third-parties/consent_signal_survival_rate_through_redirects_optimized.sql @@ -0,0 +1,225 @@ +#standardSQL +# Optimized: Consent signal survival rate through HTTP redirects (memory-efficient) + +CREATE TEMP FUNCTION extractConsentSignals(url STRING) +RETURNS STRUCT< + has_usp_standard BOOL, + has_usp_nonstandard BOOL, + has_tcf_standard BOOL, + has_gpp_standard BOOL, + has_any_signal BOOL, + signal_count INT64 +> +LANGUAGE js AS """ + try { + if (!url || typeof url !== 'string') return { + has_usp_standard: false, has_usp_nonstandard: false, + has_tcf_standard: false, has_gpp_standard: false, + has_any_signal: false, signal_count: 0 + }; + + const signals = { + has_usp_standard: /[?&]us_privacy=/.test(url), + has_usp_nonstandard: /[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=/.test(url), + has_tcf_standard: /[?&](gdpr|gdpr_consent|gdpr_pd)=/.test(url), + has_gpp_standard: /[?&](gpp|gpp_sid)=/.test(url) + }; + + signals.signal_count = [ + signals.has_usp_standard, signals.has_usp_nonstandard, + signals.has_tcf_standard, signals.has_gpp_standard + ].filter(Boolean).length; + + signals.has_any_signal = signals.signal_count > 0; + return signals; + } catch (e) { + return { + has_usp_standard: false, has_usp_nonstandard: false, + has_tcf_standard: false, has_gpp_standard: false, + has_any_signal: false, signal_count: 0 + }; + } +"""; + +WITH pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' + AND rank <= 100000 -- Expanded to top 100K sites +), + +-- Pre-filter requests with redirects and potential consent signals +requests_with_redirects AS ( + SELECT + r.client, + r.page, + r.url AS final_url, + JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl') AS redirect_url, + NET.REG_DOMAIN(r.url) AS final_domain + FROM + `httparchive.crawl.requests` r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + WHERE + r.date = '2025-06-01' + AND NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) -- Third-party only + AND JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl') IS NOT NULL + AND JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl') != '' + AND ( + -- Pre-filter: only URLs with consent signals in final URL or redirect URL + REGEXP_CONTAINS(r.url, r'[?&](us_privacy|ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') + OR REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl'), r'[?&](us_privacy|ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') + ) +), + +-- Simplified redirect parsing - 2 step analysis +redirect_steps AS ( + SELECT + client, + page, + final_url, + final_domain, + + -- Step 1: Original redirect URL (before redirect) + redirect_url AS step1_url, + + -- Step 2: Final URL (after redirect) + final_url AS step2_url + FROM + requests_with_redirects + WHERE + redirect_url IS NOT NULL + AND redirect_url != '' +), + +-- Extract consent signals for each step +signals_by_step AS ( + SELECT + client, + page, + final_domain, + + -- Step 1 signals (original redirect URL) + step1_url, + extractConsentSignals(step1_url) AS step1_signals, + + -- Step 2 signals (final URL after redirect) + step2_url, + extractConsentSignals(step2_url) AS step2_signals + FROM + redirect_steps + WHERE + step1_url IS NOT NULL +), + +-- Calculate step-wise aggregations (memory efficient) +step_aggregations AS ( + -- Step 1 stats (original redirect URL) + SELECT + client, + 1 AS redirect_step, + 'original' AS step_type, + + COUNTIF(step1_signals.has_usp_standard) AS usp_standard_count, + COUNTIF(step1_signals.has_usp_nonstandard) AS usp_nonstandard_count, + COUNTIF(step1_signals.has_tcf_standard) AS tcf_standard_count, + COUNTIF(step1_signals.has_gpp_standard) AS gpp_standard_count, + COUNTIF(step1_signals.has_any_signal) AS any_signal_count, + + AVG(step1_signals.signal_count) AS avg_signal_count, + COUNT(*) AS total_urls, + COUNT(DISTINCT page) AS total_pages + FROM + signals_by_step + WHERE + step1_signals.has_any_signal = true -- Only analyze chains that start with signals + GROUP BY + client + + UNION ALL + + -- Step 2 stats (final URL after redirect) + SELECT + client, + 2 AS redirect_step, + 'final' AS step_type, + + COUNTIF(step2_signals.has_usp_standard) AS usp_standard_count, + COUNTIF(step2_signals.has_usp_nonstandard) AS usp_nonstandard_count, + COUNTIF(step2_signals.has_tcf_standard) AS tcf_standard_count, + COUNTIF(step2_signals.has_gpp_standard) AS gpp_standard_count, + COUNTIF(step2_signals.has_any_signal) AS any_signal_count, + + AVG(step2_signals.signal_count) AS avg_signal_count, + COUNT(*) AS total_urls, + COUNT(DISTINCT page) AS total_pages + FROM + signals_by_step + WHERE + step1_signals.has_any_signal = true -- Same baseline + GROUP BY + client +), + +-- Calculate baselines (step 1) +baselines AS ( + SELECT + client, + usp_standard_count AS usp_standard_baseline, + usp_nonstandard_count AS usp_nonstandard_baseline, + tcf_standard_count AS tcf_standard_baseline, + gpp_standard_count AS gpp_standard_baseline, + any_signal_count AS any_signal_baseline, + avg_signal_count AS avg_signal_count_baseline + FROM + step_aggregations + WHERE + redirect_step = 1 +) + +-- Final output with survival rates +SELECT + sa.client, + sa.redirect_step, + sa.step_type, + sa.total_urls, + sa.total_pages, + + -- Signal survival rates + sa.usp_standard_count, + SAFE_DIVIDE(sa.usp_standard_count, b.usp_standard_baseline) AS usp_standard_survival_rate, + + sa.usp_nonstandard_count, + SAFE_DIVIDE(sa.usp_nonstandard_count, b.usp_nonstandard_baseline) AS usp_nonstandard_survival_rate, + + sa.tcf_standard_count, + SAFE_DIVIDE(sa.tcf_standard_count, b.tcf_standard_baseline) AS tcf_standard_survival_rate, + + sa.gpp_standard_count, + SAFE_DIVIDE(sa.gpp_standard_count, b.gpp_standard_baseline) AS gpp_standard_survival_rate, + + sa.any_signal_count, + SAFE_DIVIDE(sa.any_signal_count, b.any_signal_baseline) AS any_signal_survival_rate, + + -- Signal count preservation + sa.avg_signal_count, + b.avg_signal_count_baseline, + sa.avg_signal_count - b.avg_signal_count_baseline AS signal_count_change, + SAFE_DIVIDE(sa.avg_signal_count, b.avg_signal_count_baseline) AS signal_count_retention_rate + +FROM + step_aggregations sa +JOIN + baselines b +USING (client) + +ORDER BY + client, + redirect_step diff --git a/sql/2025/third-parties/consent_signal_survival_rate_through_redirects_working.sql b/sql/2025/third-parties/consent_signal_survival_rate_through_redirects_working.sql new file mode 100644 index 00000000000..7b810104590 --- /dev/null +++ b/sql/2025/third-parties/consent_signal_survival_rate_through_redirects_working.sql @@ -0,0 +1,168 @@ +#standardSQL +# Working version: Consent signal survival rate through HTTP redirects + +CREATE TEMP FUNCTION extractConsentSignals(url STRING) +RETURNS STRUCT< + has_usp_standard BOOL, + has_usp_nonstandard BOOL, + has_tcf_standard BOOL, + has_gpp_standard BOOL, + has_any_signal BOOL, + signal_count INT64 +> +LANGUAGE js AS """ + try { + if (!url || typeof url !== 'string') return { + has_usp_standard: false, has_usp_nonstandard: false, + has_tcf_standard: false, has_gpp_standard: false, + has_any_signal: false, signal_count: 0 + }; + + const signals = { + has_usp_standard: /[?&]us_privacy=/.test(url), + has_usp_nonstandard: /[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=/.test(url), + has_tcf_standard: /[?&](gdpr|gdpr_consent|gdpr_pd)=/.test(url), + has_gpp_standard: /[?&](gpp|gpp_sid)=/.test(url) + }; + + signals.signal_count = [ + signals.has_usp_standard, signals.has_usp_nonstandard, + signals.has_tcf_standard, signals.has_gpp_standard + ].filter(Boolean).length; + + signals.has_any_signal = signals.signal_count > 0; + return signals; + } catch (e) { + return { + has_usp_standard: false, has_usp_nonstandard: false, + has_tcf_standard: false, has_gpp_standard: false, + has_any_signal: false, signal_count: 0 + }; + } +"""; + +WITH pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' + AND rank <= 100000 -- Expanded to top 100K sites +), + +-- First, find all third-party requests with consent signals (regardless of redirects) +initial_consent_requests AS ( + SELECT + r.client, + r.page, + r.url, + extractConsentSignals(r.url) AS url_signals + FROM + `httparchive.crawl.requests` r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + WHERE + r.date = '2025-06-01' + AND NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) -- Third-party only + AND REGEXP_CONTAINS(r.url, r'[?&](us_privacy|ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') +), + +-- Now look for those same requests that ALSO have redirect chains +requests_with_redirects AS ( + SELECT + icr.client, + icr.page, + icr.url, + icr.url_signals, + r.summary, + JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl') AS redirect_url + FROM + initial_consent_requests icr + INNER JOIN + `httparchive.crawl.requests` r + ON + icr.client = r.client + AND icr.page = r.page + AND icr.url = r.url + WHERE + r.date = '2025-06-01' + AND JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl') IS NOT NULL + AND JSON_EXTRACT_SCALAR(r.summary, '$.redirectUrl') != '' +), + +-- Parse redirect chains (simplified - just look at redirect URL and final URL) +parsed_redirects AS ( + SELECT + client, + page, + url AS final_url, + url_signals AS final_signals, + + -- Extract redirect URL + redirect_url AS redirect_url, + + -- Simple redirect count (1 = single redirect) + CASE WHEN redirect_url IS NOT NULL AND redirect_url != '' THEN 1 ELSE 0 END AS redirect_count + FROM + requests_with_redirects + WHERE + redirect_url IS NOT NULL + AND redirect_url != '' +), + +-- Extract signals from redirect steps +redirect_analysis AS ( + SELECT + client, + page, + final_url, + final_signals, + redirect_url, + redirect_count, + extractConsentSignals(redirect_url) AS redirect_signals + FROM + parsed_redirects + WHERE + redirect_url IS NOT NULL +) + +-- Final analysis comparing signals across redirect steps +SELECT + client, + + -- Overall statistics + COUNT(*) AS total_redirect_chains_with_consent, + COUNT(DISTINCT page) AS pages_with_redirect_chains, + AVG(redirect_count) AS avg_redirect_count, + + -- Step 1 (redirect URL) signal analysis + COUNTIF(redirect_signals.has_any_signal) AS step1_requests_with_signals, + COUNTIF(redirect_signals.has_usp_standard) AS step1_usp_standard, + COUNTIF(redirect_signals.has_usp_nonstandard) AS step1_usp_nonstandard, + COUNTIF(redirect_signals.has_tcf_standard) AS step1_tcf_standard, + COUNTIF(redirect_signals.has_gpp_standard) AS step1_gpp_standard, + + -- Final URL signal analysis + COUNTIF(final_signals.has_any_signal) AS final_requests_with_signals, + COUNTIF(final_signals.has_usp_standard) AS final_usp_standard, + COUNTIF(final_signals.has_usp_nonstandard) AS final_usp_nonstandard, + COUNTIF(final_signals.has_tcf_standard) AS final_tcf_standard, + COUNTIF(final_signals.has_gpp_standard) AS final_gpp_standard, + + -- Survival rates (what percentage of signals make it to final URL) + SAFE_DIVIDE(COUNTIF(final_signals.has_any_signal), COUNT(*)) AS overall_signal_survival_rate, + SAFE_DIVIDE(COUNTIF(final_signals.has_usp_standard), COUNTIF(redirect_signals.has_usp_standard)) AS usp_standard_survival_rate, + SAFE_DIVIDE(COUNTIF(final_signals.has_tcf_standard), COUNTIF(redirect_signals.has_tcf_standard)) AS tcf_standard_survival_rate, + SAFE_DIVIDE(COUNTIF(final_signals.has_gpp_standard), COUNTIF(redirect_signals.has_gpp_standard)) AS gpp_standard_survival_rate + +FROM + redirect_analysis +GROUP BY + client +ORDER BY + client diff --git a/sql/2025/third-parties/consent_signals_by_parameter_and_domain_optimized.sql b/sql/2025/third-parties/consent_signals_by_parameter_and_domain_optimized.sql new file mode 100644 index 00000000000..14da4eaf493 --- /dev/null +++ b/sql/2025/third-parties/consent_signals_by_parameter_and_domain_optimized.sql @@ -0,0 +1,201 @@ +#standardSQL +# Optimized: Detailed breakdown of consent signals by individual parameters and top domains + +WITH pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +), + +requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' + -- Pre-filter: only process URLs that contain consent-related parameters + AND REGEXP_CONTAINS(url, r'[?&](us_privacy|ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +-- Single-pass parameter extraction using one comprehensive regex +parameter_extraction AS ( + SELECT + r.client, + canonicalDomain, + category, + rank_grouping, + -- Extract all relevant parameters in one pass using REGEXP_EXTRACT_ALL + REGEXP_EXTRACT_ALL(r.url, r'[?&](us_privacy|ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') AS found_parameters + FROM + requests r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + INNER JOIN + third_party tp + ON + NET.HOST(r.url) = NET.HOST(tp.domain), + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + p.rank <= rank_grouping +), + +-- Flatten parameters and count occurrences +flattened_parameters AS ( + SELECT + client, + canonicalDomain, + category, + rank_grouping, + param + FROM + parameter_extraction, + UNNEST(found_parameters) AS param +), + +-- Aggregate parameter counts +parameter_counts AS ( + SELECT + client, + canonicalDomain, + category, + rank_grouping, + param, + COUNT(*) AS param_count, + COUNT(DISTINCT CONCAT(client, canonicalDomain)) AS domain_count + FROM + flattened_parameters + GROUP BY + client, + canonicalDomain, + category, + rank_grouping, + param +), + +-- Get total request counts for percentage calculations (from ALL third-party requests, not pre-filtered) +totals AS ( + SELECT + r.client, + rank_grouping, + COUNT(*) AS total_all_requests + FROM + `httparchive.crawl.requests` r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + INNER JOIN + third_party tp + ON + NET.HOST(r.url) = NET.HOST(tp.domain), + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + r.date = '2025-06-01' + AND p.rank <= rank_grouping + GROUP BY + r.client, + rank_grouping +), + +-- Categorize parameters +categorized_params AS ( + SELECT + client, + rank_grouping, + param, + CASE + WHEN param = 'us_privacy' THEN 'USP Standard' + WHEN param IN ('ccpa', 'usp_consent', 'uspString', 'uspConsent', 'ccpa_consent', 'usp', 'usprivacy', 'ccpaconsent', 'usp_string') THEN 'USP Non-Standard' + WHEN param IN ('gdpr', 'gdpr_consent', 'gdpr_pd') THEN 'TCF Standard' + WHEN param IN ('gpp', 'gpp_sid') THEN 'GPP Standard' + END AS signal_category, + SUM(param_count) AS total_requests, + COUNT(DISTINCT canonicalDomain) AS domains_using + FROM + parameter_counts + GROUP BY + client, + rank_grouping, + param, + signal_category +) + +-- Parameter frequency analysis +SELECT + 'Parameter Frequency' AS analysis_type, + client, + rank_grouping, + param AS parameter_name, + signal_category, + total_requests, + domains_using, + total_requests / totals.total_all_requests AS pct_of_all_requests +FROM + categorized_params +JOIN + totals +USING (client, rank_grouping) + +UNION ALL + +-- Top domains analysis (simplified) +SELECT + 'Top Domains' AS analysis_type, + client, + rank_grouping, + canonicalDomain AS parameter_name, + category AS signal_category, + SUM(param_count) AS total_requests, + COUNT(DISTINCT param) AS domains_using, + SUM(param_count) / MAX(totals.total_all_requests) AS pct_of_all_requests +FROM + parameter_counts +JOIN + totals +USING (client, rank_grouping) +GROUP BY + client, + rank_grouping, + canonicalDomain, + category +HAVING + SUM(param_count) > 0 + +ORDER BY + analysis_type, + client, + rank_grouping, + total_requests DESC +LIMIT 1000 \ No newline at end of file diff --git a/sql/2025/third-parties/content_encoding.sql b/sql/2025/third-parties/content_encoding.sql new file mode 100644 index 00000000000..9da7ce700a1 --- /dev/null +++ b/sql/2025/third-parties/content_encoding.sql @@ -0,0 +1,51 @@ +#standardSQL +#content-encoding by third parties + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + resp_content_encoding AS content_encoding + FROM + `httparchive.summary_requests.2025_06_01_*` +), + +third_party AS ( + SELECT + NET.HOST(domain) AS domain, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + domain + HAVING + page_usage >= 50 +) + +SELECT + client, + content_encoding, + COUNT(0) AS num_requests, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM + requests +LEFT JOIN + third_party +ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +WHERE + domain IS NOT NULL +GROUP BY + client, + content_encoding +ORDER BY + client, + num_requests DESC diff --git a/sql/2025/third-parties/content_encoding_by_content_type.sql b/sql/2025/third-parties/content_encoding_by_content_type.sql new file mode 100644 index 00000000000..2364ff65454 --- /dev/null +++ b/sql/2025/third-parties/content_encoding_by_content_type.sql @@ -0,0 +1,55 @@ +#standardSQL +#content-encoding by third parties by content-type + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + resp_content_encoding AS content_encoding, + type + FROM + `httparchive.summary_requests.2025_06_01_*` +), + +third_party AS ( + SELECT + NET.HOST(domain) AS domain, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + domain + HAVING + page_usage >= 50 +) + +SELECT + client, + type, + content_encoding, + COUNT(0) AS num_requests, + SUM(COUNT(0)) OVER (PARTITION BY client, type) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, type) AS pct +FROM + requests +LEFT JOIN + third_party +ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +WHERE + domain IS NOT NULL +GROUP BY + client, + type, + content_encoding +ORDER BY + client, + type, + num_requests DESC diff --git a/sql/2025/third-parties/csp_allowed_host_frequency.sql b/sql/2025/third-parties/csp_allowed_host_frequency.sql new file mode 100644 index 00000000000..3132a51e317 --- /dev/null +++ b/sql/2025/third-parties/csp_allowed_host_frequency.sql @@ -0,0 +1,84 @@ +#standardSQL +# CSP on home pages: most prevalent allowed hosts + +CREATE TEMPORARY FUNCTION getHeader(headers STRING, headername STRING) +RETURNS STRING DETERMINISTIC +LANGUAGE js AS ''' + const parsed_headers = JSON.parse(headers); + const matching_headers = parsed_headers.filter(h => h.name.toLowerCase() == headername.toLowerCase()); + if (matching_headers.length > 0) { + return matching_headers[0].value; + } + return null; +'''; + +WITH totals AS ( + SELECT + client, + COUNT(0) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2025-06-01' AND + is_main_document + GROUP BY + client +), + +csp_data AS ( + SELECT + client, + page, + getHeader(TO_JSON_STRING(response_headers), 'Content-Security-Policy') AS csp_header + FROM + `httparchive.all.requests` + WHERE + date = '2025-06-01' AND + is_main_document AND + response_headers IS NOT NULL +), + +csp_expanded AS ( + SELECT + client, + page, + csp_allowed_host + FROM + csp_data, + UNNEST(REGEXP_EXTRACT_ALL(csp_header, r'(?i)(https*://[^\s;]+)[\s;]')) AS csp_allowed_host + WHERE + csp_header IS NOT NULL +), + +ranked_csp AS ( + SELECT + client, + csp_allowed_host, + COUNT(DISTINCT page) AS freq, + total AS total_pages, + COUNT(DISTINCT page) / total AS pct, + RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS csp_allowed_host_rank + FROM + csp_expanded + JOIN + totals + USING (client) + GROUP BY + client, + total, + csp_allowed_host +) + +SELECT + client, + csp_allowed_host, + freq, + total_pages, + pct +FROM + ranked_csp +WHERE + csp_allowed_host_rank <= 100 +ORDER BY + client, + pct DESC; diff --git a/sql/2025/third-parties/depth_of_gtm_calls.sql b/sql/2025/third-parties/depth_of_gtm_calls.sql new file mode 100644 index 00000000000..75404eb2dde --- /dev/null +++ b/sql/2025/third-parties/depth_of_gtm_calls.sql @@ -0,0 +1,93 @@ +CREATE TEMP FUNCTION findAllInitiators(rootPage STRING, data ARRAY>) +RETURNS ARRAY +LANGUAGE js AS """ + // Helper function to find all initiator_etlds for a given root_page + function findInitiators(page, visited, data) { + // Find all entries where the root_page matches and the initiator_etld hasn't been visited + const initiators = data + .filter(row => row.root_page === page && !visited.includes(row.initiator_etld)) + .map(row => row.initiator_etld); + + // Add the newly found initiators to the visited list + visited = visited.concat(initiators); + + // Recursively process all new initiators + initiators.forEach(initiator => { + visited = findInitiators(initiator, visited, data); + }); + + return visited; + } + + // Main call: Start recursion from the rootPage + // Use a Set to ensure that all returned values are distinct + return Array.from(new Set(findInitiators(rootPage, [], data))); +"""; + + +CREATE TEMP FUNCTION mean_depth_and_next_element_after_gtm(input_array ARRAY) +RETURNS STRUCT> +LANGUAGE js AS """ + // Initialize the array to hold names of next elements + const nextElements = []; + + // Traverse the input array to find "googletagmanager.com" and capture the next element + for (let i = 0; i < input_array.length - 1; i++) { // -1 to avoid out-of-bounds + if (input_array[i] === 'googletagmanager.com') { + nextElements.push(input_array[i + 1]); + } + } + + // If no "googletagmanager.com" is found, return NULL + if (nextElements.length === 0) { + return { mean_depth: null, next_elements: [] }; + } + + // Calculate mean depth for all next elements + const meanDepth = nextElements.length > 0 + ? nextElements.reduce((sum, _, idx) => sum + (idx + 2), 0) / nextElements.length + : null; + + // Return the result as a struct + return { mean_depth: meanDepth, next_elements: nextElements }; +"""; + + +WITH data AS ( + -- TP interact with other tps + SELECT + * + FROM ( + SELECT + client, + NET.REG_DOMAIN(root_page) AS root_page, + NET.REG_DOMAIN(url) AS third_party, + NET.REG_DOMAIN(JSON_VALUE(payload, '$._initiator')) AS initiator_etld + FROM + `httparchive.crawl.requests` + WHERE + NET.REG_DOMAIN(root_page) != NET.REG_DOMAIN(url) AND + date = '2025-06-01' + ) + WHERE third_party != initiator_etld AND + root_page != initiator_etld + GROUP BY client, root_page, third_party, initiator_etld +) + +SELECT client, next_elements_after_gtm, count(0) AS c FROM ( + SELECT + client, + result.mean_depth AS mean_depth_after_gtm, + result.next_elements AS next_elements_after_gtm + FROM ( + SELECT + root_page, + client, + findAllInitiators(root_page, ARRAY_AGG(STRUCT(root_page, third_party, initiator_etld))) AS all_initiators + FROM data + GROUP BY root_page, client + ), + UNNEST([mean_depth_and_next_element_after_gtm(all_initiators)]) AS result + WHERE result.mean_depth IS NOT NULL + ORDER BY mean_depth_after_gtm +) GROUP BY client, next_elements_after_gtm ORDER BY c; diff --git a/sql/2025/third-parties/distribution_of_3XX_response_body_size.sql b/sql/2025/third-parties/distribution_of_3XX_response_body_size.sql new file mode 100644 index 00000000000..928ae266500 --- /dev/null +++ b/sql/2025/third-parties/distribution_of_3XX_response_body_size.sql @@ -0,0 +1,64 @@ +#standardSQL +# Distribution of response body size by redirected third parties +# HTTP status codes documentation: https://developer.mozilla.org/docs/Web/HTTP/Status + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + status, + respBodySize AS body_size + FROM + `httparchive.summary_requests.2025_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + domain, + IF(status BETWEEN 300 AND 399, 1, 0) AS redirected, + body_size + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +) + +SELECT + client, + percentile, + APPROX_QUANTILES(body_size, 1000)[OFFSET(percentile * 10)] AS approx_redirect_body_size +FROM + base, + UNNEST(GENERATE_ARRAY(1, 100)) AS percentile +WHERE + redirected = 1 +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2025/third-parties/distribution_of_length_of_inclusion_chains.sql b/sql/2025/third-parties/distribution_of_length_of_inclusion_chains.sql new file mode 100644 index 00000000000..ace78271aad --- /dev/null +++ b/sql/2025/third-parties/distribution_of_length_of_inclusion_chains.sql @@ -0,0 +1,65 @@ +CREATE TEMP FUNCTION findAllInitiators(rootPage STRING, data ARRAY>) +RETURNS ARRAY +LANGUAGE js AS """ + // Helper function to find all initiator_etlds for a given root_page + function findInitiators(page, visited, data) { + // Find all entries where the root_page matches and the initiator_etld hasn't been visited + const initiators = data + .filter(row => row.root_page === page && !visited.includes(row.initiator_etld)) + .map(row => row.initiator_etld); + + // Add the newly found initiators to the visited list + visited = visited.concat(initiators); + + // Recursively process all new initiators + initiators.forEach(initiator => { + visited = findInitiators(initiator, visited, data); + }); + + return visited; + } + + // Main call: Start recursion from the rootPage + // Use a Set to ensure that all returned values are distinct + return Array.from(new Set(findInitiators(rootPage, [], data))); +"""; + +WITH data AS ( + -- TP interact with other tps - only extract necessary fields + SELECT + client, + root_page, + third_party, + initiator_etld + FROM ( + SELECT + client, + NET.REG_DOMAIN(root_page) AS root_page, + NET.REG_DOMAIN(url) AS third_party, + NET.REG_DOMAIN(JSON_VALUE(payload, '$._initiator')) AS initiator_etld + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' AND + NET.REG_DOMAIN(root_page) != NET.REG_DOMAIN(url) + ) + WHERE third_party != initiator_etld AND + root_page != initiator_etld + GROUP BY client, root_page, third_party, initiator_etld +) + +SELECT + client, + ARRAY_LENGTH(all_initiators) AS chain_length, + COUNT(*) AS pages_with_this_length +FROM ( + SELECT + root_page, + client, + findAllInitiators(root_page, ARRAY_AGG(STRUCT(root_page, third_party, initiator_etld))) AS all_initiators + FROM data + GROUP BY root_page, client +) +WHERE ARRAY_LENGTH(all_initiators) > 0 +GROUP BY client, chain_length +ORDER BY client, chain_length; \ No newline at end of file diff --git a/sql/2025/third-parties/distribution_of_lighthouse_unminified_css_by_3p.sql b/sql/2025/third-parties/distribution_of_lighthouse_unminified_css_by_3p.sql new file mode 100644 index 00000000000..a66b4cc4224 --- /dev/null +++ b/sql/2025/third-parties/distribution_of_lighthouse_unminified_css_by_3p.sql @@ -0,0 +1,61 @@ +#standardSQL +# Pages with unminified third-party CSS + +CREATE TEMPORARY FUNCTION getUnminifiedCssUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes}) => { + return {url, wastedBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(potential_savings) AS potential_total_savings + FROM ( + SELECT + _TABLE_SUFFIX AS client, + lighthouse.url AS page, + NET.HOST(data.url) AS domain, + data.wastedBytes AS potential_savings + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUnminifiedCssUrls(JSON_EXTRACT(report, "$.audits['unminified-css']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(potential_third_party_savings, 1000)[OFFSET(percentile * 10)] AS potential_third_party_savings_bytes, + APPROX_QUANTILES(potential_total_savings, 1000)[OFFSET(percentile * 10)] AS potential_total_savings_bytes +FROM + base, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2025/third-parties/distribution_of_lighthouse_unminified_js_by_3p.sql b/sql/2025/third-parties/distribution_of_lighthouse_unminified_js_by_3p.sql new file mode 100644 index 00000000000..6bdfd03217e --- /dev/null +++ b/sql/2025/third-parties/distribution_of_lighthouse_unminified_js_by_3p.sql @@ -0,0 +1,62 @@ +#standardSQL +# Pages with unminified third-party JavaScript + +CREATE TEMPORARY FUNCTION getUnminifiedJavascriptUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes}) => { + return {url, wastedBytes}; + }); +} catch (e) { + return []; +} +'''; + + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(potential_savings) AS potential_total_savings + FROM ( + SELECT + _TABLE_SUFFIX AS client, + lighthouse.url AS page, + NET.HOST(data.url) AS domain, + data.wastedBytes AS potential_savings + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUnminifiedJavascriptUrls(JSON_EXTRACT(report, "$.audits['unminified-javascript']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(potential_third_party_savings, 1000)[OFFSET(percentile * 10)] AS potential_third_party_savings_bytes, + APPROX_QUANTILES(potential_total_savings, 1000)[OFFSET(percentile * 10)] AS potential_total_savings_bytes +FROM + base, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2025/third-parties/distribution_of_lighthouse_unused_css_by_3p.sql b/sql/2025/third-parties/distribution_of_lighthouse_unused_css_by_3p.sql new file mode 100644 index 00000000000..5d1b2c41854 --- /dev/null +++ b/sql/2025/third-parties/distribution_of_lighthouse_unused_css_by_3p.sql @@ -0,0 +1,62 @@ +#standardSQL +# Pages with unused third-party CSS + +CREATE TEMPORARY FUNCTION getUnusedCSSUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes}) => { + return {url, wastedBytes}; + }); +} catch (e) { + return []; +} +'''; + + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(potential_savings) AS potential_total_savings + FROM ( + SELECT + _TABLE_SUFFIX AS client, + lighthouse.url AS page, + NET.HOST(data.url) AS domain, + data.wastedBytes AS potential_savings + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUnusedCSSUrls(JSON_EXTRACT(report, "$.audits['unused-css-rules']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(potential_third_party_savings, 1000)[OFFSET(percentile * 10)] AS potential_third_party_savings_bytes, + APPROX_QUANTILES(potential_total_savings, 1000)[OFFSET(percentile * 10)] AS potential_total_savings_bytes +FROM + base, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2025/third-parties/distribution_of_lighthouse_unused_js_by_3p.sql b/sql/2025/third-parties/distribution_of_lighthouse_unused_js_by_3p.sql new file mode 100644 index 00000000000..ffbc9fb3f90 --- /dev/null +++ b/sql/2025/third-parties/distribution_of_lighthouse_unused_js_by_3p.sql @@ -0,0 +1,62 @@ +#standardSQL +# Pages with unused third-party JavaScript + +CREATE TEMPORARY FUNCTION getUnusedJavascriptUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes}) => { + return {url, wastedBytes}; + }); +} catch (e) { + return []; +} +'''; + + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(potential_savings) AS potential_total_savings + FROM ( + SELECT + _TABLE_SUFFIX AS client, + lighthouse.url AS page, + NET.HOST(data.url) AS domain, + data.wastedBytes AS potential_savings + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUnusedJavascriptUrls(JSON_EXTRACT(report, "$.audits['unused-javascript']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(potential_third_party_savings, 1000)[OFFSET(percentile * 10)] AS potential_third_party_savings_bytes, + APPROX_QUANTILES(potential_total_savings, 1000)[OFFSET(percentile * 10)] AS potential_total_savings_bytes +FROM + base, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2025/third-parties/distribution_of_lighthouse_uses_optimized_images_by_3p.sql b/sql/2025/third-parties/distribution_of_lighthouse_uses_optimized_images_by_3p.sql new file mode 100644 index 00000000000..4d09f451b32 --- /dev/null +++ b/sql/2025/third-parties/distribution_of_lighthouse_uses_optimized_images_by_3p.sql @@ -0,0 +1,61 @@ +#standardSQL +# Third-party pages with unoptimized images + +CREATE TEMPORARY FUNCTION getUnminifiedImageUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes}) => { + return {url, wastedBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(potential_savings) AS potential_total_savings + FROM ( + SELECT + _TABLE_SUFFIX AS client, + lighthouse.url AS page, + NET.HOST(data.url) AS domain, + data.wastedBytes AS potential_savings + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUnminifiedImageUrls(JSON_EXTRACT(report, "$.audits['uses-optimized-images']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(potential_third_party_savings, 1000)[OFFSET(percentile * 10)] AS potential_third_party_savings_bytes, + APPROX_QUANTILES(potential_total_savings, 1000)[OFFSET(percentile * 10)] AS potential_total_savings_bytes +FROM + base, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2025/third-parties/distribution_of_size_and_time_by_third_parties.sql b/sql/2025/third-parties/distribution_of_size_and_time_by_third_parties.sql new file mode 100644 index 00000000000..e30e4c01b51 --- /dev/null +++ b/sql/2025/third-parties/distribution_of_size_and_time_by_third_parties.sql @@ -0,0 +1,65 @@ +#standardSQL +# Distribution of third party requests size and time by category + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + respBodySize AS body_size, + time + FROM + `httparchive.summary_requests.2025_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + category, + body_size, + time + FROM + requests + INNER JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +) + +SELECT + client, + category, + percentile, + APPROX_QUANTILES(body_size, 1000)[OFFSET(percentile * 10)] AS body_size, + APPROX_QUANTILES(time, 1000)[OFFSET(percentile * 10)] AS time -- noqa: L010 +FROM + base, + UNNEST(GENERATE_ARRAY(1, 100)) AS percentile +GROUP BY + client, + category, + percentile +ORDER BY + client, + category, + percentile diff --git a/sql/2025/third-parties/distribution_of_third_parties_by_frame.sql b/sql/2025/third-parties/distribution_of_third_parties_by_frame.sql new file mode 100644 index 00000000000..690e18dd4d1 --- /dev/null +++ b/sql/2025/third-parties/distribution_of_third_parties_by_frame.sql @@ -0,0 +1,86 @@ +#standardSQL +# Distribution of third-parties embedded in main vs. in iframes + +WITH document_frameid AS ( + SELECT + client, + NET.HOST(page) AS page_host, + NET.HOST(url) AS frame_host, + CASE + WHEN is_main_document = true + THEN JSON_EXTRACT_SCALAR(payload, '$._frame_id') + END AS mainframe_id, + JSON_EXTRACT_SCALAR(payload, '$._frame_id') AS frame_id, + is_main_document + FROM `httparchive.crawl.requests` AS requests + WHERE requests.date = '2025-06-01' AND requests.is_root_page = true +), + +page_frames AS ( + SELECT + client, + page_host, + frame_host, + CASE + WHEN frame_host != page_host + THEN true + ELSE false + END AS tp_flag, + is_main_document, + frame_id, + COALESCE(mainframe_id, FIRST_VALUE(mainframe_id) OVER (PARTITION BY page_host ORDER BY is_main_document DESC)) AS mainframe_id, + CASE + WHEN frame_id = COALESCE(mainframe_id, FIRST_VALUE(mainframe_id) OVER (PARTITION BY page_host ORDER BY is_main_document DESC)) + THEN 'mainframe' + ELSE 'iframe' + END AS frame_type + FROM document_frameid +), + +combined_frame_counts AS ( + SELECT + client, + page_host, + frame_host, + tp_flag, + CASE + WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'mainframe' THEN 1 ELSE 0 END) = 1 + THEN 'mainframe-only' + WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'iframe' THEN 1 ELSE 0 END) = 1 + THEN 'iframe-only' + WHEN COUNT(DISTINCT frame_id) >= 2 AND COUNT(DISTINCT frame_type) = 2 + THEN 'both' + END AS frame_presence + FROM page_frames + GROUP BY client, page_host, frame_host, tp_flag +), + +aggregated_counts AS ( + SELECT + client, + COUNT(DISTINCT page_host) AS distinct_publisher_count, + COUNT(DISTINCT CASE WHEN tp_flag THEN frame_host ELSE null END) AS distinct_third_party_count, + COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' AND tp_flag THEN page_host ELSE null END) AS distinct_publishers_mainframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' AND tp_flag THEN page_host ELSE null END) AS distinct_publishers_iframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = 'both' AND tp_flag THEN page_host ELSE null END) AS distinct_publishers_both, + COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' AND tp_flag THEN frame_host ELSE null END) AS distinct_mainframe_third_party_count, + COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' AND tp_flag THEN frame_host ELSE null END) AS distinct_iframe_third_party_count, + COUNT(DISTINCT CASE WHEN frame_presence = 'both' AND tp_flag THEN frame_host ELSE null END) AS distinct_both_third_party_count + FROM combined_frame_counts + GROUP BY client +) + +SELECT + client, + distinct_publisher_count, + distinct_third_party_count, + distinct_publishers_mainframe_only, + distinct_publishers_iframe_only, + distinct_publishers_both, + distinct_mainframe_third_party_count, + distinct_mainframe_third_party_count / distinct_third_party_count AS pct_tps_in_mainframe_only, + distinct_iframe_third_party_count, + distinct_iframe_third_party_count / distinct_third_party_count AS pct_tps_in_iframe_only, + distinct_both_third_party_count, + distinct_both_third_party_count / distinct_third_party_count AS pct_tps_in_both +FROM aggregated_counts; diff --git a/sql/2025/third-parties/distribution_of_third_parties_by_number_of_websites.sql b/sql/2025/third-parties/distribution_of_third_parties_by_number_of_websites.sql new file mode 100644 index 00000000000..fefcac1e569 --- /dev/null +++ b/sql/2025/third-parties/distribution_of_third_parties_by_number_of_websites.sql @@ -0,0 +1,65 @@ +#standardSQL +# Distribution of third parties by number of websites + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2025_06_01_*` +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + canonicalDomain, + COUNT(DISTINCT page) AS pages_per_third_party + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + WHERE + canonicalDomain IS NOT NULL + GROUP BY + client, + canonicalDomain +) + +SELECT + client, + percentile, + APPROX_QUANTILES(pages_per_third_party, 1000)[OFFSET(percentile * 10)] AS approx_pages_per_third_party +FROM + base, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2025/third-parties/distribution_of_websites_by_number_of_third_parties.sql b/sql/2025/third-parties/distribution_of_websites_by_number_of_third_parties.sql new file mode 100644 index 00000000000..9b1165ac09e --- /dev/null +++ b/sql/2025/third-parties/distribution_of_websites_by_number_of_third_parties.sql @@ -0,0 +1,67 @@ +#standardSQL +# Distribution of websites by number of third party + +-- updated for crawl.requests +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' +), + + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + page, + COUNT(domain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(percentile * 10)] AS approx_third_parties_per_page +FROM + base, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2025/third-parties/distribution_of_websites_by_number_of_third_party_providers.sql b/sql/2025/third-parties/distribution_of_websites_by_number_of_third_party_providers.sql new file mode 100644 index 00000000000..503818667f3 --- /dev/null +++ b/sql/2025/third-parties/distribution_of_websites_by_number_of_third_party_providers.sql @@ -0,0 +1,63 @@ +#standardSQL +# Distribution of websites by number of third party + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2025_06_01_*` +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + page, + COUNT(DISTINCT canonicalDomain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(percentile * 10)] AS approx_third_parties_per_page +FROM + base, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2025/third-parties/iframe_allow_attribute.sql b/sql/2025/third-parties/iframe_allow_attribute.sql new file mode 100644 index 00000000000..0d9e2aa47b9 --- /dev/null +++ b/sql/2025/third-parties/iframe_allow_attribute.sql @@ -0,0 +1,45 @@ +# standardSQL +# usage of different directives for allow attribute on iframes + +CREATE TEMP FUNCTION getNumWithAllowAttribute(payload STRING) AS (( + SELECT + COUNT(0) + FROM + UNNEST(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox')) AS iframeAttr + WHERE + JSON_EXTRACT_SCALAR(iframeAttr, '$.allow') IS NOT NULL +)); + +SELECT + client, + SPLIT(TRIM(allow_attr), ' ')[OFFSET(0)] AS directive, + total_iframes_with_allow, + COUNT(0) AS freq, + COUNT(0) / total_iframes_with_allow AS pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox') AS iframeAttrs + FROM + `httparchive.pages.2025_06_01_*` +), + UNNEST(iframeAttrs) AS iframeAttr, + UNNEST(REGEXP_EXTRACT_ALL(JSON_EXTRACT_SCALAR(iframeAttr, '$.allow'), r'(?i)([^,;]+)')) AS allow_attr +JOIN ( + SELECT + _TABLE_SUFFIX AS client, + SUM(getNumWithAllowAttribute(payload)) AS total_iframes_with_allow + FROM + `httparchive.pages.2025_06_01_*` + GROUP BY + client +) USING (client) +GROUP BY + client, + directive, + total_iframes_with_allow +HAVING + pct > 0.001 +ORDER BY + client, + pct DESC diff --git a/sql/2025/third-parties/iframe_attribute_popular_hosts.sql b/sql/2025/third-parties/iframe_attribute_popular_hosts.sql new file mode 100644 index 00000000000..9a9de6cd71e --- /dev/null +++ b/sql/2025/third-parties/iframe_attribute_popular_hosts.sql @@ -0,0 +1,54 @@ +#standardSQL +# most common hostnames of iframes that have the allow or sandbox attribute + +CREATE TEMP FUNCTION hasPolicy(attr STRING, policy_type STRING) +RETURNS BOOL DETERMINISTIC +LANGUAGE js AS ''' + const $ = JSON.parse(attr); + return $[policy_type] !== null; +'''; + +SELECT + client, + policy_type, + hostname, + COUNTIF(has_policy) AS freq, + total_iframes, + COUNTIF(has_policy) / total_iframes AS pct +FROM ( + SELECT + client, + policy_type, + JSON_EXTRACT_SCALAR(iframeAttr, '$.hostname') AS hostname, + hasPolicy(iframeAttr, policy_type) AS has_policy + FROM ( + SELECT + _TABLE_SUFFIX AS client, + JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox') AS iframeAttrs + FROM + `httparchive.pages.2025_06_01_*` + ), + UNNEST(iframeAttrs) AS iframeAttr, + UNNEST(['allow', 'sandbox']) AS policy_type +) +JOIN ( + SELECT + _TABLE_SUFFIX AS client, + SUM(ARRAY_LENGTH(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox'))) AS total_iframes + FROM + `httparchive.pages.2025_06_01_*` + GROUP BY + client +) +USING (client) +GROUP BY + client, + total_iframes, + policy_type, + hostname +HAVING + pct > 0.001 +ORDER BY + client, + policy_type, + pct DESC diff --git a/sql/2025/third-parties/length_of_chain_by_intiator.sql b/sql/2025/third-parties/length_of_chain_by_intiator.sql new file mode 100644 index 00000000000..fa36b7b0224 --- /dev/null +++ b/sql/2025/third-parties/length_of_chain_by_intiator.sql @@ -0,0 +1,72 @@ +CREATE TEMP FUNCTION findAllInitiators(rootPage STRING, data ARRAY>) +RETURNS ARRAY +LANGUAGE js AS """ + // Helper function to find all initiator_etlds for a given root_page + function findInitiators(page, visited, data) { + // Find all entries where the root_page matches and the initiator_etld hasn't been visited + const initiators = data + .filter(row => row.root_page === page && !visited.includes(row.initiator_etld)) + .map(row => row.initiator_etld); + + // Add the newly found initiators to the visited list + visited = visited.concat(initiators); + + // Recursively process all new initiators + initiators.forEach(initiator => { + visited = findInitiators(initiator, visited, data); + }); + + return visited; + } + + // Main call: Start recursion from the rootPage + // Use a Set to ensure that all returned values are distinct + return Array.from(new Set(findInitiators(rootPage, [], data))); +"""; + +WITH data AS ( + -- TP interact with other tps + SELECT + * + FROM ( + SELECT + client, + NET.REG_DOMAIN(root_page) AS root_page, + NET.REG_DOMAIN(url) AS third_party, + NET.REG_DOMAIN(JSON_VALUE(payload, '$._initiator')) AS initiator_etld + FROM + `httparchive.crawl.requests` + WHERE + NET.REG_DOMAIN(root_page) != NET.REG_DOMAIN(url) AND + date = '2025-06-01' + ) + WHERE third_party != initiator_etld AND + root_page != initiator_etld + GROUP BY client, root_page, third_party, initiator_etld +) + +-- Add this to the final SELECT to see top initiators by chain length +SELECT + client, + first_initiator, + AVG(ARRAY_LENGTH(all_initiators)) AS avg_chain_length, + MAX(ARRAY_LENGTH(all_initiators)) AS max_chain_length, + COUNT(*) AS pages +FROM ( + SELECT + root_page, + client, + all_initiators, + all_initiators[OFFSET(0)] AS first_initiator -- First third-party in chain + FROM ( + SELECT + root_page, + client, + findAllInitiators(root_page, ARRAY_AGG(STRUCT(root_page, third_party, initiator_etld))) AS all_initiators + FROM data + GROUP BY root_page, client + ) + WHERE ARRAY_LENGTH(all_initiators) > 0 +) +GROUP BY client, first_initiator +ORDER BY avg_chain_length DESC; \ No newline at end of file diff --git a/sql/2025/third-parties/lighthouse_average_unminified_css_by_3p.sql b/sql/2025/third-parties/lighthouse_average_unminified_css_by_3p.sql new file mode 100644 index 00000000000..de202c6180c --- /dev/null +++ b/sql/2025/third-parties/lighthouse_average_unminified_css_by_3p.sql @@ -0,0 +1,44 @@ +#standardSQL +# Pages with unminified CSS by 1P/3P +CREATE TEMPORARY FUNCTION getUnminifiedJsUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes}) => { + return {url, wastedBytes}; + }); +} catch (e) { + return []; +} +'''; + +SELECT + client, + AVG(pct_1p_wasted_bytes) AS avg_pct_1p_wasted_bytes, + AVG(pct_3p_wasted_bytes) AS avg_pct_3p_wasted_bytes +FROM ( + SELECT + client, + page, + SUM(IF(is_3p, 0, wasted_bytes)) / SUM(wasted_bytes) AS pct_1p_wasted_bytes, + SUM(IF(is_3p, wasted_bytes, 0)) / SUM(wasted_bytes) AS pct_3p_wasted_bytes + FROM ( + SELECT + client, + page, + NET.HOST(unminified.url) IS NOT NULL AND NET.HOST(unminified.url) IN ( + SELECT domain FROM `httparchive.almanac.third_parties` WHERE date = '2025-06-01' AND category != 'hosting' + ) AS is_3p, + unminified.wastedBytes AS wasted_bytes + FROM + `httparchive.all.pages` AS allpages + CROSS JOIN + UNNEST(getUnminifiedJsUrls(JSON_EXTRACT(allpages.lighthouse, "$.audits['unminified-css']"))) AS unminified + WHERE allpages.date = '2025-06-01' AND allpages.is_root_page = TRUE + ) + GROUP BY + client, + page +) +GROUP BY + client diff --git a/sql/2025/third-parties/lighthouse_average_unminified_js_by_3p.sql b/sql/2025/third-parties/lighthouse_average_unminified_js_by_3p.sql new file mode 100644 index 00000000000..cc1ef92efae --- /dev/null +++ b/sql/2025/third-parties/lighthouse_average_unminified_js_by_3p.sql @@ -0,0 +1,42 @@ +#standardSQL +# Pages with unminified JS by 1P/3P +CREATE TEMPORARY FUNCTION getUnminifiedJsUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes}) => { + return {url, wastedBytes}; + }); +} catch (e) { + return []; +} +'''; + +SELECT + client, + AVG(pct_1p_wasted_bytes) AS avg_pct_1p_wasted_bytes, + AVG(pct_3p_wasted_bytes) AS avg_pct_3p_wasted_bytes +FROM ( + SELECT + client, + page, + SUM(IF(is_3p, 0, wasted_bytes)) / SUM(wasted_bytes) AS pct_1p_wasted_bytes, + SUM(IF(is_3p, wasted_bytes, 0)) / SUM(wasted_bytes) AS pct_3p_wasted_bytes + FROM ( + SELECT + _TABLE_SUFFIX AS client, + lighthouse.url AS page, + NET.HOST(unminified.url) IS NOT NULL AND NET.HOST(unminified.url) IN ( + SELECT domain FROM `httparchive.almanac.third_parties` WHERE date = '2025-06-01' AND category != 'hosting' + ) AS is_3p, + unminified.wastedBytes AS wasted_bytes + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUnminifiedJsUrls(JSON_EXTRACT(report, "$.audits['unminified-javascript']"))) AS unminified + ) + GROUP BY + client, + page +) +GROUP BY + client diff --git a/sql/2025/third-parties/lighthouse_third_party_facades.sql b/sql/2025/third-parties/lighthouse_third_party_facades.sql new file mode 100644 index 00000000000..c3884dd3208 --- /dev/null +++ b/sql/2025/third-parties/lighthouse_third_party_facades.sql @@ -0,0 +1,20 @@ +SELECT + client, + fail, + total, + pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + COUNTIF(SAFE_CAST(JSON_VALUE(report, '$.audits.third-party-facades.score') AS FLOAT64) < 0.9) AS fail, + SUM(COUNT(0)) OVER (PARTITION BY _TABLE_SUFFIX) AS total, + COUNTIF(SAFE_CAST(JSON_VALUE(report, '$.audits.third-party-facades.score') AS FLOAT64) < 0.9) / SUM(COUNT(0)) OVER (PARTITION BY _TABLE_SUFFIX) AS pct + FROM + `httparchive.lighthouse.2025_06_01_*` + GROUP BY + client +) +WHERE + total > 100 +ORDER BY + client diff --git a/sql/2025/third-parties/lighthouse_unminified_css_by_3p.sql b/sql/2025/third-parties/lighthouse_unminified_css_by_3p.sql new file mode 100644 index 00000000000..3e14aa6394c --- /dev/null +++ b/sql/2025/third-parties/lighthouse_unminified_css_by_3p.sql @@ -0,0 +1,70 @@ +#standardSQL +# Third-party pages with unminified CSS + +CREATE TEMPORARY FUNCTION getUnminifiedCssUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes, totalBytes}) => { + return {url, wastedBytes, totalBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + potential_third_parties.domain AS domain, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(IF(third_party_domains.domain IS NOT NULL, transfer_size, 0)) AS third_party_transfer_size + FROM ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page, + data.wastedBytes AS potential_savings, + data.totalBytes AS transfer_size + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUnminifiedCssUrls(JSON_EXTRACT(report, "$.audits['unminified-css']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + client, + domain, + COUNT(DISTINCT page) AS total_pages, + SUM(third_party_transfer_size) AS third_party_transfer_size_bytes, + SUM(potential_third_party_savings) AS potential_third_party_savings_bytes, + SUM(potential_third_party_savings) / SUM(third_party_transfer_size) AS pct_potential_third_party_savings, + SUM(potential_third_party_savings) / COUNT(DISTINCT page) AS potential_third_party_savings_bytes_per_page +FROM + base +WHERE + potential_third_party_savings > 0 +GROUP BY + client, + domain +ORDER BY + client, + total_pages DESC, + potential_third_party_savings_bytes_per_page DESC, + domain diff --git a/sql/2025/third-parties/lighthouse_unminified_js_by_3p.sql b/sql/2025/third-parties/lighthouse_unminified_js_by_3p.sql new file mode 100644 index 00000000000..652b5ebc784 --- /dev/null +++ b/sql/2025/third-parties/lighthouse_unminified_js_by_3p.sql @@ -0,0 +1,70 @@ +#standardSQL +# Third-party pages with unminified JavaScript + +CREATE TEMPORARY FUNCTION getUnminifiedJavascriptUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes, totalBytes}) => { + return {url, wastedBytes, totalBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + potential_third_parties.domain AS domain, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(IF(third_party_domains.domain IS NOT NULL, transfer_size, 0)) AS third_party_transfer_size + FROM ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page, + data.wastedBytes AS potential_savings, + data.totalBytes AS transfer_size + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUnminifiedJavascriptUrls(JSON_EXTRACT(report, "$.audits['unminified-javascript']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + client, + domain, + COUNT(DISTINCT page) AS total_pages, + SUM(third_party_transfer_size) AS third_party_transfer_size_bytes, + SUM(potential_third_party_savings) AS potential_third_party_savings_bytes, + SUM(potential_third_party_savings) / SUM(third_party_transfer_size) AS pct_potential_third_party_savings, + SUM(potential_third_party_savings) / COUNT(DISTINCT page) AS potential_third_party_savings_bytes_per_page +FROM + base +WHERE + potential_third_party_savings > 0 +GROUP BY + client, + domain +ORDER BY + client, + total_pages DESC, + potential_third_party_savings_bytes_per_page DESC, + domain diff --git a/sql/2025/third-parties/lighthouse_unminified_js_by_3p_by_url.sql b/sql/2025/third-parties/lighthouse_unminified_js_by_3p_by_url.sql new file mode 100644 index 00000000000..4398e6c290b --- /dev/null +++ b/sql/2025/third-parties/lighthouse_unminified_js_by_3p_by_url.sql @@ -0,0 +1,75 @@ +#standardSQL +# Third-party pages with unminified JavaScript + +CREATE TEMPORARY FUNCTION getUnminifiedJavascriptUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes, totalBytes}) => { + return {url, wastedBytes, totalBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + potential_third_parties.domain AS domain, + potential_third_parties.url AS url, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(IF(third_party_domains.domain IS NOT NULL, transfer_size, 0)) AS third_party_transfer_size + FROM ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + data.url AS url, + lighthouse.url AS page, + data.wastedBytes AS potential_savings, + data.totalBytes AS transfer_size + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUnminifiedJavascriptUrls(JSON_EXTRACT(report, "$.audits['unminified-javascript']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain, + url +) + +SELECT + client, + domain, + url, + COUNT(DISTINCT page) AS total_pages, + SUM(third_party_transfer_size) AS third_party_transfer_size_bytes, + SUM(potential_third_party_savings) AS potential_third_party_savings_bytes, + SUM(potential_third_party_savings) / SUM(third_party_transfer_size) AS pct_potential_third_party_savings, + SUM(potential_third_party_savings) / COUNT(DISTINCT page) AS potential_third_party_savings_bytes_per_page +FROM + base +WHERE + potential_third_party_savings > 0 +GROUP BY + client, + domain, + url +ORDER BY + client, + total_pages DESC, + potential_third_party_savings_bytes_per_page DESC, + domain diff --git a/sql/2025/third-parties/lighthouse_unminified_uses_optimized_images_by_3p.sql b/sql/2025/third-parties/lighthouse_unminified_uses_optimized_images_by_3p.sql new file mode 100644 index 00000000000..3bd90aa259f --- /dev/null +++ b/sql/2025/third-parties/lighthouse_unminified_uses_optimized_images_by_3p.sql @@ -0,0 +1,70 @@ +#standardSQL +# Third-party pages with unoptimized images + +CREATE TEMPORARY FUNCTION getUnminifiedImageUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes, totalBytes}) => { + return {url, wastedBytes, totalBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + potential_third_parties.domain AS domain, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(IF(third_party_domains.domain IS NOT NULL, transfer_size, 0)) AS third_party_transfer_size + FROM ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page, + data.wastedBytes AS potential_savings, + data.totalBytes AS transfer_size + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUnminifiedImageUrls(JSON_EXTRACT(report, "$.audits['uses-optimized-images']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + client, + domain, + COUNT(DISTINCT page) AS total_pages, + SUM(third_party_transfer_size) AS third_party_transfer_size_bytes, + SUM(potential_third_party_savings) AS potential_third_party_savings_bytes, + SUM(potential_third_party_savings) / SUM(third_party_transfer_size) AS pct_potential_third_party_savings, + SUM(potential_third_party_savings) / COUNT(DISTINCT page) AS potential_third_party_savings_bytes_per_page +FROM + base +WHERE + potential_third_party_savings > 0 +GROUP BY + client, + domain +ORDER BY + client, + total_pages DESC, + potential_third_party_savings_bytes_per_page DESC, + domain diff --git a/sql/2025/third-parties/lighthouse_unused_css_bytes_by_3p.sql b/sql/2025/third-parties/lighthouse_unused_css_bytes_by_3p.sql new file mode 100644 index 00000000000..e31fe6af59e --- /dev/null +++ b/sql/2025/third-parties/lighthouse_unused_css_bytes_by_3p.sql @@ -0,0 +1,70 @@ +#bq-tandardSQL +# Third-party pages with unused CSS + +CREATE TEMPORARY FUNCTION getUnusedCssUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes, totalBytes}) => { + return {url, wastedBytes, totalBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + potential_third_parties.domain AS domain, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(IF(third_party_domains.domain IS NOT NULL, transfer_size, 0)) AS third_party_transfer_size + FROM ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page, + data.wastedBytes AS potential_savings, + data.totalBytes AS transfer_size + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUnusedCssUrls(JSON_EXTRACT(report, "$.audits['unused-css-rules']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + client, + domain, + COUNT(DISTINCT page) AS total_pages, + SUM(third_party_transfer_size) AS third_party_transfer_size_bytes, + SUM(potential_third_party_savings) AS potential_third_party_savings_bytes, + SUM(potential_third_party_savings) / SUM(third_party_transfer_size) AS pct_potential_third_party_savings, + SUM(potential_third_party_savings) / COUNT(DISTINCT page) AS potential_third_party_savings_bytes_per_page +FROM + base +WHERE + potential_third_party_savings > 0 +GROUP BY + client, + domain +ORDER BY + client, + total_pages DESC, + potential_third_party_savings_bytes_per_page DESC, + domain diff --git a/sql/2025/third-parties/lighthouse_unused_js_bytes_by_3p.sql b/sql/2025/third-parties/lighthouse_unused_js_bytes_by_3p.sql new file mode 100644 index 00000000000..fe45635652e --- /dev/null +++ b/sql/2025/third-parties/lighthouse_unused_js_bytes_by_3p.sql @@ -0,0 +1,70 @@ +#standardSQL +# Third-party pages with unused JavaScript + +CREATE TEMPORARY FUNCTION getUnusedJavascriptUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes, totalBytes}) => { + return {url, wastedBytes, totalBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + potential_third_parties.domain AS domain, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(IF(third_party_domains.domain IS NOT NULL, transfer_size, 0)) AS third_party_transfer_size + FROM ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page, + data.wastedBytes AS potential_savings, + data.totalBytes AS transfer_size + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUnusedJavascriptUrls(JSON_EXTRACT(report, "$.audits['unused-javascript']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + client, + domain, + COUNT(DISTINCT page) AS total_pages, + SUM(third_party_transfer_size) AS third_party_transfer_size_bytes, + SUM(potential_third_party_savings) AS potential_third_party_savings_bytes, + SUM(potential_third_party_savings) / SUM(third_party_transfer_size) AS pct_potential_third_party_savings, + SUM(potential_third_party_savings) / COUNT(DISTINCT page) AS potential_third_party_savings_bytes_per_page +FROM + base +WHERE + potential_third_party_savings > 0 +GROUP BY + client, + domain +ORDER BY + client, + total_pages DESC, + potential_third_party_savings_bytes_per_page DESC, + domain diff --git a/sql/2025/third-parties/number_of_third_parties_by_rank.sql b/sql/2025/third-parties/number_of_third_parties_by_rank.sql new file mode 100644 index 00000000000..c7559524cba --- /dev/null +++ b/sql/2025/third-parties/number_of_third_parties_by_rank.sql @@ -0,0 +1,83 @@ +#standardSQL +# Number of third-parties per websites by rank + +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' AND + is_root_page = true +), + +pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` AS pg + WHERE + pg.date = '2025-06-01' AND + pg.is_root_page = true +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + page, + rank, + COUNT(domain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + INNER JOIN + pages + USING (client, page) + GROUP BY + client, + page, + rank +) + +SELECT + client, + rank_grouping, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page +FROM + base, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping diff --git a/sql/2025/third-parties/number_of_third_parties_by_rank_and_category.sql b/sql/2025/third-parties/number_of_third_parties_by_rank_and_category.sql new file mode 100644 index 00000000000..be79bc7536f --- /dev/null +++ b/sql/2025/third-parties/number_of_third_parties_by_rank_and_category.sql @@ -0,0 +1,84 @@ +#standardSQL +# Number of third-parties per websites by rank and category + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2025_06_01_*` +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + rank + FROM + `httparchive.summary_pages.2025_06_01_*` +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category NOT IN ('hosting') + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + category, + page, + rank, + COUNT(domain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + INNER JOIN + pages + USING (client, page) + GROUP BY + client, + category, + page, + rank +) + +SELECT + client, + category, + rank_grouping, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page +FROM + base, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + category, + rank_grouping +ORDER BY + client, + category, + rank_grouping diff --git a/sql/2025/third-parties/number_of_third_party_providers_by_rank.sql b/sql/2025/third-parties/number_of_third_party_providers_by_rank.sql new file mode 100644 index 00000000000..132271e46b3 --- /dev/null +++ b/sql/2025/third-parties/number_of_third_party_providers_by_rank.sql @@ -0,0 +1,78 @@ +#standardSQL +# Number of distinct third-party providers per websites by rank +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2025_06_01_*` +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + rank + FROM + `httparchive.summary_pages.2025_06_01_*` +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + page, + rank, + COUNT(DISTINCT canonicalDomain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + INNER JOIN + pages + USING (client, page) + GROUP BY + client, + page, + rank +) + +SELECT + client, + rank_grouping, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page +FROM + base, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping diff --git a/sql/2025/third-parties/number_of_third_party_providers_by_rank_and_category.sql b/sql/2025/third-parties/number_of_third_party_providers_by_rank_and_category.sql new file mode 100644 index 00000000000..d96e77931c6 --- /dev/null +++ b/sql/2025/third-parties/number_of_third_party_providers_by_rank_and_category.sql @@ -0,0 +1,88 @@ +#standardSQL +# Number of third-party providers per websites by rank and category + +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' +), + +pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category NOT IN ('hosting') + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + category, + page, + rank, + COUNT(DISTINCT canonicalDomain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + INNER JOIN + pages + USING (client, page) + GROUP BY + client, + category, + page, + rank +) + +SELECT + client, + category, + rank_grouping, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page +FROM + base, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + category, + rank_grouping +ORDER BY + client, + category, + rank_grouping diff --git a/sql/2025/third-parties/number_of_third_party_requests_by_rank.sql b/sql/2025/third-parties/number_of_third_party_requests_by_rank.sql new file mode 100644 index 00000000000..e78af3902a5 --- /dev/null +++ b/sql/2025/third-parties/number_of_third_party_requests_by_rank.sql @@ -0,0 +1,60 @@ +#standardSQL +# Number of third-party requests by rank +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.all.requests` AS req + WHERE + req.date = '2025-06-01' AND + req.is_root_page = true +), + +pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.all.pages` AS pg + WHERE + pg.date = '2025-06-01' AND + pg.is_root_page = true +), + +third_party AS ( + SELECT + tp.client, + tp.rank, + COUNT(DISTINCT r.url) AS distinct_tp_requests, + COUNT(r.url) AS tp_requests, + rank_grouping + FROM + pages tp + INNER JOIN + requests r + ON NET.HOST(tp.page) = NET.HOST(r.page) AND tp.client = r.client + CROSS JOIN UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + tp.rank <= rank_grouping + GROUP BY + tp.client, + tp.rank, + rank_grouping +) + +SELECT + client, + rank_grouping, + APPROX_QUANTILES(distinct_tp_requests, 1000)[OFFSET(500)] AS median_distinct_tp_requests, + APPROX_QUANTILES(tp_requests, 1000)[OFFSET(500)] AS median_tp_requests +FROM + third_party +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping; diff --git a/sql/2025/third-parties/number_of_third_party_requests_per_page_by_rank.sql b/sql/2025/third-parties/number_of_third_party_requests_per_page_by_rank.sql new file mode 100644 index 00000000000..b626d197a07 --- /dev/null +++ b/sql/2025/third-parties/number_of_third_party_requests_per_page_by_rank.sql @@ -0,0 +1,61 @@ +#standardSQL +# Number of third-party requests per page by rank + +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' AND + is_root_page = true +), + +pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' AND + is_root_page = true +), + +third_party AS ( + SELECT + tp.client, + tp.page, + tp.rank, + COUNT(DISTINCT r.url) AS distinct_tp_requests, + COUNT(r.url) AS tp_requests + FROM + pages tp + INNER JOIN + requests r + ON NET.HOST(tp.page) = NET.HOST(r.page) AND tp.client = r.client + GROUP BY + tp.client, + tp.page, + tp.rank +) + +SELECT + client, + rank_grouping, + APPROX_QUANTILES(distinct_tp_requests, 1000)[OFFSET(500)] AS p50_distinct_tp_requests_per_page, + APPROX_QUANTILES(tp_requests, 1000)[OFFSET(500)] AS p50_tp_requests_per_page +FROM + third_party, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping; diff --git a/sql/2025/third-parties/percent_of_third_parties_by_content_type.sql b/sql/2025/third-parties/percent_of_third_parties_by_content_type.sql new file mode 100644 index 00000000000..f6a0a333b9d --- /dev/null +++ b/sql/2025/third-parties/percent_of_third_parties_by_content_type.sql @@ -0,0 +1,55 @@ +#standardSQL +# Percent of third party requests by content type. + +WITH requests AS ( + SELECT + client, + page, + url, + type AS contentType + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +) + +SELECT + client, + contentType, + COUNT(0) AS requests, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_requests, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_requests +FROM + requests +LEFT JOIN + third_party +ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +WHERE + domain IS NOT NULL +GROUP BY + client, + contentType +ORDER BY + client, + contentType diff --git a/sql/2025/third-parties/percent_of_third_parties_using_document_write.sql b/sql/2025/third-parties/percent_of_third_parties_using_document_write.sql new file mode 100644 index 00000000000..6684f52575b --- /dev/null +++ b/sql/2025/third-parties/percent_of_third_parties_using_document_write.sql @@ -0,0 +1,71 @@ +#standardSQL +# Percent of third-parties that use document.write + +CREATE TEMPORARY FUNCTION +getUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(i => ({url: i.source.url})); +} catch(e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + third_party_domains.domain AS domain + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUrls(JSON_EXTRACT(report, "$.audits['no-document-write']"))) AS data + ) AS potential_third_parties + INNER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + base.client AS client, + domain, + COUNT(0) AS freq, + total, + COUNT(0) / total AS pct +FROM + base +JOIN ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS total + FROM + `httparchive.lighthouse.2025_06_01_*` + GROUP BY + _TABLE_SUFFIX +) +USING (client) +GROUP BY + client, + domain, + total +ORDER BY + client, + freq DESC diff --git a/sql/2025/third-parties/percent_of_third_parties_using_legacy_javascript.sql b/sql/2025/third-parties/percent_of_third_parties_using_legacy_javascript.sql new file mode 100644 index 00000000000..a1a821420eb --- /dev/null +++ b/sql/2025/third-parties/percent_of_third_parties_using_legacy_javascript.sql @@ -0,0 +1,71 @@ +#standardSQL +# Percent third-party scripts that use legacy JavaScript + +CREATE TEMPORARY FUNCTION +getUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(i => ({url: i.url})); +} catch(e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + third_party_domains.domain AS domain + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUrls(JSON_EXTRACT(report, "$.audits['legacy-javascript']"))) AS data + ) AS potential_third_parties + INNER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + base.client AS client, + domain, + COUNT(0) AS freq, + total, + COUNT(0) / total AS pct +FROM + base +JOIN ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS total + FROM + `httparchive.lighthouse.2025_06_01_*` + GROUP BY + _TABLE_SUFFIX +) +USING (client) +GROUP BY + client, + domain, + total +ORDER BY + client, + freq DESC diff --git a/sql/2025/third-parties/percent_of_third_parties_using_legacy_javascript_by_url.sql b/sql/2025/third-parties/percent_of_third_parties_using_legacy_javascript_by_url.sql new file mode 100644 index 00000000000..6dc4152465c --- /dev/null +++ b/sql/2025/third-parties/percent_of_third_parties_using_legacy_javascript_by_url.sql @@ -0,0 +1,88 @@ +#standardSQL +# Percent third-party scripts that use legacy JavaScript by URLs + +CREATE TEMPORARY FUNCTION +getUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(i => ({url: i.url})); +} catch(e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + third_party_domains.domain AS domain, + url + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + data.url AS url, + NET.HOST(data.url) AS domain, + lighthouse.url AS page + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUrls(JSON_EXTRACT(report, "$.audits['legacy-javascript']"))) AS data + ) AS potential_third_parties + INNER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain, + url +) + +SELECT + client, + domain, + url, + freq, + total, + pct +FROM ( + SELECT + base.client AS client, + domain, + url, + COUNT(0) AS freq, + total, + COUNT(0) / total AS pct, + RANK() OVER (PARTITION BY base.client ORDER BY COUNT(0) DESC) AS url_rank + FROM + base + JOIN ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS total + FROM + `httparchive.lighthouse.2025_06_01_*` + GROUP BY + _TABLE_SUFFIX + ) + USING (client) + GROUP BY + client, + domain, + url, + total +) +WHERE + url_rank <= 100 +ORDER BY + client, + freq DESC diff --git a/sql/2025/third-parties/percent_of_third_party_cache.sql b/sql/2025/third-parties/percent_of_third_party_cache.sql new file mode 100644 index 00000000000..ba7375cf9cd --- /dev/null +++ b/sql/2025/third-parties/percent_of_third_party_cache.sql @@ -0,0 +1,75 @@ +#standardSQL +# Percent of third party requests cached +# Cache-Control documentation: https://developer.mozilla.org/docs/Web/HTTP/Headers/Cache-Control#Directives + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + resp_cache_control, + status, + respOtherHeaders, + reqOtherHeaders, + type, + url, + pageid AS page + FROM + `httparchive.summary_requests.2025_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + type, + IF( + ( + status IN (301, 302, 307, 308, 410) AND + NOT REGEXP_CONTAINS(resp_cache_control, r'(?i)private|no-store') AND + NOT REGEXP_CONTAINS(reqOtherHeaders, r'Authorization') + ) OR + ( + status IN (301, 302, 307, 308, 410) OR + REGEXP_CONTAINS(resp_cache_control, r'public|max-age|s-maxage') OR + REGEXP_CONTAINS(respOtherHeaders, r'Expires') + ), 1, 0 + ) AS cached + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + WHERE + domain IS NOT NULL +) + +SELECT + client, + type, + SUM(cached) AS cached_requests, + COUNT(0) AS total_requests, + SUM(cached) / COUNT(0) AS pct_cached_requests +FROM + base +GROUP BY + client, + type diff --git a/sql/2025/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql b/sql/2025/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql new file mode 100644 index 00000000000..e0496e976cf --- /dev/null +++ b/sql/2025/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql @@ -0,0 +1,86 @@ +#standardSQL +# Percent of third party requests and bytes by category and content type. + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + type AS contentType, + respBodySize AS body_size + FROM + `httparchive.summary_requests.2025_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + page, + category, + contentType, + body_size + FROM + requests + INNER JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +), + +requests_per_page_and_category AS ( + SELECT + client, + page, + category, + contentType, + SUM(SUM(body_size)) OVER (PARTITION BY page) AS total_page_size, + SUM(body_size) AS body_size, + SUM(COUNT(0)) OVER (PARTITION BY page) AS total_page_requests, + COUNT(0) AS requests + FROM + base + GROUP BY + client, + page, + category, + contentType +) + +SELECT + client, + category, + contentType, + SUM(requests) AS requests, + SAFE_DIVIDE(SUM(requests), SUM(SUM(requests)) OVER (PARTITION BY client, category)) AS pct_requests, + SUM(body_size) AS body_size, + SAFE_DIVIDE(SUM(body_size), SUM(SUM(body_size)) OVER (PARTITION BY client, category)) AS pct_body_size +FROM + requests_per_page_and_category +GROUP BY + client, + category, + contentType +ORDER BY + client, + category, + contentType diff --git a/sql/2025/third-parties/percent_of_third_party_with_security_headers.sql b/sql/2025/third-parties/percent_of_third_party_with_security_headers.sql new file mode 100644 index 00000000000..51e75c658b2 --- /dev/null +++ b/sql/2025/third-parties/percent_of_third_party_with_security_headers.sql @@ -0,0 +1,73 @@ +#standardSQL +# Percent of third-party requests with security headers + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + RTRIM(urlShort, '/') AS origin, + respOtherHeaders + FROM + `httparchive.summary_requests.2025_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +headers AS ( + SELECT + client, + requests.origin AS req_origin, + LOWER(respOtherHeaders) AS respOtherHeaders, + third_party.category AS req_category + FROM requests + INNER JOIN third_party + ON NET.HOST(requests.origin) = NET.HOST(third_party.domain) +), + +base AS ( + SELECT + client, + req_origin, + req_category, + IF(STRPOS(respOtherHeaders, 'strict-transport-security') > 0, 1, 0) AS hsts_header, + IF(STRPOS(respOtherHeaders, 'x-content-type-options') > 0, 1, 0) AS x_content_type_options_header, + IF(STRPOS(respOtherHeaders, 'x-frame-options') > 0, 1, 0) AS x_frame_options_header, + IF(STRPOS(respOtherHeaders, 'x-xss-protection') > 0, 1, 0) AS x_xss_protection_header + FROM headers +) + +SELECT + client, + req_category, + COUNT(0) AS total_requests, + SUM(hsts_header) / COUNT(0) AS pct_hsts_header_requests, + SUM(x_content_type_options_header) / COUNT(0) AS pct_x_content_type_options_header_requests, + SUM(x_frame_options_header) / COUNT(0) AS pct_x_frame_options_header_requests, + SUM(x_xss_protection_header) / COUNT(0) AS pct_x_xss_protection_header_requests +FROM + base +GROUP BY + client, + req_category +ORDER BY + client, + req_category diff --git a/sql/2025/third-parties/percent_of_websites_with_third_party.sql b/sql/2025/third-parties/percent_of_websites_with_third_party.sql new file mode 100644 index 00000000000..9a7164016c6 --- /dev/null +++ b/sql/2025/third-parties/percent_of_websites_with_third_party.sql @@ -0,0 +1,51 @@ +#standardSQL +# Percent of websites with third parties + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + respBodySize + FROM + `httparchive.summary_requests.2025_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage, + COUNT(0) AS request_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage > 50 +) + +SELECT + client, + COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) AS pages_with_third_party, + COUNT(DISTINCT page) AS total_pages, + COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) / COUNT(DISTINCT page) AS pct_pages_with_third_party, + COUNTIF(domain IS NOT NULL) AS third_party_requests, + COUNT(0) AS total_requests, + COUNTIF(domain IS NOT NULL) / COUNT(0) AS pct_third_party_requests, + SUM(IF(domain IS NOT NULL, respBodySize, 0)) AS third_party_body_size, + SUM(respBodySize) AS total_body_size, + SUM(IF(domain IS NOT NULL, respBodySize, 0)) / SUM(respBodySize) AS pct_body_size +FROM + requests +LEFT JOIN third_party +ON NET.HOST(requests.url) = NET.HOST(third_party.domain) +GROUP BY + client diff --git a/sql/2025/third-parties/percent_of_websites_with_third_party_by_ranking.sql b/sql/2025/third-parties/percent_of_websites_with_third_party_by_ranking.sql new file mode 100644 index 00000000000..4558d541b09 --- /dev/null +++ b/sql/2025/third-parties/percent_of_websites_with_third_party_by_ranking.sql @@ -0,0 +1,69 @@ +#standardSQL +# Percent of websites with third parties by ranking + +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' +), + + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) + +SELECT + client, + rank_grouping, + COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) AS pages_with_third_party, + COUNT(DISTINCT page) AS total_pages, + COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) / COUNT(DISTINCT page) AS pct_pages_with_third_party +FROM + pages +JOIN + requests +USING (client, page) +LEFT JOIN + third_party +ON NET.HOST(requests.url) = NET.HOST(third_party.domain), + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping diff --git a/sql/2025/third-parties/prevalence_of_consent_signals_in_third_party_requests_optimized.sql b/sql/2025/third-parties/prevalence_of_consent_signals_in_third_party_requests_optimized.sql new file mode 100644 index 00000000000..0a5334ddd83 --- /dev/null +++ b/sql/2025/third-parties/prevalence_of_consent_signals_in_third_party_requests_optimized.sql @@ -0,0 +1,193 @@ +#standardSQL +# Prevalence of specific consent signals (USP, TCF, GPP) in third-party requests + +WITH pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +), + +requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +totals AS ( + SELECT + r.client, + rank_grouping, + COUNT(DISTINCT r.page) AS total_pages, + COUNT(0) AS total_requests + FROM + requests r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + p.rank <= rank_grouping + GROUP BY + r.client, + rank_grouping +), + +third_party_requests AS ( + SELECT + r.client, + r.page, + r.url, + canonicalDomain, + category, + rank_grouping + FROM + requests r + INNER JOIN + pages p + ON + r.client = p.client AND r.page = p.page + INNER JOIN + third_party tp + ON + NET.HOST(r.url) = NET.HOST(tp.domain), + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + p.rank <= rank_grouping +), + +-- Single-pass consent signal detection using one comprehensive regex +consent_signals AS ( + SELECT + client, + page, + url, + canonicalDomain, + category, + rank_grouping, + + -- Extract all consent parameters in one pass + REGEXP_EXTRACT_ALL(url, r'[?&](us_privacy|ccpa|usp_consent|uspString|sst\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\.us_privacy|cnsnt|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') AS found_params, + + -- Boolean flags derived from the extracted parameters (computed once) + REGEXP_CONTAINS(url, r'[?&]us_privacy=') AS has_usp_standard, + REGEXP_CONTAINS(url, r'[?&](ccpa|usp_consent|uspString|sst\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\.us_privacy|cnsnt|ccpaconsent|usp_string)=') AS has_usp_nonstandard, + REGEXP_CONTAINS(url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') AS has_tcf_standard, + REGEXP_CONTAINS(url, r'[?&](gpp|gpp_sid)=') AS has_gpp_standard + FROM + third_party_requests + WHERE + -- Pre-filter to reduce data processing + REGEXP_CONTAINS(url, r'[?&](us_privacy|ccpa|usp_consent|uspString|sst\.us_privacy|uspConsent|ccpa_consent|AV_CCPA|usp|usprivacy|_fw_us_privacy|D9v\.us_privacy|cnsnt|ccpaconsent|usp_string|gdpr|gdpr_consent|gdpr_pd|gpp|gpp_sid)=') +), + +-- Add computed flag for any consent signal +signals_with_any AS ( + SELECT + *, + (has_usp_standard OR has_usp_nonstandard OR has_tcf_standard OR has_gpp_standard) AS has_any_consent_signal + FROM + consent_signals +), + +-- Create a single aggregation table to avoid repetitive calculations +signal_aggregates AS ( + SELECT + client, + rank_grouping, + -- USP Standard metrics + COUNTIF(has_usp_standard) AS usp_standard_requests, + COUNT(DISTINCT CASE WHEN has_usp_standard THEN page END) AS usp_standard_pages, + COUNT(DISTINCT CASE WHEN has_usp_standard THEN canonicalDomain END) AS usp_standard_domains, + + -- USP Non-Standard metrics + COUNTIF(has_usp_nonstandard) AS usp_nonstandard_requests, + COUNT(DISTINCT CASE WHEN has_usp_nonstandard THEN page END) AS usp_nonstandard_pages, + COUNT(DISTINCT CASE WHEN has_usp_nonstandard THEN canonicalDomain END) AS usp_nonstandard_domains, + + -- TCF Standard metrics + COUNTIF(has_tcf_standard) AS tcf_standard_requests, + COUNT(DISTINCT CASE WHEN has_tcf_standard THEN page END) AS tcf_standard_pages, + COUNT(DISTINCT CASE WHEN has_tcf_standard THEN canonicalDomain END) AS tcf_standard_domains, + + -- GPP Standard metrics + COUNTIF(has_gpp_standard) AS gpp_standard_requests, + COUNT(DISTINCT CASE WHEN has_gpp_standard THEN page END) AS gpp_standard_pages, + COUNT(DISTINCT CASE WHEN has_gpp_standard THEN canonicalDomain END) AS gpp_standard_domains, + + -- Any consent signal metrics + COUNTIF(has_any_consent_signal) AS any_consent_requests, + COUNT(DISTINCT CASE WHEN has_any_consent_signal THEN page END) AS any_consent_pages, + COUNT(DISTINCT CASE WHEN has_any_consent_signal THEN canonicalDomain END) AS any_consent_domains, + + -- Totals for this filtered dataset + COUNT(0) AS total_third_party_requests + FROM + signals_with_any + GROUP BY + client, + rank_grouping +) + +-- Final output using UNNEST to avoid repetitive UNION ALL +SELECT + agg.client, + agg.rank_grouping, + signal_data.signal_type, + signal_data.requests_with_signal, + agg.total_third_party_requests, + signal_data.requests_with_signal / agg.total_third_party_requests AS pct_requests_with_signal, + signal_data.pages_with_signal, + totals.total_pages, + signal_data.pages_with_signal / totals.total_pages AS pct_pages_with_signal, + signal_data.domains_with_signal +FROM + signal_aggregates agg +JOIN + totals +USING (client, rank_grouping) +CROSS JOIN + UNNEST([ + STRUCT('USP Standard' AS signal_type, usp_standard_requests AS requests_with_signal, usp_standard_pages AS pages_with_signal, usp_standard_domains AS domains_with_signal), + STRUCT('USP Non-Standard' AS signal_type, usp_nonstandard_requests AS requests_with_signal, usp_nonstandard_pages AS pages_with_signal, usp_nonstandard_domains AS domains_with_signal), + STRUCT('TCF Standard' AS signal_type, tcf_standard_requests AS requests_with_signal, tcf_standard_pages AS pages_with_signal, tcf_standard_domains AS domains_with_signal), + STRUCT('GPP Standard' AS signal_type, gpp_standard_requests AS requests_with_signal, gpp_standard_pages AS pages_with_signal, gpp_standard_domains AS domains_with_signal), + STRUCT('Any Consent Signal' AS signal_type, any_consent_requests AS requests_with_signal, any_consent_pages AS pages_with_signal, any_consent_domains AS domains_with_signal) + ]) AS signal_data + +ORDER BY + client, + rank_grouping, + signal_type diff --git a/sql/2025/third-parties/scripts_using_async_defer.sql b/sql/2025/third-parties/scripts_using_async_defer.sql new file mode 100644 index 00000000000..26c32fb8416 --- /dev/null +++ b/sql/2025/third-parties/scripts_using_async_defer.sql @@ -0,0 +1,76 @@ +#standardSQL +# 3P scripts using async or defer +# (capped to 1 hit per domain per page) +CREATE TEMPORARY FUNCTION getScripts(str STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' + try { + var almanac = JSON.parse(str); + + if (Array.isArray(almanac) || typeof almanac != "object") { + return result; + } + + if (almanac.scripts && almanac.scripts.nodes) { + return almanac.scripts.nodes.map((n) => ({ + src: n.src, + isAsync: n.hasOwnProperty("async"), + isDefer: n.hasOwnProperty("defer"), + })); + } + + return []; + } catch (e) { + return []; + } +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + third_party_domains.domain AS domain, + COUNTIF(isAsync) AS async_count, + COUNTIF(isDefer) AS defer_count + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.src) AS domain, + data.isAsync AS isAsync, + data.isDefer AS isDefer, + pages.url AS page + FROM + `httparchive.pages.2025_06_01_*` AS pages, + UNNEST(getScripts(JSON_EXTRACT_SCALAR(payload, '$._almanac'))) AS data + ) AS potential_third_parties + INNER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + base.client AS client, + COUNTIF(async_count > 0) AS freq_async, + COUNTIF(defer_count > 0) AS freq_defer, + COUNT(0) AS total, + COUNTIF(async_count > 0) / COUNT(0) AS pct_async, + COUNTIF(defer_count > 0) / COUNT(0) AS pct_defer +FROM + base +GROUP BY + client +ORDER BY + client diff --git a/sql/2025/third-parties/scripts_using_async_defer_by_3p.sql b/sql/2025/third-parties/scripts_using_async_defer_by_3p.sql new file mode 100644 index 00000000000..950bfdc7d27 --- /dev/null +++ b/sql/2025/third-parties/scripts_using_async_defer_by_3p.sql @@ -0,0 +1,81 @@ +#standardSQL +# 3P scripts using async or defer +# (capped to 1 hit per domain per page) +CREATE TEMPORARY FUNCTION getScripts(str STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' + try { + var almanac = JSON.parse(str); + + if (Array.isArray(almanac) || typeof almanac != "object") { + return result; + } + + if (almanac.scripts && almanac.scripts.nodes) { + return almanac.scripts.nodes.map((n) => ({ + src: n.src, + isAsync: n.hasOwnProperty("async"), + isDefer: n.hasOwnProperty("defer"), + })); + } + + return []; + } catch (e) { + return []; + } +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + third_party_domains.domain AS domain, + COUNTIF(isAsync) AS async_count, + COUNTIF(isDefer) AS defer_count + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.src) AS domain, + data.isAsync AS isAsync, + data.isDefer AS isDefer, + pages.url AS page + FROM + `httparchive.pages.2025_06_01_*` AS pages, + UNNEST(getScripts(JSON_EXTRACT_SCALAR(payload, '$._almanac'))) AS data + ) AS potential_third_parties + INNER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + base.client AS client, + domain, + COUNTIF(async_count > 0) AS freq_async, + COUNTIF(defer_count > 0) AS freq_defer, + COUNT(DISTINCT page) AS page_usage, + COUNTIF(async_count > 0) / COUNT(DISTINCT page) AS pct_async, + COUNTIF(defer_count > 0) / COUNT(DISTINCT page) AS pct_defer +FROM + base +GROUP BY + client, + domain +HAVING + page_usage > 50 +ORDER BY + client, + page_usage DESC diff --git a/sql/2025/third-parties/tao_by_third_party.sql b/sql/2025/third-parties/tao_by_third_party.sql new file mode 100644 index 00000000000..7cbef01455f --- /dev/null +++ b/sql/2025/third-parties/tao_by_third_party.sql @@ -0,0 +1,105 @@ +#standardSQL +# Percent of third-party requests with "Timing-Allow-Origin" headers +# Header reference: https://developer.mozilla.org/docs/Web/HTTP/Headers/Timing-Allow-Origin + +CREATE TEMP FUNCTION get_tao(headers STRING) +RETURNS STRING LANGUAGE js AS ''' + try { + const regex = /timing-allow-origin = (\\*|(http.*?,? )+)/gm; + output = regex.exec(headers)[1]+", "; + output = output.replace(/, , $/, ", "); + return output; + } catch (e) { + return false; + } +'''; + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + RTRIM(urlShort, '/') AS origin, + respOtherHeaders + FROM + `httparchive.summary_requests.2025_06_01_*` +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + url, + pageid AS page, + RTRIM(urlShort, '/') AS origin + FROM + `httparchive.summary_pages.2025_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +headers AS ( + SELECT + requests.client AS client, + requests.origin AS req_origin, + pages.origin AS page_origin, + get_tao(LOWER(respOtherHeaders)) AS timing_allow_origin, + respOtherHeaders, + third_party.category AS req_category + FROM requests + LEFT JOIN pages + USING (client, page) + INNER JOIN third_party + ON NET.HOST(requests.origin) = NET.HOST(third_party.domain) +), + +base AS ( + SELECT + client, + IF(respOtherHeaders LIKE '%timing-allow-origin = %', 1, 0) AS tao_header_present, + IF( + page_origin = req_origin OR + timing_allow_origin = '*' OR + timing_allow_origin LIKE '*,%' OR + timing_allow_origin LIKE '%,*' OR + timing_allow_origin LIKE '%,*,%' OR + timing_allow_origin LIKE '%, *,%' OR + timing_allow_origin = page_origin OR + timing_allow_origin LIKE page_origin || ',' OR + timing_allow_origin LIKE '%,' || page_origin OR + timing_allow_origin LIKE '%, ' || page_origin OR + timing_allow_origin LIKE '%,' || page_origin || ',%' OR + timing_allow_origin LIKE '%, ' || page_origin || ',%', + 1, 0 + ) AS timing_allowed + FROM headers +) + +SELECT + client, + SUM(tao_header_present) AS tao_requests, + SUM(timing_allowed) AS timing_allowed_requests, + COUNT(0) AS total_requests, + SUM(tao_header_present) / COUNT(0) AS pct_tao_requests, + SUM(timing_allowed) / COUNT(0) AS pct_timing_allowed_requests +FROM + base +GROUP BY + client diff --git a/sql/2025/third-parties/third_parties_blocking_main_thread.sql b/sql/2025/third-parties/third_parties_blocking_main_thread.sql new file mode 100644 index 00000000000..58dbc45ffa3 --- /dev/null +++ b/sql/2025/third-parties/third_parties_blocking_main_thread.sql @@ -0,0 +1,71 @@ +#standardSQL +# Third-Party domains which block the main thread +# +# As Lighthouse measures all impact there is no need to do a separate total +# Lighthouse also gives a useable category. So no need to use almanac.third-parties table +# +# Based heavily on research by Houssein Djirdeh: +# https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 + +SELECT + client, + domain, + category, + total_pages, + blocking_pages, + non_blocking_pages, + pct_blocking_pages, + pct_non_blocking_pages, + p50_transfer_size_kib, + p50_blocking_time, + total_pages_rank +FROM ( + SELECT + client, + domain, + category, + COUNT(DISTINCT page) AS total_pages, + COUNTIF(blocking > 0) AS blocking_pages, + COUNT(DISTINCT page) - COUNTIF(blocking > 0) AS non_blocking_pages, + COUNTIF(blocking > 0) / COUNT(0) AS pct_blocking_pages, + (COUNT(DISTINCT page) - COUNTIF(blocking > 0)) / COUNT(0) AS pct_non_blocking_pages, + APPROX_QUANTILES(transfer_size_kib, 1000)[OFFSET(500)] AS p50_transfer_size_kib, + APPROX_QUANTILES(blocking_time, 1000)[OFFSET(500)] AS p50_blocking_time, + RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS total_pages_rank + FROM ( + SELECT + client, + JSON_VALUE(third_party_items, '$.entity.url') AS domain, + page, + JSON_VALUE(third_party_items, '$.entity.text') AS category, + COUNTIF(SAFE_CAST(JSON_VALUE(report, '$.audits.third-party-summary.details.summary.wastedMs') AS FLOAT64) > 250) AS blocking, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, '$.blockingTime') AS FLOAT64)) AS blocking_time, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, '$.transferSize') AS FLOAT64) / 1024) AS transfer_size_kib + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + url AS page, + report + FROM + `httparchive.lighthouse.2025_06_01_*` + ), + UNNEST(JSON_QUERY_ARRAY(report, '$.audits.third-party-summary.details.items')) AS third_party_items + GROUP BY + client, + domain, + page, + category + ) + GROUP BY + client, + domain, + category + HAVING + total_pages >= 50 +) +WHERE + total_pages_rank <= 200 +ORDER BY + client, + total_pages DESC diff --git a/sql/2025/third-parties/third_parties_blocking_main_thread_percentiles.sql b/sql/2025/third-parties/third_parties_blocking_main_thread_percentiles.sql new file mode 100644 index 00000000000..81b1777c678 --- /dev/null +++ b/sql/2025/third-parties/third_parties_blocking_main_thread_percentiles.sql @@ -0,0 +1,59 @@ +#standardSQL +# Total of Third-Party domains which block the main thread by percentile +# +# As Lighthouse measures all impact there is no need to do a separate total +# Lighthouse also gives a useable category. So no need to use almanac.third-parties table +# +# Based heavily on research by Houssein Djirdeh: +# https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 + +SELECT + client, + total_pages, + blocking_pages, + percentile, + p50_transfer_size_kib, + p50_blocking_time +FROM ( + SELECT + client, + COUNT(DISTINCT page) AS total_pages, + COUNTIF(blocking > 0) AS blocking_pages, + percentile, + APPROX_QUANTILES(transfer_size_kib, 1000)[OFFSET(percentile * 10)] AS p50_transfer_size_kib, + APPROX_QUANTILES(blocking_time, 1000)[OFFSET(percentile * 10)] AS p50_blocking_time, + RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS total_pages_rank + FROM ( + SELECT + client, + page, + COUNTIF(SAFE_CAST(JSON_VALUE(report, '$.audits.third-party-summary.details.summary.wastedMs') AS FLOAT64) > 250) AS blocking, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, '$.blockingTime') AS FLOAT64)) AS blocking_time, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, '$.transferSize') AS FLOAT64) / 1024) AS transfer_size_kib + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + url AS page, + report + FROM + `httparchive.lighthouse.2025_06_01_*` + ), + UNNEST(JSON_QUERY_ARRAY(report, '$.audits.third-party-summary.details.items')) AS third_party_items + GROUP BY + client, + page + ), + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile + GROUP BY + client, + percentile + HAVING + total_pages >= 50 +) +WHERE + total_pages_rank <= 200 +ORDER BY + client, + total_pages DESC, + percentile diff --git a/sql/2025/third-parties/third_parties_blocking_main_thread_percentiles_by_host.sql b/sql/2025/third-parties/third_parties_blocking_main_thread_percentiles_by_host.sql new file mode 100644 index 00000000000..56905b2577d --- /dev/null +++ b/sql/2025/third-parties/third_parties_blocking_main_thread_percentiles_by_host.sql @@ -0,0 +1,70 @@ +#standardSQL +# Third-Party domains which block the main thread by percentile +# +# As Lighthouse measures all impact there is no need to do a separate total +# Lighthouse also gives a useable category. So no need to use almanac.third-parties table +# +# Based heavily on research by Houssein Djirdeh: +# https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 + +SELECT + client, + domain, + category, + total_pages, + blocking_pages, + percentile, + p50_transfer_size_kib, + p50_blocking_time +FROM ( + SELECT + client, + domain, + category, + COUNT(DISTINCT page) AS total_pages, + COUNTIF(blocking > 0) AS blocking_pages, + percentile, + APPROX_QUANTILES(transfer_size_kib, 1000)[OFFSET(percentile * 10)] AS p50_transfer_size_kib, + APPROX_QUANTILES(blocking_time, 1000)[OFFSET(percentile * 10)] AS p50_blocking_time, + RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS total_pages_rank + FROM ( + SELECT + client, + JSON_VALUE(third_party_items, '$.entity.url') AS domain, + page, + JSON_VALUE(third_party_items, '$.entity.text') AS category, + COUNTIF(SAFE_CAST(JSON_VALUE(report, '$.audits.third-party-summary.details.summary.wastedMs') AS FLOAT64) > 250) AS blocking, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, '$.blockingTime') AS FLOAT64)) AS blocking_time, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, '$.transferSize') AS FLOAT64) / 1024) AS transfer_size_kib + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + url AS page, + report + FROM + `httparchive.lighthouse.2025_06_01_*` + ), + UNNEST(JSON_QUERY_ARRAY(report, '$.audits.third-party-summary.details.items')) AS third_party_items + GROUP BY + client, + domain, + page, + category + ), + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile + GROUP BY + client, + domain, + category, + percentile + HAVING + total_pages >= 50 +) +WHERE + total_pages_rank <= 200 +ORDER BY + client, + total_pages DESC, + category, + percentile diff --git a/sql/2025/third-parties/third_parties_blocking_rendering.sql b/sql/2025/third-parties/third_parties_blocking_rendering.sql new file mode 100644 index 00000000000..51c4628afbe --- /dev/null +++ b/sql/2025/third-parties/third_parties_blocking_rendering.sql @@ -0,0 +1,116 @@ +#standardSQL +# Third-Party domains which render block paint +# +# Unlike the blocking main thread queries, lighthouse only contains details if the +# third-party is render blocking (i.e. wastedMs/total_bytes are never 0) +# And also there are no categories given to each third-party +# So we join to the usual almanac.third_parties table to get those totals and categories +# +# Based heavily on research by Houssein Djirdeh: +# https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 + +WITH total_third_party_usage AS ( + SELECT + _TABLE_SUFFIX AS client, + canonicalDomain, + category, + COUNT(DISTINCT pages.url) AS total_pages + FROM + `httparchive.summary_pages.2025_06_01_*` AS pages + INNER JOIN ( + SELECT + _TABLE_SUFFIX AS client, + pageid, + url + FROM + `httparchive.summary_requests.2025_06_01_*` + ) AS requests + ON ( + pages._TABLE_SUFFIX = requests.client AND + pages.pageid = requests.pageid + ) + INNER JOIN + `httparchive.almanac.third_parties` + ON + NET.HOST(requests.url) = NET.HOST(domain) AND + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + client, + canonicalDomain, + category + HAVING + total_pages >= 50 +) + +SELECT + client, + canonicalDomain, + category, + total_pages, + blocking_pages, + non_blocking_pages, + blocking_pages_pct, + non_blocking_pages_pct, + p50_wastedMs, + p50_total_bytes_kib +FROM ( + SELECT + client, + canonicalDomain, + category, + total_pages, + COUNT(DISTINCT page) AS blocking_pages, + total_pages - COUNT(DISTINCT page) AS non_blocking_pages, + COUNT(DISTINCT page) / total_pages AS blocking_pages_pct, + (total_pages - COUNT(DISTINCT page)) / total_pages AS non_blocking_pages_pct, + APPROX_QUANTILES(wasted_ms, 1000)[OFFSET(500)] AS p50_wastedMs, + APPROX_QUANTILES(total_bytes_kib, 1000)[OFFSET(500)] AS p50_total_bytes_kib, + RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS total_pages_rank + FROM ( + SELECT + client, + canonicalDomain, + domain, + page, + category, + SUM(SAFE_CAST(JSON_VALUE(renderBlockingItems, '$.wastedMs') AS FLOAT64)) AS wasted_ms, + SUM(SAFE_CAST(JSON_VALUE(renderBlockingItems, '$.totalBytes') AS FLOAT64) / 1024) AS total_bytes_kib + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + url AS page, + report + FROM + `httparchive.lighthouse.2025_06_01_*` + ), + UNNEST(JSON_QUERY_ARRAY(report, '$.audits.render-blocking-resources.details.items')) AS renderBlockingItems + INNER JOIN + `httparchive.almanac.third_parties` + ON + NET.HOST(JSON_VALUE(renderBlockingItems, '$.url')) = domain + GROUP BY + client, + canonicalDomain, + domain, + page, + category + ) + INNER JOIN + total_third_party_usage + USING (client, canonicalDomain, category) + GROUP BY + client, + canonicalDomain, + category, + total_pages + HAVING + total_pages >= 50 +) +WHERE + total_pages_rank <= 200 +ORDER BY + client, + total_pages DESC, + category diff --git a/sql/2025/third-parties/third_parties_blocking_rendering_percentiles.sql b/sql/2025/third-parties/third_parties_blocking_rendering_percentiles.sql new file mode 100644 index 00000000000..cdc0e1db3e9 --- /dev/null +++ b/sql/2025/third-parties/third_parties_blocking_rendering_percentiles.sql @@ -0,0 +1,114 @@ +#standardSQL +# Third-Party domains which render block paint by percentile +# +# Unlike the blocking main thread queries, lighthouse only contains details if the +# third-party is render blocking (i.e. wastedMs/total_bytes are never 0) +# And also there are no categories given to each third-party +# So we join to the usual almanac.third_parties table to get those totals and categories +# +# Based heavily on research by Houssein Djirdeh: +# https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 + +WITH total_third_party_usage AS ( + SELECT + _TABLE_SUFFIX AS client, + canonicalDomain, + category, + COUNT(DISTINCT pages.url) AS total_pages + FROM + `httparchive.summary_pages.2025_06_01_*` AS pages + INNER JOIN ( + SELECT + _TABLE_SUFFIX AS client, + pageid, + url + FROM + `httparchive.summary_requests.2025_06_01_*` + ) AS requests + ON ( + pages._TABLE_SUFFIX = requests.client AND + pages.pageid = requests.pageid + ) + INNER JOIN + `httparchive.almanac.third_parties` + ON + NET.HOST(requests.url) = NET.HOST(domain) AND + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + client, + canonicalDomain, + category + HAVING + total_pages >= 50 +) + +SELECT + client, + canonicalDomain, + category, + total_pages, + blocking_pages, + percentile, + wasted_ms, + total_bytes_kib +FROM ( + SELECT + client, + canonicalDomain, + category, + total_pages, + COUNT(DISTINCT page) AS blocking_pages, + percentile, + APPROX_QUANTILES(wasted_ms, 1000)[OFFSET(percentile * 10)] AS wasted_ms, + APPROX_QUANTILES(total_bytes_kib, 1000)[OFFSET(percentile * 10)] AS total_bytes_kib, + RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS total_pages_rank + FROM ( + SELECT + client, + canonicalDomain, + page, + category, + SUM(SAFE_CAST(JSON_VALUE(render_blocking_items, '$.wastedMs') AS FLOAT64)) AS wasted_ms, + SUM(SAFE_CAST(JSON_VALUE(render_blocking_items, '$.totalBytes') AS FLOAT64) / 1024) AS total_bytes_kib + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + url AS page, + report + FROM + `httparchive.lighthouse.2025_06_01_*` + ), + UNNEST(JSON_QUERY_ARRAY(report, '$.audits.render-blocking-resources.details.items')) AS render_blocking_items + INNER JOIN + `httparchive.almanac.third_parties` + ON + NET.HOST(JSON_VALUE(render_blocking_items, '$.url')) = domain AND + date = '2025-06-01' + GROUP BY + client, + canonicalDomain, + page, + category + ) + INNER JOIN + total_third_party_usage + USING (client, canonicalDomain, category), + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile + GROUP BY + client, + canonicalDomain, + category, + total_pages, + percentile + HAVING + total_pages >= 50 +) +WHERE + total_pages_rank <= 200 +ORDER BY + client, + total_pages DESC, + category, + percentile diff --git a/sql/2025/third-parties/third_parties_using_legacy_javascript.sql b/sql/2025/third-parties/third_parties_using_legacy_javascript.sql new file mode 100644 index 00000000000..7dbe24800c3 --- /dev/null +++ b/sql/2025/third-parties/third_parties_using_legacy_javascript.sql @@ -0,0 +1,57 @@ +#standardSQL +# Third-parties that use legacy JavaScript + +CREATE TEMPORARY FUNCTION +getUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(i => ({url: i.url})); +} catch(e) { + return []; +} +'''; + + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + COUNTIF(third_party_domains.domain IS NULL) / COUNT(0) AS pct_1p_legacy, + COUNTIF(third_party_domains.domain IS NOT NULL) / COUNT(0) AS pct_3p_legacy + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page + FROM + `httparchive.lighthouse.2025_06_01_*` AS lighthouse, + UNNEST(getUrls(JSON_EXTRACT(report, "$.audits['legacy-javascript']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page +) + +SELECT + client, + AVG(pct_1p_legacy) AS avg_pct_1p_legacy, + AVG(pct_3p_legacy) AS avg_pct_3p_legacy +FROM + base +GROUP BY + client +ORDER BY + client diff --git a/sql/2025/third-parties/top100_third_parties_by_median_body_size_and_time.sql b/sql/2025/third-parties/top100_third_parties_by_median_body_size_and_time.sql new file mode 100644 index 00000000000..053a4e809cc --- /dev/null +++ b/sql/2025/third-parties/top100_third_parties_by_median_body_size_and_time.sql @@ -0,0 +1,87 @@ +#standardSQL +# Top 100 third parties by median response body size, time + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + url, + pageid AS page, + respBodySize AS body_size, + time + FROM + `httparchive.summary_requests.2025_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + canonicalDomain, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-06-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + category, + canonicalDomain, + APPROX_QUANTILES(body_size, 1000)[OFFSET(500)] / 1024 AS median_body_size_kb, + APPROX_QUANTILES(time, 1000)[OFFSET(500)] / 1000 AS median_time_s -- noqa: L010 + FROM + requests + INNER JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + GROUP BY + client, + category, + canonicalDomain +) + +SELECT + ranking, + client, + category, + canonicalDomain, + metric, + sorted_order +FROM ( + SELECT + 'median_body_size_kb' AS ranking, + client, + category, + canonicalDomain, + median_body_size_kb AS metric, + DENSE_RANK() OVER (PARTITION BY client ORDER BY median_body_size_kb DESC) AS sorted_order + FROM base + UNION ALL + SELECT + 'median_time_s' AS ranking, + client, + category, + canonicalDomain, + median_time_s AS metric, + DENSE_RANK() OVER (PARTITION BY client ORDER BY median_time_s DESC) AS sorted_order + FROM base +) +WHERE + sorted_order <= 100 +ORDER BY + ranking, + client, + metric DESC diff --git a/sql/2025/third-parties/top100_third_parties_by_number_of_websites.sql b/sql/2025/third-parties/top100_third_parties_by_number_of_websites.sql new file mode 100644 index 00000000000..f54d3cb220e --- /dev/null +++ b/sql/2025/third-parties/top100_third_parties_by_number_of_websites.sql @@ -0,0 +1,80 @@ +#standardSQL +# Top 100 third parties by number of websites + +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' +), + +totals AS ( + SELECT + client, + COUNT(DISTINCT page) AS total_pages, + COUNT(0) AS total_requests + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' + GROUP BY + client +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2025-07-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +) + +SELECT + client, + canonicalDomain, + COUNT(DISTINCT page) AS pages, + total_pages, + COUNT(DISTINCT page) / total_pages AS pct_pages, + COUNT(0) AS requests, + total_requests, + COUNT(0) / total_requests AS pct_requests, + DENSE_RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS sorted_order +FROM + requests +LEFT JOIN + third_party +ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +JOIN + totals +USING (client) +WHERE + canonicalDomain IS NOT NULL +GROUP BY + client, + total_pages, + total_requests, + canonicalDomain +QUALIFY + sorted_order <= 100 +ORDER BY + pct_pages DESC, + client diff --git a/sql/2025/third-parties/top20_third_parties_by_client_and_frame_location.sql b/sql/2025/third-parties/top20_third_parties_by_client_and_frame_location.sql new file mode 100644 index 00000000000..8ee47c5f168 --- /dev/null +++ b/sql/2025/third-parties/top20_third_parties_by_client_and_frame_location.sql @@ -0,0 +1,106 @@ +#standardSQL +# Top 20 third-parties embedded in mainframe vs. in iframes + +WITH document_frameid AS ( + SELECT + client, + NET.HOST(page) AS page_host, + NET.HOST(url) AS frame_host, + CASE + WHEN is_main_document = true + THEN JSON_EXTRACT_SCALAR(payload, '$._frame_id') + END AS mainframe_id, + JSON_EXTRACT_SCALAR(payload, '$._frame_id') AS frame_id, + is_main_document + FROM `httparchive.all.requests` AS requests + WHERE requests.date = '2025-06-01' AND requests.is_root_page = true +), + +page_frames AS ( + SELECT + client, + page_host, + frame_host, + CASE + WHEN frame_host != page_host + THEN true + ELSE false + END AS tp_flag, + is_main_document, + frame_id, + COALESCE(mainframe_id, FIRST_VALUE(mainframe_id) OVER (PARTITION BY page_host ORDER BY is_main_document DESC)) AS mainframe_id, + CASE + WHEN frame_id = COALESCE(mainframe_id, FIRST_VALUE(mainframe_id) OVER (PARTITION BY page_host ORDER BY is_main_document DESC)) + THEN 'mainframe' + ELSE 'iframe' + END AS frame_type + FROM document_frameid +), + +combined_frame_counts AS ( + SELECT + client, + page_host, + frame_host, + tp_flag, + CASE + WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'mainframe' THEN 1 ELSE 0 END) = 1 + THEN 'mainframe-only' + WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'iframe' THEN 1 ELSE 0 END) = 1 + THEN 'iframe-only' + WHEN COUNT(DISTINCT frame_id) >= 2 AND COUNT(DISTINCT frame_type) = 2 + THEN 'both' + END AS frame_presence + FROM page_frames + GROUP BY client, page_host, frame_host, tp_flag +), + +grouped_data AS ( + SELECT + client, + frame_host, + COUNT(DISTINCT page_host) AS total_distinct_publisher_count, + COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' AND tp_flag THEN page_host ELSE null END) AS num_distinct_publishers_mainframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' AND tp_flag THEN page_host ELSE null END) AS num_distinct_publishers_iframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = 'both' AND tp_flag THEN page_host ELSE null END) AS num_distinct_publishers_both + FROM combined_frame_counts + GROUP BY client, frame_host +), + +ranked_publishers AS ( + SELECT + client, + frame_host, + num_distinct_publishers_mainframe_only, + num_distinct_publishers_iframe_only, + num_distinct_publishers_both, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY num_distinct_publishers_mainframe_only DESC) AS rank_mainframe, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY num_distinct_publishers_iframe_only DESC) AS rank_iframe, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY num_distinct_publishers_both DESC) AS rank_both + FROM grouped_data +) + +SELECT + client, + frame_host, + num_distinct_publishers_mainframe_only AS num_distinct_publishers, + 'mainframe' AS category +FROM ranked_publishers +WHERE rank_mainframe <= 20 AND num_distinct_publishers_mainframe_only > 0 +UNION ALL +SELECT + client, + frame_host, + num_distinct_publishers_iframe_only AS num_distinct_publishers, + 'iframe' AS category +FROM ranked_publishers +WHERE rank_iframe <= 20 AND num_distinct_publishers_iframe_only > 0 +UNION ALL +SELECT + client, + frame_host, + num_distinct_publishers_both AS num_distinct_publishers, + 'both' AS category +FROM ranked_publishers +WHERE rank_both <= 20 AND num_distinct_publishers_both > 0 +ORDER BY client, category, num_distinct_publishers DESC; diff --git a/sql/2025/third-parties/usage_of_lite_youtube_embed.sql b/sql/2025/third-parties/usage_of_lite_youtube_embed.sql new file mode 100644 index 00000000000..fa70b5741b4 --- /dev/null +++ b/sql/2025/third-parties/usage_of_lite_youtube_embed.sql @@ -0,0 +1,37 @@ +#standardSQL +# Percent of pages using lite-youtube-embed + +WITH totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS total_pages + FROM + `httparchive.summary_pages.2025_06_01_*` + GROUP BY + client +), + +youtube_embed AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS youtube_embed_pages + FROM + `httparchive.technologies.2025_06_01_*` + WHERE + app = 'lite-youtube-embed' + GROUP BY + client +) + +SELECT + client, + youtube_embed_pages, + total_pages, + youtube_embed_pages / total_pages AS pct_youtube_embed_pages +FROM + totals +JOIN + youtube_embed +USING (client) +ORDER BY + client diff --git a/sql/2025/third-parties/usage_of_partytown.sql b/sql/2025/third-parties/usage_of_partytown.sql new file mode 100644 index 00000000000..251830b1870 --- /dev/null +++ b/sql/2025/third-parties/usage_of_partytown.sql @@ -0,0 +1,37 @@ +#standardSQL +# Percent of pages using Partytown + +WITH totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS total_pages + FROM + `httparchive.summary_pages.2025_06_01_*` + GROUP BY + client +), + +partytown AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS partytown_pages + FROM + `httparchive.technologies.2025_06_01_*` + WHERE + app = 'Partytown' + GROUP BY + client +) + +SELECT + client, + partytown_pages, + total_pages, + partytown_pages / total_pages AS pct_partytown_pages +FROM + totals +JOIN + partytown +USING (client) +ORDER BY + client