Skip to content
61 changes: 61 additions & 0 deletions sql/2025/third-parties/a11y_overall_tech_usage_by_rank.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#standardSQL
# Overall A11Y technology usage by domain rank

WITH a11y_technologies AS (
SELECT
_TABLE_SUFFIX AS client,
url
FROM
`httparchive.technologies.2025_06_01_*`
WHERE
category = 'Accessibility'
),

pages AS (
SELECT
_TABLE_SUFFIX AS client,
url,
rank_grouping
FROM
`httparchive.summary_pages.2025_06_01_*`,
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
WHERE
rank <= rank_grouping
),

rank_totals AS (
SELECT
_TABLE_SUFFIX AS client,
rank_grouping,
COUNT(0) AS total
FROM
`httparchive.summary_pages.2025_06_01_*`,
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
WHERE
rank <= rank_grouping
GROUP BY
client,
rank_grouping
)

SELECT
client,
rank_grouping AS rank,
COUNT(DISTINCT url) AS freq,
total,
(COUNT(DISTINCT url) / total) * 100 AS pct
FROM
a11y_technologies
LEFT OUTER JOIN
pages
USING (client, url)
JOIN
rank_totals
USING (client, rank_grouping)
GROUP BY
rank_grouping,
total,
client
ORDER BY
client,
rank
35 changes: 35 additions & 0 deletions sql/2025/third-parties/a11y_technology_usage.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#standardSQL
# A11Y technology usage

WITH a11y_technologies AS (
SELECT
_TABLE_SUFFIX AS client,
COUNT(DISTINCT url) AS freq
FROM
`httparchive.technologies.2025_06_01_*`
WHERE
category = 'Accessibility'
GROUP BY
client
),

pages AS (
SELECT
_TABLE_SUFFIX AS client,
COUNT(0) AS total
FROM
`httparchive.summary_pages.2025_06_01_*`
GROUP BY
client
)

SELECT
client,
freq,
total,
(freq / total) * 100 AS pct
FROM
a11y_technologies
JOIN
pages
USING (client)
65 changes: 65 additions & 0 deletions sql/2025/third-parties/a11y_technology_usage_by_rank.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#standardSQL
# A11Y technology usage by domain rank

WITH a11y_technologies AS (
SELECT
_TABLE_SUFFIX AS client,
app,
url
FROM
`httparchive.technologies.2025_06_01_*`
WHERE
category = 'Accessibility'
),

pages AS (
SELECT
_TABLE_SUFFIX AS client,
url,
rank_grouping
FROM
`httparchive.summary_pages.2025_06_01_*`,
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
WHERE
rank <= rank_grouping
),

rank_totals AS (
SELECT
_TABLE_SUFFIX AS client,
rank_grouping,
COUNT(0) AS total
FROM
`httparchive.summary_pages.2025_06_01_*`,
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
WHERE
rank <= rank_grouping
GROUP BY
client,
rank_grouping
)

SELECT
client,
rank_grouping AS rank,
app,
COUNT(0) AS freq,
total,
(COUNT(0) / total) * 100 AS pct
FROM
a11y_technologies
LEFT OUTER JOIN
pages
USING (client, url)
JOIN
rank_totals
USING (client, rank_grouping)
GROUP BY
rank_grouping,
total,
client,
app
ORDER BY
client,
rank,
pct DESC
81 changes: 81 additions & 0 deletions sql/2025/third-parties/compressed_images_by_3p.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#standardSQL
# Compressed images (excluding SVG) by third parties

WITH requests AS (
SELECT
_TABLE_SUFFIX AS client,
pageid AS page,
url,
resp_content_encoding AS content_encoding,
type,
respBodySize AS size
FROM
`httparchive.summary_requests.2025_06_01_*`
WHERE
type = 'image' AND (
resp_content_encoding = 'gzip' OR
resp_content_encoding = 'br'
) AND NOT (
resp_content_type LIKE 'image/svg%' OR
ENDS_WITH(url, '.svg')
)
),

third_party AS (
SELECT
NET.HOST(domain) AS domain,
COUNT(DISTINCT page) AS page_usage
FROM
`httparchive.almanac.third_parties` tp
JOIN
requests r
ON NET.HOST(r.url) = NET.HOST(tp.domain)
WHERE
date = '2025-06-01' AND
category != 'hosting'
GROUP BY
domain
HAVING
page_usage >= 50
)

SELECT
client,
content_encoding,
domain,
size,
SUM(size) OVER (PARTITION BY client) AS total_size,
size / SUM(size) OVER (PARTITION BY client) AS pct_size,
num_requests,
total_requests,
pct_requests
FROM (
SELECT
client,
content_encoding,
domain,
COUNT(0) AS num_requests,
SUM(size) AS size,
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_requests,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_requests,
RANK() OVER (PARTITION BY client, type, content_encoding ORDER BY COUNT(0) DESC) AS domain_rank
FROM
requests
LEFT JOIN
third_party
ON
NET.HOST(requests.url) = NET.HOST(third_party.domain)
WHERE
domain IS NOT NULL
GROUP BY
client,
type,
content_encoding,
domain
)
WHERE
domain_rank <= 100
ORDER BY
client,
content_encoding,
size DESC
90 changes: 90 additions & 0 deletions sql/2025/third-parties/consent_signal_basic_analysis.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#standardSQL
# Basic consent signal analysis (simplified version to ensure data returns)

WITH pages AS (
SELECT
client,
page,
rank
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-06-01'
AND rank <= 50000 -- Expand to top 50K sites
),

-- Find requests with consent signals (no redirect filtering)
consent_requests AS (
SELECT
r.client,
r.page,
r.url,
NET.REG_DOMAIN(r.page) AS page_domain,
NET.REG_DOMAIN(r.url) AS url_domain,

-- Extract consent signals
REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') AS has_usp_standard,
REGEXP_CONTAINS(r.url, r'[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=') AS has_usp_nonstandard,
REGEXP_CONTAINS(r.url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') AS has_tcf_standard,
REGEXP_CONTAINS(r.url, r'[?&](gpp|gpp_sid)=') AS has_gpp_standard,

-- Check if request has redirects
JSON_EXTRACT(r.summary, '$.redirects') IS NOT NULL AND
TO_JSON_STRING(JSON_EXTRACT(r.summary, '$.redirects')) != '[]' AS has_redirects
FROM
`httparchive.crawl.requests` r
INNER JOIN
pages p
ON
r.client = p.client AND r.page = p.page
WHERE
r.date = '2025-06-01'
AND NET.REG_DOMAIN(r.page) != NET.REG_DOMAIN(r.url) -- Third-party only
AND (
REGEXP_CONTAINS(r.url, r'[?&]us_privacy=') OR
REGEXP_CONTAINS(r.url, r'[?&](ccpa|usp_consent|uspString|uspConsent|ccpa_consent|usp|usprivacy|ccpaconsent|usp_string)=') OR
REGEXP_CONTAINS(r.url, r'[?&](gdpr|gdpr_consent|gdpr_pd)=') OR
REGEXP_CONTAINS(r.url, r'[?&](gpp|gpp_sid)=')
)
),

-- Add any consent signal flag
requests_with_signals AS (
SELECT
*,
(has_usp_standard OR has_usp_nonstandard OR has_tcf_standard OR has_gpp_standard) AS has_any_signal
FROM
consent_requests
)

-- Basic analysis
SELECT
client,

-- Overall counts
COUNT(*) AS total_requests_with_consent_signals,
COUNT(DISTINCT page) AS total_pages_with_consent_signals,
COUNT(DISTINCT url_domain) AS total_domains_with_consent_signals,

-- Signal type breakdown
COUNTIF(has_usp_standard) AS usp_standard_requests,
COUNTIF(has_usp_nonstandard) AS usp_nonstandard_requests,
COUNTIF(has_tcf_standard) AS tcf_standard_requests,
COUNTIF(has_gpp_standard) AS gpp_standard_requests,

-- Percentage breakdown
COUNTIF(has_usp_standard) / COUNT(*) AS pct_usp_standard,
COUNTIF(has_usp_nonstandard) / COUNT(*) AS pct_usp_nonstandard,
COUNTIF(has_tcf_standard) / COUNT(*) AS pct_tcf_standard,
COUNTIF(has_gpp_standard) / COUNT(*) AS pct_gpp_standard,

-- Redirect availability
COUNTIF(has_redirects) AS requests_with_redirects,
COUNTIF(has_redirects) / COUNT(*) AS pct_requests_with_redirects

FROM
requests_with_signals
GROUP BY
client
ORDER BY
client
Loading