diff --git a/sql/2025/accessibility/README copy.md b/sql/2025/accessibility/README copy.md new file mode 100644 index 00000000000..89f1de5fdc7 --- /dev/null +++ b/sql/2025/accessibility/README copy.md @@ -0,0 +1,20 @@ +# 2024 Accessibility queries + + + +## Resources + +- [📄 Planning doc][~google-doc] +- [📊 Results sheet][~google-sheets] +- [📝 Markdown file][~chapter-markdown] + +[~google-doc]: https://docs.google.com/document/d/1anCSQk9g_YDfZP6GtjqdC-vCfnCNZAUEQwjSr8AzqTw/edit +[~google-sheets]: https://docs.google.com/spreadsheets/d/1btB1r9QpdgTyToPhn7glcGAdMFs7eq4UcQSVIHBqiYQ/edit#gid=1778117656 +[~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2024/accessibility.md diff --git a/sql/2025/accessibility/a11y_frontend_technology.sql b/sql/2025/accessibility/a11y_frontend_technology.sql new file mode 100644 index 00000000000..ba2cb7bd121 --- /dev/null +++ b/sql/2025/accessibility/a11y_frontend_technology.sql @@ -0,0 +1,84 @@ +#standardSQL +-- Web Almanac — Lighthouse category scores by framework (2025-07-01) +-- Google Sheet: a11y_frontend_technology +-- +-- Purpose +-- • Extract Lighthouse category scores (performance, accessibility, +-- best-practices, SEO) from JSON in the crawl dataset. +-- • Associate each crawled page with detected frontend frameworks or JS libraries. +-- • Limit to root pages only for consistency. +-- • De-duplicate multiple {page, framework} rows caused by UNNEST, by averaging +-- scores per page before computing framework-level averages. +-- +-- Method +-- 1. Extract scores with JSON_EXTRACT_SCALAR, cast to FLOAT64. +-- 2. Filter to categories: Web frameworks, JavaScript libraries, +-- Frontend frameworks, JavaScript frameworks. +-- 3. Aggregate in two steps: +-- a. Per {client, page, framework}, average scores to remove duplicates. +-- b. Global averages per {client, framework}. +-- +-- Output columns +-- client — "desktop" | "mobile" +-- framework — detected framework or JS library +-- avg_performance_score — average Lighthouse performance score (0–1) +-- avg_accessibility_score — average Lighthouse accessibility score (0–1) +-- avg_best_practices_score — average Lighthouse best-practices score (0–1) +-- avg_seo_score — average Lighthouse SEO score (0–1) +-- total_pages — distinct page count per {client, framework} +-- +-- Notes +-- • Scores remain in 0–1 float scale (not percentages). +-- • `is_root_page = TRUE` ensures only root URLs are included. +-- • Optional: enable TABLESAMPLE for faster smoke testing. +WITH score_data AS ( + SELECT + client, + page, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.performance.score') AS FLOAT64) AS performance_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64) AS accessibility_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.best-practices.score') AS FLOAT64) AS best_practices_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.seo.score') AS FLOAT64) AS seo_score, + t.technology AS framework + FROM + `httparchive.crawl.pages`, + -- TABLESAMPLE SYSTEM (0.1 PERCENT) -- ← optional: cheap smoke test + UNNEST(technologies) AS t + WHERE + date = '2025-07-01' AND + lighthouse IS NOT NULL AND + -- lighthouse != '{}' AND + is_root_page = TRUE AND + ('Web frameworks' IN UNNEST(t.categories) OR 'JavaScript libraries' IN UNNEST(t.categories) OR 'Frontend frameworks' IN UNNEST(t.categories) OR 'JavaScript frameworks' IN UNNEST(t.categories)) AND + t.technology IS NOT NULL +) + +SELECT + client, + framework, + AVG(performance_score) AS avg_performance_score, + AVG(accessibility_score) AS avg_accessibility_score, + AVG(best_practices_score) AS avg_best_practices_score, + AVG(seo_score) AS avg_seo_score, + COUNT(DISTINCT page) AS total_pages +FROM ( + SELECT + client, + page, + framework, + AVG(performance_score) AS performance_score, # All scores are the same for one page (we have multiple rows due to unnest), we could also take the first instead of the average + AVG(accessibility_score) AS accessibility_score, + AVG(best_practices_score) AS best_practices_score, + AVG(seo_score) AS seo_score + FROM + score_data + GROUP BY + client, + page, + framework +) +GROUP BY + client, + framework +ORDER BY + total_pages DESC; diff --git a/sql/2025/accessibility/a11y_overall_tech_usage_by_domain_rank.sql b/sql/2025/accessibility/a11y_overall_tech_usage_by_domain_rank.sql new file mode 100644 index 00000000000..049d5b8d8fe --- /dev/null +++ b/sql/2025/accessibility/a11y_overall_tech_usage_by_domain_rank.sql @@ -0,0 +1,93 @@ +#standardSQL +-- Accessibility Technology (A11y) Usage by Domain Rank (2025-07-01) +-- Google Sheet: a11y_overall_tech_usage_by_domain_rank +-- +-- Purpose +-- • Quantify adoption of accessibility-related technologies (e.g., overlays) +-- across websites, segmented by domain rank tiers. +-- • Provide both absolute counts of sites using A11y tech and percentages +-- within each rank grouping. +-- +-- Dataset +-- • Source: `httparchive.crawl.pages` +-- • Crawl date: 2025-07-01 +-- • Technologies: extracted from `technologies` and `categories` arrays. +-- • Rank groupings: [1K, 10K, 100K, 1M, 10M, 100M]. +-- +-- Method +-- 1. Subquery A: +-- – Expand rank thresholds with UNNEST. +-- – Select distinct {client, page, is_root_page, rank_grouping} +-- where `category = 'Accessibility'`. +-- 2. Subquery B: +-- – Count all sites per {client, rank_grouping} as denominators +-- (total sites in each rank band). +-- 3. Join Subquery A with Subquery B on {client, rank_grouping}. +-- 4. Aggregate results to compute distinct site counts and percentages. +-- +-- Output columns +-- client — "desktop" | "mobile" +-- is_root_page — TRUE if page is a root URL +-- rank_grouping — maximum rank threshold (e.g., 1000, 10000, …) +-- total_in_rank — total number of sites within the rank group +-- sites_with_a11y_tech — count of distinct sites using A11y technology +-- pct_sites_with_a11y_tech — fraction of sites in rank group using A11y tech +-- +-- Notes +-- • Percentages are relative to the total sites in each rank grouping. +-- • Multiple rank thresholds allow trend analysis across different scales +-- of the web (top 1K → top 100M). +-- • `is_root_page` is preserved to allow filtering on root vs non-root pages. +SELECT + client, + is_root_page, + rank_grouping, # Grouping of domains by their rank (e.g., top 1000, top 10000, etc.) + total_in_rank, # Total number of sites within the rank grouping + COUNT(DISTINCT page) AS sites_with_a11y_tech, # Number of unique sites that use accessibility technology + COUNT(DISTINCT page) / total_in_rank AS pct_sites_with_a11y_tech # Percentage of sites using accessibility technology within the rank grouping +FROM + ( + # Subquery to filter and extract relevant pages with A11Y technology + SELECT DISTINCT + client, + is_root_page, + page, + rank_grouping, + category + FROM + `httparchive.crawl.pages`, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping, # Expand rank_grouping to cover different rank categories + UNNEST(technologies) AS tech, + UNNEST(categories) AS category + WHERE + date = '2025-07-01' AND + category = 'Accessibility' AND + rank <= rank_grouping # Include only sites within the specified rank grouping + ) +JOIN + ( + # Subquery to count total sites in each rank grouping for each client + SELECT + client, + rank_grouping, + COUNT(0) AS total_in_rank + FROM + `httparchive.crawl.pages`, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping + WHERE + date = '2025-07-01' AND + rank <= rank_grouping + GROUP BY + client, + rank_grouping + ) +USING (client, rank_grouping) +GROUP BY + client, + is_root_page, + rank_grouping, + total_in_rank +ORDER BY + client, + is_root_page, + rank_grouping diff --git a/sql/2025/accessibility/a11y_technology_usage.sql b/sql/2025/accessibility/a11y_technology_usage.sql new file mode 100644 index 00000000000..fd0d43c5809 --- /dev/null +++ b/sql/2025/accessibility/a11y_technology_usage.sql @@ -0,0 +1,49 @@ +#standardSQL +-- Accessibility Technology (A11y) Usage by Client (2025-07-01) +-- Google Sheets: a11y_technology_usage +-- +-- Purpose +-- • Measure the adoption of accessibility-related technologies (e.g., overlays) +-- across websites, segmented by client type (desktop vs mobile). +-- • Provide absolute counts of sites with A11y tech and their percentage share +-- relative to all sites. +-- +-- Dataset +-- • Source: `httparchive.crawl.pages` +-- • Crawl date: 2025-07-01 +-- • Technologies: extracted via `UNNEST(technologies)` and `UNNEST(categories)`. +-- +-- Method +-- 1. Count distinct sites (pages) per {client, is_root_page}. +-- 2. Count distinct sites where `category = 'Accessibility'`. +-- 3. Compute percentage as (# sites with A11y tech / total sites). +-- +-- Output columns +-- client — "desktop" | "mobile" +-- is_root_page — TRUE if page is a root URL +-- total_sites — number of distinct sites per client +-- sites_with_a11y_tech — number of distinct sites with Accessibility technology +-- pct_sites_with_a11y_tech — fraction of sites using A11y tech (0–1 float) +-- +-- Notes +-- • `DISTINCT page` prevents double-counting when a site has multiple technologies. +-- • Percentages are per client (desktop/mobile) and root-page grouping. +-- • Useful for high-level comparison of A11y tech adoption across clients. +SELECT + client, # Client domain + is_root_page, + COUNT(DISTINCT page) AS total_sites, # Total number of unique sites for the client + COUNT(DISTINCT IF(category = 'Accessibility', page, NULL)) AS sites_with_a11y_tech, # Number of unique sites that use accessibility technology + COUNT(DISTINCT IF(category = 'Accessibility', page, NULL)) / COUNT(DISTINCT page) AS pct_sites_with_a11y_tech # Percentage of sites using accessibility technology +FROM + `httparchive.crawl.pages`, + UNNEST(technologies) AS tech, + UNNEST(categories) AS category +WHERE + date = '2025-07-01' # Specific date for data extraction +GROUP BY + client, + is_root_page +ORDER BY + client, + is_root_page; diff --git a/sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql b/sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql new file mode 100644 index 00000000000..ba70b580578 --- /dev/null +++ b/sql/2025/accessibility/a11y_technology_usage_by_domain_rank.sql @@ -0,0 +1,95 @@ +#standardSQL +# Purpose +# Measure adoption of specific Accessibility-related technologies (apps/overlays) +# across domain rank buckets in the 2025-07-01 HTTP Archive crawl. +# Google Sheet: a11y_technology_usage_by_domain_rank +# +# Output columns +# • client = desktop or mobile +# • is_root_page = TRUE if page is the root of the site +# • rank_grouping = domain rank bucket (1k, 10k, …, 100M) +# • total_in_rank = total number of unique pages in the rank bucket +# • app = specific Accessibility technology detected (Wappalyzer name) +# • sites_with_app = number of unique pages using that technology +# • pct_sites_with_app = share of pages in the rank bucket using that technology +# +# Method +# 1. Assign each page to a rank_grouping based on its domain rank. +# 2. Compute totals per client / root flag / rank grouping (denominator). +# 3. Expand technologies and categories, keeping only category = 'Accessibility'. +# 4. Count distinct pages per technology and divide by the rank total. +# +# Notes +# • Unit of analysis = page URL, not host/site. +# • Percentages are returned as numeric fractions (0–1). Use FORMAT() if a +# human-readable percent string is needed. +# • Rank groupings are aligned with prior reporting thresholds (1k → 100M). +WITH ranked_sites AS ( + -- Get the total number of sites within each rank grouping + SELECT + client, + is_root_page, + page, + rank, + technologies, -- Include technologies field here + CASE + WHEN rank <= 1000 THEN 1000 + WHEN rank <= 10000 THEN 10000 + WHEN rank <= 100000 THEN 100000 + WHEN rank <= 1000000 THEN 1000000 + WHEN rank <= 10000000 THEN 10000000 + WHEN rank <= 100000000 THEN 100000000 + END AS rank_grouping + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' -- Use the relevant date for analysis +), + +rank_totals AS ( + -- Calculate total sites in each rank grouping + SELECT + client, + is_root_page, + rank_grouping, + COUNT(DISTINCT page) AS total_in_rank + FROM + ranked_sites + GROUP BY + client, + is_root_page, + rank_grouping +) + +SELECT + r.client, + r.is_root_page, + r.rank_grouping, + rt.total_in_rank, -- Total number of unique sites within the rank grouping + tech.technology AS app, -- Accessibility technology used + COUNT(DISTINCT r.page) AS sites_with_app, -- Number of sites using the specific accessibility technology + SAFE_DIVIDE(COUNT(DISTINCT r.page), rt.total_in_rank) AS pct_sites_with_app -- Percentage of sites using the accessibility technology +FROM + ranked_sites r +JOIN + UNNEST(r.technologies) AS tech -- Expand technologies array to individual rows +JOIN + rank_totals rt -- Join to get the total number of sites per rank grouping +ON r.client = rt.client AND + r.is_root_page = rt.is_root_page AND + r.rank_grouping = rt.rank_grouping +JOIN + UNNEST(tech.categories) AS category -- Unnest the categories array to filter for accessibility +WHERE + category = 'Accessibility' -- Filter to include only accessibility-related technologies +GROUP BY + r.client, + r.is_root_page, + r.rank_grouping, + rt.total_in_rank, + tech.technology +ORDER BY + tech.technology, -- Order results by technology (app) + r.rank_grouping, -- Order results by rank grouping + r.client, + r.is_root_page; diff --git a/sql/2025/accessibility/alt_ending_in_image_extension.sql b/sql/2025/accessibility/alt_ending_in_image_extension.sql new file mode 100644 index 00000000000..b2f5a45819c --- /dev/null +++ b/sql/2025/accessibility/alt_ending_in_image_extension.sql @@ -0,0 +1,79 @@ +#standardSQL +-- Alt Texts Ending in File Extensions (2025-07-01) +-- Google Sheet: alt_ending_in_image_extension +-- Measures how often alt attributes end in file extensions (e.g., .jpg, .png). +-- Reports both site-level adoption (what % of sites do this) and alt-level frequency +-- (what % of all non-empty alts end with extensions), broken down by client, root status, +-- and specific file extension. +CREATE TEMPORARY FUNCTION getUsedExtensions(a11y JSON) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + return Object.entries(a11y.file_extension_alts.file_extensions).map(([extension, total]) => { + return {extension, total}; + }); +} catch (e) { + return []; +} +'''; +SELECT + client, + is_root_page, + sites_with_non_empty_alt, + sites_with_file_extension_alt, + total_alts_with_file_extensions, + + # Of sites with a non-empty alt, what % have an alt with a file extension + sites_with_file_extension_alt / sites_with_non_empty_alt AS pct_sites_with_file_extension_alt, + # Given a random alt, how often will it end in a file extension + total_alts_with_file_extensions / total_non_empty_alts AS pct_alts_with_file_extension, + + extension_stat.extension AS extension, + COUNT(0) AS total_sites_using, + # Of sites with a non-empty alt, what % have an alt with this file extension + COUNT(0) / sites_with_non_empty_alt AS pct_applicable_sites_using, + + # Of sites with a non-empty alt, what % have an alt with this file extension + SUM(extension_stat.total) AS total_occurances, + # Given a random alt ending in a file extension, how often will it end in this file extension + SUM(extension_stat.total) / total_alts_with_file_extensions AS pct_total_occurances +FROM + `httparchive.crawl.pages`, + UNNEST(getUsedExtensions(custom_metrics.a11y)) AS extension_stat +LEFT JOIN ( + SELECT + client, + is_root_page, + COUNTIF(total_non_empty_alt > 0) AS sites_with_non_empty_alt, + COUNTIF(total_with_file_extension > 0) AS sites_with_file_extension_alt, + + SUM(total_non_empty_alt) AS total_non_empty_alts, + SUM(total_with_file_extension) AS total_alts_with_file_extensions + FROM ( + SELECT + client, + is_root_page, + CAST(JSON_VALUE(custom_metrics.markup.images.img.alt.present) AS INT64) AS total_non_empty_alt, + CAST(JSON_VALUE(custom_metrics.a11y.file_extension_alts.total_with_file_extension) AS INT64) AS total_with_file_extension + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' + ) + GROUP BY + client, + is_root_page +) USING (client, is_root_page) +WHERE + date = '2025-07-01' +GROUP BY + client, + is_root_page, + sites_with_non_empty_alt, + sites_with_file_extension_alt, + total_non_empty_alts, + total_alts_with_file_extensions, + extension +ORDER BY + client, + is_root_page, + total_occurances DESC diff --git a/sql/2025/accessibility/anchors_with_role_button.sql b/sql/2025/accessibility/anchors_with_role_button.sql new file mode 100644 index 00000000000..f14cc0e3658 --- /dev/null +++ b/sql/2025/accessibility/anchors_with_role_button.sql @@ -0,0 +1,33 @@ +#standardSQL +-- Anchors with role="button" (2025-07-01) +-- Google Sheet: anchors_with_role_button +-- +-- Measures how often elements are given role="button". +-- Reports per client and root-page status: +-- • # of sites with anchors +-- • # of sites with at least one anchor role="button" +-- • % of anchor-using sites that apply role="button" +SELECT + client, + is_root_page, + COUNTIF(total_anchors > 0) AS sites_with_anchors, + COUNTIF(total_anchors_with_role_button > 0) AS sites_with_anchor_role_button, + + # Of sites that have anchors... how many have an anchor with a role='button' + COUNTIF(total_anchors_with_role_button > 0) / COUNTIF(total_anchors > 0) AS pct_sites_with_anchor_role_button +FROM ( + SELECT + client, + is_root_page, + date, + CAST(JSON_VALUE(custom_metrics.a11y.total_anchors_with_role_button) AS INT64) AS total_anchors_with_role_button, + IFNULL(CAST(JSON_VALUE(custom_metrics.element_count.a) AS INT64), 0) AS total_anchors + FROM + `httparchive.crawl.pages` + -- TABLESAMPLE SYSTEM (10 PERCENT) -- ← optional: cheap smoke test +) +WHERE + date = '2025-07-01' +GROUP BY + client, + is_root_page; diff --git a/sql/2025/accessibility/audio_track_usage.sql b/sql/2025/accessibility/audio_track_usage.sql new file mode 100644 index 00000000000..4a01092ba3e --- /dev/null +++ b/sql/2025/accessibility/audio_track_usage.sql @@ -0,0 +1,40 @@ +-- standardSQL +-- Web Almanac — Audio elements with usage (2025) +-- Google Sheet: audio_track_usage +-- +-- This query: +-- • Uses `httparchive.crawl.pages` at the site level (client × is_root_page). +-- • Extracts