diff --git a/sql/2025/seo/.!32582!anchor-rel-attribute-usage-2025.sql b/sql/2025/seo/.!32582!anchor-rel-attribute-usage-2025.sql new file mode 100644 index 00000000000..e69de29bb2d diff --git a/sql/2025/seo/README.md b/sql/2025/seo/README.md index 20c616a0285..0f9a861804a 100644 --- a/sql/2025/seo/README.md +++ b/sql/2025/seo/README.md @@ -15,6 +15,6 @@ - [📊 Results sheet][~google-sheets] - [📝 Markdown file][~chapter-markdown] -[~google-doc]: https://docs.google.com/document/d/1CM6CV86bDndFI1EaOrplvxdclldcToQ8GFkq2POpNlg -[~google-sheets]: https://docs.google.com/spreadsheets/d/1MoWoxogYWH6fv5r485EttvVgJuw7dMzzcot66X3MWu4/edit +[~google-doc]: https://docs.google.com/document/d/1SZL_TF3IGyq_yLATjZ7OA2bAXPzvRD5nJrCMR4ZeiYk/edit +[~google-sheets]: https://docs.google.com/spreadsheets/d/1lAQKcOF7l6xz9v7yvnI9I1F8yiSqcz3Xx6u-5ady1DQ/edit#gid=1778117656 [~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2025/seo.md diff --git a/sql/2025/seo/anchor-rel-attribute-usage-2025.sql b/sql/2025/seo/anchor-rel-attribute-usage-2025.sql new file mode 100644 index 00000000000..7227ade564d --- /dev/null +++ b/sql/2025/seo/anchor-rel-attribute-usage-2025.sql @@ -0,0 +1,66 @@ +#standardSQL +# Anchor rel attribute usage +# This query reports if a rel attribute value was ever used on a page, and calculates various statistics. + +CREATE TEMPORARY FUNCTION getRelStatsWptBodies(wpt_bodies_string STRING) +RETURNS STRUCT< + rel ARRAY +> LANGUAGE js AS ''' +var result = {rel: []}; +// Function to retrieve only keys if value is >0 +function getKey(dict){ + const arr = [], + obj = Object.keys(dict); + for (var x in obj){ + if(dict[obj[x]] > 0){ + arr.push(obj[x]); + } + } + return arr; +} +try { + var wpt_bodies = JSON.parse(wpt_bodies_string); + if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result; + if (wpt_bodies.anchors && wpt_bodies.anchors.rendered && wpt_bodies.anchors.rendered.rel_attributes) { + result.rel = getKey(wpt_bodies.anchors.rendered.rel_attributes); + } +} catch (e) {} +return result; +'''; + +WITH rel_stats_table AS ( + SELECT + client, + root_page, + page, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END + AS is_root_page, + getRelStatsWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS wpt_bodies_info + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) + +SELECT + client, + is_root_page, + rel, + COUNT(DISTINCT page) AS sites, + SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total, + COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct +FROM + rel_stats_table, + UNNEST(wpt_bodies_info.rel) AS rel +GROUP BY + client, + is_root_page, + rel +ORDER BY + sites DESC, + rel, + client DESC; diff --git a/sql/2025/seo/anchor-same-site-occurance-stats-2025.sql b/sql/2025/seo/anchor-same-site-occurance-stats-2025.sql new file mode 100644 index 00000000000..fcfba69252e --- /dev/null +++ b/sql/2025/seo/anchor-same-site-occurance-stats-2025.sql @@ -0,0 +1,74 @@ +#standardSQL +# Anchor same site occurrence stats +# This query aims to highlight sites with few same-site links, like SPAs. + +CREATE TEMPORARY FUNCTION getLinkDesciptionsWptBodies(wpt_bodies_string STRING) +RETURNS STRUCT< + links_same_site INT64, + links_window_location INT64, + links_window_open INT64, + links_href_javascript INT64 +> LANGUAGE js AS ''' +var result = { + links_same_site: 0, + links_window_location: 0, + links_window_open: 0, + links_href_javascript: 0 +}; +try { + var wpt_bodies = JSON.parse(wpt_bodies_string); + + if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result; + + if (wpt_bodies.anchors && wpt_bodies.anchors.rendered) { + var anchors_rendered = wpt_bodies.anchors.rendered; + + result.links_same_site = anchors_rendered.same_site || 0; + result.links_window_location = anchors_rendered.same_page.dynamic.onclick_attributes.window_location || 0; + result.links_window_open = anchors_rendered.same_page.dynamic.onclick_attributes.window_open || 0; + result.links_href_javascript = anchors_rendered.same_page.dynamic.href_javascript || 0; + } + +} catch (e) {} +return result; +'''; + +WITH same_links_info AS ( + SELECT + client, + root_page, + page, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END + AS is_root_page, + getLinkDesciptionsWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS wpt_bodies_info + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) + +SELECT + client, + wpt_bodies_info.links_same_site AS links_same_site, + is_root_page, + COUNT(DISTINCT page) AS sites, -- Counting all occurrences of links_same_site + SAFE_DIVIDE(COUNT(0), COUNT(DISTINCT page)) AS pct_links_same_site, -- Percentage of same-site links + AVG(wpt_bodies_info.links_window_location) AS avg_links_window_location, + AVG(wpt_bodies_info.links_window_open) AS avg_links_window_open, + AVG(wpt_bodies_info.links_href_javascript) AS avg_links_href_javascript, + AVG(wpt_bodies_info.links_window_location + wpt_bodies_info.links_window_open + wpt_bodies_info.links_href_javascript) AS avg_links_any, + MAX(wpt_bodies_info.links_window_location + wpt_bodies_info.links_window_open + wpt_bodies_info.links_href_javascript) AS max_links_any, + SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total, + COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct -- Secondary page percentage within group +FROM + same_links_info +GROUP BY + client, + is_root_page, + wpt_bodies_info.links_same_site +ORDER BY + links_same_site ASC; diff --git a/sql/2025/seo/content-language-2025.sql b/sql/2025/seo/content-language-2025.sql new file mode 100644 index 00000000000..dc975d97727 --- /dev/null +++ b/sql/2025/seo/content-language-2025.sql @@ -0,0 +1,53 @@ +CREATE TEMPORARY FUNCTION getContentLanguagesAlmanac(almanac_string STRING) +RETURNS ARRAY +LANGUAGE js AS ''' +var result = []; +try { + var almanac = JSON.parse(almanac_string); + + if (Array.isArray(almanac) || typeof almanac != 'object') return ["NO PAYLOAD"]; + + if (almanac && almanac["meta-nodes"] && almanac["meta-nodes"].nodes && almanac["meta-nodes"].nodes.filter) { + result = almanac["meta-nodes"].nodes.filter(n => n["http-equiv"] && n["http-equiv"].toLowerCase().trim() == 'content-language' && n.content).map(am => am.content.toLowerCase().trim()); + } + + if (result.length === 0) + result.push("NO TAG"); + +} catch (e) {result.push("ERROR "+e);} // results show some issues with the validity of the payload +return result; +'''; +WITH content_language_usage AS ( + SELECT + client, + root_page, + page, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END AS is_root_page, + getContentLanguagesAlmanac(JSON_EXTRACT_SCALAR(payload, '$._almanac')) AS content_languages + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) + +SELECT + client, + is_root_page, + content_language, + COUNT(DISTINCT page) AS sites, + SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total, + COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct +FROM + content_language_usage, + UNNEST(content_languages) AS content_language +GROUP BY + client, + is_root_page, + content_language +ORDER BY + sites DESC, + client DESC; diff --git a/sql/2025/seo/core-web-vitals-2025.sql b/sql/2025/seo/core-web-vitals-2025.sql new file mode 100644 index 00000000000..24f7b2360f2 --- /dev/null +++ b/sql/2025/seo/core-web-vitals-2025.sql @@ -0,0 +1,45 @@ +CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + good / (good + needs_improvement + poor) >= 0.75 +); +CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + good + needs_improvement + poor > 0 +); +SELECT + date, + device, + SAFE_DIVIDE( + COUNT(DISTINCT IF(IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL)), + COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL)) + ) AS pct_good_lcp, + # Origins with good FID divided by origins with any FID. + SAFE_DIVIDE( + COUNT(DISTINCT IF(IS_GOOD(fast_fid, avg_fid, slow_fid), origin, NULL)), + COUNT(DISTINCT IF(IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL)) + ) AS pct_good_fid, + # Origins with good CLS divided by origins with any CLS. + SAFE_DIVIDE( + COUNT(DISTINCT IF(IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)), + COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL)) + ) AS pct_good_cls, + # Origins with good LCP, FID, and CLS dividied by origins with any LCP, FID, and CLS. + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND + IS_GOOD(fast_fid, avg_fid, slow_fid) IS NOT FALSE AND + IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AND + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL + )) + ) AS pct_good_cwv +FROM + `chrome-ux-report.materialized.device_summary` +WHERE + date BETWEEN '2019-11-01' AND '2025-06-01' AND + device IN ('desktop', 'phone') +GROUP BY + date, + device +ORDER BY + date DESC diff --git a/sql/2025/seo/hreflang-header-usage-2025.sql b/sql/2025/seo/hreflang-header-usage-2025.sql new file mode 100644 index 00000000000..ce83e497bdc --- /dev/null +++ b/sql/2025/seo/hreflang-header-usage-2025.sql @@ -0,0 +1,60 @@ +#standardSQL +# hreflang header usage + +# Returns all the data we need from _wpt_bodies +CREATE TEMPORARY FUNCTION getHreflangWptBodies(wpt_bodies_string STRING) +RETURNS STRUCT< + hreflangs ARRAY +> LANGUAGE js AS ''' +var result = { + hreflangs: [] +}; + +try { + var wpt_bodies = JSON.parse(wpt_bodies_string); + + if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result; + + if (wpt_bodies.hreflangs && wpt_bodies.hreflangs.http_header && wpt_bodies.hreflangs.http_header.values) { + result.hreflangs = wpt_bodies.hreflangs.http_header.values.map(v => v); // seems to fix a coercion issue! + } + +} catch (e) {} +return result; +'''; + +WITH hreflang_usage AS ( + SELECT + client, + root_page, + page, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END AS is_root_page, + getHreflangWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS hreflang_wpt_bodies_info + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' + +) + +SELECT + client, + is_root_page, + NORMALIZE_AND_CASEFOLD(hreflang) AS hreflang, + COUNT(DISTINCT page) AS sites, + SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total, + COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct +FROM + hreflang_usage, + UNNEST(hreflang_wpt_bodies_info.hreflangs) AS hreflang +GROUP BY + hreflang, + client, + is_root_page +ORDER BY + sites DESC, + client DESC; diff --git a/sql/2025/seo/hreflang-link-tag-usage-2025.sql b/sql/2025/seo/hreflang-link-tag-usage-2025.sql new file mode 100644 index 00000000000..693ed455858 --- /dev/null +++ b/sql/2025/seo/hreflang-link-tag-usage-2025.sql @@ -0,0 +1,58 @@ +#standardSQL +# hreflang link tag usage + +# Returns all the data we need from _wpt_bodies +CREATE TEMPORARY FUNCTION getHreflangWptBodies(wpt_bodies_string STRING) +RETURNS STRUCT< + hreflangs ARRAY +> LANGUAGE js AS ''' +var result = { +hreflangs: [] +}; + +try { + var wpt_bodies = JSON.parse(wpt_bodies_string); + + if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result; + + if (wpt_bodies.hreflangs && wpt_bodies.hreflangs.rendered && wpt_bodies.hreflangs.rendered.values) { + result.hreflangs = wpt_bodies.hreflangs.rendered.values.map(v => v); // seems to fix a coercion issue! + } + +} catch (e) {} +return result; +'''; + +WITH link_tag AS ( + SELECT + client, + root_page, + page, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END AS is_root_page, + getHreflangWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS hreflang_wpt_bodies_info + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) + +SELECT + client, + is_root_page, + NORMALIZE_AND_CASEFOLD(hreflang) AS hreflang, + COUNT(DISTINCT page) AS sites, + SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total, + COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct +FROM + link_tag, + UNNEST(hreflang_wpt_bodies_info.hreflangs) AS hreflang +GROUP BY + hreflang, + is_root_page, + client +ORDER BY + client DESC; diff --git a/sql/2025/seo/html-response-content-language-2025.sql b/sql/2025/seo/html-response-content-language-2025.sql new file mode 100644 index 00000000000..294938fb405 --- /dev/null +++ b/sql/2025/seo/html-response-content-language-2025.sql @@ -0,0 +1,35 @@ +WITH subquery AS ( + SELECT + client, + page, + request_headers, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END AS is_root_page + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' +) + +SELECT + client, + is_root_page, + header.name AS request_header_name, + header.value AS request_header_value, + COUNT(DISTINCT page) AS sites, + SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total, + SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER ()) AS pct +FROM + subquery, + UNNEST(request_headers) AS header +GROUP BY + client, + is_root_page, + header.name, + header.value +ORDER BY + sites DESC, + client; diff --git a/sql/2025/seo/html-response-vary-header-used-2025.sql b/sql/2025/seo/html-response-vary-header-used-2025.sql new file mode 100644 index 00000000000..94d5c90b04f --- /dev/null +++ b/sql/2025/seo/html-response-vary-header-used-2025.sql @@ -0,0 +1,30 @@ +WITH subquery AS ( + SELECT + client, + request_headers, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END AS is_root_page + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' +) + +SELECT + client, + is_root_page, + REGEXP_CONTAINS(LOWER(IFNULL(request_headers[SAFE_OFFSET(0)].name, '')), r'user-agent') AS resp_vary_user_agent, + COUNT(0) AS freq, + SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER ()) AS pct +FROM + subquery +GROUP BY + client, + is_root_page, + resp_vary_user_agent +ORDER BY + freq DESC, + client; diff --git a/sql/2025/seo/iframe-loading-property-usage-2025.sql b/sql/2025/seo/iframe-loading-property-usage-2025.sql new file mode 100644 index 00000000000..1aaa38916d6 --- /dev/null +++ b/sql/2025/seo/iframe-loading-property-usage-2025.sql @@ -0,0 +1,69 @@ +#standardSQL +# Iframe loading property usage +# Note: This query only reports if an attribute was ever used on a page. It is not a per iframe report. + +# Returns all the data we need from _markup +CREATE TEMPORARY FUNCTION getIframeMarkupInfo(markup_string STRING) +RETURNS STRUCT< + loading ARRAY +> LANGUAGE js AS ''' +var result = {}; + +// Function to retrieve only keys if value is >0 +function getKey(dict) { + const arr = [], + obj = Object.keys(dict); + for (var x in obj) { + if (dict[obj[x]] > 0) { + arr.push(obj[x]); + } + } + return arr; +} + +try { + var markup = JSON.parse(markup_string); + + if (Array.isArray(markup) || typeof markup != 'object') return result; + + if (markup.iframes && markup.iframes.loading) { + result.loading = getKey(markup.iframes.loading); + } +} catch (e) {} +return result; +'''; + +WITH iframe_loading_table AS ( + SELECT + client, + root_page, + page, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END AS is_root_page, + getIframeMarkupInfo(JSON_EXTRACT_SCALAR(payload, '$._markup')) AS iframe_markup_info + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) + +SELECT + client, + is_root_page, + iframe_markup_info, + COUNT(DISTINCT page) AS sites, + SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total, + COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct +FROM + iframe_loading_table, + UNNEST(iframe_markup_info.loading) AS loading +GROUP BY + client, + is_root_page, + iframe_markup_info +ORDER BY + client, + is_root_page diff --git a/sql/2025/seo/image-alt-stats-2025.sql b/sql/2025/seo/image-alt-stats-2025.sql new file mode 100644 index 00000000000..e724c181c5b --- /dev/null +++ b/sql/2025/seo/image-alt-stats-2025.sql @@ -0,0 +1,93 @@ +#standardSQL +# Image alt stats + +# Returns all the data we need from _markup +CREATE TEMPORARY FUNCTION get_markup_info(markup_string STRING) +RETURNS STRUCT< + images_img_total INT64, + images_with_alt_present INT64, + images_with_alt_blank INT64, + images_with_alt_missing INT64 +> LANGUAGE js AS ''' +var result = { + images_img_total: 0, + images_with_alt_present: 0, + images_with_alt_blank: 0, + images_with_alt_missing: 0 +}; +try { + var markup = JSON.parse(markup_string); + + if (Array.isArray(markup) || typeof markup != 'object') return result; + + if (markup.images) { + if (markup.images.img) { + var img = markup.images.img; + result.images_img_total = img.total; + + if (img.alt) { + result.images_with_alt_present = img.alt.present; + result.images_with_alt_blank = img.alt.blank; + result.images_with_alt_missing = img.alt.missing; + } + } + } + +} catch (e) {} +return result; +'''; + +WITH processed_data AS ( + SELECT + client, + root_page, + page, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END AS is_root_page, + get_markup_info(JSON_EXTRACT_SCALAR(payload, '$._markup')) AS markup_info + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) + +SELECT + percentile, + client, + is_root_page, + # Images per page + APPROX_QUANTILES(markup_info.images_img_total, 1000)[OFFSET(percentile * 10)] AS img_count, + + # Percent of images containing alt text (not blank) + APPROX_QUANTILES(SAFE_DIVIDE(markup_info.images_with_alt_present, markup_info.images_img_total), 1000)[OFFSET(percentile * 10)] AS images_with_alt_present_percent, + + # Percent of images containing a blank alt text + APPROX_QUANTILES(SAFE_DIVIDE(markup_info.images_with_alt_blank, markup_info.images_img_total), 1000)[OFFSET(percentile * 10)] AS images_with_alt_blank_percent, + + # Percent of images without an alt attribute + APPROX_QUANTILES(SAFE_DIVIDE(markup_info.images_with_alt_missing, markup_info.images_img_total), 1000)[OFFSET(percentile * 10)] AS images_with_alt_missing_percent, + + # Number of images containing alt text (not blank) + APPROX_QUANTILES(markup_info.images_with_alt_present, 1000)[OFFSET(percentile * 10)] AS images_with_alt_present, + + # Number of images containing a blank alt text + APPROX_QUANTILES(markup_info.images_with_alt_blank, 1000)[OFFSET(percentile * 10)] AS images_with_alt_blank, + + # Number of images without an alt attribute + APPROX_QUANTILES(markup_info.images_with_alt_missing, 1000)[OFFSET(percentile * 10)] AS images_with_alt_missing, + COUNT(DISTINCT page) AS sites, + SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total, + COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct +FROM + processed_data, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + percentile, + is_root_page, + client +ORDER BY + percentile, + client; diff --git a/sql/2025/seo/image-loading-property-usage-2025.sql b/sql/2025/seo/image-loading-property-usage-2025.sql new file mode 100644 index 00000000000..42441c44278 --- /dev/null +++ b/sql/2025/seo/image-loading-property-usage-2025.sql @@ -0,0 +1,62 @@ +# Create a temporary function to extract the loading properties from the markup +CREATE TEMPORARY FUNCTION getLoadingPropertyMarkupInfo(markup_string STRING) +RETURNS STRUCT< + loading ARRAY +> LANGUAGE js AS ''' +var result = {}; + +// Function to retrieve only keys if value is >0 +function getKey(dict){ + const arr = [], + obj = Object.keys(dict); + for (var x in obj){ + if(dict[obj[x]] > 0){ + arr.push(obj[x]); + } + } + return arr; +} + +try { + var markup = JSON.parse(markup_string); + + if (Array.isArray(markup) || typeof markup != 'object') return result; + + if (markup.images && markup.images.img && markup.images.img.loading) { + result.loading = getKey(markup.images.img.loading); + } +} catch (e) {} +return result; +'''; + +WITH image_loading AS ( + SELECT + client, + root_page, + is_root_page, + page, + getLoadingPropertyMarkupInfo(JSON_EXTRACT_SCALAR(payload, '$._markup')) AS loading_property_markup_info + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) + +SELECT + client, + loading, + COUNT(DISTINCT root_page) AS sites, + COUNT(DISTINCT page) AS total_pages, + COUNTIF(is_root_page = TRUE) AS count_homepage, + COUNTIF(is_root_page = FALSE) AS count_secondarypage, + COUNTIF(is_root_page = TRUE) / COUNT(DISTINCT page) AS homepage_pct, + COUNTIF(is_root_page = FALSE) / COUNT(DISTINCT page) AS secondary_pct +FROM + image_loading, + UNNEST(loading_property_markup_info.loading) AS loading +GROUP BY + client, + loading +ORDER BY + client, + loading; diff --git a/sql/2025/seo/invalid-head-elements-2025.sql b/sql/2025/seo/invalid-head-elements-2025.sql new file mode 100644 index 00000000000..1cd1d2d6b87 --- /dev/null +++ b/sql/2025/seo/invalid-head-elements-2025.sql @@ -0,0 +1,47 @@ +WITH pages AS ( + SELECT + client, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END AS is_root_page, + page, + payload + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +), + +total_sites AS ( + -- total number of distinct pages (URLs) per client and page type + SELECT + client, + is_root_page, + COUNT(DISTINCT page) AS total_sites + FROM + pages + GROUP BY + client, + is_root_page +) + +SELECT + p.client, + p.is_root_page, + element, + COUNT(DISTINCT p.page) AS invalid_sites, -- Count of distinct pages with invalid elements + ts.total_sites +FROM + pages p +JOIN + total_sites ts ON p.client = ts.client AND p.is_root_page = ts.is_root_page, + UNNEST(JSON_EXTRACT_ARRAY(p.payload, '$._valid-head.invalidElements')) AS element +GROUP BY + p.client, + p.is_root_page, + ts.total_sites, + element +ORDER BY + p.client; diff --git a/sql/2025/seo/invalid-head-sites-2025.sql b/sql/2025/seo/invalid-head-sites-2025.sql new file mode 100644 index 00000000000..5ba97aba2cf --- /dev/null +++ b/sql/2025/seo/invalid-head-sites-2025.sql @@ -0,0 +1,42 @@ +#standardSQL +# Counted metrics of invalid head elements in HTML +WITH totals AS ( + SELECT + client, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END + AS is_root_page, + payload, + page, + JSON_QUERY(payload, '$._valid-head.invalidHead') AS invalidHead, + ARRAY_LENGTH(JSON_EXTRACT_ARRAY(payload, '$._valid-head.invalidElements')) AS invalidCount + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' + GROUP BY + client, + page, + is_root_page, + payload +) + +SELECT + client, + is_root_page, + COUNTIF(invalidHead = 'true') AS invalidHeads, + SUM(invalidCount) AS invalidCount, + COUNTIF(invalidHead = 'true') / COUNT(DISTINCT page) AS pct_invalidHeads, + COUNT(DISTINCT page) AS sites, + SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total, + COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct +FROM + totals +GROUP BY + client, + is_root_page +ORDER BY + client diff --git a/sql/2025/seo/lighthouse-seo-stats-2025.sql b/sql/2025/seo/lighthouse-seo-stats-2025.sql new file mode 100644 index 00000000000..cc4d70d511e --- /dev/null +++ b/sql/2025/seo/lighthouse-seo-stats-2025.sql @@ -0,0 +1,60 @@ +CREATE TEMPORARY FUNCTION getAudits(audits STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +var auditsObj = JSON.parse(audits); +var results = []; + +for (var auditId in auditsObj) { + if (auditsObj.hasOwnProperty(auditId)) { + var audit = auditsObj[auditId]; + results.push({ + id: auditId, + weight: audit.weight || 0, + title: audit.title || '', + description: audit.description || '', + score: audit.score ?? null + }); + } +} +return results; +'''; + +WITH lighthouse_extraction AS ( + SELECT + client, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END + AS is_root_page, + page, + lighthouse AS report + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) + +SELECT + client, + audits.id AS id, + is_root_page, + COUNTIF(audits.score > 0) AS num_pages, + COUNT(DISTINCT page) AS sites, + COUNTIF(audits.score IS NOT NULL) AS total_applicable, + SAFE_DIVIDE(COUNTIF(audits.score > 0), COUNTIF(audits.score IS NOT NULL)) AS pct, + APPROX_QUANTILES(audits.weight, 100)[OFFSET(50)] AS median_weight, + MAX(audits.title) AS title, + MAX(audits.description) AS description, + SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total +FROM + lighthouse_extraction, + UNNEST(getAudits(JSON_EXTRACT(report, '$.audits'))) AS audits +GROUP BY + client, + is_root_page, + audits.id +ORDER BY + client, + median_weight DESC, + id; diff --git a/sql/2025/seo/mark-up-stats-2025.sql b/sql/2025/seo/mark-up-stats-2025.sql new file mode 100644 index 00000000000..7572e91596c --- /dev/null +++ b/sql/2025/seo/mark-up-stats-2025.sql @@ -0,0 +1,101 @@ +#standardSQL +# Markup stats + + +# returns all the data we need from _markup +CREATE TEMPORARY FUNCTION getMarkupStatsInfo(markup_string STRING) +RETURNS STRUCT< + images_img_total INT64, + images_alt_missing_total INT64, + images_alt_blank_total INT64, + images_alt_present_total INT64, + + has_html_amp_attribute BOOL, + has_rel_amphtml_tag BOOL, + has_html_amp_emoji_attribute BOOL +> LANGUAGE js AS ''' +var result = { + images_img_total: 0, + images_alt_missing_total: 0, + images_alt_blank_total: 0, + images_alt_present_total: 0, + has_html_amp_attribute: false, + has_rel_amphtml_tag: false, + has_html_amp_emoji_attribute: false +}; +try { + var markup = JSON.parse(markup_string); + + if (Array.isArray(markup) || typeof markup != 'object') return result; + + if (markup.images) { + if (markup.images.img) { + var img = markup.images.img; + result.images_img_total = img.total; + + if (img.alt) { + var alt = img.alt; + result.images_alt_missing_total = alt.missing; + result.images_alt_blank_total = alt.blank; + result.images_alt_present_total = alt.present; // present does not include blank + } + } + } + + if (markup.amp) { + result.has_html_amp_attribute = markup.amp.html_amp_attribute_present; + result.has_html_amp_emoji_attribute = markup.amp.html_amp_emoji_attribute_present; + result.has_rel_amphtml_tag = markup.amp.rel_amphtml; + } +} catch (e) {} +return result; +'''; + + +WITH markup_extraction AS ( + SELECT + client, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END + AS is_root_page, + page, + getMarkupStatsInfo(JSON_EXTRACT_SCALAR(payload, '$._markup')) AS markup_info + FROM + `httparchive.crawl.pages` + WHERE date = '2025-06-01' +) + +SELECT + client, + COUNT(0) AS total, + is_root_page, + # Pages with img + SAFE_DIVIDE(COUNTIF(markup_info.images_img_total > 0), COUNT(0)) AS pct_has_img, + + # percent pages with an img alt + SUM(markup_info.images_img_total) AS total_img, + SUM(markup_info.images_alt_present_total) AS total_img_alt_present, + SUM(markup_info.images_alt_blank_total) AS total_img_alt_blank, + SUM(markup_info.images_alt_missing_total) AS total_img_alt_missing, + SAFE_DIVIDE(SUM(markup_info.images_alt_missing_total), SUM(markup_info.images_img_total)) AS pct_images_with_img_alt_missing, + SAFE_DIVIDE(SUM(markup_info.images_alt_present_total), SUM(markup_info.images_img_total)) AS pct_images_with_img_alt_present, # present does not include blank + SAFE_DIVIDE(SUM(markup_info.images_alt_blank_total), SUM(markup_info.images_img_total)) AS pct_images_with_img_alt_blank, + SAFE_DIVIDE(SUM(markup_info.images_alt_blank_total) + SUM(markup_info.images_alt_present_total), SUM(markup_info.images_img_total)) AS pct_images_with_img_alt_blank_or_present, + + # Pages with tag + COUNTIF(markup_info.has_html_amp_attribute) AS has_html_amp_attribute, + COUNTIF(markup_info.has_html_amp_emoji_attribute) AS has_html_amp_emoji_attribute, + SAFE_DIVIDE(COUNTIF(markup_info.has_html_amp_attribute), COUNT(0)) AS pct_has_html_amp_attribute, + SAFE_DIVIDE(COUNTIF(markup_info.has_html_amp_emoji_attribute), COUNT(0)) AS pct_has_html_amp_emoji_attribute, + SAFE_DIVIDE(COUNTIF(markup_info.has_html_amp_emoji_attribute OR markup_info.has_html_amp_attribute), COUNT(0)) AS pct_has_html_amp_or_emoji_attribute, + + # Pages with rel=amphtml + SAFE_DIVIDE(COUNTIF(markup_info.has_rel_amphtml_tag), COUNT(0)) AS pct_has_rel_amphtml_tag +FROM + markup_extraction +GROUP BY + client, + is_root_page diff --git a/sql/2025/seo/media-property-usage-link-tags-rel-alternate-2025.sql b/sql/2025/seo/media-property-usage-link-tags-rel-alternate-2025.sql new file mode 100644 index 00000000000..ac1130d5027 --- /dev/null +++ b/sql/2025/seo/media-property-usage-link-tags-rel-alternate-2025.sql @@ -0,0 +1,63 @@ +#standardSQL +# Media property usage of link tags with rel=alternate + +# returns all the data we need from _almanac +CREATE TEMPORARY FUNCTION getMediaPropertyAlmanacInfo(almanac_string STRING) +RETURNS ARRAY +LANGUAGE js AS ''' +var result = []; +try { + var almanac = JSON.parse(almanac_string); + + if (Array.isArray(almanac) || typeof almanac != 'object') return ["NO PAYLOAD"]; + + if (almanac && almanac["link-nodes"] && almanac["link-nodes"].nodes && almanac["link-nodes"].nodes.filter) { + result = almanac["link-nodes"].nodes.filter(n => n.rel && n.rel.split(' ').find(r => r.trim().toLowerCase() == 'alternate') && n.media).map(am => am.media.toLowerCase().trim().replace("d(", "d (").replace(": ", ":")); + } + + if (result.length === 0) + result.push("NO TAG"); + +} catch (e) {result.push("ERROR "+e);} // results show some issues with the validity of the payload +return result; +'''; + +WITH page_almanac_info AS ( + SELECT + client, + getMediaPropertyAlmanacInfo(JSON_EXTRACT_SCALAR(payload, '$._almanac')) AS media_property_almanac_info + FROM + `httparchive.crawl.pages` + WHERE + DATE = '2025-06-01' +), + +total_pages AS ( + SELECT + client, + COUNT(0) AS total + FROM + page_almanac_info + GROUP BY + client +) + +SELECT + page_almanac_info.client, + media, + total_pages.total, + COUNT(0) AS count, + SAFE_DIVIDE(COUNT(0), total_pages.total) AS pct +FROM + page_almanac_info, + UNNEST(page_almanac_info.media_property_almanac_info) AS media +JOIN + total_pages +ON page_almanac_info.client = total_pages.client +GROUP BY + total_pages.total, + media, + page_almanac_info.client +ORDER BY + count DESC +LIMIT 1000 diff --git a/sql/2025/seo/meta-tag-usage-by-name-2025.sql b/sql/2025/seo/meta-tag-usage-by-name-2025.sql new file mode 100644 index 00000000000..060d910584a --- /dev/null +++ b/sql/2025/seo/meta-tag-usage-by-name-2025.sql @@ -0,0 +1,62 @@ +#standardSQL +# Meta tag usage by name + +# returns all the data we need from _almanac +CREATE TEMPORARY FUNCTION getMetaTagAlmanacInfo(almanac_string STRING) +RETURNS ARRAY +LANGUAGE js AS ''' +var result = []; +try { + var almanac = JSON.parse(almanac_string); + + if (Array.isArray(almanac) || typeof almanac != 'object') return []; + + if (almanac && almanac["meta-nodes"] && almanac["meta-nodes"].nodes) { + result = almanac["meta-nodes"].nodes + .map(am => am["name"].toLowerCase().trim()) // array of meta tag names + .filter((v, i, a) => a.indexOf(v) === i); // remove duplicates + } + +} catch (e) {} // results show some issues with the validity of the payload +return result; +'''; + +WITH page_almanac_info AS ( + SELECT + client, + getMetaTagAlmanacInfo(JSON_EXTRACT_SCALAR(payload, '$._almanac')) AS meta_tag_almanac_info + FROM + `httparchive.crawl.pages` + WHERE + DATE = '2025-06-01' +), + +total_pages AS ( + SELECT + client, + COUNT(0) AS total + FROM + page_almanac_info + GROUP BY + client +) + +SELECT + page_almanac_info.client, + meta_tag_name, + total_pages.total, + COUNT(0) AS count, + SAFE_DIVIDE(COUNT(0), total_pages.total) AS pct +FROM + page_almanac_info, + UNNEST(page_almanac_info.meta_tag_almanac_info) AS meta_tag_name +JOIN + total_pages +ON page_almanac_info.client = total_pages.client +GROUP BY + total_pages.total, + meta_tag_name, + page_almanac_info.client +ORDER BY + count DESC +LIMIT 1000 diff --git a/sql/2025/seo/meta-tag-usage-by-property-2025.sql b/sql/2025/seo/meta-tag-usage-by-property-2025.sql new file mode 100644 index 00000000000..7cb23f76f3d --- /dev/null +++ b/sql/2025/seo/meta-tag-usage-by-property-2025.sql @@ -0,0 +1,62 @@ +#standardSQL +# Meta tag usage by property + +# returns all the data we need from _almanac +CREATE TEMPORARY FUNCTION getMetaTagAlmanacInfo(almanac_string STRING) +RETURNS ARRAY +LANGUAGE js AS ''' +var result = []; +try { + var almanac = JSON.parse(almanac_string); + if (Array.isArray(almanac) || typeof almanac != 'object') return []; + + if (almanac && almanac["meta-nodes"] && almanac["meta-nodes"].nodes && almanac["meta-nodes"].nodes.filter) { + result = almanac["meta-nodes"].nodes + .filter(n => n["property"]) // just with a property attribute + .map(am => am["property"].toLowerCase().trim()) // array of the property values + .filter((v, i, a) => a.indexOf(v) === i); // remove duplicates + } + +} catch (e) {} // results show some issues with the validity of the payload +return result; +'''; + +WITH page_almanac_info AS ( + SELECT + client, + getMetaTagAlmanacInfo(JSON_EXTRACT_SCALAR(payload, '$._almanac')) AS meta_tag_almanac_info + FROM + `httparchive.crawl.pages` + WHERE + DATE = '2025-06-01' +), + +total_pages AS ( + SELECT + client, + COUNT(0) AS total + FROM + page_almanac_info + GROUP BY + client +) + +SELECT + page_almanac_info.client, + meta_tag_property, + total_pages.total, + COUNT(0) AS count, + SAFE_DIVIDE(COUNT(0), total_pages.total) AS pct +FROM + page_almanac_info, + UNNEST(page_almanac_info.meta_tag_almanac_info) AS meta_tag_property, + total_pages +WHERE + page_almanac_info.client = total_pages.client +GROUP BY + total_pages.total, + meta_tag_property, + page_almanac_info.client +ORDER BY + count DESC +LIMIT 1000 diff --git a/sql/2025/seo/outgoing_links_by_rank-2025.sql b/sql/2025/seo/outgoing_links_by_rank-2025.sql new file mode 100644 index 00000000000..c677e53390a --- /dev/null +++ b/sql/2025/seo/outgoing_links_by_rank-2025.sql @@ -0,0 +1,74 @@ +#standardSQL +# Internal and external link metrics by quantile and rank +WITH page_metrics AS ( + SELECT + client, + page, + is_root_page, + IF(rank <= rank_bucket, rank_bucket, NULL) AS rank, + ANY_VALUE(custom_metrics.wpt_bodies.anchors) AS anchors + FROM httparchive.crawl.pages, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_bucket + WHERE + date = '2025-06-01' + GROUP BY + client, + page, + is_root_page, + rank + HAVING rank IS NOT NULL +), + +metric_details AS ( + SELECT + client, + is_root_page, + percentile, + rank, + APPROX_QUANTILES(INT64(anchors.rendered.same_site), 1000)[OFFSET(percentile * 10)] AS outgoing_links_same_site, + APPROX_QUANTILES(INT64(anchors.rendered.same_property), 1000)[OFFSET(percentile * 10)] AS outgoing_links_same_property, + APPROX_QUANTILES(INT64(anchors.rendered.other_property), 1000)[OFFSET(percentile * 10)] AS outgoing_links_other_property + FROM page_metrics, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile + GROUP BY + client, + is_root_page, + rank, + percentile + ORDER BY + client, + is_root_page, + rank, + percentile +), + +page_counts AS ( + SELECT + client, + is_root_page, + rank, + COUNT(DISTINCT page) AS total_pages + FROM page_metrics + GROUP BY + client, + is_root_page, + rank +) + +SELECT + client, + is_root_page, + rank, + total_pages, + percentile, + outgoing_links_same_site, + outgoing_links_same_property, + outgoing_links_other_property +FROM metric_details +LEFT JOIN page_counts +USING (client, is_root_page, rank) +ORDER BY + client, + is_root_page, + rank, + percentile diff --git a/sql/2025/seo/pages-canonical-stats-2025.sql b/sql/2025/seo/pages-canonical-stats-2025.sql new file mode 100644 index 00000000000..55b54cf242c --- /dev/null +++ b/sql/2025/seo/pages-canonical-stats-2025.sql @@ -0,0 +1,174 @@ +#standardSQL +# page canonical metrics by device + +# Note: Contains redundant stats to seo-stats.sql in order to start better segmenting metrics away from monolithic queries. + + +# JS parsing of payload +CREATE TEMPORARY FUNCTION getCanonicalMetrics(payload STRING) +RETURNS STRUCT< + has_wpt_bodies BOOL, + has_canonicals BOOL, + has_self_canonical BOOL, + is_canonicalized BOOL, + has_http_canonical BOOL, + has_rendered_canonical BOOL, + has_raw_canonical BOOL, + has_canonical_mismatch BOOL, + rendering_changed_canonical BOOL, + http_header_changed_canonical BOOL, + has_relative_canonical BOOL, + has_absolute_canonical BOOL, + js_error BOOL +> LANGUAGE js AS ''' + +var result = {has_wpt_bodies: true, + has_canonicals: false, + has_self_canonical: false, + is_canonicalized: false, + has_http_canonical: false, + has_rendered_canonical: false, + has_raw_canonical: false, + canonical_missmatch: false, + rendering_changed_canonical: false, + http_header_changed_canonical: false, + has_relative_canonical: false, + has_absolute_canonical: false, + js_error: false}; + + function compareStringArrays(array1, array2) { + if (!array1 && !array2) return true; // both missing + if (!Array.isArray(array1) || !Array.isArray(array2)) return false; + if (!array1 && array2.length > 0) return false; + if (!array2 && array1.length > 0) return false; + if (array1.length != array2.length) return false; + + array1 = array1.slice(); + array1.sort(); + array2 = array2.slice(); + array2.sort(); + for (var i = 0; i < array1.length; i++) { + if (array1[i] != array2[i]) { + return false; + } + } + return true; + } + + +try { + + var $ = JSON.parse(payload); + var wpt_bodies = JSON.parse($._wpt_bodies); + + if (!wpt_bodies){ + result.has_wpt_bodies = false; + return result; + } + + var canonicals = wpt_bodies.canonicals; + + if (canonicals) { + + if (canonicals.canonicals && canonicals.canonicals.length) { + result.has_canonicals = canonicals.canonicals.length > 0; + } + if (canonicals.self_canonical) { + result.has_self_canonical = canonicals.self_canonical; + } + if (canonicals.other_canonical) { + result.is_canonicalized = canonicals.other_canonical; + } + if (canonicals.http_header_link_canoncials) { + result.has_http_canonical = canonicals.http_header_link_canoncials.length > 0; + } + if (canonicals.rendered && canonicals.rendered.html_link_canoncials) { + result.has_rendered_canonical = canonicals.rendered.html_link_canoncials.length > 0; + } + if (canonicals.raw && canonicals.raw.html_link_canoncials) { + result.has_raw_canonical = canonicals.raw.html_link_canoncials.length > 0; + } + if (canonicals.canonical_missmatch) { + result.has_canonical_mismatch = canonicals.canonical_missmatch; + } + if (canonicals.raw && canonicals.rendered) { + result.rendering_changed_canonical = !compareStringArrays(canonicals.raw.html_link_canoncials, canonicals.rendered.html_link_canoncials); + } + if (canonicals.raw && canonicals.http_header_link_canoncials && canonicals.http_header_link_canoncials.length > 0) { + result.http_header_changed_canonical = !compareStringArrays(canonicals.raw.html_link_canoncials, canonicals.http_header_link_canoncials); + } + + if (result.has_canonicals){ + result.has_relative_canonical = [].map.call(canonicals.canonicals, (e) => {return e.startsWith('/')}).indexOf(true) > -1; + result.has_absolute_canonical = [].map.call(canonicals.canonicals, (e) => {return e.startsWith('http')}).indexOf(true) > -1; + } + + } + + return result; + +} catch (e) { + result.js_error = true; + return result; +} +'''; + + +SELECT + client, + COUNT(0) AS total, + canonical_metrics.js_error AS js_error, + + # Pages with canonical + SAFE_DIVIDE(COUNTIF(canonical_metrics.has_canonicals), COUNT(0)) AS pct_has_canonical, + + # Pages with self-canonical + SAFE_DIVIDE(COUNTIF(canonical_metrics.has_self_canonical), COUNT(0)) AS pct_has_self_canonical, + + # Pages canonicalized + SAFE_DIVIDE(COUNTIF(canonical_metrics.is_canonicalized), COUNT(0)) AS pct_is_canonicalized, + + # Pages with canonical in HTTP header + SAFE_DIVIDE(COUNTIF(canonical_metrics.has_http_canonical), COUNT(0)) AS pct_http_canonical, + + # Pages with canonical in raw html + SAFE_DIVIDE(COUNTIF(canonical_metrics.has_raw_canonical), COUNT(0)) AS pct_has_raw_canonical, + + # Pages with canonical in rendered html + SAFE_DIVIDE(COUNTIF(canonical_metrics.has_rendered_canonical), COUNT(0)) AS pct_has_rendered_canonical, + + # Pages with canonical in rendered but not raw html + SAFE_DIVIDE(COUNTIF(canonical_metrics.has_rendered_canonical AND NOT canonical_metrics.has_raw_canonical), COUNT(0)) AS pct_has_rendered_but_not_raw_canonical, + + # Pages with canonical mismatch + SAFE_DIVIDE(COUNTIF(canonical_metrics.has_canonical_mismatch), COUNT(0)) AS pct_has_canonical_mismatch, + + # Pages with canonical conflict between raw and rendered + SAFE_DIVIDE(COUNTIF(canonical_metrics.rendering_changed_canonical), COUNT(0)) AS pct_has_conflict_rendering_changed_canonical, + + # Pages with canonical conflict between raw and http header + SAFE_DIVIDE(COUNTIF(canonical_metrics.http_header_changed_canonical), COUNT(0)) AS pct_has_conflict_http_header_changed_canonical, + + # Pages with canonical conflict between raw and http header + SAFE_DIVIDE(COUNTIF(canonical_metrics.http_header_changed_canonical OR canonical_metrics.rendering_changed_canonical), COUNT(0)) AS pct_has_conflict_http_header_or_rendering_changed_canonical, + + # Pages with canonicals that are absolute + SAFE_DIVIDE(COUNTIF(canonical_metrics.has_absolute_canonical), COUNTIF(canonical_metrics.has_canonicals)) AS pct_canonicals_absolute, + + # Pages with canonicals that are relative + SAFE_DIVIDE(COUNTIF(canonical_metrics.has_relative_canonical), COUNTIF(canonical_metrics.has_canonicals)) AS pct_canonicals_relative + +FROM ( + SELECT + client AS client, + getCanonicalMetrics(payload) AS canonical_metrics + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) +WHERE + canonical_metrics.has_wpt_bodies +GROUP BY + client, + js_error diff --git a/sql/2025/seo/pages-containing-a-video-element-2025.sql b/sql/2025/seo/pages-containing-a-video-element-2025.sql new file mode 100644 index 00000000000..a0125f9ed2b --- /dev/null +++ b/sql/2025/seo/pages-containing-a-video-element-2025.sql @@ -0,0 +1,44 @@ +#standardSQL +# Pages containing a video element + + +# returns all the data we need from _almanac +CREATE TEMPORARY FUNCTION getVideosAlmanacInfo(almanac_string STRING) +RETURNS STRUCT< + videos_total INT64 +> LANGUAGE js AS ''' +var result = { + videos_total: 0 +}; +try { + var almanac = JSON.parse(almanac_string); + + if (Array.isArray(almanac) || typeof almanac != 'object') return result; + + if (almanac.videos && almanac.videos.total) { + result.videos_total = almanac.videos.total; + } +} catch (e) {} +return result; +'''; + +SELECT + client, + COUNT(0) AS total, + + # Pages with videos + COUNTIF(videos_almanac_info.videos_total > 0) AS has_videos, + SAFE_DIVIDE(COUNTIF(videos_almanac_info.videos_total > 0), COUNT(0)) AS pct_has_videos + +FROM + ( + SELECT + client AS client, + getVideosAlmanacInfo(JSON_EXTRACT_SCALAR(payload, '$._almanac')) AS videos_almanac_info + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' + ) +GROUP BY + client diff --git a/sql/2025/seo/robots-meta-usage-2025.sql b/sql/2025/seo/robots-meta-usage-2025.sql new file mode 100644 index 00000000000..8474575323f --- /dev/null +++ b/sql/2025/seo/robots-meta-usage-2025.sql @@ -0,0 +1,80 @@ +#standardSQL +# Robots meta/header user agent directive usage + +CREATE TEMPORARY FUNCTION parseRobotsMeta(robotsMetaJson STRING) +RETURNS ARRAY> LANGUAGE js AS ''' + +var results = []; + +if (typeof robotsMetaJson === 'string') { + var robotsMetaParsed = JSON.parse(robotsMetaJson); + const reports = ['main_frame_robots_rendered', 'main_frame_robots_raw', 'main_frame_robots_headers', 'iframe_robots_raw', 'iframe_robots_headers'] + for (report of reports) { + var reportData = robotsMetaParsed[report]; + var result = typeof reportData === 'object' ? Object.entries(reportData).map(([bot,botData])=>{ + return Object.assign({ + 'report': report, + 'bot': bot + }, Object.fromEntries(Object.entries(botData).map(([k,v])=>[k.replaceAll('-', '_'), v / Math.max(v, 1)]))) + } + )[0] : null; + + if (result) + results.push(result); + + } +} +return results; +'''; +WITH Robots_Data AS ( + SELECT + client, + page, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END AS is_root_page, + JSON_EXTRACT(payload, '$._robots_meta') AS robots_meta_json + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) + +SELECT + client, + is_root_page, + data.bot AS bot, + data.report AS report, + COUNT(0) AS count, + COUNT(DISTINCT page) AS sites, + SAFE_DIVIDE(SUM(data.noindex), COUNT(0)) AS noindex, + SAFE_DIVIDE(SUM(data.index), COUNT(0)) AS index, + SAFE_DIVIDE(SUM(data.follow), COUNT(0)) AS follow, + SAFE_DIVIDE(SUM(data.none), COUNT(0)) AS none, + SAFE_DIVIDE(SUM(data.nofollow), COUNT(0)) AS nofollow, + SAFE_DIVIDE(SUM(data.noarchive), COUNT(0)) AS noarchive, + SAFE_DIVIDE(SUM(data.nosnippet), COUNT(0)) AS nosnippet, + SAFE_DIVIDE(SUM(data.unavailable_after), COUNT(0)) AS unavailable_after, + SAFE_DIVIDE(SUM(data.max_snippet), COUNT(0)) AS max_snippet, + SAFE_DIVIDE(SUM(data.max_image_preview), COUNT(0)) AS max_image_preview, + SAFE_DIVIDE(SUM(data.max_video_preview), COUNT(0)) AS max_video_preview, + SAFE_DIVIDE(SUM(data.notranslate), COUNT(0)) AS notranslate, + SAFE_DIVIDE(SUM(data.noimageindex), COUNT(0)) AS noimageindex, + SAFE_DIVIDE(SUM(data.nocache), COUNT(0)) AS nocache, + SAFE_DIVIDE(SUM(data.indexifembedded), COUNT(0)) AS indexifembedded +FROM + Robots_Data, + UNNEST(parseRobotsMeta(robots_meta_json)) AS data + +GROUP BY + client, + is_root_page, + bot, + report +HAVING + count >= 20 + +ORDER BY + count DESC diff --git a/sql/2025/seo/robots-text-size-2025.sql b/sql/2025/seo/robots-text-size-2025.sql new file mode 100644 index 00000000000..22592d36706 --- /dev/null +++ b/sql/2025/seo/robots-text-size-2025.sql @@ -0,0 +1,47 @@ +#standardSQL + +# Robots txt size by size bins (size in KiB) +# Note: Main story is robots.txt over 500 KiB which is Google's limit +# This is reason that size bins were used instead of quantiles + + +# helper to get robots size in kibibytes (KiB) +# Note: Assumes mostly ASCII 1byte = 1character. Size is collected by +# custom measurement as string length. +CREATE TEMPORARY FUNCTION getRobotsSize(payload STRING) +RETURNS FLOAT64 LANGUAGE js AS ''' +try { + var $ = JSON.parse(payload); + var robots = JSON.parse($._robots_txt); + return robots['size']/1024; +} catch (e) { + return 0; +} +'''; + +SELECT + client, + COUNT(DISTINCT(site)) AS sites, + SAFE_DIVIDE(COUNTIF(robots_size > 0 AND robots_size <= 100), COUNT(DISTINCT(site))) AS pct_0_100, + SAFE_DIVIDE(COUNTIF(robots_size > 100 AND robots_size <= 200), COUNT(DISTINCT(site))) AS pct_100_200, + SAFE_DIVIDE(COUNTIF(robots_size > 200 AND robots_size <= 300), COUNT(DISTINCT(site))) AS pct_200_300, + SAFE_DIVIDE(COUNTIF(robots_size > 300 AND robots_size <= 400), COUNT(DISTINCT(site))) AS pct_300_400, + SAFE_DIVIDE(COUNTIF(robots_size > 400 AND robots_size <= 500), COUNT(DISTINCT(site))) AS pct_400_500, + SAFE_DIVIDE(COUNTIF(robots_size > 500), COUNT(DISTINCT(site))) AS pct_gt500, + SAFE_DIVIDE(COUNTIF(robots_size = 0), COUNT(DISTINCT(site))) AS pct_missing, + COUNTIF(robots_size > 500) AS count_gt500, + COUNTIF(robots_size = 0) AS count_missing +FROM ( + SELECT + client AS client, + page AS site, + getRobotsSize(payload) AS robots_size + FROM + `httparchive.crawl.pages` + WHERE + DATE = '2025-06-01' +) +GROUP BY + client +ORDER BY + client DESC diff --git a/sql/2025/seo/robots-txt-size-2025.sql b/sql/2025/seo/robots-txt-size-2025.sql new file mode 100644 index 00000000000..2487d23ef70 --- /dev/null +++ b/sql/2025/seo/robots-txt-size-2025.sql @@ -0,0 +1,32 @@ +#standardSQL +# Robots.txt size +SELECT + client, + COUNT(DISTINCT(site)) AS sites, + SAFE_DIVIDE(COUNTIF(robots_size = 0), COUNT(DISTINCT(site))) AS pct_0, + SAFE_DIVIDE(COUNTIF(robots_size > 0 AND robots_size <= 100), COUNT(DISTINCT(site))) AS pct_0_100, + SAFE_DIVIDE(COUNTIF(robots_size > 100 AND robots_size <= 200), COUNT(DISTINCT(site))) AS pct_100_200, + SAFE_DIVIDE(COUNTIF(robots_size > 200 AND robots_size <= 300), COUNT(DISTINCT(site))) AS pct_200_300, + SAFE_DIVIDE(COUNTIF(robots_size > 300 AND robots_size <= 400), COUNT(DISTINCT(site))) AS pct_300_400, + SAFE_DIVIDE(COUNTIF(robots_size > 400 AND robots_size <= 500), COUNT(DISTINCT(site))) AS pct_400_500, + SAFE_DIVIDE(COUNTIF(robots_size > 500), COUNT(DISTINCT(site))) AS pct_gt500, + SAFE_DIVIDE(COUNTIF(robots_size IS NULL), COUNT(DISTINCT(site))) AS pct_missing, + COUNTIF(robots_size > 500) AS count_gt500, + COUNTIF(robots_size IS NULL) AS count_missing +FROM ( + SELECT + client, + root_page AS site, + custom_metrics.robots_txt, + FLOAT64(custom_metrics.robots_txt.size_kib) AS robots_size + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' AND + is_root_page AND -- no need to crawl inner pages for this one + custom_metrics.robots_txt.status IS NOT NULL +) +GROUP BY + client +ORDER BY + client DESC diff --git a/sql/2025/seo/robots-txt-status-codes -2025.sql b/sql/2025/seo/robots-txt-status-codes -2025.sql new file mode 100644 index 00000000000..9060c8e79d7 --- /dev/null +++ b/sql/2025/seo/robots-txt-status-codes -2025.sql @@ -0,0 +1,43 @@ +#standardSQL +# Robots txt status codes + +# returns all the data we need from _robots_txt +CREATE TEMPORARY FUNCTION getRobotsStatusInfo(robots_txt_string STRING) +RETURNS STRUCT< + status_code STRING +> LANGUAGE js AS ''' +var result = {}; +try { + var robots_txt = JSON.parse(robots_txt_string); + + if (Array.isArray(robots_txt) || typeof robots_txt != 'object') return result; + + if (robots_txt.status) { + result.status_code = ''+robots_txt.status; + } + +} catch (e) {} +return result; +'''; + +SELECT + client, + robots_txt_status_info.status_code AS status_code, + COUNT(0) AS total, + SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER (PARTITION BY client)) AS pct + +FROM + ( + SELECT + client, + getRobotsStatusInfo(JSON_EXTRACT_SCALAR(payload, '$._robots_txt')) AS robots_txt_status_info + FROM + `httparchive.crawl.pages` + WHERE + DATE = '2025-06-01' + ) +GROUP BY + client, + status_code +ORDER BY + total DESC diff --git a/sql/2025/seo/robots-txt-user-agent-usage-2025.sql b/sql/2025/seo/robots-txt-user-agent-usage-2025.sql new file mode 100644 index 00000000000..2c886139d3b --- /dev/null +++ b/sql/2025/seo/robots-txt-user-agent-usage-2025.sql @@ -0,0 +1,48 @@ +#standardSQL +# Robots txt user agent usage + + +# returns all the data we need from _robots_txt +CREATE TEMPORARY FUNCTION getRobotsTxtUserAgents(robots_txt_string STRING) +RETURNS STRUCT< + user_agents ARRAY +> LANGUAGE js AS ''' +var result = { + user_agents: [] +}; +try { + var robots_txt = JSON.parse(robots_txt_string); + var uas = robots_txt.record_counts.by_useragent; + result.user_agents = typeof uas === 'object' ? Object.keys(uas).map(ua => ua.toLowerCase()) : []; +} catch (e) {} +return result; +'''; + + +WITH robots AS ( + SELECT + client, + root_page, + getRobotsTxtUserAgents(JSON_EXTRACT_SCALAR(payload, '$._robots_txt')) AS robots_txt_user_agent_info, + COUNT(DISTINCT root_page) OVER (PARTITION BY client) AS total_sites + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) + +SELECT + client, + user_agent, + COUNT(DISTINCT root_page) AS sites, + COUNT(DISTINCT root_page) / ANY_VALUE(total_sites) AS pct +FROM + robots, + UNNEST(robots_txt_user_agent_info.user_agents) AS user_agent +GROUP BY + user_agent, + client +HAVING + sites >= 20 +ORDER BY + sites DESC diff --git a/sql/2025/seo/seo-stats-2025.sql b/sql/2025/seo/seo-stats-2025.sql new file mode 100644 index 00000000000..116c82862d5 --- /dev/null +++ b/sql/2025/seo/seo-stats-2025.sql @@ -0,0 +1,495 @@ +#standardSQL +# SEO stats + +# Note: Canonical metrics moved to pages-canonical-stats.sql. Should be removed from here in 2022. + +# returns all the data we need from _wpt_bodies +CREATE TEMPORARY FUNCTION getSeoStatsWptBodies(wpt_bodies_string STRING) +RETURNS STRUCT< + + # tags + n_titles INT64, + title_words INT64, + n_meta_descriptions INT64, + n_h1 INT64, + n_h2 INT64, + n_h3 INT64, + n_h4 INT64, + n_non_empty_h1 INT64, + n_non_empty_h2 INT64, + n_non_empty_h3 INT64, + n_non_empty_h4 INT64, + has_same_h1_title BOOL, + + # robots + robots_has_robots_meta_tag BOOL, + robots_has_x_robots_tag BOOL, + rendering_changed_robots_meta_tag BOOL, + + # canonical + has_canonicals BOOL, + has_self_canonical BOOL, + is_canonicalized BOOL, + has_http_canonical BOOL, + has_rendered_canonical BOOL, + has_raw_canonical BOOL, + has_canonical_mismatch BOOL, + rendering_changed_canonical BOOL, + http_header_changed_canonical BOOL, + + # hreflang + rendering_changed_hreflang BOOL, + has_hreflang BOOL, + has_http_hreflang BOOL, + has_rendered_hreflang BOOL, + has_raw_hreflang BOOL, + + # structured data + has_raw_jsonld_or_microdata BOOL, + has_rendered_jsonld_or_microdata BOOL, + rendering_changes_structured_data BOOL, + + # meta robots + rendered_otherbot_status_index BOOL, + rendered_otherbot_status_follow BOOL, + rendered_otherbot_noarchive BOOL, + rendered_otherbot_nosnippet BOOL, + rendered_otherbot_unavailable_after BOOL, + rendered_otherbot_max_snippet BOOL, + rendered_otherbot_max_image_preview BOOL, + rendered_otherbot_max_video_preview BOOL, + rendered_otherbot_notranslate BOOL, + rendered_otherbot_noimageindex BOOL, + rendered_otherbot_nocache BOOL, + + rendered_googlebot_status_index BOOL, + rendered_googlebot_status_follow BOOL, + rendered_googlebot_noarchive BOOL, + rendered_googlebot_nosnippet BOOL, + rendered_googlebot_unavailable_after BOOL, + rendered_googlebot_max_snippet BOOL, + rendered_googlebot_max_image_preview BOOL, + rendered_googlebot_max_video_preview BOOL, + rendered_googlebot_notranslate BOOL, + rendered_googlebot_noimageindex BOOL, + rendered_googlebot_nocache BOOL, + + rendered_googlebot_news_status_index BOOL, + rendered_googlebot_news_status_follow BOOL, + rendered_googlebot_news_noarchive BOOL, + rendered_googlebot_news_nosnippet BOOL, + rendered_googlebot_news_unavailable_after BOOL, + rendered_googlebot_news_max_snippet BOOL, + rendered_googlebot_news_max_image_preview BOOL, + rendered_googlebot_news_max_video_preview BOOL, + rendered_googlebot_news_notranslate BOOL, + rendered_googlebot_news_noimageindex BOOL, + rendered_googlebot_news_nocache BOOL +> LANGUAGE js AS ''' +var result = {}; +try { + var wpt_bodies = JSON.parse(wpt_bodies_string); + + if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result; + + // checks if two string arrays contain the same strings + function compareStringArrays(array1, array2) { + if (!array1 && !array2) return true; // both missing + if (!array1 && array2.length > 0) return false; + if (!array2 && array1.length > 0) return false; + if (array1.length != array2.length) return false; + + array1 = array1.slice(); + array1.sort(); + array2 = array2.slice(); + array2.sort(); + + for (var i = 0; i < array1.length; i++) { + if (array1[i] != array2[i]) { + return false; + } + } + + return true; + } + + var title = wpt_bodies.title; + if (title) { + if (title.rendered) { + var title_rendered = title.rendered; + //Number of words in the title tag + if (title_rendered.primary) { + result.title_words = title_rendered.primary.words; + } + + //If the webpage has a title + result.n_titles = title_rendered.total + } + } + + var meta_description = wpt_bodies.meta_description; + if (meta_description) { + + if (meta_description.rendered) { + //If the webpage has a meta description + result.n_meta_descriptions = meta_description.rendered.total; + } + } + + var headings = wpt_bodies.headings; + if (headings) { + var headings_rendered = headings.rendered; + if (headings_rendered) { + + //If the webpage has h1 + result.n_h1 = headings_rendered.h1.total; + + //If the webpage has h2 + result.n_h2 = headings_rendered.h2.total; + + //If the webpage has h3 + result.n_h3 = headings_rendered.h3.total; + + //If the webpage has h4 + result.n_h4 = headings_rendered.h4.total; + + //If the webpage has a non empty h1 + result.n_non_empty_h1 = headings_rendered.h1.non_empty_total; + + //If the webpage has a non empty h2 + result.n_non_empty_h2 = headings_rendered.h2.non_empty_total; + + //If the webpage has a non empty h3 + result.n_non_empty_h3 = headings_rendered.h3.non_empty_total; + + //If the webpage has a non empty h4 + result.n_non_empty_h4 = headings_rendered.h4.non_empty_total; + + + //If h1 and title tag are the same + result.has_same_h1_title = headings_rendered.primary.matches_title; + } + } + + var robots = wpt_bodies.robots; + if (robots) { + result.robots_has_robots_meta_tag = robots.has_robots_meta_tag; + result.robots_has_x_robots_tag = robots.has_x_robots_tag; + + // added to rendered + // has_rendered_robots_meta_tag ??? + // added to raw + // raw and rendered are different + + //rendering_changed_robots_meta_tag + // if the raw and rendered data are different. + if (robots.raw && robots.rendered) { + var rendered = robots.rendered; + var raw = robots.raw; + if ( + rendered.otherbot.status_index !== raw.otherbot.status_index || + rendered.otherbot.status_follow !== raw.otherbot.status_follow || + rendered.googlebot.status_index !== raw.googlebot.status_index || + rendered.googlebot.status_follow !== raw.googlebot.status_follow || + rendered.googlebot_news.status_index !== raw.googlebot_news.status_index || + rendered.googlebot_news.status_follow !== raw.googlebot_news.status_follow || + JSON.stringify(rendered.google) !== JSON.stringify(raw.google) + ) + { + result.rendering_changed_robots_meta_tag = true; + } + else + { + result.rendering_changed_robots_meta_tag = false; + } + + result.rendered_otherbot_status_index = rendered.otherbot.status_index; + result.rendered_otherbot_status_follow = rendered.otherbot.status_follow; + result.rendered_otherbot_noarchive = rendered.otherbot.noarchive === true; + result.rendered_otherbot_nosnippet = rendered.otherbot.nosnippet === true; + result.rendered_otherbot_unavailable_after = rendered.otherbot.unavailable_after === true; + result.rendered_otherbot_max_snippet = rendered.otherbot.max_snippet === true; + result.rendered_otherbot_max_image_preview = rendered.otherbot.max_image_preview === true; + result.rendered_otherbot_max_video_preview = rendered.otherbot.max_video_preview === true; + result.rendered_otherbot_notranslate = rendered.otherbot.notranslate === true; + result.rendered_otherbot_noimageindex = rendered.otherbot.noimageindex === true; + result.rendered_otherbot_nocache = rendered.otherbot.nocache === true; + + result.rendered_googlebot_status_index = rendered.googlebot.status_index; + result.rendered_googlebot_status_follow = rendered.googlebot.status_follow; + result.rendered_googlebot_noarchive = rendered.googlebot.noarchive === true; + result.rendered_googlebot_nosnippet = rendered.googlebot.nosnippet === true; + result.rendered_googlebot_unavailable_after = rendered.googlebot.unavailable_after === true; + result.rendered_googlebot_max_snippet = rendered.googlebot.max_snippet === true; + result.rendered_googlebot_max_image_preview = rendered.googlebot.max_image_preview === true; + result.rendered_googlebot_max_video_preview = rendered.googlebot.max_video_preview === true; + result.rendered_googlebot_notranslate = rendered.googlebot.notranslate === true; + result.rendered_googlebot_noimageindex = rendered.googlebot.noimageindex === true; + result.rendered_googlebot_nocache = rendered.googlebot.nocache === true; + + result.rendered_googlebot_news_status_index = rendered.googlebot_news.status_index; + result.rendered_googlebot_news_status_follow = rendered.googlebot_news.status_follow; + result.rendered_googlebot_news_noarchive = rendered.googlebot_news.noarchive === true; + result.rendered_googlebot_news_nosnippet = rendered.googlebot_news.nosnippet === true; + result.rendered_googlebot_news_unavailable_after = rendered.googlebot_news.unavailable_after === true; + result.rendered_googlebot_news_max_snippet = rendered.googlebot_news.max_snippet === true; + result.rendered_googlebot_news_max_image_preview = rendered.googlebot_news.max_image_preview === true; + result.rendered_googlebot_news_max_video_preview = rendered.googlebot_news.max_video_preview === true; + result.rendered_googlebot_news_notranslate = rendered.googlebot_news.notranslate === true; + result.rendered_googlebot_news_noimageindex = rendered.googlebot_news.noimageindex === true; + result.rendered_googlebot_news_nocache = rendered.googlebot_news.nocache === true; + + // result.rendering_changed_robots_meta_tag = JSON.stringify(robots.raw) === JSON.stringify(robots.rendered); + } + } + + var canonicals = wpt_bodies.canonicals; + if (canonicals) { + + if (canonicals.canonicals && canonicals.canonicals.length) { + result.has_canonicals = canonicals.canonicals.length > 0; + } + + if (canonicals.self_canonical) { + result.has_self_canonical = canonicals.self_canonical; + } + + if (canonicals.other_canonical) { + result.is_canonicalized = canonicals.other_canonical; + } + + if (canonicals.http_header_link_canoncials) { + result.has_http_canonical = canonicals.http_header_link_canoncials.length > 0; + } + + result.has_rendered_canonical = false; // used in a NOT so must be set for a simple query to work + if (canonicals.rendered && canonicals.rendered.html_link_canoncials) { + result.has_rendered_canonical = canonicals.rendered.html_link_canoncials.length > 0; + } + + result.has_raw_canonical = false; // used in a NOT so must be set for a simple query to work + if (canonicals.raw && canonicals.raw.html_link_canoncials) { + result.has_raw_canonical = canonicals.raw.html_link_canoncials.length > 0; + } + + if (canonicals.canonical_missmatch) { + result.has_canonical_mismatch = canonicals.canonical_missmatch; + } + + if (canonicals.raw && canonicals.rendered) { + result.rendering_changed_canonical = !compareStringArrays(canonicals.raw.html_link_canoncials, canonicals.rendered.html_link_canoncials); + } + + if (canonicals.raw && canonicals.http_header_link_canoncials && canonicals.http_header_link_canoncials.length > 0) { + result.http_header_changed_canonical = !compareStringArrays(canonicals.raw.html_link_canoncials, canonicals.http_header_link_canoncials); + } + } + + var hreflangs = wpt_bodies.hreflangs; + if (hreflangs) { + + if (hreflangs.raw && hreflangs.raw.values && hreflangs.rendered && hreflangs.rendered.values) { + result.rendering_changed_hreflang = !compareStringArrays(hreflangs.raw.values, hreflangs.rendered.values); + } + + if (hreflangs.rendered && hreflangs.rendered.values) { + result.has_hreflang = hreflangs.rendered.values.length > 0; + } + + if (hreflangs.http_header && hreflangs.http_header.values) { + result.has_http_hreflang = hreflangs.http_header.values.length > 0; + } + + result.has_rendered_hreflang = false; // used in a NOT so must be set for a simple query to work + if (hreflangs.rendered && hreflangs.rendered.values) { + result.has_rendered_hreflang = hreflangs.rendered.values.length > 0; + } + + result.has_raw_hreflang = false; // used in a NOT so must be set for a simple query to work + if (hreflangs.raw && hreflangs.raw.values) { + result.has_raw_hreflang = hreflangs.raw.values.length > 0; + } + } + + var structured_data = wpt_bodies.structured_data; + if (structured_data) { + result.has_raw_jsonld_or_microdata = structured_data.raw && structured_data.raw.jsonld_and_microdata_types && structured_data.raw.jsonld_and_microdata_types.length > 0; + result.has_rendered_jsonld_or_microdata = structured_data.rendered && structured_data.rendered.jsonld_and_microdata_types && structured_data.rendered.jsonld_and_microdata_types.length > 0; + + if (structured_data.raw && structured_data.rendered) { + result.rendering_changes_structured_data = JSON.stringify(structured_data.raw) !== JSON.stringify(structured_data.rendered); + } + } +} catch (e) {} +return result; +'''; + +SELECT + client, + COUNT(0) AS total, + is_root_page, + # meta title inclusion + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.n_titles > 0), COUNT(0)) AS pct_has_title_tag, + + # meta description inclusion + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.n_meta_descriptions > 0), COUNT(0)) AS pct_has_meta_description, + + # H1 inclusion + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.n_h1 > 0), COUNT(0)) AS pct_has_h1, + + # H2 inclusion + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.n_h2 > 0), COUNT(0)) AS pct_has_h2, + + # H3 inclusion + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.n_h3 > 0), COUNT(0)) AS pct_has_h3, + + # H4 inclusion + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.n_h4 > 0), COUNT(0)) AS pct_has_h4, + + # Non-empty H1 inclusion + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.n_non_empty_h1 > 0), COUNT(0)) AS pct_has_non_empty_h1, + + # Non-empty H2 inclusion + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.n_non_empty_h2 > 0), COUNT(0)) AS pct_has_non_empty_h2, + + # Non-empty H3 inclusion + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.n_non_empty_h3 > 0), COUNT(0)) AS pct_has_non_empty_h3, + + # Non-empty H4 inclusion + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.n_non_empty_h4 > 0), COUNT(0)) AS pct_has_non_empty_h4, + + # Same title and H1 + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_same_h1_title), COUNT(0)) AS pct_has_same_h1_title, + + # Meta Robots inclusion + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.robots_has_robots_meta_tag), COUNT(0)) AS pct_has_meta_robots, + + # HTTP Header Robots inclusion + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.robots_has_x_robots_tag), COUNT(0)) AS pct_has_x_robots_tag, + + # Meta Robots and x-robots inclusion + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.robots_has_robots_meta_tag AND wpt_bodies_info.robots_has_x_robots_tag), COUNT(0)) AS pct_has_meta_robots_and_x_robots_tag, + + # Rendering changed Robots + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendering_changed_robots_meta_tag), COUNT(0)) AS pct_rendering_changed_robots_meta_tag, + + # Pages with canonical + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_canonicals), COUNT(0)) AS pct_has_canonical, + + # Pages with self-canonical + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_self_canonical), COUNT(0)) AS pct_has_self_canonical, + + # Pages canonicalized + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.is_canonicalized), COUNT(0)) AS pct_is_canonicalized, + + # Pages with canonical in HTTP header + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_http_canonical), COUNT(0)) AS pct_http_canonical, + + # Pages with canonical in raw html + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_raw_canonical), COUNT(0)) AS pct_has_raw_canonical, + + # Pages with canonical in rendered html + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_rendered_canonical), COUNT(0)) AS pct_has_rendered_canonical, + + # Pages with canonical in rendered but not raw html + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_rendered_canonical AND NOT wpt_bodies_info.has_raw_canonical), COUNT(0)) AS pct_has_rendered_but_not_raw_canonical, + + # Pages with canonical mismatch + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_canonical_mismatch), COUNT(0)) AS pct_has_canonical_mismatch, + + # Pages with canonical conflict between raw and rendered + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendering_changed_canonical), COUNT(0)) AS pct_has_conflict_rendering_changed_canonical, + + # Pages with canonical conflict between raw and http header + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.http_header_changed_canonical), COUNT(0)) AS pct_has_conflict_http_header_changed_canonical, + + # Pages with canonical conflict between raw and http header + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.http_header_changed_canonical OR wpt_bodies_info.rendering_changed_canonical), COUNT(0)) AS pct_has_conflict_http_header_or_rendering_changed_canonical, + + # Pages with hreflang conflict between raw and rendered + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendering_changed_hreflang), COUNT(0)) AS pct_has_conflict_raw_rendered_hreflang, + + # Pages with hreflang + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_hreflang), COUNT(0)) AS pct_has_hreflang, + + # Pages with http hreflang + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_http_hreflang), COUNT(0)) AS pct_has_http_hreflang, + + # Pages with rendered hreflang + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_rendered_hreflang), COUNT(0)) AS pct_has_rendered_hreflang, + + # Pages with raw hreflang + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_raw_hreflang), COUNT(0)) AS pct_has_raw_hreflang, + + # Pages with hreflang in rendered but not raw html + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_rendered_hreflang AND NOT wpt_bodies_info.has_raw_hreflang), COUNT(0)) AS pct_has_rendered_but_not_raw_hreflang, + + # Pages with raw jsonld or microdata + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_raw_jsonld_or_microdata), COUNT(0)) AS pct_has_raw_jsonld_or_microdata, + + # Pages with rendered jsonld or microdata + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_rendered_jsonld_or_microdata), COUNT(0)) AS pct_has_rendered_jsonld_or_microdata, + + # Pages with only rendered jsonld or microdata + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.has_rendered_jsonld_or_microdata AND NOT wpt_bodies_info.has_raw_jsonld_or_microdata), COUNT(0)) AS pct_has_only_rendered_jsonld_or_microdata, + + # Pages where rendering changed jsonld or microdata + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendering_changes_structured_data), COUNT(0)) AS pct_rendering_changes_structured_data, + + # http or https + SAFE_DIVIDE(COUNTIF(protocol = 'https'), COUNT(0)) AS pct_https, + + # meta robots + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_otherbot_status_index), COUNT(0)) AS pct_rendered_otherbot_status_index, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_otherbot_status_follow), COUNT(0)) AS pct_rendered_otherbot_status_follow, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_otherbot_noarchive), COUNT(0)) AS pct_rendered_otherbot_noarchive, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_otherbot_nosnippet), COUNT(0)) AS pct_rendered_otherbot_nosnippet, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_otherbot_unavailable_after), COUNT(0)) AS pct_rendered_otherbot_unavailable_after, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_otherbot_max_snippet), COUNT(0)) AS pct_rendered_otherbot_max_snippet, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_otherbot_max_image_preview), COUNT(0)) AS pct_rendered_otherbot_max_image_preview, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_otherbot_max_video_preview), COUNT(0)) AS pct_rendered_otherbot_max_video_preview, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_otherbot_notranslate), COUNT(0)) AS pct_rendered_otherbot_notranslate, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_otherbot_noimageindex), COUNT(0)) AS pct_rendered_otherbot_noimageindex, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_otherbot_nocache), COUNT(0)) AS pct_rendered_otherbot_nocache, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_status_index), COUNT(0)) AS pct_rendered_googlebot_status_index, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_status_follow), COUNT(0)) AS pct_rendered_googlebot_status_follow, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_noarchive), COUNT(0)) AS pct_rendered_googlebot_noarchive, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_nosnippet), COUNT(0)) AS pct_rendered_googlebot_nosnippet, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_unavailable_after), COUNT(0)) AS pct_rendered_googlebot_unavailable_after, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_max_snippet), COUNT(0)) AS pct_rendered_googlebot_max_snippet, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_max_image_preview), COUNT(0)) AS pct_rendered_googlebot_max_image_preview, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_max_video_preview), COUNT(0)) AS pct_rendered_googlebot_max_video_preview, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_notranslate), COUNT(0)) AS pct_rendered_googlebot_notranslate, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_noimageindex), COUNT(0)) AS pct_rendered_googlebot_noimageindex, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_nocache), COUNT(0)) AS pct_rendered_googlebot_nocache, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_news_status_index), COUNT(0)) AS pct_rendered_googlebot_news_status_index, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_news_status_follow), COUNT(0)) AS pct_rendered_googlebot_news_status_follow, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_news_noarchive), COUNT(0)) AS pct_rendered_googlebot_news_noarchive, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_news_nosnippet), COUNT(0)) AS pct_rendered_googlebot_news_nosnippet, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_news_unavailable_after), COUNT(0)) AS pct_rendered_googlebot_news_unavailable_after, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_news_max_snippet), COUNT(0)) AS pct_rendered_googlebot_news_max_snippet, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_news_max_image_preview), COUNT(0)) AS pct_rendered_googlebot_news_max_image_preview, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_news_max_video_preview), COUNT(0)) AS pct_rendered_googlebot_news_max_video_preview, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_news_notranslate), COUNT(0)) AS pct_rendered_googlebot_news_notranslate, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_news_noimageindex), COUNT(0)) AS pct_rendered_googlebot_news_noimageindex, + SAFE_DIVIDE(COUNTIF(wpt_bodies_info.rendered_googlebot_news_nocache), COUNT(0)) AS pct_rendered_googlebot_news_nocache + +FROM ( + SELECT + client, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END + AS is_root_page, + SPLIT(page, ':')[OFFSET(0)] AS protocol, + getSeoStatsWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS wpt_bodies_info + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) +GROUP BY + client, + is_root_page diff --git a/sql/2025/seo/seo-stats-by-percentile-2025.sql b/sql/2025/seo/seo-stats-by-percentile-2025.sql new file mode 100644 index 00000000000..1e7afcfa5e3 --- /dev/null +++ b/sql/2025/seo/seo-stats-by-percentile-2025.sql @@ -0,0 +1,160 @@ +#standardSQL +# SEO stats by percentile + +# returns all the data we need from _wpt_bodies +CREATE TEMPORARY FUNCTION get_wpt_bodies_info(wpt_bodies_string STRING) +RETURNS STRUCT< + title_words INT64, + title_characters INT64, + links_other_property INT64, + links_same_site INT64, + links_same_property INT64, + visible_words_rendered_count INT64, + visible_words_raw_count INT64, + meta_description_words INT64, + meta_description_characters INT64, + image_links INT64, + text_links INT64, + hash_link INT64, + hash_only_link INT64, + javascript_void_links INT64, + same_page_jumpto_total INT64, + same_page_dynamic_total INT64, + same_page_other_total INT64, + + valid_data BOOL +> LANGUAGE js AS ''' + +function allPropsAreInt(props) { + const keys = Object.keys(props); + for (const key of keys) { + if (!Number.isInteger(props[key])) { + return false; + } + } + + return true; +} + +try { + var result = {}; + + var wpt_bodies = JSON.parse(wpt_bodies_string); + + if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') { + result.valid_data = false; + return result; + } + + if (wpt_bodies.title) { + if (wpt_bodies.title.rendered) { + result.title_words = wpt_bodies.title.rendered.primary.words; + result.title_characters = wpt_bodies.title.rendered.primary.characters; + } + } + if (wpt_bodies.visible_words) { + result.visible_words_rendered_count = wpt_bodies.visible_words.rendered; + result.visible_words_raw_count = wpt_bodies.visible_words.raw; + } + + if (wpt_bodies.anchors && wpt_bodies.anchors.rendered) { + var anchors_rendered = wpt_bodies.anchors.rendered; + + result.links_other_property = anchors_rendered.other_property; + result.links_same_site = anchors_rendered.same_site; + result.links_same_property = anchors_rendered.same_property; + + result.image_links = anchors_rendered.image_links; + result.text_links = anchors_rendered.text_links; + + result.hash_link = anchors_rendered.hash_link; + result.hash_only_link = anchors_rendered.hash_only_link; + result.javascript_void_links = anchors_rendered.javascript_void_links; + result.same_page_jumpto_total = anchors_rendered.same_page.jumpto.total; + result.same_page_dynamic_total = anchors_rendered.same_page.dynamic.total; + result.same_page_other_total = anchors_rendered.same_page.other.total; + } + + if (wpt_bodies.meta_description && wpt_bodies.meta_description.rendered && wpt_bodies.meta_description.rendered.primary) { + result.meta_description_characters = wpt_bodies.meta_description.rendered.primary.characters; + result.meta_description_words = wpt_bodies.meta_description.rendered.primary.words; + } + + // There was an invalid value somewhere. Throwout all the results for this page + if (!allPropsAreInt(result)) { + return { + valid_data: false, + }; + } + + result.valid_data = true; + return result; +} catch (e) { + return { + valid_data: false, + }; +} +'''; + +SELECT + percentile, + client, + is_root_page, + COUNT(DISTINCT page) AS total, + # title + APPROX_QUANTILES(wpt_bodies_info.title_words, 1000)[OFFSET(percentile * 10)] AS title_words, + APPROX_QUANTILES(wpt_bodies_info.title_characters, 1000)[OFFSET(percentile * 10)] AS title_characters, + + # meta description + APPROX_QUANTILES(wpt_bodies_info.meta_description_words, 1000)[OFFSET(percentile * 10)] AS meta_description_words, + APPROX_QUANTILES(wpt_bodies_info.meta_description_characters, 1000)[OFFSET(percentile * 10)] AS meta_description_characters, + + # links + APPROX_QUANTILES(wpt_bodies_info.links_other_property, 1000)[OFFSET(percentile * 10)] AS outgoing_links_external, + APPROX_QUANTILES(wpt_bodies_info.links_same_property + wpt_bodies_info.links_same_site + wpt_bodies_info.links_other_property, 1000)[OFFSET(percentile * 10)] AS outgoing_links, + APPROX_QUANTILES(wpt_bodies_info.links_same_property + wpt_bodies_info.links_same_site, 1000)[OFFSET(percentile * 10)] AS outgoing_links_internal, + + APPROX_QUANTILES(wpt_bodies_info.image_links, 1000)[OFFSET(percentile * 10)] AS image_links, + APPROX_QUANTILES(wpt_bodies_info.text_links, 1000)[OFFSET(percentile * 10)] AS text_links, + + APPROX_QUANTILES(wpt_bodies_info.hash_link, 1000)[OFFSET(percentile * 10)] AS hash_links, + APPROX_QUANTILES(wpt_bodies_info.hash_only_link, 1000)[OFFSET(percentile * 10)] AS hash_only_links, + APPROX_QUANTILES(wpt_bodies_info.javascript_void_links, 1000)[OFFSET(percentile * 10)] AS javascript_void_links, + APPROX_QUANTILES(wpt_bodies_info.same_page_jumpto_total, 1000)[OFFSET(percentile * 10)] AS same_page_jumpto_links, + APPROX_QUANTILES(wpt_bodies_info.same_page_dynamic_total, 1000)[OFFSET(percentile * 10)] AS same_page_dynamic_links, + APPROX_QUANTILES(wpt_bodies_info.same_page_other_total, 1000)[OFFSET(percentile * 10)] AS same_page_other_links, + + # percent of links are image links + APPROX_QUANTILES(SAFE_DIVIDE(wpt_bodies_info.image_links, wpt_bodies_info.image_links + wpt_bodies_info.text_links), 1000)[OFFSET(percentile * 10)] AS image_links_percent, + + # words + APPROX_QUANTILES(wpt_bodies_info.visible_words_rendered_count, 1000)[OFFSET(percentile * 10)] AS visible_words_rendered, + APPROX_QUANTILES(wpt_bodies_info.visible_words_raw_count, 1000)[OFFSET(percentile * 10)] AS visible_words_raw + +FROM ( + SELECT + client, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END + AS is_root_page, + percentile, + page, + get_wpt_bodies_info(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS wpt_bodies_info + FROM + `httparchive.crawl.pages`, + UNNEST([10, 25, 50, 75, 90]) AS percentile + WHERE + date = '2025-06-01' +) +WHERE + wpt_bodies_info.valid_data +GROUP BY + percentile, + is_root_page, + client +ORDER BY + percentile, + client diff --git a/sql/2025/seo/structured-data-formats-2025.sql b/sql/2025/seo/structured-data-formats-2025.sql new file mode 100644 index 00000000000..dd3bcc49aa2 --- /dev/null +++ b/sql/2025/seo/structured-data-formats-2025.sql @@ -0,0 +1,70 @@ +#standardSQL +# Structured data formats + +# returns all the data we need from _wpt_bodies +CREATE TEMPORARY FUNCTION getStructuredDataWptBodies(wpt_bodies_string STRING) +RETURNS STRUCT< + items_by_format ARRAY +> LANGUAGE js AS ''' +var result = { +items_by_format: [] +}; + +//Function to retrieve only keys if value is > 0 +function getKey(dict){ + const arr = [], + obj = Object.keys(dict); + for (var x in obj){ + if(dict[obj[x]] > 0){ + arr.push(obj[x]); + } + } + return arr; +} + +try { + var wpt_bodies = JSON.parse(wpt_bodies_string); + + if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result; + + if (wpt_bodies.structured_data && wpt_bodies.structured_data.rendered && wpt_bodies.structured_data.rendered.items_by_format) { + result.items_by_format = getKey(wpt_bodies.structured_data.rendered.items_by_format); + } + +} catch (e) {} +return result; +'''; + +WITH structured_data AS ( + SELECT + client, + root_page, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END AS is_root_page, + page, + getStructuredDataWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS structured_data_wpt_bodies_info, + COUNT(DISTINCT root_page) OVER (PARTITION BY client) AS total_sites + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) + +SELECT + client, + is_root_page, + format, + COUNT(DISTINCT root_page) AS sites, + COUNT(DISTINCT root_page) / ANY_VALUE(total_sites) AS pct +FROM + structured_data, + UNNEST(structured_data_wpt_bodies_info.items_by_format) AS format +GROUP BY + client, + is_root_page, + format +ORDER BY + sites DESC diff --git a/sql/2025/seo/structured-data-schema-types-2025.sql b/sql/2025/seo/structured-data-schema-types-2025.sql new file mode 100644 index 00000000000..53bdd5bf3f1 --- /dev/null +++ b/sql/2025/seo/structured-data-schema-types-2025.sql @@ -0,0 +1,56 @@ +#standardSQL +# Structured data schema types + +# returns all the data we need from _wpt_bodies +CREATE TEMPORARY FUNCTION getStructuredSchemaWptBodies(wpt_bodies_string STRING) +RETURNS STRUCT< + jsonld_and_microdata_types ARRAY +> LANGUAGE js AS ''' +var result = {}; +try { + var wpt_bodies = JSON.parse(wpt_bodies_string); + + if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result; + + if (wpt_bodies.structured_data && wpt_bodies.structured_data.rendered) { + var temp = wpt_bodies.structured_data.rendered.jsonld_and_microdata_types; + result.jsonld_and_microdata_types = temp.map(a => a.name); + } +} catch (e) {} +return result; +'''; + +WITH structured_data AS ( + SELECT + client, + root_page, + CASE + WHEN is_root_page = FALSE THEN 'Secondarypage' + WHEN is_root_page = TRUE THEN 'Homepage' + ELSE 'No Assigned Page' + END AS is_root_page, + getStructuredSchemaWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS structured_schema_wpt_bodies_info, + COUNT(DISTINCT root_page) OVER (PARTITION BY client) AS total_sites + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' +) + +SELECT + client, + is_root_page, + type, + COUNT(DISTINCT root_page) AS sites, + COUNT(DISTINCT root_page) / ANY_VALUE(total_sites) AS pct +FROM + structured_data, + UNNEST(structured_schema_wpt_bodies_info.jsonld_and_microdata_types) AS type +GROUP BY + type, + is_root_page, + client +HAVING + sites > 50 +ORDER BY + sites DESC diff --git a/sql/2025/seo/unused-css-js-2025.sql b/sql/2025/seo/unused-css-js-2025.sql new file mode 100644 index 00000000000..f9a06701d46 --- /dev/null +++ b/sql/2025/seo/unused-css-js-2025.sql @@ -0,0 +1,44 @@ +#standardSQL +# Gather lighthouse unused css and js by CrUX rank + +SELECT + client, + rank_grouping, + CASE + WHEN rank_grouping = 100000000 THEN 'all' + ELSE FORMAT("%'d", rank_grouping) + END AS ranking, + COUNT(DISTINCT page) AS pages, + SUM(unused_javascript) / COUNT(DISTINCT page) AS unused_javascript_kib_avg, + SUM(unused_css_rules) / COUNT(DISTINCT page) AS unused_css_rules_kib_avg + +FROM ( + SELECT + client, + page, + rank + FROM + `httparchive.crawl.pages` + WHERE date = '2025-06-01' +) + +LEFT JOIN ( + SELECT + client, + page, + SAFE_DIVIDE(CAST(JSON_EXTRACT_SCALAR(report, '$.audits.unused-javascript.details.overallSavingsBytes') AS INT64), 1024) AS unused_javascript, + SAFE_DIVIDE(CAST(JSON_EXTRACT_SCALAR(report, '$.audits.unused-css-rules.details.overallSavingsBytes') AS INT64), 1024) AS unused_css_rules + FROM + `httparchive.crawl.pages` + WHERE date = '2025-06-01' +) + +USING (client, page), + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + rank_grouping diff --git a/sql/2025/seo/videos_per_page-2025.sql b/sql/2025/seo/videos_per_page-2025.sql new file mode 100644 index 00000000000..bf20c0778ce --- /dev/null +++ b/sql/2025/seo/videos_per_page-2025.sql @@ -0,0 +1,50 @@ +#standardSQL +# Videos per page + +# returns all the data we need from _almanac +CREATE TEMPORARY FUNCTION getVideosAlmanacInfo(almanac_string STRING) +RETURNS STRUCT< + videos_total INT64 +> LANGUAGE js AS ''' +var result = { + videos_total: 0 +}; +try { + var almanac = JSON.parse(almanac_string); + + if (Array.isArray(almanac) || typeof almanac != 'object') return result; + + if (almanac.videos && almanac.videos.total) { + result.videos_total = almanac.videos.total; + } +} catch (e) {} +return result; +'''; + +SELECT + percentile, + client, + COUNT(DISTINCT page) AS total, + + # videos per page + APPROX_QUANTILES(video_almanac_info.videos_total, 1000)[OFFSET(percentile * 10)] AS videos_count + +FROM ( + SELECT + client AS client, + percentile, + page, + getVideosAlmanacInfo(JSON_EXTRACT_SCALAR(payload, '$._almanac')) AS video_almanac_info + FROM + `httparchive.crawl.pages`, + UNNEST([10, 25, 50, 75, 90]) AS percentile + WHERE date = '2025-06-01' +) +WHERE + video_almanac_info.videos_total > 0 +GROUP BY + percentile, + client +ORDER BY + percentile, + client