Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
4 changes: 2 additions & 2 deletions sql/2025/seo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
- [πŸ“Š Results sheet][~google-sheets]
- [πŸ“ Markdown file][~chapter-markdown]

[~google-doc]: https://docs.google.com/document/d/1CM6CV86bDndFI1EaOrplvxdclldcToQ8GFkq2POpNlg
[~google-sheets]: https://docs.google.com/spreadsheets/d/1MoWoxogYWH6fv5r485EttvVgJuw7dMzzcot66X3MWu4/edit
[~google-doc]: https://docs.google.com/document/d/1SZL_TF3IGyq_yLATjZ7OA2bAXPzvRD5nJrCMR4ZeiYk/edit
[~google-sheets]: https://docs.google.com/spreadsheets/d/1lAQKcOF7l6xz9v7yvnI9I1F8yiSqcz3Xx6u-5ady1DQ/edit#gid=1778117656
[~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2025/seo.md
66 changes: 66 additions & 0 deletions sql/2025/seo/anchor-rel-attribute-usage-2025.sql

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Query returning no data

Copy link

@chr156r33n chr156r33n Aug 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now works, new SQL:

CREATE TEMPORARY FUNCTION getRelStatsWptBodies(wpt_bodies_string STRING)
RETURNS STRUCT< rel ARRAY<STRING> >
LANGUAGE js AS '''
var result = {rel: []};
function getKey(dict){
  const arr = [], obj = Object.keys(dict || {});
  for (var i=0;i<obj.length;i++){
    if (Number(dict[obj[i]]) > 0) arr.push(obj[i]);
  }
  return arr;
}
try {
  var wpt_bodies = JSON.parse(wpt_bodies_string);
  if (Array.isArray(wpt_bodies) || typeof wpt_bodies !== 'object') return result;

  if (wpt_bodies.anchors && wpt_bodies.anchors.rendered && wpt_bodies.anchors.rendered.rel_attributes) {
    result.rel = getKey(wpt_bodies.anchors.rendered.rel_attributes);
  }
} catch (e) {}
return result;
''';

WITH rel_stats_table AS (
  SELECT
    client,
    root_page,
    page,
    CASE
      WHEN is_root_page = FALSE THEN 'Secondarypage'
      WHEN is_root_page = TRUE  THEN 'Homepage'
      ELSE 'No Assigned Page'
    END AS is_root_page,

    getRelStatsWptBodies(
      TO_JSON_STRING(JSON_QUERY(TO_JSON(custom_metrics), '$.wpt_bodies'))
    ) AS wpt_bodies_info

  FROM `httparchive.crawl.pages`
  WHERE date = '2025-06-01'
)

SELECT
  client,
  is_root_page,
  rel,
  COUNT(DISTINCT page) AS sites,
  SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
  COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct
FROM rel_stats_table, UNNEST(wpt_bodies_info.rel) AS rel
GROUP BY client, is_root_page, rel
ORDER BY sites DESC, rel, client DESC;

Output - new sample - https://docs.google.com/spreadsheets/d/1mBFI6sXDuqP72No4VpZUv-5xBI8U_AUdgzmmz_c9FDY/edit?usp=sharing

Copy link

@chr156r33n chr156r33n Aug 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Key differences (old vs new)

  1. Source location

    • Old: JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')
    • New: TO_JSON_STRING(JSON_QUERY(TO_JSON(custom_metrics), '$.wpt_bodies'))
    • _wpt_bodies moved from inside payload (JSON string) to custom_metrics (STRUCT).
  2. Type handling

    • Old UDFs received a STRING.
    • JSON_QUERY now returns JSON, so wrap with TO_JSON_STRING(...) before passing to JS UDFs.
  3. JSON path stability

    • Old crawls: wpt_bodies.anchors.rendered.rel_attributes
    • Newer crawls: wpt_bodies.anchors.raw.rel_attributes
  4. Other points

    • REGEXP_CONTAINS on JSON β†’ wrap in TO_JSON_STRING(...).
    • Aggregating BOOLs β†’ use COUNTIF(...).
    • custom_metrics is a STRUCT β†’ convert with TO_JSON(...) before pathing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This:

TO_JSON_STRING(JSON_QUERY(TO_JSON(custom_metrics), '$.wpt_bodies'))

can probably be simplified to this:

custom_metrics.wpt_bodies

the beauty of the new JSON type columns is you can reference them directly.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Old UDFs received a STRING.

You can change these to expect JSON input instead. To save you converting to atring before passing, and then converting back to JSON in the UDF.

Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#standardSQL
# Anchor rel attribute usage
# This query reports if a rel attribute value was ever used on a page, and calculates various statistics.

CREATE TEMPORARY FUNCTION getRelStatsWptBodies(wpt_bodies_string STRING)
RETURNS STRUCT<
rel ARRAY<STRING>
> LANGUAGE js AS '''
var result = {rel: []};
// Function to retrieve only keys if value is >0
function getKey(dict){
const arr = [],
obj = Object.keys(dict);
for (var x in obj){
if(dict[obj[x]] > 0){
arr.push(obj[x]);
}
}
return arr;
}
try {
var wpt_bodies = JSON.parse(wpt_bodies_string);
if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result;
if (wpt_bodies.anchors && wpt_bodies.anchors.rendered && wpt_bodies.anchors.rendered.rel_attributes) {
result.rel = getKey(wpt_bodies.anchors.rendered.rel_attributes);
}
} catch (e) {}
return result;
''';

WITH rel_stats_table AS (
SELECT
client,
root_page,
page,
CASE
WHEN is_root_page = FALSE THEN 'Secondarypage'
WHEN is_root_page = TRUE THEN 'Homepage'
ELSE 'No Assigned Page'
END
AS is_root_page,
getRelStatsWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS wpt_bodies_info
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-06-01'
)

SELECT
client,
is_root_page,
rel,
COUNT(DISTINCT page) AS sites,
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct
FROM
rel_stats_table,
UNNEST(wpt_bodies_info.rel) AS rel
GROUP BY
client,
is_root_page,
rel
ORDER BY
sites DESC,
rel,
client DESC;
74 changes: 74 additions & 0 deletions sql/2025/seo/anchor-same-site-occurance-stats-2025.sql
Copy link

@chr156r33n chr156r33n Aug 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link

@chr156r33n chr156r33n Aug 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed, SQL, new sampled output - https://docs.google.com/spreadsheets/d/1MdX8mhyuuz5vPyiHq4BSQ0S1Tf2dytMcQFsNUt5dMqA/edit?usp=sharing

#standardSQL
-- Anchor same site occurrence stats

CREATE TEMPORARY FUNCTION getLinkDesciptionsWptBodies(wpt_bodies_string STRING)
RETURNS STRUCT<
  links_same_site INT64,
  links_window_location INT64,
  links_window_open INT64,
  links_href_javascript INT64
>
LANGUAGE js AS '''
var result = {
  links_same_site: 0,
  links_window_location: 0,
  links_window_open: 0,
  links_href_javascript: 0
};
try {
  var w = JSON.parse(wpt_bodies_string);
  if (Array.isArray(w) || typeof w !== 'object') return result;

  var r = w && w.anchors && w.anchors.rendered ? w.anchors.rendered : null;
  if (!r) return result;

  // Defensive: coerce to numbers or 0
  result.links_same_site       = Number(r.same_site) || 0;
  var spd = (r.same_page && r.same_page.dynamic) ? r.same_page.dynamic : {};
  var oa  = spd.onclick_attributes || {};

  result.links_window_location = Number(oa.window_location) || 0;
  result.links_window_open     = Number(oa.window_open) || 0;
  result.links_href_javascript = Number(spd.href_javascript) || 0;
} catch (e) {}
return result;
''';

WITH same_links_info AS (
  SELECT
    client,
    root_page,
    page,
    CASE WHEN is_root_page THEN 'Homepage' ELSE 'Secondarypage' END AS is_root_page,
    -- CHANGED: read from custom_metrics.wpt_bodies (STRUCT -> JSON -> STRING)
    getLinkDesciptionsWptBodies(
      TO_JSON_STRING(JSON_QUERY(TO_JSON(custom_metrics), '$.wpt_bodies'))
    ) AS wpt_bodies_info
  FROM `httparchive.crawl.pages`
  WHERE date = '2025-06-01'
)

SELECT
  client,
  wpt_bodies_info.links_same_site AS links_same_site,
  is_root_page,
  COUNT(DISTINCT page) AS sites,
  SAFE_DIVIDE(COUNT(0), COUNT(DISTINCT page)) AS pct_links_same_site,
  AVG(wpt_bodies_info.links_window_location) AS avg_links_window_location,
  AVG(wpt_bodies_info.links_window_open)     AS avg_links_window_open,
  AVG(wpt_bodies_info.links_href_javascript) AS avg_links_href_javascript,
  AVG(wpt_bodies_info.links_window_location
    + wpt_bodies_info.links_window_open
    + wpt_bodies_info.links_href_javascript) AS avg_links_any,
  MAX(wpt_bodies_info.links_window_location
    + wpt_bodies_info.links_window_open
    + wpt_bodies_info.links_href_javascript) AS max_links_any,
  SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
  COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct
FROM same_links_info
GROUP BY client, is_root_page, wpt_bodies_info.links_same_site
ORDER BY links_same_site ASC;

Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#standardSQL
# Anchor same site occurrence stats
# This query aims to highlight sites with few same-site links, like SPAs.

CREATE TEMPORARY FUNCTION getLinkDesciptionsWptBodies(wpt_bodies_string STRING)
RETURNS STRUCT<
links_same_site INT64,
links_window_location INT64,
links_window_open INT64,
links_href_javascript INT64
> LANGUAGE js AS '''
var result = {
links_same_site: 0,
links_window_location: 0,
links_window_open: 0,
links_href_javascript: 0
};
try {
var wpt_bodies = JSON.parse(wpt_bodies_string);

if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result;

if (wpt_bodies.anchors && wpt_bodies.anchors.rendered) {
var anchors_rendered = wpt_bodies.anchors.rendered;

result.links_same_site = anchors_rendered.same_site || 0;
result.links_window_location = anchors_rendered.same_page.dynamic.onclick_attributes.window_location || 0;
result.links_window_open = anchors_rendered.same_page.dynamic.onclick_attributes.window_open || 0;
result.links_href_javascript = anchors_rendered.same_page.dynamic.href_javascript || 0;
}

} catch (e) {}
return result;
''';

WITH same_links_info AS (
SELECT
client,
root_page,
page,
CASE
WHEN is_root_page = FALSE THEN 'Secondarypage'
WHEN is_root_page = TRUE THEN 'Homepage'
ELSE 'No Assigned Page'
END
AS is_root_page,
getLinkDesciptionsWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS wpt_bodies_info
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-06-01'
)

SELECT
client,
wpt_bodies_info.links_same_site AS links_same_site,
is_root_page,
COUNT(DISTINCT page) AS sites, -- Counting all occurrences of links_same_site
SAFE_DIVIDE(COUNT(0), COUNT(DISTINCT page)) AS pct_links_same_site, -- Percentage of same-site links
AVG(wpt_bodies_info.links_window_location) AS avg_links_window_location,
AVG(wpt_bodies_info.links_window_open) AS avg_links_window_open,
AVG(wpt_bodies_info.links_href_javascript) AS avg_links_href_javascript,
AVG(wpt_bodies_info.links_window_location + wpt_bodies_info.links_window_open + wpt_bodies_info.links_href_javascript) AS avg_links_any,
MAX(wpt_bodies_info.links_window_location + wpt_bodies_info.links_window_open + wpt_bodies_info.links_href_javascript) AS max_links_any,
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct -- Secondary page percentage within group
FROM
same_links_info
GROUP BY
client,
is_root_page,
wpt_bodies_info.links_same_site
ORDER BY
links_same_site ASC;
53 changes: 53 additions & 0 deletions sql/2025/seo/content-language-2025.sql

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
CREATE TEMPORARY FUNCTION getContentLanguagesAlmanac(almanac_string STRING)
RETURNS ARRAY<STRING>
LANGUAGE js AS '''
var result = [];
try {
var almanac = JSON.parse(almanac_string);

if (Array.isArray(almanac) || typeof almanac != 'object') return ["NO PAYLOAD"];

if (almanac && almanac["meta-nodes"] && almanac["meta-nodes"].nodes && almanac["meta-nodes"].nodes.filter) {
result = almanac["meta-nodes"].nodes.filter(n => n["http-equiv"] && n["http-equiv"].toLowerCase().trim() == 'content-language' && n.content).map(am => am.content.toLowerCase().trim());
}

if (result.length === 0)
result.push("NO TAG");

} catch (e) {result.push("ERROR "+e);} // results show some issues with the validity of the payload
return result;
''';
WITH content_language_usage AS (
SELECT
client,
root_page,
page,
CASE
WHEN is_root_page = FALSE THEN 'Secondarypage'
WHEN is_root_page = TRUE THEN 'Homepage'
ELSE 'No Assigned Page'
END AS is_root_page,
getContentLanguagesAlmanac(JSON_EXTRACT_SCALAR(payload, '$._almanac')) AS content_languages
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-06-01'
)

SELECT
client,
is_root_page,
content_language,
COUNT(DISTINCT page) AS sites,
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct
FROM
content_language_usage,
UNNEST(content_languages) AS content_language
GROUP BY
client,
is_root_page,
content_language
ORDER BY
sites DESC,
client DESC;
45 changes: 45 additions & 0 deletions sql/2025/seo/core-web-vitals-2025.sql

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
good / (good + needs_improvement + poor) >= 0.75
);
CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS (
good + needs_improvement + poor > 0
);
SELECT
date,
device,
SAFE_DIVIDE(
COUNT(DISTINCT IF(IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL)),
COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL))
) AS pct_good_lcp,
# Origins with good FID divided by origins with any FID.
SAFE_DIVIDE(
COUNT(DISTINCT IF(IS_GOOD(fast_fid, avg_fid, slow_fid), origin, NULL)),
COUNT(DISTINCT IF(IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL))
) AS pct_good_fid,
# Origins with good CLS divided by origins with any CLS.
SAFE_DIVIDE(
COUNT(DISTINCT IF(IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)),
COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))
) AS pct_good_cls,
# Origins with good LCP, FID, and CLS dividied by origins with any LCP, FID, and CLS.
SAFE_DIVIDE(
COUNT(DISTINCT IF(
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND
IS_GOOD(fast_fid, avg_fid, slow_fid) IS NOT FALSE AND
IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL
)),
COUNT(DISTINCT IF(
IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AND
IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL
))
) AS pct_good_cwv
FROM
`chrome-ux-report.materialized.device_summary`
WHERE
date BETWEEN '2019-11-01' AND '2025-06-01' AND
device IN ('desktop', 'phone')
GROUP BY
date,
device
ORDER BY
date DESC
60 changes: 60 additions & 0 deletions sql/2025/seo/hreflang-header-usage-2025.sql

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Query returns no data

Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#standardSQL
# hreflang header usage

# Returns all the data we need from _wpt_bodies
CREATE TEMPORARY FUNCTION getHreflangWptBodies(wpt_bodies_string STRING)
RETURNS STRUCT<
hreflangs ARRAY<STRING>
> LANGUAGE js AS '''
var result = {
hreflangs: []
};

try {
var wpt_bodies = JSON.parse(wpt_bodies_string);

if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result;

if (wpt_bodies.hreflangs && wpt_bodies.hreflangs.http_header && wpt_bodies.hreflangs.http_header.values) {
result.hreflangs = wpt_bodies.hreflangs.http_header.values.map(v => v); // seems to fix a coercion issue!
}

} catch (e) {}
return result;
''';

WITH hreflang_usage AS (
SELECT
client,
root_page,
page,
CASE
WHEN is_root_page = FALSE THEN 'Secondarypage'
WHEN is_root_page = TRUE THEN 'Homepage'
ELSE 'No Assigned Page'
END AS is_root_page,
getHreflangWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS hreflang_wpt_bodies_info
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-06-01'

)

SELECT
client,
is_root_page,
NORMALIZE_AND_CASEFOLD(hreflang) AS hreflang,
COUNT(DISTINCT page) AS sites,
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct
FROM
hreflang_usage,
UNNEST(hreflang_wpt_bodies_info.hreflangs) AS hreflang
GROUP BY
hreflang,
client,
is_root_page
ORDER BY
sites DESC,
client DESC;
58 changes: 58 additions & 0 deletions sql/2025/seo/hreflang-link-tag-usage-2025.sql

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Query returns no data

Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#standardSQL
# hreflang link tag usage

# Returns all the data we need from _wpt_bodies
CREATE TEMPORARY FUNCTION getHreflangWptBodies(wpt_bodies_string STRING)
RETURNS STRUCT<
hreflangs ARRAY<STRING>
> LANGUAGE js AS '''
var result = {
hreflangs: []
};

try {
var wpt_bodies = JSON.parse(wpt_bodies_string);

if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result;

if (wpt_bodies.hreflangs && wpt_bodies.hreflangs.rendered && wpt_bodies.hreflangs.rendered.values) {
result.hreflangs = wpt_bodies.hreflangs.rendered.values.map(v => v); // seems to fix a coercion issue!
}

} catch (e) {}
return result;
''';

WITH link_tag AS (
SELECT
client,
root_page,
page,
CASE
WHEN is_root_page = FALSE THEN 'Secondarypage'
WHEN is_root_page = TRUE THEN 'Homepage'
ELSE 'No Assigned Page'
END AS is_root_page,
getHreflangWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS hreflang_wpt_bodies_info
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-06-01'
)

SELECT
client,
is_root_page,
NORMALIZE_AND_CASEFOLD(hreflang) AS hreflang,
COUNT(DISTINCT page) AS sites,
SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total,
COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct
FROM
link_tag,
UNNEST(hreflang_wpt_bodies_info.hreflangs) AS hreflang
GROUP BY
hreflang,
is_root_page,
client
ORDER BY
client DESC;
Loading