-
-
Notifications
You must be signed in to change notification settings - Fork 199
SEO queries 2025 #4186
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
SEO queries 2025 #4186
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,66 @@ | ||
| #standardSQL | ||
| # Anchor rel attribute usage | ||
| # This query reports if a rel attribute value was ever used on a page, and calculates various statistics. | ||
|
|
||
| CREATE TEMPORARY FUNCTION getRelStatsWptBodies(wpt_bodies_string STRING) | ||
| RETURNS STRUCT< | ||
| rel ARRAY<STRING> | ||
| > LANGUAGE js AS ''' | ||
| var result = {rel: []}; | ||
| // Function to retrieve only keys if value is >0 | ||
| function getKey(dict){ | ||
| const arr = [], | ||
| obj = Object.keys(dict); | ||
| for (var x in obj){ | ||
| if(dict[obj[x]] > 0){ | ||
| arr.push(obj[x]); | ||
| } | ||
| } | ||
| return arr; | ||
| } | ||
| try { | ||
| var wpt_bodies = JSON.parse(wpt_bodies_string); | ||
| if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result; | ||
| if (wpt_bodies.anchors && wpt_bodies.anchors.rendered && wpt_bodies.anchors.rendered.rel_attributes) { | ||
| result.rel = getKey(wpt_bodies.anchors.rendered.rel_attributes); | ||
| } | ||
| } catch (e) {} | ||
| return result; | ||
| '''; | ||
|
|
||
| WITH rel_stats_table AS ( | ||
| SELECT | ||
| client, | ||
| root_page, | ||
| page, | ||
| CASE | ||
| WHEN is_root_page = FALSE THEN 'Secondarypage' | ||
| WHEN is_root_page = TRUE THEN 'Homepage' | ||
| ELSE 'No Assigned Page' | ||
| END | ||
| AS is_root_page, | ||
| getRelStatsWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS wpt_bodies_info | ||
| FROM | ||
| `httparchive.crawl.pages` | ||
| WHERE | ||
| date = '2025-06-01' | ||
| ) | ||
|
|
||
| SELECT | ||
| client, | ||
| is_root_page, | ||
| rel, | ||
| COUNT(DISTINCT page) AS sites, | ||
| SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total, | ||
| COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct | ||
| FROM | ||
| rel_stats_table, | ||
| UNNEST(wpt_bodies_info.rel) AS rel | ||
| GROUP BY | ||
| client, | ||
| is_root_page, | ||
| rel | ||
| ORDER BY | ||
| sites DESC, | ||
| rel, | ||
| client DESC; |
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Outputs, but missing data - sampled output - https://docs.google.com/spreadsheets/d/1_doMqIEhTr7mYpKOWaJ8pxXIJ3L_A5tD5uKoYDnDdEs/edit?usp=sharing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed, SQL, new sampled output - https://docs.google.com/spreadsheets/d/1MdX8mhyuuz5vPyiHq4BSQ0S1Tf2dytMcQFsNUt5dMqA/edit?usp=sharing |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| #standardSQL | ||
| # Anchor same site occurrence stats | ||
| # This query aims to highlight sites with few same-site links, like SPAs. | ||
|
|
||
| CREATE TEMPORARY FUNCTION getLinkDesciptionsWptBodies(wpt_bodies_string STRING) | ||
| RETURNS STRUCT< | ||
| links_same_site INT64, | ||
| links_window_location INT64, | ||
| links_window_open INT64, | ||
| links_href_javascript INT64 | ||
| > LANGUAGE js AS ''' | ||
| var result = { | ||
| links_same_site: 0, | ||
| links_window_location: 0, | ||
| links_window_open: 0, | ||
| links_href_javascript: 0 | ||
| }; | ||
| try { | ||
| var wpt_bodies = JSON.parse(wpt_bodies_string); | ||
|
|
||
| if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result; | ||
|
|
||
| if (wpt_bodies.anchors && wpt_bodies.anchors.rendered) { | ||
| var anchors_rendered = wpt_bodies.anchors.rendered; | ||
|
|
||
| result.links_same_site = anchors_rendered.same_site || 0; | ||
| result.links_window_location = anchors_rendered.same_page.dynamic.onclick_attributes.window_location || 0; | ||
| result.links_window_open = anchors_rendered.same_page.dynamic.onclick_attributes.window_open || 0; | ||
| result.links_href_javascript = anchors_rendered.same_page.dynamic.href_javascript || 0; | ||
| } | ||
|
|
||
| } catch (e) {} | ||
| return result; | ||
| '''; | ||
|
|
||
| WITH same_links_info AS ( | ||
| SELECT | ||
| client, | ||
| root_page, | ||
| page, | ||
| CASE | ||
| WHEN is_root_page = FALSE THEN 'Secondarypage' | ||
| WHEN is_root_page = TRUE THEN 'Homepage' | ||
| ELSE 'No Assigned Page' | ||
| END | ||
| AS is_root_page, | ||
| getLinkDesciptionsWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS wpt_bodies_info | ||
| FROM | ||
| `httparchive.crawl.pages` | ||
| WHERE | ||
| date = '2025-06-01' | ||
| ) | ||
|
|
||
| SELECT | ||
| client, | ||
| wpt_bodies_info.links_same_site AS links_same_site, | ||
| is_root_page, | ||
| COUNT(DISTINCT page) AS sites, -- Counting all occurrences of links_same_site | ||
| SAFE_DIVIDE(COUNT(0), COUNT(DISTINCT page)) AS pct_links_same_site, -- Percentage of same-site links | ||
| AVG(wpt_bodies_info.links_window_location) AS avg_links_window_location, | ||
| AVG(wpt_bodies_info.links_window_open) AS avg_links_window_open, | ||
| AVG(wpt_bodies_info.links_href_javascript) AS avg_links_href_javascript, | ||
| AVG(wpt_bodies_info.links_window_location + wpt_bodies_info.links_window_open + wpt_bodies_info.links_href_javascript) AS avg_links_any, | ||
| MAX(wpt_bodies_info.links_window_location + wpt_bodies_info.links_window_open + wpt_bodies_info.links_href_javascript) AS max_links_any, | ||
| SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total, | ||
| COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct -- Secondary page percentage within group | ||
| FROM | ||
| same_links_info | ||
| GROUP BY | ||
| client, | ||
| is_root_page, | ||
| wpt_bodies_info.links_same_site | ||
| ORDER BY | ||
| links_same_site ASC; |
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| CREATE TEMPORARY FUNCTION getContentLanguagesAlmanac(almanac_string STRING) | ||
| RETURNS ARRAY<STRING> | ||
| LANGUAGE js AS ''' | ||
| var result = []; | ||
| try { | ||
| var almanac = JSON.parse(almanac_string); | ||
|
|
||
| if (Array.isArray(almanac) || typeof almanac != 'object') return ["NO PAYLOAD"]; | ||
|
|
||
| if (almanac && almanac["meta-nodes"] && almanac["meta-nodes"].nodes && almanac["meta-nodes"].nodes.filter) { | ||
| result = almanac["meta-nodes"].nodes.filter(n => n["http-equiv"] && n["http-equiv"].toLowerCase().trim() == 'content-language' && n.content).map(am => am.content.toLowerCase().trim()); | ||
| } | ||
|
|
||
| if (result.length === 0) | ||
| result.push("NO TAG"); | ||
|
|
||
| } catch (e) {result.push("ERROR "+e);} // results show some issues with the validity of the payload | ||
| return result; | ||
| '''; | ||
| WITH content_language_usage AS ( | ||
| SELECT | ||
| client, | ||
| root_page, | ||
| page, | ||
| CASE | ||
| WHEN is_root_page = FALSE THEN 'Secondarypage' | ||
| WHEN is_root_page = TRUE THEN 'Homepage' | ||
| ELSE 'No Assigned Page' | ||
| END AS is_root_page, | ||
| getContentLanguagesAlmanac(JSON_EXTRACT_SCALAR(payload, '$._almanac')) AS content_languages | ||
| FROM | ||
| `httparchive.crawl.pages` | ||
| WHERE | ||
| date = '2025-06-01' | ||
| ) | ||
|
|
||
| SELECT | ||
| client, | ||
| is_root_page, | ||
| content_language, | ||
| COUNT(DISTINCT page) AS sites, | ||
| SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total, | ||
| COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct | ||
| FROM | ||
| content_language_usage, | ||
| UNNEST(content_languages) AS content_language | ||
| GROUP BY | ||
| client, | ||
| is_root_page, | ||
| content_language | ||
| ORDER BY | ||
| sites DESC, | ||
| client DESC; |
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( | ||
| good / (good + needs_improvement + poor) >= 0.75 | ||
| ); | ||
| CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( | ||
| good + needs_improvement + poor > 0 | ||
| ); | ||
| SELECT | ||
| date, | ||
| device, | ||
| SAFE_DIVIDE( | ||
| COUNT(DISTINCT IF(IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL)), | ||
| COUNT(DISTINCT IF(IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL)) | ||
| ) AS pct_good_lcp, | ||
| # Origins with good FID divided by origins with any FID. | ||
| SAFE_DIVIDE( | ||
| COUNT(DISTINCT IF(IS_GOOD(fast_fid, avg_fid, slow_fid), origin, NULL)), | ||
| COUNT(DISTINCT IF(IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL)) | ||
| ) AS pct_good_fid, | ||
| # Origins with good CLS divided by origins with any CLS. | ||
| SAFE_DIVIDE( | ||
| COUNT(DISTINCT IF(IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)), | ||
| COUNT(DISTINCT IF(IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL)) | ||
| ) AS pct_good_cls, | ||
| # Origins with good LCP, FID, and CLS dividied by origins with any LCP, FID, and CLS. | ||
| SAFE_DIVIDE( | ||
| COUNT(DISTINCT IF( | ||
| IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND | ||
| IS_GOOD(fast_fid, avg_fid, slow_fid) IS NOT FALSE AND | ||
| IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL | ||
| )), | ||
| COUNT(DISTINCT IF( | ||
| IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AND | ||
| IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL | ||
| )) | ||
| ) AS pct_good_cwv | ||
| FROM | ||
| `chrome-ux-report.materialized.device_summary` | ||
| WHERE | ||
| date BETWEEN '2019-11-01' AND '2025-06-01' AND | ||
| device IN ('desktop', 'phone') | ||
| GROUP BY | ||
| date, | ||
| device | ||
| ORDER BY | ||
| date DESC |
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Query returns no data |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,60 @@ | ||
| #standardSQL | ||
| # hreflang header usage | ||
|
|
||
| # Returns all the data we need from _wpt_bodies | ||
| CREATE TEMPORARY FUNCTION getHreflangWptBodies(wpt_bodies_string STRING) | ||
| RETURNS STRUCT< | ||
| hreflangs ARRAY<STRING> | ||
| > LANGUAGE js AS ''' | ||
| var result = { | ||
| hreflangs: [] | ||
| }; | ||
|
|
||
| try { | ||
| var wpt_bodies = JSON.parse(wpt_bodies_string); | ||
|
|
||
| if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result; | ||
|
|
||
| if (wpt_bodies.hreflangs && wpt_bodies.hreflangs.http_header && wpt_bodies.hreflangs.http_header.values) { | ||
| result.hreflangs = wpt_bodies.hreflangs.http_header.values.map(v => v); // seems to fix a coercion issue! | ||
| } | ||
|
|
||
| } catch (e) {} | ||
| return result; | ||
| '''; | ||
|
|
||
| WITH hreflang_usage AS ( | ||
| SELECT | ||
| client, | ||
| root_page, | ||
| page, | ||
| CASE | ||
| WHEN is_root_page = FALSE THEN 'Secondarypage' | ||
| WHEN is_root_page = TRUE THEN 'Homepage' | ||
| ELSE 'No Assigned Page' | ||
| END AS is_root_page, | ||
| getHreflangWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS hreflang_wpt_bodies_info | ||
| FROM | ||
| `httparchive.crawl.pages` | ||
| WHERE | ||
| date = '2025-06-01' | ||
|
|
||
| ) | ||
|
|
||
| SELECT | ||
| client, | ||
| is_root_page, | ||
| NORMALIZE_AND_CASEFOLD(hreflang) AS hreflang, | ||
| COUNT(DISTINCT page) AS sites, | ||
| SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total, | ||
| COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct | ||
| FROM | ||
| hreflang_usage, | ||
| UNNEST(hreflang_wpt_bodies_info.hreflangs) AS hreflang | ||
| GROUP BY | ||
| hreflang, | ||
| client, | ||
| is_root_page | ||
| ORDER BY | ||
| sites DESC, | ||
| client DESC; |
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Query returns no data |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| #standardSQL | ||
| # hreflang link tag usage | ||
|
|
||
| # Returns all the data we need from _wpt_bodies | ||
| CREATE TEMPORARY FUNCTION getHreflangWptBodies(wpt_bodies_string STRING) | ||
| RETURNS STRUCT< | ||
| hreflangs ARRAY<STRING> | ||
| > LANGUAGE js AS ''' | ||
| var result = { | ||
| hreflangs: [] | ||
| }; | ||
|
|
||
| try { | ||
| var wpt_bodies = JSON.parse(wpt_bodies_string); | ||
|
|
||
| if (Array.isArray(wpt_bodies) || typeof wpt_bodies != 'object') return result; | ||
|
|
||
| if (wpt_bodies.hreflangs && wpt_bodies.hreflangs.rendered && wpt_bodies.hreflangs.rendered.values) { | ||
| result.hreflangs = wpt_bodies.hreflangs.rendered.values.map(v => v); // seems to fix a coercion issue! | ||
| } | ||
|
|
||
| } catch (e) {} | ||
| return result; | ||
| '''; | ||
|
|
||
| WITH link_tag AS ( | ||
| SELECT | ||
| client, | ||
| root_page, | ||
| page, | ||
| CASE | ||
| WHEN is_root_page = FALSE THEN 'Secondarypage' | ||
| WHEN is_root_page = TRUE THEN 'Homepage' | ||
| ELSE 'No Assigned Page' | ||
| END AS is_root_page, | ||
| getHreflangWptBodies(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')) AS hreflang_wpt_bodies_info | ||
| FROM | ||
| `httparchive.crawl.pages` | ||
| WHERE | ||
| date = '2025-06-01' | ||
| ) | ||
|
|
||
| SELECT | ||
| client, | ||
| is_root_page, | ||
| NORMALIZE_AND_CASEFOLD(hreflang) AS hreflang, | ||
| COUNT(DISTINCT page) AS sites, | ||
| SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS total, | ||
| COUNT(0) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client, is_root_page) AS pct | ||
| FROM | ||
| link_tag, | ||
| UNNEST(hreflang_wpt_bodies_info.hreflangs) AS hreflang | ||
| GROUP BY | ||
| hreflang, | ||
| is_root_page, | ||
| client | ||
| ORDER BY | ||
| client DESC; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Query returning no data
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now works, new SQL:
Output - new sample - https://docs.google.com/spreadsheets/d/1mBFI6sXDuqP72No4VpZUv-5xBI8U_AUdgzmmz_c9FDY/edit?usp=sharing
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Key differences (old vs new)
Source location
JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies')TO_JSON_STRING(JSON_QUERY(TO_JSON(custom_metrics), '$.wpt_bodies'))_wpt_bodiesmoved from insidepayload(JSON string) tocustom_metrics(STRUCT).Type handling
JSON_QUERYnow returns JSON, so wrap withTO_JSON_STRING(...)before passing to JS UDFs.JSON path stability
wpt_bodies.anchors.rendered.rel_attributeswpt_bodies.anchors.raw.rel_attributesOther points
REGEXP_CONTAINSon JSON β wrap inTO_JSON_STRING(...).COUNTIF(...).custom_metricsis a STRUCT β convert withTO_JSON(...)before pathing.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This:
TO_JSON_STRING(JSON_QUERY(TO_JSON(custom_metrics), '$.wpt_bodies'))can probably be simplified to this:
the beauty of the new JSON type columns is you can reference them directly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can change these to expect JSON input instead. To save you converting to atring before passing, and then converting back to JSON in the UDF.