diff --git a/sql/2025/sustainability/cache_header_usage.sql b/sql/2025/sustainability/cache_header_usage.sql new file mode 100644 index 00000000000..1c492dfd0ba --- /dev/null +++ b/sql/2025/sustainability/cache_header_usage.sql @@ -0,0 +1,54 @@ +#standardSQL +# The distribution of cache header adoption on websites by client. + +SELECT + client, + COUNT(0) AS total_requests, + + COUNTIF(uses_cache_control) AS total_using_cache_control, + COUNTIF(uses_max_age) AS total_using_max_age, + COUNTIF(uses_expires) AS total_using_expires, + COUNTIF(uses_max_age AND uses_expires) AS total_using_max_age_and_expires, + COUNTIF(uses_cache_control AND uses_expires) AS total_using_both_cc_and_expires, + COUNTIF(NOT uses_cache_control AND NOT uses_expires) AS total_using_neither_cc_and_expires, + COUNTIF(uses_cache_control AND NOT uses_expires) AS total_using_only_cache_control, + COUNTIF(NOT uses_cache_control AND uses_expires) AS total_using_only_expires, + + COUNTIF(uses_cache_control) / COUNT(0) AS pct_cache_control, + COUNTIF(uses_max_age) / COUNT(0) AS pct_using_max_age, + COUNTIF(uses_expires) / COUNT(0) AS pct_using_expires, + COUNTIF(uses_max_age AND uses_expires) / COUNT(0) AS pct_using_max_age_and_expires, + COUNTIF(uses_cache_control AND uses_expires) / COUNT(0) AS pct_using_both_cc_and_expires, + COUNTIF(NOT uses_cache_control AND NOT uses_expires) / COUNT(0) AS pct_using_neither_cc_nor_expires, + COUNTIF(uses_cache_control AND NOT uses_expires) / COUNT(0) AS pct_using_only_cache_control, + COUNTIF(NOT uses_cache_control AND uses_expires) / COUNT(0) AS pct_using_only_expires + +FROM ( + SELECT + client, + url, + LOGICAL_OR(header.name = 'expires' AND header.value IS NOT NULL AND TRIM(header.value) != '') AS uses_expires, + LOGICAL_OR(header.name = 'cache-control' AND header.value IS NOT NULL AND TRIM(header.value) != '') AS uses_cache_control, + LOGICAL_OR(header.name = 'cache-control' AND REGEXP_CONTAINS(header.value, r'(?i)max-age\s*=\s*[0-9]+')) AS uses_max_age, + + LOGICAL_OR(header.name = 'etag' AND (header.value IS NULL OR TRIM(header.value) = '')) AS uses_no_etag, + LOGICAL_OR(header.name = 'etag' AND header.value IS NOT NULL AND TRIM(header.value) != '') AS uses_etag, + LOGICAL_OR(header.name = 'last-modified' AND header.value IS NOT NULL AND TRIM(header.value) != '') AS uses_last_modified, + + LOGICAL_OR(header.name = 'etag' AND REGEXP_CONTAINS(TRIM(header.value), '^W/".*"')) AS uses_weak_etag, + LOGICAL_OR(header.name = 'etag' AND REGEXP_CONTAINS(TRIM(header.value), '^".*"')) AS uses_strong_etag + + FROM + `httparchive.crawl.requests`, + UNNEST(response_headers) AS header + WHERE + date = '2025-07-01' + GROUP BY + client, + url +) + +GROUP BY + client +ORDER BY + client; diff --git a/sql/2025/sustainability/cdn_adoption.sql b/sql/2025/sustainability/cdn_adoption.sql new file mode 100644 index 00000000000..2e3676586e5 --- /dev/null +++ b/sql/2025/sustainability/cdn_adoption.sql @@ -0,0 +1,33 @@ +#standardSQL +# The distribution of CDN adoption on websites by client. + +SELECT + client, + total, + IF(cdn = '', 'No CDN', cdn) AS cdn, + COUNT(0) AS freq, + ROUND(100 * COUNT(0) / total, 2) AS pct +FROM ( + SELECT + client, + COUNT(0) AS total, + ARRAY_CONCAT_AGG( + SPLIT(JSON_EXTRACT_SCALAR(summary, '$.cdn'), ', ') + ) AS cdn_list + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' AND + is_root_page = TRUE + GROUP BY + client +), + UNNEST(cdn_list) AS cdn +GROUP BY + client, + cdn, + total +ORDER BY + pct DESC, + client ASC, + cdn ASC; diff --git a/sql/2025/sustainability/cms_bytes_per_type.sql b/sql/2025/sustainability/cms_bytes_per_type.sql new file mode 100644 index 00000000000..5dc9b5a650d --- /dev/null +++ b/sql/2025/sustainability/cms_bytes_per_type.sql @@ -0,0 +1,161 @@ +#standardSQL +# Median resource weights by CMS + +# Declare variables to calculate the carbon emissions of one byte +# Source: https://sustainablewebdesign.org/calculating-digital-emissions/ + +DECLARE grid_intensity NUMERIC DEFAULT 494; +DECLARE embodied_emissions_data_centers NUMERIC DEFAULT 0.012; +DECLARE embodied_emissions_network NUMERIC DEFAULT 0.013; +DECLARE embodied_emissions_user_devices NUMERIC DEFAULT 0.081; +DECLARE operational_emissions_data_centers NUMERIC DEFAULT 0.055; +DECLARE operational_emissions_network NUMERIC DEFAULT 0.059; +DECLARE operational_emissions_user_devices NUMERIC DEFAULT 0.080; + +WITH cms_data AS ( + SELECT + client, + page, + tech.technology AS cms, + CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 AS total_kb, + + -- Operational emissions calculations + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity AS op_emissions_dc, + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity AS op_emissions_networks, + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity AS op_emissions_devices, + + -- Embodied emissions calculations + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity AS em_emissions_dc, + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity AS em_emissions_networks, + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity AS em_emissions_devices, + + -- Total emissions (operational + embodied) + ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity + ) AS total_operational_emissions, + + ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity + ) AS total_embodied_emissions, + + ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity + ) AS total_emissions, + + -- Proportions of each resource type relative to total bytes + CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS html_proportion, + CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS js_proportion, + CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS css_proportion, + CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS img_proportion, + CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS font_proportion, + + -- Resource-specific emissions calculations + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_html_emissions, + + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_js_emissions, + + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_css_emissions, + + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_img_emissions, + + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_font_emissions, + + -- Resource-specific size in KB + CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64) / 1024 AS html_kb, + CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64) / 1024 AS js_kb, + CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64) / 1024 AS css_kb, + CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64) / 1024 AS img_kb, + CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64) / 1024 AS font_kb + FROM + `httparchive.crawl.pages`, + UNNEST(technologies) AS tech + WHERE + date = '2025-07-01' AND + is_root_page = TRUE AND + 'CMS' IN UNNEST(tech.categories) +) + +SELECT + client, + cms, + COUNT(0) AS pages, + -- Median resource weights and emissions + APPROX_QUANTILES(total_kb, 1000)[OFFSET(500)] AS median_total_kb, + APPROX_QUANTILES(total_operational_emissions, 1000)[OFFSET(500)] AS median_operational_emissions, + APPROX_QUANTILES(total_embodied_emissions, 1000)[OFFSET(500)] AS median_embodied_emissions, + APPROX_QUANTILES(total_emissions, 1000)[OFFSET(500)] AS median_total_emissions, + + -- Resource-specific medians + APPROX_QUANTILES(html_kb, 1000)[OFFSET(500)] AS median_html_kb, + APPROX_QUANTILES(total_html_emissions, 1000)[OFFSET(500)] AS median_total_html_emissions, + APPROX_QUANTILES(js_kb, 1000)[OFFSET(500)] AS median_js_kb, + APPROX_QUANTILES(total_js_emissions, 1000)[OFFSET(500)] AS median_total_js_emissions, + APPROX_QUANTILES(css_kb, 1000)[OFFSET(500)] AS median_css_kb, + APPROX_QUANTILES(total_css_emissions, 1000)[OFFSET(500)] AS median_total_css_emissions, + APPROX_QUANTILES(img_kb, 1000)[OFFSET(500)] AS median_img_kb, + APPROX_QUANTILES(total_img_emissions, 1000)[OFFSET(500)] AS median_total_img_emissions, + APPROX_QUANTILES(font_kb, 1000)[OFFSET(500)] AS median_font_kb, + APPROX_QUANTILES(total_font_emissions, 1000)[OFFSET(500)] AS median_total_font_emissions +FROM + cms_data +GROUP BY + client, + cms +ORDER BY + pages DESC, + cms, + client; diff --git a/sql/2025/sustainability/content-visibility.sql b/sql/2025/sustainability/content-visibility.sql new file mode 100644 index 00000000000..7b724c99389 --- /dev/null +++ b/sql/2025/sustainability/content-visibility.sql @@ -0,0 +1,74 @@ +#standardSQL +CREATE TEMPORARY FUNCTION HASCONTENTVISIBILITY(css STRING) +RETURNS ARRAY> +LANGUAGE js +OPTIONS (library = "gs://httparchive/lib/css-utils.js") +AS ''' +try { + var ast = JSON.parse(css); + + let ret = {}; + + walkDeclarations(ast, ({property}) => { + // Strip hacks like *property, _property etc and normalize to lowercase + property = property.replace(/[^a-z-]/g, "").toLowerCase(); + + if (matches(property, 'content-visibility')) { + incrementByKey(ret, property); + } + }); + + return Object.entries(ret).map(([property, freq]) => { + return {property, freq}; + }); +} catch (e) { + return []; +} +'''; + +WITH totals AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS total_pages + FROM + `httparchive.crawl.parsed_css` + WHERE + date = '2025-06-01' AND + is_root_page + GROUP BY + client +), + +content_visibility_pages AS ( + SELECT + client, + COUNT(DISTINCT root_page) AS pages_with_content_visibility + FROM + `httparchive.crawl.parsed_css`, + UNNEST(HASCONTENTVISIBILITY(TO_JSON_STRING(css))) + WHERE + date = '2025-06-01' AND + is_root_page + GROUP BY + client +) + +SELECT + totals.client, + totals.total_pages, + COALESCE( + content_visibility_pages.pages_with_content_visibility, 0 + ) AS pages_with_content_visibility, + ROUND( + COALESCE( + content_visibility_pages.pages_with_content_visibility, 0 + ) * 100.0 / totals.total_pages, + 2 + ) AS pct_pages +FROM + totals +LEFT JOIN + content_visibility_pages +ON totals.client = content_visibility_pages.client +ORDER BY + totals.client diff --git a/sql/2025/sustainability/ecommerce_bytes_per_type.sql b/sql/2025/sustainability/ecommerce_bytes_per_type.sql new file mode 100644 index 00000000000..27fff6087d1 --- /dev/null +++ b/sql/2025/sustainability/ecommerce_bytes_per_type.sql @@ -0,0 +1,167 @@ +#standardSQL +# Median resource weights by ecommerce platform with detailed CO2e breakdown +# Source: https://sustainablewebdesign.org/calculating-digital-emissions/ +# Declare variables to calculate the carbon emissions per gigabyte (kWh/GB) + +DECLARE grid_intensity NUMERIC DEFAULT 494; +DECLARE embodied_emissions_data_centers NUMERIC DEFAULT 0.012; +DECLARE embodied_emissions_network NUMERIC DEFAULT 0.013; +DECLARE embodied_emissions_user_devices NUMERIC DEFAULT 0.081; +DECLARE operational_emissions_data_centers NUMERIC DEFAULT 0.055; +DECLARE operational_emissions_network NUMERIC DEFAULT 0.059; +DECLARE operational_emissions_user_devices NUMERIC DEFAULT 0.080; + +WITH ecommerce_data AS ( + SELECT + client, + page, + tech.technology AS ecommerce, + CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 AS total_kb, + + -- Operational emissions calculations + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity AS op_emissions_dc, + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity AS op_emissions_networks, + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity AS op_emissions_devices, + + -- Embodied emissions calculations + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity AS em_emissions_dc, + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity AS em_emissions_networks, + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity AS em_emissions_devices, + + -- Total emissions (operational + embodied) + ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity + ) AS total_operational_emissions, + + ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity + ) AS total_embodied_emissions, + + ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity + ) AS total_emissions, + + -- Proportions of each resource type relative to total bytes + CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS html_proportion, + CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS js_proportion, + CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS css_proportion, + CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS img_proportion, + CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS font_proportion, + + -- Resource-specific emissions calculations + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_html_emissions, + + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_js_emissions, + + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_css_emissions, + + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_img_emissions, + + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_font_emissions, + + -- Resource-specific size in KB + CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64) / 1024 AS html_kb, + CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64) / 1024 AS js_kb, + CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64) / 1024 AS css_kb, + CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64) / 1024 AS img_kb, + CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64) / 1024 AS font_kb + + FROM + `httparchive.crawl.pages`, + UNNEST(technologies) AS tech + WHERE + date = '2025-06-01' AND + is_root_page = TRUE AND + EXISTS ( + SELECT 1 + FROM UNNEST(tech.categories) AS category + WHERE category = 'Ecommerce' AND + tech.technology NOT IN ('Cart Functionality', 'Google Analytics Enhanced eCommerce') + ) +) + +SELECT + client, + ecommerce, + COUNT(0) AS pages, + + -- Median resource weights and emissions + APPROX_QUANTILES(total_kb, 1000)[OFFSET(500)] AS median_total_kb, + APPROX_QUANTILES(total_operational_emissions, 1000)[OFFSET(500)] AS median_operational_emissions, + APPROX_QUANTILES(total_embodied_emissions, 1000)[OFFSET(500)] AS median_embodied_emissions, + APPROX_QUANTILES(total_emissions, 1000)[OFFSET(500)] AS median_total_emissions, + + -- Resource-specific medians + APPROX_QUANTILES(html_kb, 1000)[OFFSET(500)] AS median_html_kb, + APPROX_QUANTILES(total_html_emissions, 1000)[OFFSET(500)] AS median_total_html_emissions, + APPROX_QUANTILES(js_kb, 1000)[OFFSET(500)] AS median_js_kb, + APPROX_QUANTILES(total_js_emissions, 1000)[OFFSET(500)] AS median_total_js_emissions, + APPROX_QUANTILES(css_kb, 1000)[OFFSET(500)] AS median_css_kb, + APPROX_QUANTILES(total_css_emissions, 1000)[OFFSET(500)] AS median_total_css_emissions, + APPROX_QUANTILES(img_kb, 1000)[OFFSET(500)] AS median_img_kb, + APPROX_QUANTILES(total_img_emissions, 1000)[OFFSET(500)] AS median_total_img_emissions, + APPROX_QUANTILES(font_kb, 1000)[OFFSET(500)] AS median_font_kb, + APPROX_QUANTILES(total_font_emissions, 1000)[OFFSET(500)] AS median_total_font_emissions +FROM + ecommerce_data +GROUP BY + client, + ecommerce +ORDER BY + pages DESC, + ecommerce, + client; diff --git a/sql/2025/sustainability/favicons.sql b/sql/2025/sustainability/favicons.sql new file mode 100644 index 00000000000..1263fcba109 --- /dev/null +++ b/sql/2025/sustainability/favicons.sql @@ -0,0 +1,70 @@ +#standardSQL +# Temporary function to extract favicon image extensions from the JSON payload +CREATE TEMPORARY FUNCTION getFaviconImage(almanac JSON) +RETURNS STRING LANGUAGE js AS ''' +var result = 'NO_DATA'; +try { + if (Array.isArray(almanac) || typeof almanac != 'object') return result; + + if (almanac["link-nodes"] && almanac["link-nodes"].nodes && almanac["link-nodes"].nodes.find) { + var faviconNode = almanac["link-nodes"].nodes.find(n => n.rel && n.rel.split(' ').find(r => r.trim().toLowerCase() == 'icon')); + + if (faviconNode) { + if (faviconNode.href) { + var temp = faviconNode.href; + + if (temp.includes('?')) { + temp = temp.substring(0, temp.indexOf('?')); + } + + if (temp.includes('.')) { + temp = temp.substring(temp.lastIndexOf('.')+1); + + result = temp.toLowerCase().trim(); + } + else { + result = "NO_EXTENSION"; + } + + } else { + result = "NO_HREF"; + } + } else { + result = "NO_ICON"; + } + } + else { + result = "NO_DATA"; + } + +} catch (e) {} +return result; +'''; + +# Main query to analyze favicon image extensions with sampling +WITH favicons AS ( + SELECT + client, + getFaviconImage(custom_metrics.other.almanac) AS image_type_extension, + COUNT(0) AS freq, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS percentage_of_total + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-07-01' + GROUP BY + client, + image_type_extension +) + +SELECT + *, + percentage_of_total AS pct +FROM + favicons +ORDER BY + pct DESC +LIMIT + 1000; + \ No newline at end of file diff --git a/sql/2025/sustainability/global_emissions_per_page.sql b/sql/2025/sustainability/global_emissions_per_page.sql new file mode 100644 index 00000000000..f397e4c9bce --- /dev/null +++ b/sql/2025/sustainability/global_emissions_per_page.sql @@ -0,0 +1,79 @@ +#standardSQL +# The distribution of page weight by resource type and client, with updated SWDM v4 methodology including both operational and embodied emissions + +-- Energy consumption factors from SWDM v4 (in kWh/GB) +DECLARE energy_per_GB_datacenter NUMERIC DEFAULT CAST(0.055 + 0.012 AS NUMERIC); -- Operational + Embodied +DECLARE energy_per_GB_network NUMERIC DEFAULT CAST(0.059 + 0.013 AS NUMERIC); -- Operational + Embodied +DECLARE energy_per_GB_device NUMERIC DEFAULT CAST(0.080 + 0.081 AS NUMERIC); -- Operational + Embodied + +-- Total energy consumption per GB, calculated by summing the above factors +DECLARE kw_per_GB NUMERIC DEFAULT CAST(energy_per_GB_datacenter + energy_per_GB_network + energy_per_GB_device AS NUMERIC); -- Sum of all operational and embodied energies + +-- Global average carbon intensity of electricity generation (gCO2/kWh) +DECLARE global_grid_intensity NUMERIC DEFAULT 494; + +-- Function to calculate emissions in gCO2 +CREATE TEMP FUNCTION calculate_emissions( + bytes FLOAT64, + kw_per_GB FLOAT64, + grid_intensity FLOAT64 +) RETURNS FLOAT64 AS ( + (bytes / 1024 / 1024 / 1024) * -- Convert bytes to GB + (kw_per_GB) * + grid_intensity +); + +WITH page_data AS ( + SELECT + client, + CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS bytesTotal, + CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64) AS bytesHtml, + CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64) AS bytesJS, + CAST(COALESCE(JSON_VALUE(summary, '$.bytesCss'), JSON_VALUE(summary, '$.bytesStyle')) AS INT64) AS bytesCSS, + CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64) AS bytesImg, + CAST(JSON_VALUE(summary, '$.bytesOther') AS INT64) AS bytesOther, + CAST(JSON_VALUE(summary, '$.bytesHtmlDoc') AS INT64) AS bytesHtmlDoc, + CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64) AS bytesFont + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' AND is_root_page +) + +SELECT + percentile, + client, + -- For each resource type, calculate the size in KB and the associated emissions + -- Total resources + APPROX_QUANTILES(bytesTotal / 1024, 1000)[OFFSET(percentile * 10)] AS total_kbytes, + APPROX_QUANTILES(calculate_emissions(bytesTotal, kw_per_GB, global_grid_intensity), 1000)[OFFSET(percentile * 10)] AS total_emissions, + -- HTML resources + APPROX_QUANTILES(bytesHtml / 1024, 1000)[OFFSET(percentile * 10)] AS html_kbytes, + APPROX_QUANTILES(calculate_emissions(bytesHtml, kw_per_GB, global_grid_intensity), 1000)[OFFSET(percentile * 10)] AS html_emissions, + -- JavaScript resources + APPROX_QUANTILES(bytesJS / 1024, 1000)[OFFSET(percentile * 10)] AS js_kbytes, + APPROX_QUANTILES(calculate_emissions(bytesJS, kw_per_GB, global_grid_intensity), 1000)[OFFSET(percentile * 10)] AS js_emissions, + -- CSS resources + APPROX_QUANTILES(bytesCSS / 1024, 1000)[OFFSET(percentile * 10)] AS css_kbytes, + APPROX_QUANTILES(calculate_emissions(bytesCSS, kw_per_GB, global_grid_intensity), 1000)[OFFSET(percentile * 10)] AS css_emissions, + -- Image resources + APPROX_QUANTILES(bytesImg / 1024, 1000)[OFFSET(percentile * 10)] AS img_kbytes, + APPROX_QUANTILES(calculate_emissions(bytesImg, kw_per_GB, global_grid_intensity), 1000)[OFFSET(percentile * 10)] AS img_emissions, + -- Other resources + APPROX_QUANTILES(bytesOther / 1024, 1000)[OFFSET(percentile * 10)] AS other_kbytes, + APPROX_QUANTILES(calculate_emissions(bytesOther, kw_per_GB, global_grid_intensity), 1000)[OFFSET(percentile * 10)] AS other_emissions, + -- HTML document + APPROX_QUANTILES(bytesHtmlDoc / 1024, 1000)[OFFSET(percentile * 10)] AS html_doc_kbytes, + APPROX_QUANTILES(calculate_emissions(bytesHtmlDoc, kw_per_GB, global_grid_intensity), 1000)[OFFSET(percentile * 10)] AS html_doc_emissions, + -- Font resources + APPROX_QUANTILES(bytesFont / 1024, 1000)[OFFSET(percentile * 10)] AS font_kbytes, + APPROX_QUANTILES(calculate_emissions(bytesFont, kw_per_GB, global_grid_intensity), 1000)[OFFSET(percentile * 10)] AS font_emissions +FROM + page_data, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + percentile, + client +ORDER BY + client, + percentile diff --git a/sql/2025/sustainability/green_third_party_requests.sql b/sql/2025/sustainability/green_third_party_requests.sql new file mode 100644 index 00000000000..c5f6c4c3104 --- /dev/null +++ b/sql/2025/sustainability/green_third_party_requests.sql @@ -0,0 +1,159 @@ +#standardSQL +# Median third-parties & green third-party requests per websites by rank + +WITH third_party_date AS ( + SELECT MAX(date) AS date + FROM `httparchive.almanac.third_parties` + WHERE date <= '2025-06-01' +), + +gwf_date AS ( + SELECT MAX(date) AS date + FROM `httparchive.almanac.green_web_foundation` + WHERE date <= '2025-06-01' +), + +requests AS ( + SELECT + client, + url, + page + FROM + `httparchive.crawl.requests` + WHERE + date = '2025-06-01' +), + +green AS ( + SELECT + TRUE AS is_green, + NET.HOST(url) AS host + FROM + `httparchive.almanac.green_web_foundation` g + JOIN gwf_date d + ON g.date = d.date +), + +pages AS ( + SELECT + client, + rank, + page + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' AND + is_root_page +), + +third_party AS ( + SELECT + tp.domain, + COUNT(DISTINCT r.page) AS page_usage + FROM + `httparchive.almanac.third_parties` AS tp + INNER JOIN + requests AS r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + tp.date = (SELECT date FROM third_party_date) AND + tp.category NOT IN ('hosting') + GROUP BY + tp.domain + HAVING + page_usage >= 50 +), + +green_tp AS ( + SELECT tp.domain + FROM + `httparchive.almanac.third_parties` AS tp + INNER JOIN + green AS g + ON NET.HOST(g.host) = NET.HOST(tp.domain) + WHERE + tp.date = (SELECT date FROM third_party_date) AND + tp.category NOT IN ('hosting') + GROUP BY + tp.domain +), + +base AS ( + SELECT + r.client, + r.page, + p.rank, + COUNT(tp.domain) AS third_parties_per_page + FROM + requests AS r + LEFT JOIN + third_party AS tp + ON + NET.HOST(r.url) = NET.HOST(tp.domain) + INNER JOIN + pages AS p + ON r.client = p.client AND r.page = p.page + GROUP BY + r.client, + r.page, + p.rank +), + +base_green AS ( + SELECT + r.client, + r.page, + p.rank, + COUNT(gtp.domain) AS green_third_parties_per_page + FROM + requests AS r + LEFT JOIN + green_tp AS gtp + ON + NET.HOST(r.url) = NET.HOST(gtp.domain) + INNER JOIN + pages AS p + ON r.client = p.client AND r.page = p.page + GROUP BY + r.client, + r.page, + p.rank +) + +SELECT + b.client, + rank_grouping, + CASE + WHEN rank_grouping = 0 THEN '' + WHEN rank_grouping = 100000000 THEN 'all' + ELSE FORMAT("%'d", rank_grouping) + END AS ranking, + APPROX_QUANTILES( + b.third_parties_per_page, 1000 + )[OFFSET(500)] AS p50_third_parties_per_page, + APPROX_QUANTILES( + bg.green_third_parties_per_page, 1000 + )[OFFSET(500)] AS p50_green_third_parties_per_page, + APPROX_QUANTILES( + SAFE_DIVIDE( + bg.green_third_parties_per_page, + b.third_parties_per_page + ), 1000 + )[OFFSET(500)] AS pct_green +FROM + base AS b, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping +INNER JOIN + base_green AS bg +ON + b.client = bg.client AND + b.page = bg.page AND + b.rank = bg.rank +WHERE + b.rank <= rank_grouping +GROUP BY + b.client, + rank_grouping +ORDER BY + b.client, + rank_grouping diff --git a/sql/2025/sustainability/green_web_hosting.sql b/sql/2025/sustainability/green_web_hosting.sql new file mode 100644 index 00000000000..d2611cefcbc --- /dev/null +++ b/sql/2025/sustainability/green_web_hosting.sql @@ -0,0 +1,66 @@ +# standardSQL +# What percentage of URLs are hosted on a known green web hosting provider? + +WITH gwf_date AS ( + SELECT MAX(date) AS date + FROM `httparchive.almanac.green_web_foundation` + WHERE date <= '2025-06-01' +), + +green AS ( + SELECT + TRUE AS is_green, + NET.HOST(url) AS host + FROM + `httparchive.almanac.green_web_foundation` g + JOIN + gwf_date d + ON g.date = d.date +), + +pages AS ( + SELECT + client, + rank, + NET.HOST(root_page) AS host + FROM + `httparchive.crawl.pages` + WHERE + is_root_page = TRUE AND + date = '2025-06-01' +) + +-- Apply rank grouping +SELECT + client, + rank_grouping, + CASE + WHEN rank_grouping = 0 THEN '' + WHEN rank_grouping = 100000000 THEN 'all' + ELSE FORMAT("%'d", rank_grouping) + END AS ranking, + COUNTIF(is_green) AS total_green, + COUNT(0) AS total_sites, + ROUND(100 * SAFE_DIVIDE(COUNTIF(is_green), COUNT(0)), 2) AS pct_green +FROM ( + -- Left join green hosting information + SELECT + p.client, + p.host, + p.rank, + g.is_green + FROM + pages AS p + LEFT JOIN + green AS g + ON p.host = g.host +), + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping; diff --git a/sql/2025/sustainability/page_byte_pre_type.sql b/sql/2025/sustainability/page_byte_pre_type.sql new file mode 100644 index 00000000000..7fce03ed2f0 --- /dev/null +++ b/sql/2025/sustainability/page_byte_pre_type.sql @@ -0,0 +1,118 @@ +#standardSQL + +-- Energy consumption factors from SWDM v4 (in TWh/ZB) +# 290 TWh / 5.29 ZB +DECLARE ENERGY_PER_GB_DATACENTER NUMERIC DEFAULT 0.00006829493087557603; +# 310 TWh / 5.29 ZB +DECLARE ENERGY_PER_GB_NETWORK NUMERIC DEFAULT 0.05859598853868195; +# 421 TWh / 5.29 ZB +DECLARE ENERGY_PER_GB_DEVICE NUMERIC DEFAULT 0.07956802188162324; + +# (290 + 310 + 421) TWh / 5.29 ZB * 1000000 kWh/TWh / 1000000000 GB/ZB +DECLARE KW_PER_GB NUMERIC DEFAULT 0.19300566251415094; + +-- Global average carbon intensity of electricity generation (gCO2/kWh) +DECLARE GLOBAL_GRID_INTENSITY NUMERIC DEFAULT 494; + +-- Function to calculate emissions in gCO2 +CREATE TEMP FUNCTION calculate_emissions( + bytes FLOAT64, + kw_per_GB FLOAT64, + grid_intensity FLOAT64 +) RETURNS FLOAT64 AS ( + (BYTES / 1024 / 1024 / 1024) * -- Convert bytes to GB + (KW_PER_GB) * + GRID_INTENSITY +); + +WITH PAGE_DATA AS ( + SELECT + CLIENT, + cast(json_value(SUMMARY, '$.bytesTotal') AS INT64) AS BYTESTOTAL, + cast(json_value(SUMMARY, '$.bytesHtml') AS INT64) AS BYTESHTML, + cast( + coalesce( + json_value(SUMMARY, '$.bytesCss'), + json_value(SUMMARY, '$.bytesStyle') + ) AS INT64 + ) AS BYTESCSS, + cast(json_value(SUMMARY, '$.bytesJS') AS INT64) AS BYTESJS, + cast(json_value(SUMMARY, '$.bytesImg') AS INT64) AS BYTESIMG, + cast(json_value(SUMMARY, '$.bytesOther') AS INT64) AS BYTESOTHER, + cast(json_value(SUMMARY, '$.bytesHtmlDoc') AS INT64) AS BYTESHTMLDOC, + cast(json_value(SUMMARY, '$.bytesFont') AS INT64) AS BYTESFONT + FROM + `httparchive.crawl.pages` + WHERE + DATE = '2025-06-01' AND IS_ROOT_PAGE +) + +SELECT + PERCENTILE, + CLIENT, + -- Total resources + approx_quantiles( + BYTESTOTAL / 1024, 1000 + )[offset(PERCENTILE * 10)] AS TOTAL_KBYTES, + approx_quantiles( + calculate_emissions(BYTESTOTAL, KW_PER_GB, GLOBAL_GRID_INTENSITY), 1000 + )[offset(PERCENTILE * 10)] AS TOTAL_EMISSIONS, + -- HTML resources + approx_quantiles( + BYTESHTML / 1024, 1000 + )[offset(PERCENTILE * 10)] AS HTML_KBYTES, + approx_quantiles( + calculate_emissions(BYTESHTML, KW_PER_GB, GLOBAL_GRID_INTENSITY), 1000 + )[offset(PERCENTILE * 10)] AS HTML_EMISSIONS, + -- JavaScript resources + approx_quantiles( + BYTESJS / 1024, 1000 + )[offset(PERCENTILE * 10)] AS JS_KBYTES, + approx_quantiles( + calculate_emissions(BYTESJS, KW_PER_GB, GLOBAL_GRID_INTENSITY), 1000 + )[offset(PERCENTILE * 10)] AS JS_EMISSIONS, + -- CSS resources + approx_quantiles( + BYTESCSS / 1024, 1000 + )[offset(PERCENTILE * 10)] AS CSS_KBYTES, + approx_quantiles( + calculate_emissions(BYTESCSS, KW_PER_GB, GLOBAL_GRID_INTENSITY), 1000 + )[offset(PERCENTILE * 10)] AS CSS_EMISSIONS, + -- Image resources + approx_quantiles( + BYTESIMG / 1024, 1000 + )[offset(PERCENTILE * 10)] AS IMG_KBYTES, + approx_quantiles( + calculate_emissions(BYTESIMG, KW_PER_GB, GLOBAL_GRID_INTENSITY), 1000 + )[offset(PERCENTILE * 10)] AS IMG_EMISSIONS, + -- Other resources + approx_quantiles( + BYTESOTHER / 1024, 1000 + )[offset(PERCENTILE * 10)] AS OTHER_KBYTES, + approx_quantiles( + calculate_emissions(BYTESOTHER, KW_PER_GB, GLOBAL_GRID_INTENSITY), 1000 + )[offset(PERCENTILE * 10)] AS OTHER_EMISSIONS, + -- HTML document + approx_quantiles( + BYTESHTMLDOC / 1024, 1000 + )[offset(PERCENTILE * 10)] AS HTML_DOC_KBYTES, + approx_quantiles( + calculate_emissions(BYTESHTMLDOC, KW_PER_GB, GLOBAL_GRID_INTENSITY), + 1000 + )[offset(PERCENTILE * 10)] AS HTML_DOC_EMISSIONS, + -- Font resources + approx_quantiles( + BYTESFONT / 1024, 1000 + )[offset(PERCENTILE * 10)] AS FONT_KBYTES, + approx_quantiles( + calculate_emissions(BYTESFONT, KW_PER_GB, GLOBAL_GRID_INTENSITY), 1000 + )[offset(PERCENTILE * 10)] AS FONT_EMISSIONS +FROM + PAGE_DATA, + unnest([10, 25, 50, 75, 90, 100]) AS PERCENTILE +GROUP BY + PERCENTILE, + CLIENT +ORDER BY + CLIENT, + PERCENTILE diff --git a/sql/2025/sustainability/query_run_size.sql b/sql/2025/sustainability/query_run_size.sql new file mode 100644 index 00000000000..a79df28f1f3 --- /dev/null +++ b/sql/2025/sustainability/query_run_size.sql @@ -0,0 +1,15 @@ +# standardSQL +# Monthly query run size average (MB) and total (TB) +# (0.012+0.013+0.081+0.055+0.0590.080)x494x [Total TB] *1024 = Total kg CO2e + +SELECT + AVG( + CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) + ) / 1048576 AS avg_size_mb, + SUM( + CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) + ) / 1099511627776 AS total_size_tb +FROM + `httparchive.crawl.pages` +WHERE + date = '2025-06-01' diff --git a/sql/2025/sustainability/responsive_images.sql b/sql/2025/sustainability/responsive_images.sql new file mode 100644 index 00000000000..63b3ee93d86 --- /dev/null +++ b/sql/2025/sustainability/responsive_images.sql @@ -0,0 +1,70 @@ +#standardSQL +# percent of sites using images with srcset w/wo sizes, or picture element + +WITH page_data AS ( + SELECT + client, + -- Count occurrences in HTML of the main document + ARRAY_LENGTH(REGEXP_EXTRACT_ALL(COALESCE(response_body, ''), r'(?is)<(?:img|source)[^>]*srcset\s*=')) AS num_srcset_all, + ARRAY_LENGTH(REGEXP_EXTRACT_ALL(COALESCE(response_body, ''), r'(?is)<(?:img|source)[^>]*sizes\s*=')) AS num_srcset_sizes, + -- Presence of + IF(REGEXP_CONTAINS(COALESCE(response_body, ''), r'(?is) 0), + count(0) + ) * 100, + 2 + ) AS pages_with_srcset_pct, + round( + safe_divide( + countif(num_srcset_sizes > 0), + count(0) + ) * 100, + 2 + ) AS pages_with_srcset_sizes_pct, + round( + safe_divide( + ( + countif(num_srcset_all > 0) - + countif(num_srcset_sizes > 0) + ), + count(0) + ) * 100, + 2 + ) AS pages_with_srcset_wo_sizes_pct, + round( + safe_divide( + sum(num_srcset_sizes), + nullif(sum(num_srcset_all), 0) + ) * 100, + 2 + ) AS instances_of_srcset_sizes_pct, + round( + safe_divide( + ( + sum(num_srcset_all) - + sum(num_srcset_sizes) + ), + nullif(sum(num_srcset_all), 0) + ) * 100, + 2 + ) AS instances_of_srcset_wo_sizes_pct, + round( + safe_divide(countif(coalesce(picture_total, 0) > 0), count(0)) * 100, + 2 + ) AS pages_with_picture_pct +FROM page_data +GROUP BY + client +ORDER BY + client diff --git a/sql/2025/sustainability/script_count.sql b/sql/2025/sustainability/script_count.sql new file mode 100644 index 00000000000..7a7e83b3e56 --- /dev/null +++ b/sql/2025/sustainability/script_count.sql @@ -0,0 +1,107 @@ +#standardSQL +# Breakdown of inline vs external scripts +WITH script_data AS ( + SELECT + client, + page, + CAST( + JSON_EXTRACT_SCALAR( + JSON_EXTRACT( + TO_JSON_STRING(custom_metrics.javascript), + '$.script_tags' + ), + '$.total' + ) AS INT64 + ) AS total_scripts, + CAST( + JSON_EXTRACT_SCALAR( + JSON_EXTRACT( + TO_JSON_STRING(custom_metrics.javascript), + '$.script_tags' + ), + '$.inline' + ) AS INT64 + ) AS inline_scripts, + CAST( + JSON_EXTRACT_SCALAR( + JSON_EXTRACT( + TO_JSON_STRING(custom_metrics.javascript), + '$.script_tags' + ), + '$.src' + ) AS INT64 + ) AS external_scripts, + SAFE_DIVIDE( + CAST( + JSON_EXTRACT_SCALAR( + JSON_EXTRACT( + TO_JSON_STRING(custom_metrics.javascript), + '$.script_tags' + ), + '$.inline' + ) AS INT64 + ), + CAST( + JSON_EXTRACT_SCALAR( + JSON_EXTRACT( + TO_JSON_STRING(custom_metrics.javascript), + '$.script_tags' + ), + '$.total' + ) AS INT64 + ) + ) AS pct_inline_script, + SAFE_DIVIDE( + CAST( + JSON_EXTRACT_SCALAR( + JSON_EXTRACT( + TO_JSON_STRING(custom_metrics.javascript), + '$.script_tags' + ), + '$.src' + ) AS INT64 + ), + CAST( + JSON_EXTRACT_SCALAR( + JSON_EXTRACT( + TO_JSON_STRING(custom_metrics.javascript), + '$.script_tags' + ), + '$.total' + ) AS INT64 + ) + ) AS pct_external_script + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' AND + JSON_EXTRACT_SCALAR( + JSON_EXTRACT( + TO_JSON_STRING(custom_metrics.javascript), '$.script_tags' + ), + '$.total' + ) IS NOT NULL +) + +SELECT + client, + COUNT(DISTINCT page) AS pages_analyzed, + SUM(total_scripts) AS total_scripts, + SUM(inline_scripts) AS inline_scripts, + SUM(external_scripts) AS external_scripts, + ROUND(100 * SAFE_DIVIDE( + SUM(external_scripts), SUM(total_scripts) + ), 2) AS pct_external_script, + ROUND(100 * SAFE_DIVIDE(SUM(inline_scripts), SUM(total_scripts)), 2) AS pct_inline_script, + APPROX_QUANTILES( + SAFE_DIVIDE(external_scripts, total_scripts), 1000 + )[OFFSET(500)] AS median_external, + APPROX_QUANTILES( + SAFE_DIVIDE(inline_scripts, total_scripts), 1000 + )[OFFSET(500)] AS median_inline +FROM + script_data +GROUP BY + client +ORDER BY + client; diff --git a/sql/2025/sustainability/ssg_bytes_per_type.sql b/sql/2025/sustainability/ssg_bytes_per_type.sql new file mode 100644 index 00000000000..02eec5beeee --- /dev/null +++ b/sql/2025/sustainability/ssg_bytes_per_type.sql @@ -0,0 +1,169 @@ +#standardSQL + +# Median resource weights by static site generator with detailed CO2e breakdown +# Source: https://sustainablewebdesign.org/calculating-digital-emissions/ +# Declare variables to calculate the carbon emissions per gigabyte (kWh/GB) + +DECLARE grid_intensity NUMERIC DEFAULT 494; +DECLARE embodied_emissions_data_centers NUMERIC DEFAULT 0.012; +DECLARE embodied_emissions_network NUMERIC DEFAULT 0.013; +DECLARE embodied_emissions_user_devices NUMERIC DEFAULT 0.081; +DECLARE operational_emissions_data_centers NUMERIC DEFAULT 0.055; +DECLARE operational_emissions_network NUMERIC DEFAULT 0.059; +DECLARE operational_emissions_user_devices NUMERIC DEFAULT 0.080; + +WITH ssg_data AS ( + SELECT + client, + page, + tech.technology AS ssg, + CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 AS total_kb, + + -- Operational emissions calculations + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity AS op_emissions_dc, + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity AS op_emissions_networks, + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity AS op_emissions_devices, + + -- Embodied emissions calculations + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity AS em_emissions_dc, + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity AS em_emissions_networks, + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity AS em_emissions_devices, + + -- Total emissions (operational + embodied) + ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity + ) AS total_operational_emissions, + + ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity + ) AS total_embodied_emissions, + + ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity + + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity + ) AS total_emissions, + + -- Proportions of each resource type relative to total bytes + CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS html_proportion, + CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS js_proportion, + CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS css_proportion, + CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS img_proportion, + CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS font_proportion, + + -- Resource-specific emissions calculations + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_html_emissions, + + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_js_emissions, + + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_css_emissions, + + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_img_emissions, + + (SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * ( + (CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * ( + operational_emissions_data_centers * grid_intensity + + operational_emissions_network * grid_intensity + + operational_emissions_user_devices * grid_intensity + + embodied_emissions_data_centers * grid_intensity + + embodied_emissions_network * grid_intensity + + embodied_emissions_user_devices * grid_intensity + ) + )) AS total_font_emissions, + + -- Resource-specific size in KB + CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64) / 1024 AS html_kb, + CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64) / 1024 AS js_kb, + CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64) / 1024 AS css_kb, + CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64) / 1024 AS img_kb, + CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64) / 1024 AS font_kb + + FROM + `httparchive.crawl.pages`, + UNNEST(technologies) AS tech + WHERE + date = '2025-06-01' AND + is_root_page = TRUE AND + EXISTS ( + SELECT 1 + FROM UNNEST(tech.categories) AS category + WHERE LOWER(category) = 'static site generator' OR + tech.technology IN ('Next.js', 'Nuxt.js') + ) +) + +SELECT + client, + ssg, + COUNT(0) AS pages, + + -- Median resource weights and emissions + APPROX_QUANTILES(total_kb, 1000)[OFFSET(500)] AS median_total_kb, + APPROX_QUANTILES(total_operational_emissions, 1000)[OFFSET(500)] AS median_operational_emissions, + APPROX_QUANTILES(total_embodied_emissions, 1000)[OFFSET(500)] AS median_embodied_emissions, + APPROX_QUANTILES(total_emissions, 1000)[OFFSET(500)] AS median_total_emissions, + + -- Resource-specific medians + APPROX_QUANTILES(html_kb, 1000)[OFFSET(500)] AS median_html_kb, + APPROX_QUANTILES(total_html_emissions, 1000)[OFFSET(500)] AS median_total_html_emissions, + APPROX_QUANTILES(js_kb, 1000)[OFFSET(500)] AS median_js_kb, + APPROX_QUANTILES(total_js_emissions, 1000)[OFFSET(500)] AS median_total_js_emissions, + APPROX_QUANTILES(css_kb, 1000)[OFFSET(500)] AS median_css_kb, + APPROX_QUANTILES(total_css_emissions, 1000)[OFFSET(500)] AS median_total_css_emissions, + APPROX_QUANTILES(img_kb, 1000)[OFFSET(500)] AS median_img_kb, + APPROX_QUANTILES(total_img_emissions, 1000)[OFFSET(500)] AS median_total_img_emissions, + APPROX_QUANTILES(font_kb, 1000)[OFFSET(500)] AS median_font_kb, + APPROX_QUANTILES(total_font_emissions, 1000)[OFFSET(500)] AS median_total_font_emissions + +FROM + ssg_data +GROUP BY + client, + ssg +ORDER BY + pages DESC, + ssg ASC, + client ASC; diff --git a/sql/2025/sustainability/stylesheet_count.sql b/sql/2025/sustainability/stylesheet_count.sql new file mode 100644 index 00000000000..34ceba4dce8 --- /dev/null +++ b/sql/2025/sustainability/stylesheet_count.sql @@ -0,0 +1,42 @@ +#standardSQL +# Breakdown of inline vs external scripts +WITH stylesheet_data AS ( + SELECT + client, + page, + CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(custom_metrics.javascript), '$.document.stylesheets') AS INT64) AS external_stylesheets, + CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(custom_metrics.javascript), '$.document.inlineStyles') AS INT64) AS inline_stylesheets, + SAFE_DIVIDE( + CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(custom_metrics.javascript), '$.document.inlineStyles') AS INT64), + CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(custom_metrics.javascript), '$.document.stylesheets') AS INT64) + + CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(custom_metrics.javascript), '$.document.inlineStyles') AS INT64) + ) AS pct_inline_stylesheets, + SAFE_DIVIDE( + CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(custom_metrics.javascript), '$.document.stylesheets') AS INT64), + CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(custom_metrics.javascript), '$.document.stylesheets') AS INT64) + + CAST(JSON_EXTRACT_SCALAR(TO_JSON_STRING(custom_metrics.javascript), '$.document.inlineStyles') AS INT64) + ) AS pct_external_stylesheets + FROM + `httparchive.crawl.pages` + WHERE + date = '2025-06-01' + AND + is_root_page = TRUE AND + JSON_EXTRACT_SCALAR(TO_JSON_STRING(custom_metrics.javascript), '$.document.stylesheets') IS NOT NULL +) + +SELECT + client, + COUNT(DISTINCT page) AS pages_analyzed, + SUM(external_stylesheets) AS external_stylesheets, + SUM(inline_stylesheets) AS inline_stylesheets, + ROUND(100 * SAFE_DIVIDE(SUM(inline_stylesheets), SUM(inline_stylesheets + external_stylesheets)), 2) AS pct_inline_stylesheets, + ROUND(100 * SAFE_DIVIDE(SUM(external_stylesheets), SUM(inline_stylesheets + external_stylesheets)), 2) AS pct_external_stylesheets, + APPROX_QUANTILES(SAFE_DIVIDE(inline_stylesheets, inline_stylesheets + external_stylesheets), 1000)[OFFSET(500)] AS median_inline_stylesheets, + APPROX_QUANTILES(SAFE_DIVIDE(external_stylesheets, inline_stylesheets + external_stylesheets), 1000)[OFFSET(500)] AS median_external_stylesheets +FROM + stylesheet_data +GROUP BY + client +ORDER BY + client; diff --git a/sql/2025/sustainability/text_compression.sql b/sql/2025/sustainability/text_compression.sql new file mode 100644 index 00000000000..174ac0c6e2f --- /dev/null +++ b/sql/2025/sustainability/text_compression.sql @@ -0,0 +1,32 @@ +WITH content_encoding AS ( + SELECT + client, + LOWER(h.value) AS encoding + FROM `httparchive.crawl.requests` r + LEFT JOIN UNNEST(r.response_headers) AS h + ON LOWER(h.name) = 'content-encoding' + WHERE + date = '2025-06-01' + AND is_root_page + AND is_main_document +), + +compression_rollup AS ( + SELECT + client, + CASE + WHEN encoding = 'gzip' THEN 'Gzip' + WHEN encoding = 'br' THEN 'Brotli' + WHEN encoding IS NULL OR encoding = '' THEN 'no text compression' + ELSE 'other' + END AS compression_type, + COUNT(0) AS num_requests, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + ROUND(COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) * 100, 2) AS pct + FROM content_encoding + GROUP BY client, compression_type +) + +SELECT client, compression_type, num_requests, total, pct +FROM compression_rollup +ORDER BY client ASC, num_requests DESC diff --git a/sql/2025/sustainability/unminified_css_bytes.sql b/sql/2025/sustainability/unminified_css_bytes.sql new file mode 100644 index 00000000000..334fdabd07f --- /dev/null +++ b/sql/2025/sustainability/unminified_css_bytes.sql @@ -0,0 +1,18 @@ +#standardSQL +# Distribution of unminified CSS request bytes per page + +SELECT + client, + percentile, + APPROX_QUANTILES(CAST(JSON_VALUE(lighthouse, '$.audits.unminified-css.details.overallSavingsBytes') AS INT64) / 1024, 1000)[OFFSET(percentile * 10)] AS css_kilobytes +FROM + `httparchive.crawl.pages`, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +WHERE + date = '2025-06-01' +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2025/sustainability/unminified_js_bytes.sql b/sql/2025/sustainability/unminified_js_bytes.sql new file mode 100644 index 00000000000..01c3b2b9c96 --- /dev/null +++ b/sql/2025/sustainability/unminified_js_bytes.sql @@ -0,0 +1,26 @@ +#standardSQL +# Distribution of unminified JS request bytes per page + +SELECT + client, + percentile, + APPROX_QUANTILES( + CAST( + JSON_VALUE( + lighthouse, + '$.audits.unminified-javascript.details.overallSavingsBytes' + ) AS INT64 + ) / 1024, + 1000 + )[OFFSET(percentile * 10)] AS js_kilobytes +FROM + `httparchive.crawl.pages`, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +WHERE + date = '2025-06-01' +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2025/sustainability/unused_css_bytes.sql b/sql/2025/sustainability/unused_css_bytes.sql new file mode 100644 index 00000000000..39f76c446ef --- /dev/null +++ b/sql/2025/sustainability/unused_css_bytes.sql @@ -0,0 +1,26 @@ +#standardSQL +# Distribution of unused CSS request bytes per page + +SELECT + client, + percentile, + APPROX_QUANTILES( + CAST( + JSON_VALUE( + lighthouse, + '$.audits.unused-css-rules.details.overallSavingsBytes' + ) AS INT64 + ) / 1024, + 1000 + )[OFFSET(percentile * 10)] AS css_kilobytes +FROM + `httparchive.crawl.pages`, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +WHERE + date = '2025-06-01' +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2025/sustainability/unused_js_bytes.sql b/sql/2025/sustainability/unused_js_bytes.sql new file mode 100644 index 00000000000..3a945e923dc --- /dev/null +++ b/sql/2025/sustainability/unused_js_bytes.sql @@ -0,0 +1,26 @@ +#standardSQL +# Distribution of unused JS request bytes per page + +SELECT + client, + percentile, + APPROX_QUANTILES( + CAST( + JSON_VALUE( + lighthouse, + '$.audits.unused-javascript.details.overallSavingsBytes' + ) AS INT64 + ) / 1024, + 1000 + )[OFFSET(percentile * 10)] AS js_kilobytes +FROM + `httparchive.crawl.pages`, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +WHERE + date = '2025-06-01' +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2025/sustainability/use_of_prefers_dark_mode_usage.sql b/sql/2025/sustainability/use_of_prefers_dark_mode_usage.sql new file mode 100644 index 00000000000..e3fd0a65e9d --- /dev/null +++ b/sql/2025/sustainability/use_of_prefers_dark_mode_usage.sql @@ -0,0 +1,37 @@ +#standardSQL +# The distribution of websites by client that use the prefers-color-scheme:dark media query. + +WITH combined_data AS ( + SELECT + client, + page, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages, + SUM( + CASE + WHEN EXISTS ( + SELECT 1 + FROM UNNEST(JSON_EXTRACT_ARRAY(css, '$.stylesheet.rules')) AS rule + WHERE JSON_EXTRACT_SCALAR(rule, '$.type') = 'media' AND + LOWER(JSON_EXTRACT_SCALAR(rule, '$.media')) LIKE '%prefers-color-scheme:dark%' + ) + THEN 1 + ELSE 0 + END + ) OVER (PARTITION BY client, page) AS is_dark_mode_page + FROM + `httparchive.crawl.parsed_css` + WHERE + date = '2025-06-01' +) + +SELECT + client, + MAX(total_pages) AS total_pages, + SUM(is_dark_mode_page) AS pages_using_dark_mode, + SUM(is_dark_mode_page) / MAX(total_pages) * 100 AS percentage_of_pages +FROM + combined_data +GROUP BY + client +ORDER BY + percentage_of_pages DESC, client ASC; diff --git a/sql/2025/sustainability/video_autoplay_values.sql b/sql/2025/sustainability/video_autoplay_values.sql new file mode 100644 index 00000000000..af1cbc5579f --- /dev/null +++ b/sql/2025/sustainability/video_autoplay_values.sql @@ -0,0 +1,40 @@ +WITH video_data AS ( + SELECT + client, + LOWER( + COALESCE( + JSON_EXTRACT_SCALAR(video_nodes, '$.autoplay'), + '(autoplay not used)' + ) + ) AS autoplay_value + FROM + `httparchive.crawl.pages`, + UNNEST( + JSON_EXTRACT_ARRAY( + JSON_EXTRACT_SCALAR(payload, '$._almanac'), '$.videos.nodes' + ) + ) AS video_nodes + WHERE + date = '2025-06-01' AND -- Updated date + is_root_page + LIMIT 10000 -- Limit the number of rows processed for faster testing +) + +SELECT + client, + IF(autoplay_value = '', '(empty)', autoplay_value) AS autoplay_value, + COUNT(0) AS autoplay_value_count, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_videos, + ROUND( + SAFE_DIVIDE(COUNT(0), SUM(COUNT(0)) OVER (PARTITION BY client)) * 100, 2 + ) AS autoplay_value_pct +FROM + video_data +GROUP BY + client, + autoplay_value +QUALIFY + autoplay_value_count > 10 +ORDER BY + client ASC, + autoplay_value_count DESC diff --git a/sql/2025/sustainability/video_preload_values.sql b/sql/2025/sustainability/video_preload_values.sql new file mode 100644 index 00000000000..41ca1e150ef --- /dev/null +++ b/sql/2025/sustainability/video_preload_values.sql @@ -0,0 +1,42 @@ +WITH video_data AS ( + SELECT + date, + client, + LOWER( + COALESCE( + JSON_EXTRACT_SCALAR(video_nodes, '$.preload'), + '(preload not used)' + ) + ) AS preload_value + FROM + `httparchive.crawl.pages`, + UNNEST( + JSON_EXTRACT_ARRAY( + JSON_EXTRACT_SCALAR(payload, '$._almanac'), '$.videos.nodes' + ) + ) AS video_nodes + WHERE + date IN ('2025-06-01', '2024-07-01') AND -- Updated dates + is_root_page +) + +SELECT + date, + client, + IF(preload_value = '', '(empty)', preload_value) AS preload_value, + COUNT(0) AS preload_value_count, + SAFE_DIVIDE( + COUNT(0), SUM(COUNT(0)) OVER (PARTITION BY date, client) + ) AS preload_value_pct +FROM + video_data +GROUP BY + date, + client, + preload_value +QUALIFY + preload_value_count > 10 +ORDER BY + date ASC, + client ASC, + preload_value_count DESC