Skip to content
Draft
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions sql/2025/sustainability/cache_header_usage.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#standardSQL
# The distribution of cache header adoption on websites by client.

SELECT
client,
COUNT(0) AS total_requests,

COUNTIF(uses_cache_control) AS total_using_cache_control,
COUNTIF(uses_max_age) AS total_using_max_age,
COUNTIF(uses_expires) AS total_using_expires,
COUNTIF(uses_max_age AND uses_expires) AS total_using_max_age_and_expires,
COUNTIF(uses_cache_control AND uses_expires) AS total_using_both_cc_and_expires,
COUNTIF(NOT uses_cache_control AND NOT uses_expires) AS total_using_neither_cc_and_expires,
COUNTIF(uses_cache_control AND NOT uses_expires) AS total_using_only_cache_control,
COUNTIF(NOT uses_cache_control AND uses_expires) AS total_using_only_expires,

COUNTIF(uses_cache_control) / COUNT(0) AS pct_cache_control,
COUNTIF(uses_max_age) / COUNT(0) AS pct_using_max_age,
COUNTIF(uses_expires) / COUNT(0) AS pct_using_expires,
COUNTIF(uses_max_age AND uses_expires) / COUNT(0) AS pct_using_max_age_and_expires,
COUNTIF(uses_cache_control AND uses_expires) / COUNT(0) AS pct_using_both_cc_and_expires,
COUNTIF(NOT uses_cache_control AND NOT uses_expires) / COUNT(0) AS pct_using_neither_cc_nor_expires,
COUNTIF(uses_cache_control AND NOT uses_expires) / COUNT(0) AS pct_using_only_cache_control,
COUNTIF(NOT uses_cache_control AND uses_expires) / COUNT(0) AS pct_using_only_expires

FROM (
SELECT
client,
url,
LOGICAL_OR(header.name = 'expires' AND header.value IS NOT NULL AND TRIM(header.value) != '') AS uses_expires,
LOGICAL_OR(header.name = 'cache-control' AND header.value IS NOT NULL AND TRIM(header.value) != '') AS uses_cache_control,
LOGICAL_OR(header.name = 'cache-control' AND REGEXP_CONTAINS(header.value, r'(?i)max-age\s*=\s*[0-9]+')) AS uses_max_age,

LOGICAL_OR(header.name = 'etag' AND (header.value IS NULL OR TRIM(header.value) = '')) AS uses_no_etag,
LOGICAL_OR(header.name = 'etag' AND header.value IS NOT NULL AND TRIM(header.value) != '') AS uses_etag,
LOGICAL_OR(header.name = 'last-modified' AND header.value IS NOT NULL AND TRIM(header.value) != '') AS uses_last_modified,

LOGICAL_OR(header.name = 'etag' AND REGEXP_CONTAINS(TRIM(header.value), '^W/".*"')) AS uses_weak_etag,
LOGICAL_OR(header.name = 'etag' AND REGEXP_CONTAINS(TRIM(header.value), '^".*"')) AS uses_strong_etag

FROM
`httparchive.crawl.requests`,
UNNEST(response_headers) AS header
WHERE
date = '2025-07-01'
GROUP BY
client,
url
)

GROUP BY
client
ORDER BY
client;
33 changes: 33 additions & 0 deletions sql/2025/sustainability/cdn_adoption.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#standardSQL
# The distribution of CDN adoption on websites by client.

SELECT
client,
total,
IF(cdn = '', 'No CDN', cdn) AS cdn,
COUNT(0) AS freq,
COUNT(0) / total AS pct
FROM (
SELECT
client,
COUNT(0) AS total,
ARRAY_CONCAT_AGG(
SPLIT(JSON_EXTRACT_SCALAR(summary, '$.cdn'), ', ')
) AS cdn_list
FROM
`httparchive.crawl.pages`
WHERE
date = '2025-06-01' AND
is_root_page = TRUE
GROUP BY
client
),
UNNEST(cdn_list) AS cdn
GROUP BY
client,
cdn,
total
ORDER BY
pct DESC,
client ASC,
cdn ASC;
161 changes: 161 additions & 0 deletions sql/2025/sustainability/cms_bytes_per_type.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
#standardSQL
# Median resource weights by CMS

# Declare variables to calculate the carbon emissions of one byte
# Source: https://sustainablewebdesign.org/calculating-digital-emissions/

DECLARE grid_intensity NUMERIC DEFAULT 494;
DECLARE embodied_emissions_data_centers NUMERIC DEFAULT 0.012;
DECLARE embodied_emissions_network NUMERIC DEFAULT 0.013;
DECLARE embodied_emissions_user_devices NUMERIC DEFAULT 0.081;
DECLARE operational_emissions_data_centers NUMERIC DEFAULT 0.055;
DECLARE operational_emissions_network NUMERIC DEFAULT 0.059;
DECLARE operational_emissions_user_devices NUMERIC DEFAULT 0.080;

WITH cms_data AS (
SELECT
client,
page,
tech.technology AS cms,
CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 AS total_kb,

-- Operational emissions calculations
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity AS op_emissions_dc,
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity AS op_emissions_networks,
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity AS op_emissions_devices,

-- Embodied emissions calculations
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity AS em_emissions_dc,
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity AS em_emissions_networks,
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity AS em_emissions_devices,

-- Total emissions (operational + embodied)
(
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity
) AS total_operational_emissions,

(
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity
) AS total_embodied_emissions,

(
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity
) AS total_emissions,

-- Proportions of each resource type relative to total bytes
CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS html_proportion,
CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS js_proportion,
CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS css_proportion,
CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS img_proportion,
CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS font_proportion,

-- Resource-specific emissions calculations
(SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * (
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * (
operational_emissions_data_centers * grid_intensity +
operational_emissions_network * grid_intensity +
operational_emissions_user_devices * grid_intensity +
embodied_emissions_data_centers * grid_intensity +
embodied_emissions_network * grid_intensity +
embodied_emissions_user_devices * grid_intensity
)
)) AS total_html_emissions,

(SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * (
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * (
operational_emissions_data_centers * grid_intensity +
operational_emissions_network * grid_intensity +
operational_emissions_user_devices * grid_intensity +
embodied_emissions_data_centers * grid_intensity +
embodied_emissions_network * grid_intensity +
embodied_emissions_user_devices * grid_intensity
)
)) AS total_js_emissions,

(SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * (
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * (
operational_emissions_data_centers * grid_intensity +
operational_emissions_network * grid_intensity +
operational_emissions_user_devices * grid_intensity +
embodied_emissions_data_centers * grid_intensity +
embodied_emissions_network * grid_intensity +
embodied_emissions_user_devices * grid_intensity
)
)) AS total_css_emissions,

(SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * (
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * (
operational_emissions_data_centers * grid_intensity +
operational_emissions_network * grid_intensity +
operational_emissions_user_devices * grid_intensity +
embodied_emissions_data_centers * grid_intensity +
embodied_emissions_network * grid_intensity +
embodied_emissions_user_devices * grid_intensity
)
)) AS total_img_emissions,

(SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * (
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * (
operational_emissions_data_centers * grid_intensity +
operational_emissions_network * grid_intensity +
operational_emissions_user_devices * grid_intensity +
embodied_emissions_data_centers * grid_intensity +
embodied_emissions_network * grid_intensity +
embodied_emissions_user_devices * grid_intensity
)
)) AS total_font_emissions,

-- Resource-specific size in KB
CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64) / 1024 AS html_kb,
CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64) / 1024 AS js_kb,
CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64) / 1024 AS css_kb,
CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64) / 1024 AS img_kb,
CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64) / 1024 AS font_kb
FROM
`httparchive.crawl.pages`,
UNNEST(technologies) AS tech
WHERE
date = '2025-07-01' AND
is_root_page = TRUE AND
'CMS' IN UNNEST(tech.categories)
)

SELECT
client,
cms,
COUNT(0) AS pages,
-- Median resource weights and emissions
APPROX_QUANTILES(total_kb, 1000)[OFFSET(500)] AS median_total_kb,
APPROX_QUANTILES(total_operational_emissions, 1000)[OFFSET(500)] AS median_operational_emissions,
APPROX_QUANTILES(total_embodied_emissions, 1000)[OFFSET(500)] AS median_embodied_emissions,
APPROX_QUANTILES(total_emissions, 1000)[OFFSET(500)] AS median_total_emissions,

-- Resource-specific medians
APPROX_QUANTILES(html_kb, 1000)[OFFSET(500)] AS median_html_kb,
APPROX_QUANTILES(total_html_emissions, 1000)[OFFSET(500)] AS median_total_html_emissions,
APPROX_QUANTILES(js_kb, 1000)[OFFSET(500)] AS median_js_kb,
APPROX_QUANTILES(total_js_emissions, 1000)[OFFSET(500)] AS median_total_js_emissions,
APPROX_QUANTILES(css_kb, 1000)[OFFSET(500)] AS median_css_kb,
APPROX_QUANTILES(total_css_emissions, 1000)[OFFSET(500)] AS median_total_css_emissions,
APPROX_QUANTILES(img_kb, 1000)[OFFSET(500)] AS median_img_kb,
APPROX_QUANTILES(total_img_emissions, 1000)[OFFSET(500)] AS median_total_img_emissions,
APPROX_QUANTILES(font_kb, 1000)[OFFSET(500)] AS median_font_kb,
APPROX_QUANTILES(total_font_emissions, 1000)[OFFSET(500)] AS median_total_font_emissions
FROM
cms_data
GROUP BY
client,
cms
ORDER BY
pages DESC,
cms,
client;
74 changes: 74 additions & 0 deletions sql/2025/sustainability/content-visibility.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#standardSQL
CREATE TEMPORARY FUNCTION HASCONTENTVISIBILITY(css STRING)
RETURNS ARRAY<STRUCT<property STRING, freq INT64>>
LANGUAGE js
OPTIONS (library = "gs://httparchive/lib/css-utils.js")
AS '''
try {
var ast = JSON.parse(css);

let ret = {};

walkDeclarations(ast, ({property}) => {
Comment on lines +2 to +12
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
CREATE TEMPORARY FUNCTION HASCONTENTVISIBILITY(css STRING)
RETURNS ARRAY<STRUCT<property STRING, freq INT64>>
LANGUAGE js
OPTIONS (library = "gs://httparchive/lib/css-utils.js")
AS '''
try {
var ast = JSON.parse(css);
let ret = {};
walkDeclarations(ast, ({property}) => {
CREATE TEMPORARY FUNCTION HASCONTENTVISIBILITY(css JSON)
RETURNS ARRAY<STRUCT<property STRING, freq INT64>>
LANGUAGE js
OPTIONS (library = "gs://httparchive/lib/css-utils.js")
AS '''
try {
let ret = {};
walkDeclarations(css, ({property}) => {

// Strip hacks like *property, _property etc and normalize to lowercase
property = property.replace(/[^a-z-]/g, "").toLowerCase();

if (matches(property, 'content-visibility')) {
incrementByKey(ret, property);
}
});

return Object.entries(ret).map(([property, freq]) => {
return {property, freq};
});
} catch (e) {
return [];
}
''';

WITH totals AS (
SELECT
client,
COUNT(DISTINCT root_page) AS total_pages
FROM
`httparchive.crawl.parsed_css`
WHERE
date = '2025-06-01' AND
is_root_page
GROUP BY
client
),

content_visibility_pages AS (
SELECT
client,
COUNT(DISTINCT root_page) AS pages_with_content_visibility
FROM
`httparchive.crawl.parsed_css`,
UNNEST(HASCONTENTVISIBILITY(css))
WHERE
date = '2025-06-01' AND
is_root_page
GROUP BY
client
)

SELECT
totals.client,
totals.total_pages,
COALESCE(
content_visibility_pages.pages_with_content_visibility, 0
) AS pages_with_content_visibility,
ROUND(
COALESCE(
content_visibility_pages.pages_with_content_visibility, 0
) * 100.0 / totals.total_pages,
2
) AS pct_pages
FROM
totals
LEFT JOIN
content_visibility_pages
ON totals.client = content_visibility_pages.client
ORDER BY
totals.client
Loading