diff --git a/crates/common/src/html_processor.rs b/crates/common/src/html_processor.rs index 9473e8e..5325a1e 100644 --- a/crates/common/src/html_processor.rs +++ b/crates/common/src/html_processor.rs @@ -4,7 +4,8 @@ use std::cell::Cell; use std::rc::Rc; -use lol_html::{element, html_content::ContentType, Settings as RewriterSettings}; +use lol_html::{element, html_content::ContentType, text, Settings as RewriterSettings}; +use regex::Regex; use crate::settings::Settings; use crate::streaming_processor::{HtmlRewriterAdapter, StreamProcessor}; @@ -17,6 +18,8 @@ pub struct HtmlProcessorConfig { pub request_host: String, pub request_scheme: String, pub enable_prebid: bool, + pub nextjs_enabled: bool, + pub nextjs_attributes: Vec, } impl HtmlProcessorConfig { @@ -32,6 +35,8 @@ impl HtmlProcessorConfig { request_host: request_host.to_string(), request_scheme: request_scheme.to_string(), enable_prebid: settings.prebid.auto_configure, + nextjs_enabled: settings.publisher.nextjs.enabled, + nextjs_attributes: settings.publisher.nextjs.rewrite_attributes.clone(), } } } @@ -65,6 +70,39 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso fn protocol_relative_replacement(&self) -> String { format!("//{}", self.request_host) } + + fn rewrite_nextjs_values(&self, content: &str, attributes: &[String]) -> Option { + let mut rewritten = content.to_string(); + let mut changed = false; + let escaped_origin = regex::escape(&self.origin_host); + for attribute in attributes { + let escaped_attr = regex::escape(attribute); + let pattern = format!( + r#"(?P(?:\\*")?{attr}(?:\\*")?:\\*")(?Phttps?://|//){origin}"#, + attr = escaped_attr, + origin = escaped_origin + ); + let regex = Regex::new(&pattern).expect("valid Next.js rewrite regex"); + let new_value = regex.replace_all(&rewritten, |caps: ®ex::Captures| { + let scheme = &caps["scheme"]; + let replacement = if scheme == "//" { + format!("//{}", self.request_host) + } else { + self.replacement_url() + }; + format!("{}{}", &caps["prefix"], replacement) + }); + if new_value != rewritten { + changed = true; + rewritten = new_value.into_owned(); + } + } + if changed { + Some(rewritten) + } else { + None + } + } } let patterns = Rc::new(UrlPatterns { @@ -73,6 +111,8 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso request_scheme: config.request_scheme.clone(), }); + let nextjs_attributes = Rc::new(config.nextjs_attributes.clone()); + let injected_tsjs = Rc::new(Cell::new(false)); fn is_prebid_script_url(url: &str) -> bool { @@ -85,119 +125,150 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso ) } - let rewriter_settings = RewriterSettings { - element_content_handlers: vec![ - // Inject tsjs once at the start of - element!("head", { - let injected_tsjs = injected_tsjs.clone(); - move |el| { - if !injected_tsjs.get() { - let loader = tsjs::core_script_tag(); - el.prepend(&loader, ContentType::Html); - injected_tsjs.set(true); - } - Ok(()) - } - }), - // Replace URLs in href attributes - element!("[href]", { - let patterns = patterns.clone(); - let rewrite_prebid = config.enable_prebid; - move |el| { - if let Some(href) = el.get_attribute("href") { - // If Prebid auto-config is enabled and this looks like a Prebid script href, rewrite to our extension - if rewrite_prebid && is_prebid_script_url(&href) { - let ext_src = tsjs::ext_script_src(); - el.set_attribute("href", &ext_src)?; - } else { - let new_href = href - .replace(&patterns.https_origin(), &patterns.replacement_url()) - .replace(&patterns.http_origin(), &patterns.replacement_url()); - if new_href != href { - el.set_attribute("href", &new_href)?; - } - } - } - Ok(()) + let mut element_content_handlers = vec![ + // Inject tsjs once at the start of + element!("head", { + let injected_tsjs = injected_tsjs.clone(); + move |el| { + if !injected_tsjs.get() { + let loader = tsjs::core_script_tag(); + el.prepend(&loader, ContentType::Html); + injected_tsjs.set(true); } - }), - // Replace URLs in src attributes - element!("[src]", { - let patterns = patterns.clone(); - let rewrite_prebid = config.enable_prebid; - move |el| { - if let Some(src) = el.get_attribute("src") { - if rewrite_prebid && is_prebid_script_url(&src) { - let ext_src = tsjs::ext_script_src(); - el.set_attribute("src", &ext_src)?; - } else { - let new_src = src - .replace(&patterns.https_origin(), &patterns.replacement_url()) - .replace(&patterns.http_origin(), &patterns.replacement_url()); - if new_src != src { - el.set_attribute("src", &new_src)?; - } + Ok(()) + } + }), + // Replace URLs in href attributes + element!("[href]", { + let patterns = patterns.clone(); + let rewrite_prebid = config.enable_prebid; + move |el| { + if let Some(href) = el.get_attribute("href") { + // If Prebid auto-config is enabled and this looks like a Prebid script href, rewrite to our extension + if rewrite_prebid && is_prebid_script_url(&href) { + let ext_src = tsjs::ext_script_src(); + el.set_attribute("href", &ext_src)?; + } else { + let new_href = href + .replace(&patterns.https_origin(), &patterns.replacement_url()) + .replace(&patterns.http_origin(), &patterns.replacement_url()); + if new_href != href { + el.set_attribute("href", &new_href)?; } } - Ok(()) } - }), - // Replace URLs in action attributes - element!("[action]", { - let patterns = patterns.clone(); - move |el| { - if let Some(action) = el.get_attribute("action") { - let new_action = action + Ok(()) + } + }), + // Replace URLs in src attributes + element!("[src]", { + let patterns = patterns.clone(); + let rewrite_prebid = config.enable_prebid; + move |el| { + if let Some(src) = el.get_attribute("src") { + if rewrite_prebid && is_prebid_script_url(&src) { + let ext_src = tsjs::ext_script_src(); + el.set_attribute("src", &ext_src)?; + } else { + let new_src = src .replace(&patterns.https_origin(), &patterns.replacement_url()) .replace(&patterns.http_origin(), &patterns.replacement_url()); - if new_action != action { - el.set_attribute("action", &new_action)?; + if new_src != src { + el.set_attribute("src", &new_src)?; } } - Ok(()) } - }), - // Replace URLs in srcset attributes (for responsive images) - element!("[srcset]", { - let patterns = patterns.clone(); - move |el| { - if let Some(srcset) = el.get_attribute("srcset") { - let new_srcset = srcset - .replace(&patterns.https_origin(), &patterns.replacement_url()) - .replace(&patterns.http_origin(), &patterns.replacement_url()) - .replace( - &patterns.protocol_relative_origin(), - &patterns.protocol_relative_replacement(), - ) - .replace(&patterns.origin_host, &patterns.request_host); - - if new_srcset != srcset { - el.set_attribute("srcset", &new_srcset)?; - } + Ok(()) + } + }), + // Replace URLs in action attributes + element!("[action]", { + let patterns = patterns.clone(); + move |el| { + if let Some(action) = el.get_attribute("action") { + let new_action = action + .replace(&patterns.https_origin(), &patterns.replacement_url()) + .replace(&patterns.http_origin(), &patterns.replacement_url()); + if new_action != action { + el.set_attribute("action", &new_action)?; } - Ok(()) } - }), - // Replace URLs in imagesrcset attributes (for link preload) - element!("[imagesrcset]", { - let patterns = patterns.clone(); - move |el| { - if let Some(imagesrcset) = el.get_attribute("imagesrcset") { - let new_imagesrcset = imagesrcset - .replace(&patterns.https_origin(), &patterns.replacement_url()) - .replace(&patterns.http_origin(), &patterns.replacement_url()) - .replace( - &patterns.protocol_relative_origin(), - &patterns.protocol_relative_replacement(), - ); - if new_imagesrcset != imagesrcset { - el.set_attribute("imagesrcset", &new_imagesrcset)?; - } + Ok(()) + } + }), + // Replace URLs in srcset attributes (for responsive images) + element!("[srcset]", { + let patterns = patterns.clone(); + move |el| { + if let Some(srcset) = el.get_attribute("srcset") { + let new_srcset = srcset + .replace(&patterns.https_origin(), &patterns.replacement_url()) + .replace(&patterns.http_origin(), &patterns.replacement_url()) + .replace( + &patterns.protocol_relative_origin(), + &patterns.protocol_relative_replacement(), + ) + .replace(&patterns.origin_host, &patterns.request_host); + + if new_srcset != srcset { + el.set_attribute("srcset", &new_srcset)?; + } + } + Ok(()) + } + }), + // Replace URLs in imagesrcset attributes (for link preload) + element!("[imagesrcset]", { + let patterns = patterns.clone(); + move |el| { + if let Some(imagesrcset) = el.get_attribute("imagesrcset") { + let new_imagesrcset = imagesrcset + .replace(&patterns.https_origin(), &patterns.replacement_url()) + .replace(&patterns.http_origin(), &patterns.replacement_url()) + .replace( + &patterns.protocol_relative_origin(), + &patterns.protocol_relative_replacement(), + ); + if new_imagesrcset != imagesrcset { + el.set_attribute("imagesrcset", &new_imagesrcset)?; } - Ok(()) } - }), - ], + Ok(()) + } + }), + ]; + + if config.nextjs_enabled && !nextjs_attributes.is_empty() { + element_content_handlers.push(text!("script#__NEXT_DATA__", { + let patterns = patterns.clone(); + let attributes = nextjs_attributes.clone(); + move |text| { + let content = text.as_str(); + if let Some(rewritten) = patterns.rewrite_nextjs_values(content, &attributes) { + text.replace(&rewritten, ContentType::Text); + } + Ok(()) + } + })); + + element_content_handlers.push(text!("script", { + let patterns = patterns.clone(); + let attributes = nextjs_attributes.clone(); + move |text| { + let content = text.as_str(); + if !content.contains("self.__next_f") { + return Ok(()); + } + if let Some(rewritten) = patterns.rewrite_nextjs_values(content, &attributes) { + text.replace(&rewritten, ContentType::Text); + } + Ok(()) + } + })); + } + + let rewriter_settings = RewriterSettings { + element_content_handlers, // TODO: Consider adding text content replacement if needed with settings // // Replace URLs in text content @@ -238,6 +309,8 @@ mod tests { request_host: "test.example.com".to_string(), request_scheme: "https".to_string(), enable_prebid: false, + nextjs_enabled: false, + nextjs_attributes: vec!["href".to_string(), "link".to_string(), "url".to_string()], } } @@ -318,6 +391,137 @@ mod tests { assert!(processed.contains("/static/tsjs=tsjs-core.min.js")); } + #[test] + fn test_rewrites_nextjs_script_when_enabled() { + let html = r#" + + "#; + + let mut config = create_test_config(); + config.nextjs_enabled = true; + config.nextjs_attributes = vec!["href".to_string(), "link".to_string(), "url".to_string()]; + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + let processed = String::from_utf8_lossy(&output); + println!("processed={processed}"); + println!("processed stream payload: {}", processed); + println!("processed stream payload: {}", processed); + + assert!( + processed.contains(r#""href":"https://test.example.com/reviews""#), + "Should rewrite https Next.js href values" + ); + assert!( + processed.contains(r#""href":"https://test.example.com/sign-in""#), + "Should rewrite http Next.js href values" + ); + assert!( + processed.contains(r#""fallbackHref":"http://origin.example.com/legacy""#), + "Should leave other fields untouched" + ); + assert!( + processed.contains(r#""protoRelative":"//origin.example.com/assets/logo.png""#), + "Should not rewrite non-href keys" + ); + assert!( + !processed.contains("\"href\":\"https://origin.example.com/reviews\""), + "Should remove origin https href" + ); + assert!( + !processed.contains("\"href\":\"http://origin.example.com/sign-in\""), + "Should remove origin http href" + ); + } + + #[test] + fn test_rewrites_nextjs_stream_payload() { + let html = r#" + + "#; + + let mut config = create_test_config(); + config.nextjs_enabled = true; + config.nextjs_attributes = vec!["href".to_string(), "link".to_string(), "url".to_string()]; + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + let processed = String::from_utf8_lossy(&output); + let normalized = processed.replace('\\', ""); + assert!( + normalized.contains("\"href\":\"https://test.example.com/dashboard\""), + "Should rewrite escaped href sequences inside streamed payloads. Content: {}", + normalized + ); + assert!( + normalized.contains("\"href\":\"https://test.example.com/secondary\""), + "Should rewrite plain href attributes inside streamed payloads" + ); + assert!( + normalized.contains("\"link\":\"https://test.example.com/api-test\""), + "Should rewrite additional configured attributes like link" + ); + assert!( + processed.contains("\"dataHost\":\"https://origin.example.com/api\""), + "Should leave non-href properties untouched" + ); + } + + #[test] + fn test_nextjs_rewrite_respects_flag() { + let html = r#" + + "#; + + let config = create_test_config(); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + let processed = String::from_utf8_lossy(&output); + + assert!( + processed.contains("origin.example.com"), + "Should leave Next.js data untouched when disabled" + ); + assert!( + !processed.contains("test.example.com/reviews"), + "Should not rewrite Next.js data when flag is off" + ); + } + #[test] fn test_create_html_processor_url_replacement() { let config = create_test_config(); @@ -365,6 +569,15 @@ mod tests { assert_eq!(config.request_host, "proxy.example.com"); assert_eq!(config.request_scheme, "https"); assert!(config.enable_prebid); // Uses default true + assert!( + !config.nextjs_enabled, + "Next.js rewrites should default to disabled" + ); + assert_eq!( + config.nextjs_attributes, + vec!["href".to_string(), "link".to_string(), "url".to_string()], + "Should default to rewriting href/link/url attributes" + ); } #[test] diff --git a/crates/common/src/settings.rs b/crates/common/src/settings.rs index b46ba1c..2c7940d 100644 --- a/crates/common/src/settings.rs +++ b/crates/common/src/settings.rs @@ -3,7 +3,10 @@ use core::str; use config::{Config, Environment, File, FileFormat}; use error_stack::{Report, ResultExt}; use regex::Regex; -use serde::{de::DeserializeOwned, Deserialize, Deserializer, Serialize}; +use serde::{ + de::{DeserializeOwned, IntoDeserializer}, + Deserialize, Deserializer, Serialize, +}; use serde_json::Value as JsonValue; use std::collections::HashMap; use std::sync::OnceLock; @@ -23,6 +26,9 @@ pub struct Publisher { /// Secret used to encrypt/decrypt proxied URLs in `/first-party/proxy`. /// Keep this secret stable to allow existing links to decode. pub proxy_secret: String, + #[serde(default)] + #[validate(nested)] + pub nextjs: NextJs, } impl Publisher { @@ -37,6 +43,7 @@ impl Publisher { /// cookie_domain: ".example.com".to_string(), /// origin_url: "https://origin.example.com:8080".to_string(), /// proxy_secret: "proxy-secret".to_string(), + /// nextjs: Default::default(), /// }; /// assert_eq!(publisher.origin_host(), "origin.example.com:8080"); /// ``` @@ -79,6 +86,42 @@ fn default_auto_configure() -> bool { true } +#[derive(Debug, Deserialize, Serialize, Validate)] +pub struct NextJs { + #[serde(default)] + pub enabled: bool, + #[serde( + default = "default_nextjs_attributes", + deserialize_with = "deserialize_nextjs_attributes" + )] + pub rewrite_attributes: Vec, +} + +fn default_nextjs_attributes() -> Vec { + vec!["href".to_string(), "link".to_string(), "url".to_string()] +} + +impl Default for NextJs { + fn default() -> Self { + Self { + enabled: false, + rewrite_attributes: default_nextjs_attributes(), + } + } +} + +fn deserialize_nextjs_attributes<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let value = Option::::deserialize(deserializer)?; + match value { + Some(json) => vec_from_seq_or_map(json.into_deserializer()) + .map_err(::custom), + None => Ok(default_nextjs_attributes()), + } +} + #[allow(unused)] #[derive(Debug, Default, Deserialize, Serialize, Validate)] pub struct Synthetic { @@ -130,6 +173,7 @@ pub struct Settings { pub publisher: Publisher, #[validate(nested)] pub prebid: Prebid, + #[serde(default)] #[validate(nested)] pub synthetic: Synthetic, #[serde(default, deserialize_with = "vec_from_seq_or_map")] @@ -291,6 +335,15 @@ mod tests { assert!(!settings.publisher.origin_url.is_empty()); assert!(!settings.prebid.server_url.is_empty()); + assert!( + !settings.publisher.nextjs.enabled, + "Next.js URL rewriting should default to disabled" + ); + assert_eq!( + settings.publisher.nextjs.rewrite_attributes, + vec!["href".to_string(), "link".to_string(), "url".to_string()], + "Next.js rewrite attributes should default to href/link/url" + ); assert!(!settings.synthetic.counter_store.is_empty()); assert!(!settings.synthetic.opid_store.is_empty()); @@ -310,6 +363,15 @@ mod tests { settings.prebid.server_url, "https://test-prebid.com/openrtb2/auction" ); + assert!( + !settings.publisher.nextjs.enabled, + "Next.js URL rewriting should default to disabled" + ); + assert_eq!( + settings.publisher.nextjs.rewrite_attributes, + vec!["href".to_string(), "link".to_string(), "url".to_string()], + "Next.js rewrite attributes should default to href/link/url" + ); assert_eq!(settings.publisher.domain, "test-publisher.com"); assert_eq!(settings.publisher.cookie_domain, ".test-publisher.com"); assert_eq!( @@ -568,6 +630,7 @@ mod tests { cookie_domain: ".example.com".to_string(), origin_url: "https://origin.example.com:8080".to_string(), proxy_secret: "test-secret".to_string(), + nextjs: NextJs::default(), }; assert_eq!(publisher.origin_host(), "origin.example.com:8080"); @@ -577,6 +640,7 @@ mod tests { cookie_domain: ".example.com".to_string(), origin_url: "https://origin.example.com".to_string(), proxy_secret: "test-secret".to_string(), + nextjs: NextJs::default(), }; assert_eq!(publisher.origin_host(), "origin.example.com"); @@ -586,6 +650,7 @@ mod tests { cookie_domain: ".example.com".to_string(), origin_url: "http://localhost:9090".to_string(), proxy_secret: "test-secret".to_string(), + nextjs: NextJs::default(), }; assert_eq!(publisher.origin_host(), "localhost:9090"); @@ -595,6 +660,7 @@ mod tests { cookie_domain: ".example.com".to_string(), origin_url: "localhost:9090".to_string(), proxy_secret: "test-secret".to_string(), + nextjs: NextJs::default(), }; assert_eq!(publisher.origin_host(), "localhost:9090"); @@ -604,6 +670,7 @@ mod tests { cookie_domain: ".example.com".to_string(), origin_url: "http://192.168.1.1:8080".to_string(), proxy_secret: "test-secret".to_string(), + nextjs: NextJs::default(), }; assert_eq!(publisher.origin_host(), "192.168.1.1:8080"); @@ -613,6 +680,7 @@ mod tests { cookie_domain: ".example.com".to_string(), origin_url: "http://[::1]:8080".to_string(), proxy_secret: "test-secret".to_string(), + nextjs: NextJs::default(), }; assert_eq!(publisher.origin_host(), "[::1]:8080"); } diff --git a/trusted-server.toml b/trusted-server.toml index 50ae25a..6ec6eae 100644 --- a/trusted-server.toml +++ b/trusted-server.toml @@ -9,6 +9,10 @@ cookie_domain = ".test-publisher.com" origin_url = "https://origin.test-publisher.com" proxy_secret = "change-me-proxy-secret" +[publisher.nextjs] +enabled = false +rewrite_attributes = ["href", "link", "url"] + [prebid] server_url = "http://68.183.113.79:8000" timeout_ms = 1000