Skip to content

Commit 86a807b

Browse files
committed
Another refactor
1 parent aeea7dc commit 86a807b

File tree

5 files changed

+229
-41
lines changed

5 files changed

+229
-41
lines changed

crates/common/src/integrations/nextjs/html_post_process.rs

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,16 +84,19 @@ fn find_rsc_push_scripts(html: &str) -> Vec<RscPushScriptRange> {
8484
let mut i = payload_start;
8585
let bytes = html.as_bytes();
8686
while i < bytes.len() {
87-
if bytes[i] == b'\\' {
88-
i += 2;
87+
if bytes[i] == b'\\' && i + 1 < bytes.len() {
88+
i += 2; // Skip escape sequence (safe: we checked i+1 exists)
89+
} else if bytes[i] == b'\\' {
90+
// Trailing backslash at end of content - malformed
91+
break;
8992
} else if bytes[i] == quote as u8 {
9093
break;
9194
} else {
9295
i += 1;
9396
}
9497
}
9598

96-
if i >= bytes.len() {
99+
if i >= bytes.len() || bytes[i] != quote as u8 {
97100
search_pos = payload_start;
98101
continue;
99102
}
@@ -330,4 +333,54 @@ mod tests {
330333
"Origin URL should be removed. Got: {rewritten}"
331334
);
332335
}
336+
337+
#[test]
338+
fn handles_trailing_backslash_gracefully() {
339+
// Malformed content with trailing backslash should not panic
340+
let html = r#"<html><body>
341+
<script>self.__next_f.push([1,"content with trailing backslash\"])</script>
342+
<script>self.__next_f.push([1,"valid https://origin.example.com/page"])</script>
343+
</body></html>"#;
344+
345+
let scripts = find_rsc_push_scripts(html);
346+
// The first script is malformed (trailing backslash escapes the quote),
347+
// so it won't be detected as valid. The second one should be found.
348+
assert!(
349+
scripts.len() >= 1,
350+
"Should find at least the valid script. Found: {}",
351+
scripts.len()
352+
);
353+
354+
// Should not panic during processing
355+
let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https");
356+
assert!(
357+
result.contains("test.example.com") || result.contains("origin.example.com"),
358+
"Processing should complete without panic"
359+
);
360+
}
361+
362+
#[test]
363+
fn handles_unterminated_string_gracefully() {
364+
// Content where string never closes - should not hang or panic
365+
let html = r#"<html><body>
366+
<script>self.__next_f.push([1,"content without closing quote
367+
</body></html>"#;
368+
369+
let scripts = find_rsc_push_scripts(html);
370+
assert_eq!(
371+
scripts.len(),
372+
0,
373+
"Should not find scripts with unterminated strings"
374+
);
375+
}
376+
377+
#[test]
378+
fn no_origin_returns_unchanged() {
379+
let html = r#"<html><body>
380+
<script>self.__next_f.push([1,"content without origin URLs"])</script>
381+
</body></html>"#;
382+
383+
let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https");
384+
assert_eq!(result, html, "HTML without origin should be unchanged");
385+
}
333386
}

crates/common/src/integrations/nextjs/rsc.rs

Lines changed: 110 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ static TCHUNK_PATTERN: Lazy<Regex> =
1010
/// Marker used to track script boundaries when combining RSC content.
1111
pub(crate) const RSC_MARKER: &str = "\x00SPLIT\x00";
1212

13+
/// Maximum combined payload size for cross-script processing (10 MB).
14+
/// Payloads exceeding this limit are processed individually without cross-script T-chunk handling.
15+
const MAX_COMBINED_PAYLOAD_SIZE: usize = 10 * 1024 * 1024;
16+
1317
// =============================================================================
1418
// Escape Sequence Parsing
1519
// =============================================================================
@@ -299,6 +303,7 @@ impl RscUrlRewriter {
299303
return Cow::Borrowed(input);
300304
}
301305

306+
// Phase 1: Regex-based URL pattern rewriting (handles escaped slashes, schemes, etc.)
302307
let replaced = self
303308
.pattern
304309
.replace_all(input, |caps: &regex::Captures<'_>| {
@@ -310,18 +315,20 @@ impl RscUrlRewriter {
310315
}
311316
});
312317

313-
let still_contains_origin = match &replaced {
314-
Cow::Borrowed(s) => s.contains(&self.origin_host),
315-
Cow::Owned(s) => s.contains(&self.origin_host),
318+
// Phase 2: Handle bare host occurrences not matched by the URL regex
319+
// (e.g., `siteProductionDomain`). Only check if regex made no changes,
320+
// because if it did, we already know origin_host was present.
321+
let text = match &replaced {
322+
Cow::Borrowed(s) => *s,
323+
Cow::Owned(s) => s.as_str(),
316324
};
317325

318-
if !still_contains_origin {
326+
if !text.contains(&self.origin_host) {
319327
return replaced;
320328
}
321329

322-
// Also rewrite bare host occurrences inside RSC payloads (e.g. `siteProductionDomain`).
323-
let owned = replaced.into_owned();
324-
Cow::Owned(owned.replace(&self.origin_host, &self.request_host))
330+
// Bare host replacement needed
331+
Cow::Owned(text.replace(&self.origin_host, &self.request_host))
325332
}
326333

327334
pub(crate) fn rewrite_to_string(&self, input: &str) -> String {
@@ -398,7 +405,26 @@ pub fn rewrite_rsc_scripts_combined(
398405
return vec![rewrite_rsc_tchunks_with_rewriter(payloads[0], &rewriter)];
399406
}
400407

401-
let mut combined = payloads[0].to_string();
408+
// Check total size before allocating combined buffer
409+
let total_size: usize =
410+
payloads.iter().map(|p| p.len()).sum::<usize>() + (payloads.len() - 1) * RSC_MARKER.len();
411+
412+
if total_size > MAX_COMBINED_PAYLOAD_SIZE {
413+
// Fall back to individual processing if combined size is too large.
414+
// This sacrifices cross-script T-chunk correctness for memory safety.
415+
log::warn!(
416+
"RSC combined payload size {} exceeds limit {}, processing individually",
417+
total_size,
418+
MAX_COMBINED_PAYLOAD_SIZE
419+
);
420+
return payloads
421+
.iter()
422+
.map(|p| rewrite_rsc_tchunks_with_rewriter(p, &rewriter))
423+
.collect();
424+
}
425+
426+
let mut combined = String::with_capacity(total_size);
427+
combined.push_str(payloads[0]);
402428
for payload in &payloads[1..] {
403429
combined.push_str(RSC_MARKER);
404430
combined.push_str(payload);
@@ -591,4 +617,80 @@ mod tests {
591617
"Bare host should be rewritten inside RSC payload. Got: {rewritten}"
592618
);
593619
}
620+
621+
#[test]
622+
fn single_payload_bypasses_combining() {
623+
// When there's only one payload, we should process it directly without combining
624+
// Content: {"url":"https://origin.example.com/x"} = 37 bytes = 0x25 hex
625+
let payload = r#"1a:T25,{"url":"https://origin.example.com/x"}"#;
626+
let payloads: Vec<&str> = vec![payload];
627+
628+
let results = rewrite_rsc_scripts_combined(
629+
&payloads,
630+
"origin.example.com",
631+
"test.example.com",
632+
"https",
633+
);
634+
635+
assert_eq!(results.len(), 1);
636+
assert!(
637+
results[0].contains("test.example.com"),
638+
"Single payload should be rewritten. Got: {}",
639+
results[0]
640+
);
641+
// The length should be updated for the rewritten URL
642+
// {"url":"https://test.example.com/x"} = 35 bytes = 0x23 hex
643+
assert!(
644+
results[0].contains(":T23,"),
645+
"T-chunk length should be updated. Got: {}",
646+
results[0]
647+
);
648+
}
649+
650+
#[test]
651+
fn empty_payloads_returns_empty() {
652+
let payloads: Vec<&str> = vec![];
653+
let results = rewrite_rsc_scripts_combined(
654+
&payloads,
655+
"origin.example.com",
656+
"test.example.com",
657+
"https",
658+
);
659+
assert!(results.is_empty());
660+
}
661+
662+
#[test]
663+
fn no_origin_in_payloads_returns_unchanged() {
664+
let payloads: Vec<&str> = vec![r#"1a:T10,{"key":"value"}"#, r#"1b:T10,{"foo":"bar"}"#];
665+
666+
let results = rewrite_rsc_scripts_combined(
667+
&payloads,
668+
"origin.example.com",
669+
"test.example.com",
670+
"https",
671+
);
672+
673+
assert_eq!(results.len(), 2);
674+
// Content should be identical - note that T-chunk lengths may be recalculated
675+
// even if content is unchanged (due to how the algorithm works)
676+
assert!(
677+
!results[0].contains("origin.example.com") && !results[0].contains("test.example.com"),
678+
"No host should be present in payload without URLs"
679+
);
680+
assert!(
681+
!results[1].contains("origin.example.com") && !results[1].contains("test.example.com"),
682+
"No host should be present in payload without URLs"
683+
);
684+
// The content after T-chunk header should be preserved
685+
assert!(
686+
results[0].contains(r#"{"key":"value"}"#),
687+
"Content should be preserved. Got: {}",
688+
results[0]
689+
);
690+
assert!(
691+
results[1].contains(r#"{"foo":"bar"}"#),
692+
"Content should be preserved. Got: {}",
693+
results[1]
694+
);
695+
}
594696
}

crates/common/src/integrations/nextjs/script_rewriter.rs

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,22 @@ impl NextJsScriptRewriter {
3636
content: &str,
3737
ctx: &IntegrationScriptContext<'_>,
3838
) -> ScriptRewriteAction {
39-
if let Some(rewritten) = rewrite_nextjs_values(
40-
content,
39+
if ctx.origin_host.is_empty()
40+
|| ctx.request_host.is_empty()
41+
|| self.config.rewrite_attributes.is_empty()
42+
{
43+
return ScriptRewriteAction::keep();
44+
}
45+
46+
let rewriter = UrlRewriter::new(
4147
ctx.origin_host,
4248
ctx.request_host,
4349
ctx.request_scheme,
4450
&self.config.rewrite_attributes,
45-
false,
46-
) {
51+
false, // preserve_length not used for structured payloads
52+
);
53+
54+
if let Some(rewritten) = rewrite_nextjs_values_with_rewriter(content, &rewriter) {
4755
ScriptRewriteAction::replace(rewritten)
4856
} else {
4957
ScriptRewriteAction::keep()
@@ -151,6 +159,11 @@ impl IntegrationScriptRewriter for NextJsScriptRewriter {
151159
}
152160
}
153161

162+
fn rewrite_nextjs_values_with_rewriter(content: &str, rewriter: &UrlRewriter) -> Option<String> {
163+
rewriter.rewrite_embedded(content)
164+
}
165+
166+
#[cfg(test)]
154167
fn rewrite_nextjs_values(
155168
content: &str,
156169
origin_host: &str,
@@ -171,15 +184,26 @@ fn rewrite_nextjs_values(
171184
preserve_length,
172185
);
173186

174-
rewriter.rewrite_embedded(content)
187+
rewrite_nextjs_values_with_rewriter(content, &rewriter)
175188
}
176189

190+
/// Rewrites URLs in structured Next.js JSON payloads (e.g., `__NEXT_DATA__`).
191+
///
192+
/// This rewriter uses attribute-specific regex patterns to find and replace URLs
193+
/// in JSON content. It handles full URLs, protocol-relative URLs, and bare hostnames.
194+
///
195+
/// The `preserve_length` option adds whitespace padding to maintain byte length,
196+
/// which was an early attempt at RSC compatibility. This is no longer needed for
197+
/// RSC payloads (T-chunk lengths are recalculated instead), but is kept for
198+
/// potential future use cases where length preservation is required.
177199
struct UrlRewriter {
178200
origin_host: String,
179201
request_host: String,
180202
request_scheme: String,
181203
embedded_patterns: Vec<Regex>,
182204
bare_host_patterns: Vec<Regex>,
205+
/// When true, adds whitespace padding to maintain original byte length.
206+
/// Currently unused in production (always false).
183207
preserve_length: bool,
184208
}
185209

crates/common/src/rsc_flight.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@ enum RowState {
2121
///
2222
/// For `T` rows, the length prefix is the UTF-8 byte length of the content bytes. If we rewrite
2323
/// URLs inside the content, we must recompute the length and rewrite the header.
24+
///
25+
/// ## Limitations
26+
///
27+
/// This rewriter performs simple string replacement and does NOT handle JSON escape sequences.
28+
/// URLs like `\/\/origin.example.com` (JSON-escaped slashes) will not be rewritten. This is
29+
/// acceptable because Flight responses from client-side navigation typically contain plain URLs,
30+
/// not doubly-escaped JSON-in-JS content. For inlined `__next_f` data in HTML (which can have
31+
/// escape sequences), the HTML post-processor in `integrations/nextjs/` handles those cases.
2432
pub struct RscFlightUrlRewriter {
2533
origin_url: String,
2634
origin_http_url: Option<String>,

0 commit comments

Comments
 (0)