@@ -21,6 +21,7 @@ import { rewriteCss } from "@rewriters/css";
2121import { rewriteWorkers } from "@rewriters/worker" ;
2222import { ScramjetConfig } from "@/types" ;
2323import DomHandler from "domhandler" ;
24+ import { sniffEncoding } from "@/shared/sniffEncoding" ;
2425
2526export interface ScramjetFetchRequest {
2627 rawUrl : URL ;
@@ -87,6 +88,19 @@ export class ScramjetFetchHandler extends EventTarget {
8788 }
8889}
8990
91+ function normalizeContentType (
92+ request : ScramjetFetchRequest ,
93+ headers : ScramjetHeaders
94+ ) {
95+ if ( request . destination !== "document" && request . destination !== "iframe" )
96+ return ;
97+
98+ const ct = headers . get ( "content-type" ) ;
99+ if ( ! ct ) return ;
100+ if ( ! ct . startsWith ( "text/html" ) ) return ;
101+
102+ headers . set ( "content-type" , "text/html; charset=utf-8" ) ;
103+ }
90104async function doHandleFetch (
91105 handler : ScramjetFetchHandler ,
92106 request : ScramjetFetchRequest
@@ -184,6 +198,11 @@ async function doHandleFetch(
184198
185199 if ( response . body && ! isRedirect ( response ) ) {
186200 responseBody = await rewriteBody ( handler , request , parsed , response ) ;
201+
202+ // After rewriting HTML, the body is a JS string which will be encoded as
203+ // UTF-8 by the Response constructor. Normalize the Content-Type charset so
204+ // the browser doesn't try to decode UTF-8 bytes with the original encoding.
205+ normalizeContentType ( request , responseHeaders ) ;
187206 }
188207
189208 // Clean up tracker if not a redirect
@@ -436,6 +455,10 @@ async function handleBlobOrDataUrlFetch(
436455 ) ;
437456 }
438457 const headers = ScramjetHeaders . fromRawHeaders ( response . rawHeaders ) ;
458+
459+ // blob urls actually *can* set charsets, so we need to normalize them if it goes down the html path
460+ normalizeContentType ( request , headers ) ;
461+
439462 if ( handler . crossOriginIsolated ) {
440463 headers . set ( "Cross-Origin-Opener-Policy" , "same-origin" ) ;
441464 headers . set ( "Cross-Origin-Embedder-Policy" , "require-corp" ) ;
@@ -663,22 +686,16 @@ async function rewriteBody(
663686 case "iframe" :
664687 case "document" :
665688 if ( response . headers . get ( "content-type" ) ?. startsWith ( "text/html" ) ) {
666- // note from percs: i think this has the potential to be slow asf, but for right now its fine (we should probably look for a better solution)
667- // another note from percs: regex seems to be broken, gonna comment this out
668- /*
669- const buf = await response.arrayBuffer();
670- const decode = new TextDecoder("utf-8").decode(buf);
671- const charsetHeader = response.headers.get("content-type");
672- const charset =
673- charsetHeader?.split("charset=")[1] ||
674- decode.match(/charset=([^"]+)/)?.[1] ||
675- "utf-8";
676- const htmlContent = charset
677- ? new TextDecoder(charset).decode(buf)
678- : decode;
679- */
689+ const buf = await response . arrayBuffer ( ) ;
690+ const bytes = new Uint8Array ( buf ) ;
691+ const encoding = sniffEncoding (
692+ bytes ,
693+ response . headers . get ( "content-type" )
694+ ) ;
695+ const htmlContent = new TextDecoder ( encoding ) . decode ( bytes ) ;
696+
680697 return rewriteHtml (
681- await response . text ( ) ,
698+ htmlContent ,
682699 handler . context ,
683700 parsed . meta ,
684701 true ,
0 commit comments