Skip to content

Commit 3340623

Browse files
committed
[scramjet/core] restore charset sniffing logic following the official spec
1 parent b2c24c2 commit 3340623

File tree

3 files changed

+1440
-15
lines changed

3 files changed

+1440
-15
lines changed

packages/scramjet/packages/core/src/fetch/index.ts

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import { rewriteCss } from "@rewriters/css";
2121
import { rewriteWorkers } from "@rewriters/worker";
2222
import { ScramjetConfig } from "@/types";
2323
import DomHandler from "domhandler";
24+
import { sniffEncoding } from "@/shared/sniffEncoding";
2425

2526
export interface ScramjetFetchRequest {
2627
rawUrl: URL;
@@ -87,6 +88,19 @@ export class ScramjetFetchHandler extends EventTarget {
8788
}
8889
}
8990

91+
function normalizeContentType(
92+
request: ScramjetFetchRequest,
93+
headers: ScramjetHeaders
94+
) {
95+
if (request.destination !== "document" && request.destination !== "iframe")
96+
return;
97+
98+
const ct = headers.get("content-type");
99+
if (!ct) return;
100+
if (!ct.startsWith("text/html")) return;
101+
102+
headers.set("content-type", "text/html; charset=utf-8");
103+
}
90104
async function doHandleFetch(
91105
handler: ScramjetFetchHandler,
92106
request: ScramjetFetchRequest
@@ -184,6 +198,11 @@ async function doHandleFetch(
184198

185199
if (response.body && !isRedirect(response)) {
186200
responseBody = await rewriteBody(handler, request, parsed, response);
201+
202+
// After rewriting HTML, the body is a JS string which will be encoded as
203+
// UTF-8 by the Response constructor. Normalize the Content-Type charset so
204+
// the browser doesn't try to decode UTF-8 bytes with the original encoding.
205+
normalizeContentType(request, responseHeaders);
187206
}
188207

189208
// Clean up tracker if not a redirect
@@ -436,6 +455,10 @@ async function handleBlobOrDataUrlFetch(
436455
);
437456
}
438457
const headers = ScramjetHeaders.fromRawHeaders(response.rawHeaders);
458+
459+
// blob urls actually *can* set charsets, so we need to normalize them if it goes down the html path
460+
normalizeContentType(request, headers);
461+
439462
if (handler.crossOriginIsolated) {
440463
headers.set("Cross-Origin-Opener-Policy", "same-origin");
441464
headers.set("Cross-Origin-Embedder-Policy", "require-corp");
@@ -663,22 +686,16 @@ async function rewriteBody(
663686
case "iframe":
664687
case "document":
665688
if (response.headers.get("content-type")?.startsWith("text/html")) {
666-
// note from percs: i think this has the potential to be slow asf, but for right now its fine (we should probably look for a better solution)
667-
// another note from percs: regex seems to be broken, gonna comment this out
668-
/*
669-
const buf = await response.arrayBuffer();
670-
const decode = new TextDecoder("utf-8").decode(buf);
671-
const charsetHeader = response.headers.get("content-type");
672-
const charset =
673-
charsetHeader?.split("charset=")[1] ||
674-
decode.match(/charset=([^"]+)/)?.[1] ||
675-
"utf-8";
676-
const htmlContent = charset
677-
? new TextDecoder(charset).decode(buf)
678-
: decode;
679-
*/
689+
const buf = await response.arrayBuffer();
690+
const bytes = new Uint8Array(buf);
691+
const encoding = sniffEncoding(
692+
bytes,
693+
response.headers.get("content-type")
694+
);
695+
const htmlContent = new TextDecoder(encoding).decode(bytes);
696+
680697
return rewriteHtml(
681-
await response.text(),
698+
htmlContent,
682699
handler.context,
683700
parsed.meta,
684701
true,

0 commit comments

Comments
 (0)