Skip to content
This repository was archived by the owner on Mar 13, 2025. It is now read-only.

Commit 39cc299

Browse files
committed
Improve conformance of HTMLRewriter to runtime
- Require `ArrayBuffer`/`ArrayBufferView` chunks - Don't start rewriting until transformed `Response` body read - Don't run `end()` handlers on `null` bodied `Response`s
1 parent 7d6c352 commit 39cc299

File tree

2 files changed

+144
-65
lines changed

2 files changed

+144
-65
lines changed

packages/html-rewriter/src/rewriter.ts

Lines changed: 52 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,25 @@
1-
import { ReadableStream } from "stream/web";
2-
import { TextEncoder } from "util";
1+
import { ReadableStream, TransformStream } from "stream/web";
32
import { Response } from "@miniflare/core";
4-
import type { DocumentHandlers, ElementHandlers } from "html-rewriter-wasm";
3+
import type {
4+
HTMLRewriter as BaseHTMLRewriter,
5+
DocumentHandlers,
6+
ElementHandlers,
7+
} from "html-rewriter-wasm";
58
import { Response as BaseResponse } from "undici";
69

7-
// TODO: remove this function and these conversions, Workers only allows byte streams for Responses
8-
// Based on https://developer.mozilla.org/en-US/docs/Web/API/TransformStream#anything-to-uint8array_stream
9-
const encoder = new TextEncoder();
10-
// eslint-disable-next-line @typescript-eslint/explicit-module-boundary-types
11-
function transformToArray(chunk: any): Uint8Array {
10+
function transformToArray(chunk: ArrayBuffer | ArrayBufferView): Uint8Array {
1211
if (chunk instanceof Uint8Array) {
1312
return chunk;
14-
} else if (ArrayBuffer.isView(chunk)) {
15-
return new Uint8Array(chunk.buffer, chunk.byteOffset, chunk.byteLength);
1613
} else if (chunk instanceof ArrayBuffer) {
1714
return new Uint8Array(chunk);
18-
} else if (
19-
Array.isArray(chunk) &&
20-
chunk.every((value) => typeof value === "number")
21-
) {
22-
return new Uint8Array(chunk);
23-
} else if (typeof chunk === "number") {
24-
return new Uint8Array([chunk]);
25-
} else if (chunk === null || chunk === undefined) {
26-
throw new TypeError("Chunk must be defined");
2715
} else {
28-
return encoder.encode(String(chunk));
16+
return new Uint8Array(chunk.buffer, chunk.byteOffset, chunk.byteLength);
2917
}
3018
}
3119

3220
type SelectorElementHandlers = [selector: string, handlers: ElementHandlers];
3321

22+
// noinspection SuspiciousTypeOfGuard
3423
export class HTMLRewriter {
3524
readonly #elementHandlers: SelectorElementHandlers[] = [];
3625
readonly #documentHandlers: DocumentHandlers[] = [];
@@ -46,8 +35,11 @@ export class HTMLRewriter {
4635
}
4736

4837
transform(response: BaseResponse | Response): Response {
49-
const transformedStream = new ReadableStream<Uint8Array>({
50-
type: "bytes",
38+
let rewriter: BaseHTMLRewriter;
39+
const transformStream = new TransformStream<
40+
ArrayBuffer | ArrayBufferView,
41+
Uint8Array
42+
>({
5143
start: async (controller) => {
5244
// Create a rewriter instance for this transformation that writes its
5345
// output to the transformed response's stream. Note that each
@@ -57,7 +49,7 @@ export class HTMLRewriter {
5749
const { HTMLRewriter: BaseHTMLRewriter } = await import(
5850
"html-rewriter-wasm"
5951
);
60-
const rewriter = new BaseHTMLRewriter((output: Uint8Array) => {
52+
rewriter = new BaseHTMLRewriter((output) => {
6153
// enqueue will throw on empty chunks
6254
if (output.length !== 0) controller.enqueue(output);
6355
});
@@ -68,29 +60,52 @@ export class HTMLRewriter {
6860
for (const handlers of this.#documentHandlers) {
6961
rewriter.onDocument(handlers);
7062
}
71-
72-
try {
73-
// Transform the response body (may be null if empty)
74-
if (response.body) {
75-
for await (const chunk of response.body) {
76-
await rewriter.write(transformToArray(chunk));
77-
}
63+
},
64+
transform: async (chunk) => {
65+
if (chunk instanceof ArrayBuffer || ArrayBuffer.isView(chunk)) {
66+
try {
67+
// Make sure we're passing a Uint8Array to Rust glue
68+
return await rewriter.write(transformToArray(chunk));
69+
} catch (e) {
70+
// Make sure the rewriter is always freed (if transformer
71+
// transform() throws an error, transformer flush() won't be called)
72+
rewriter.free();
73+
throw e;
7874
}
79-
await rewriter.end();
80-
} catch (e) {
81-
controller.error(e);
75+
} else if (typeof chunk === "string") {
76+
throw new TypeError(
77+
"This TransformStream is being used as a byte stream, " +
78+
"but received a string on its writable side. " +
79+
"If you wish to write a string, you'll probably want to " +
80+
"explicitly UTF-8-encode it with TextEncoder."
81+
);
82+
} else {
83+
throw new TypeError(
84+
"This TransformStream is being used as a byte stream, " +
85+
"but received an object of non-ArrayBuffer/ArrayBufferView " +
86+
"type on its writable side."
87+
);
88+
}
89+
},
90+
flush: async () => {
91+
try {
92+
// Runs document end handlers
93+
return await rewriter.end();
8294
} finally {
83-
// Make sure the rewriter/controller are always freed/closed
95+
// Make sure the rewriter is always freed, regardless of whether
96+
// rewriter.end() throws
8497
rewriter.free();
85-
controller.close();
8698
}
8799
},
88100
});
89101

90102
// Return a response with the transformed body, copying over headers, etc,
91103
// returning a @miniflare/core Response so we don't need to convert
92-
// BaseResponse to one when dispatching fetch events
93-
const res = new Response(transformedStream, response);
104+
// BaseResponse to one when dispatching fetch events.
105+
const body = response.body as ReadableStream<Uint8Array> | null;
106+
const res = new Response(body?.pipeThrough(transformStream), response);
107+
// If Content-Length is set, it's probably going to be wrong, since we're
108+
// rewriting content, so remove it
94109
res.headers.delete("Content-Length");
95110
return res;
96111
}

packages/html-rewriter/test/rewriter.spec.ts

Lines changed: 92 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import assert from "assert";
22
import { ReadableStream } from "stream/web";
33
import { setTimeout } from "timers/promises";
4-
import { URLSearchParams } from "url";
54
import { TextDecoder, TextEncoder } from "util";
65
import { Response } from "@miniflare/core";
76
import { HTMLRewriter, HTMLRewriterPlugin } from "@miniflare/html-rewriter";
@@ -20,6 +19,8 @@ import {
2019

2120
// TODO (someday): debug why removing .serial breaks some of these async tests
2221

22+
const encoder = new TextEncoder();
23+
2324
// region: ELEMENT HANDLERS
2425

2526
const mutationsMacro: Macro<
@@ -556,7 +557,7 @@ test("HTMLRewriter: handles streaming responses", async (t) => {
556557
"</html>",
557558
];
558559
for (const chunk of chunks) {
559-
controller.enqueue(chunk);
560+
controller.enqueue(encoder.encode(chunk));
560561
await setTimeout(50);
561562
}
562563
controller.close();
@@ -583,43 +584,106 @@ test("HTMLRewriter: handles streaming responses", async (t) => {
583584
t.true(chunks.length >= 2);
584585
t.is(chunks.join(""), '<html lang="en"><body><p>test</p></body></html>');
585586
});
586-
test("HTMLRewriter: handles streaming response with various chunk types", async (t) => {
587-
const encoder = new TextEncoder();
588-
const chunks: [input: any, output: Uint8Array][] = [
589-
[new Uint8Array([1, 2, 3]), new Uint8Array([1, 2, 3])],
590-
[new Int8Array([1, 2, 3]), new Uint8Array([1, 2, 3])],
591-
[encoder.encode("test").buffer, encoder.encode("test")],
592-
[[1, 2, 3], new Uint8Array([1, 2, 3])],
593-
[1, new Uint8Array([1])],
594-
["test", encoder.encode("test")],
595-
[
596-
new URLSearchParams({ a: "1", b: "2", c: "3" }),
597-
encoder.encode("a=1&b=2&c=3"),
598-
],
599-
[{}, encoder.encode("[object Object]")],
600-
];
601-
587+
test("HTMLRewriter: handles ArrayBuffer and ArrayBufferView chunks", async (t) => {
588+
t.plan(2);
602589
const inputStream = new ReadableStream({
603-
async start(controller) {
604-
for (const [input] of chunks) controller.enqueue(input);
590+
start(controller) {
591+
const buffer = encoder.encode("<p>").buffer;
592+
let array = encoder.encode("test");
593+
const view1 = new Uint16Array(
594+
array.buffer,
595+
array.byteOffset,
596+
array.byteLength / Uint16Array.BYTES_PER_ELEMENT
597+
);
598+
array = encoder.encode("</p>");
599+
const view2 = new DataView(
600+
array.buffer,
601+
array.byteOffset,
602+
array.byteLength
603+
);
604+
controller.enqueue(buffer);
605+
controller.enqueue(view1);
606+
controller.enqueue(view2);
607+
controller.close();
608+
},
609+
});
610+
const res = new HTMLRewriter()
611+
.on("p", {
612+
text(text) {
613+
if (text.text) t.is(text.text, "test");
614+
},
615+
})
616+
.transform(new Response(inputStream));
617+
t.is(await res.text(), "<p>test</p>");
618+
});
619+
test("HTMLRewriter: throws on string chunks", async (t) => {
620+
const inputStream = new ReadableStream({
621+
start(controller) {
622+
controller.enqueue("I'm a string");
605623
controller.close();
606624
},
607625
});
608626
const res = new HTMLRewriter().transform(new Response(inputStream));
609-
assert(res.body);
610-
for await (const chunk of res.body) {
611-
const expected = chunks.shift();
612-
t.deepEqual(chunk, expected?.[1]);
613-
}
627+
await t.throwsAsync(res.text(), {
628+
instanceOf: TypeError,
629+
message:
630+
"This TransformStream is being used as a byte stream, " +
631+
"but received a string on its writable side. " +
632+
"If you wish to write a string, you'll probably want to " +
633+
"explicitly UTF-8-encode it with TextEncoder.",
634+
});
635+
});
636+
test("HTMLRewriter: throws on non-ArrayBuffer/ArrayBufferView chunks", async (t) => {
637+
const inputStream = new ReadableStream({
638+
start(controller) {
639+
controller.enqueue(42);
640+
controller.close();
641+
},
642+
});
643+
const res = new HTMLRewriter().transform(new Response(inputStream));
644+
await t.throwsAsync(res.text(), {
645+
instanceOf: TypeError,
646+
message:
647+
"This TransformStream is being used as a byte stream, " +
648+
"but received an object of non-ArrayBuffer/ArrayBufferView " +
649+
"type on its writable side.",
650+
});
614651
});
615652
test("HTMLRewriter: handles empty response", async (t) => {
616653
// Shouldn't call BaseHTMLRewriter.write, just BaseHTMLRewriter.end
617-
const res = new HTMLRewriter().transform(new Response());
654+
const res = new HTMLRewriter()
655+
.onDocument({
656+
end(end) {
657+
end.append("end");
658+
},
659+
})
660+
.transform(new Response());
661+
// Workers don't run the end() handler on null responses
618662
t.is(await res.text(), "");
619663
});
620664
test("HTMLRewriter: handles empty string response", async (t) => {
621-
const res = new HTMLRewriter().transform(new Response(""));
622-
t.is(await res.text(), "");
665+
const res = new HTMLRewriter()
666+
.onDocument({
667+
end(end) {
668+
end.append("end");
669+
},
670+
})
671+
.transform(new Response(""));
672+
t.is(await res.text(), "end");
673+
});
674+
test("HTNLRewriter: doesn't transform response until needed", async (t) => {
675+
const chunks: string[] = [];
676+
const res = new HTMLRewriter()
677+
.on("p", {
678+
text(text) {
679+
if (text.text) chunks.push(text.text);
680+
},
681+
})
682+
.transform(new Response("<p>1</p><p>2</p><p>3</p>"));
683+
await setTimeout(50);
684+
t.deepEqual(chunks, []);
685+
await res.arrayBuffer();
686+
t.deepEqual(chunks, ["1", "2", "3"]);
623687
});
624688
test("HTMLRewriter: copies response status and headers", async (t) => {
625689
const res = new HTMLRewriter().transform(

0 commit comments

Comments
 (0)