Skip to content
This repository was archived by the owner on Mar 13, 2025. It is now read-only.

Commit 983fefe

Browse files
committed
Switch to html-rewriter-wasm package for HTMLRewriter
1 parent cdd6eab commit 983fefe

File tree

14 files changed

+82
-1355
lines changed

14 files changed

+82
-1355
lines changed

package-lock.json

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
"dotenv": "^8.2.0",
4646
"env-paths": "^2.2.1",
4747
"formdata-node": "^2.4.0",
48+
"html-rewriter-wasm": "^0.3.0",
4849
"http-cache-semantics": "^4.1.0",
4950
"ioredis": "^4.27.6",
5051
"micromatch": "^4.0.4",

src/modules/index.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
export { DurableObject } from "./do";
22
export { FetchEvent, ScheduledEvent, ResponseWaitUntil } from "./events";
3-
export { HTMLRewriter, UnsafeHTMLRewriter } from "./rewriter";
3+
export {
4+
HTMLRewriter,
5+
Element,
6+
Comment,
7+
TextChunk,
8+
Doctype,
9+
DocumentEnd,
10+
ContentTypeOptions,
11+
} from "./rewriter";
412
export {
513
URL,
614
URLSearchParams,

src/modules/rewriter.ts

Lines changed: 39 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,19 @@
1-
/// <reference types="@cloudflare/workers-types" />
21
import { TextEncoder } from "util";
32
import { Response } from "@mrbbot/node-fetch";
4-
import { ReadableStream } from "web-streams-polyfill/ponyfill/es6";
5-
// This import relies on dist having the same structure as src
63
import {
7-
HTMLRewriter as LOLHTMLRewriter,
8-
registerPromise,
9-
} from "../../vendor/lol-html";
10-
import { Mutex } from "../kv/helpers";
11-
import { ProcessedOptions } from "../options";
4+
HTMLRewriter as BaseHTMLRewriter,
5+
Comment,
6+
ContentTypeOptions,
7+
Doctype,
8+
DocumentEnd,
9+
DocumentHandlers,
10+
Element,
11+
ElementHandlers,
12+
TextChunk,
13+
} from "html-rewriter-wasm";
14+
import { ReadableStream } from "web-streams-polyfill/ponyfill/es6";
1215
import { Context, Module } from "./module";
1316

14-
function wrapHandler<T>(handler?: (arg: T) => void | Promise<void>) {
15-
if (handler === undefined) return undefined;
16-
return function (arg: T) {
17-
const result = handler(arg);
18-
// If this handler is async and returns a promise, register it and return
19-
// its handle so it can be awaited later in WebAssembly
20-
if (typeof result === "object" && typeof result.then === "function") {
21-
return registerPromise(result);
22-
}
23-
// Otherwise, return 0 to signal there's nothing to await
24-
return 0;
25-
};
26-
}
27-
2817
// Based on https://developer.mozilla.org/en-US/docs/Web/API/TransformStream#anything-to-uint8array_stream
2918
const encoder = new TextEncoder();
3019
// eslint-disable-next-line @typescript-eslint/explicit-module-boundary-types
@@ -49,54 +38,28 @@ export function transformToArray(chunk: any): Uint8Array {
4938
}
5039
}
5140

52-
/* The WebAssembly version of lol-html used by Miniflare uses asyncify for
53-
* async handlers. When a handler returns a promise, the WebAssembly stack is
54-
* stored in temporary storage, the promise is awaited, then the stack is
55-
* restored and WebAssembly execution continues where it left off. This
56-
* temporary storage is currently per module instance and we only have a single
57-
* instance because of how wasm-pack generates package code for NodeJS.
58-
* TODO: ideally, we would allocate each transform call its own temporary
59-
* space for the saved stack.
60-
*
61-
* This means if you have multiple concurrent transforms in progress, the saved
62-
* stacks will be overwritten and lol-html will be unhappy. Therefore, to be
63-
* "safe", we need to make sure only one transform operation is in-progress at
64-
* any time, hence the mutex.
65-
*
66-
* However, this problem only occurs when using async handlers with concurrent
67-
* transforms. If just using sync handlers, or not doing multiple rewrites
68-
* concurrently (very likely), there's no need for the mutex, so we can use the
69-
* "unsafe" version. The "safe" version is the default just so people don't see
70-
* confusing errors. See the docs for concrete examples of where the "unsafe"
71-
* version can be used.
72-
*/
73-
74-
const wasmModuleMutex = new Mutex();
75-
// Symbol gives us "protected" method only accessible/overridable by subclass
76-
const runCriticalSectionSymbol = Symbol("HTMLRewriter runCriticalSection");
77-
78-
export class UnsafeHTMLRewriter {
41+
export class HTMLRewriter {
7942
#elementHandlers: [selector: string, handlers: any][] = [];
8043
#documentHandlers: any[] = [];
8144

82-
on(selector: string, handlers: Partial<ElementHandler>): this {
45+
on(selector: string, handlers: ElementHandlers): this {
8346
// Ensure handlers register returned promises, and `this` is bound correctly
8447
const wrappedHandlers = {
85-
element: wrapHandler(handlers.element?.bind(handlers)),
86-
comments: wrapHandler(handlers.comments?.bind(handlers)),
87-
text: wrapHandler(handlers.text?.bind(handlers)),
48+
element: handlers.element?.bind(handlers),
49+
comments: handlers.comments?.bind(handlers),
50+
text: handlers.text?.bind(handlers),
8851
};
8952
this.#elementHandlers.push([selector, wrappedHandlers]);
9053
return this;
9154
}
9255

93-
onDocument(handlers: Partial<DocumentHandler>): this {
56+
onDocument(handlers: DocumentHandlers): this {
9457
// Ensure handlers register returned promises, and `this` is bound correctly
9558
const wrappedHandlers = {
96-
doctype: wrapHandler(handlers.doctype?.bind(handlers)),
97-
comments: wrapHandler(handlers.comments?.bind(handlers)),
98-
text: wrapHandler(handlers.text?.bind(handlers)),
99-
end: wrapHandler(handlers.end?.bind(handlers)),
59+
doctype: handlers.doctype?.bind(handlers),
60+
comments: handlers.comments?.bind(handlers),
61+
text: handlers.text?.bind(handlers),
62+
end: handlers.end?.bind(handlers),
10063
};
10164
this.#documentHandlers.push(wrappedHandlers);
10265
return this;
@@ -108,7 +71,7 @@ export class UnsafeHTMLRewriter {
10871
start: async (controller) => {
10972
// Create a rewriter instance for this transformation that writes its
11073
// output to the transformed response's stream
111-
const rewriter = new LOLHTMLRewriter((output: Uint8Array) => {
74+
const rewriter = new BaseHTMLRewriter((output: Uint8Array) => {
11275
if (output.length === 0) {
11376
// Free the rewriter once it's finished doing its thing
11477
queueMicrotask(() => rewriter.free());
@@ -125,41 +88,32 @@ export class UnsafeHTMLRewriter {
12588
rewriter.onDocument(handlers);
12689
}
12790

128-
await this[runCriticalSectionSymbol](async () => {
129-
// Transform the response body (may be null if empty)
130-
if (response.body) {
131-
for await (const chunk of response.body) {
132-
await rewriter.write(transformToArray(chunk));
133-
}
91+
// Transform the response body (may be null if empty)
92+
if (response.body) {
93+
for await (const chunk of response.body) {
94+
await rewriter.write(transformToArray(chunk));
13495
}
135-
await rewriter.end();
136-
});
96+
}
97+
await rewriter.end();
13798
},
13899
});
139100

140101
// Return a response with the transformed body, copying over headers, etc
141102
return new Response(transformedStream, response);
142103
}
143-
144-
[runCriticalSectionSymbol](closure: () => Promise<void>): Promise<void> {
145-
return closure();
146-
}
147-
}
148-
149-
// See big comment above for what this does and why it's needed. It's possible
150-
// we'll remove this distinction in the future.
151-
export class HTMLRewriter extends UnsafeHTMLRewriter {
152-
[runCriticalSectionSymbol](closure: () => Promise<void>): Promise<void> {
153-
return wasmModuleMutex.run(closure);
154-
}
155104
}
156105

157106
export class HTMLRewriterModule extends Module {
158-
buildSandbox(options: ProcessedOptions): Context {
159-
return {
160-
HTMLRewriter: options.htmlRewriterUnsafe
161-
? UnsafeHTMLRewriter
162-
: HTMLRewriter,
163-
};
107+
buildSandbox(): Context {
108+
return { HTMLRewriter };
164109
}
165110
}
111+
112+
export {
113+
Element,
114+
Comment,
115+
TextChunk,
116+
Doctype,
117+
DocumentEnd,
118+
ContentTypeOptions,
119+
};

src/options/index.ts

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,6 @@ export interface Options {
9494
envPath?: string;
9595
bindings?: Record<string, any>;
9696
wasmBindings?: Record<string, string>;
97-
98-
htmlRewriterUnsafe?: boolean;
9997
}
10098

10199
export interface ProcessedOptions extends Options {
@@ -158,7 +156,6 @@ export function logOptions(log: Log, options: ProcessedOptions): void {
158156
: options.https === true
159157
? "Self-Signed"
160158
: `Self-Signed: ${options.https}`,
161-
"Unsafe HTMLRewriter": options.htmlRewriterUnsafe,
162159
};
163160
log.debug("Options:");
164161
for (const [key, value] of Object.entries(entries)) {

test/modules/rewriter.spec.ts

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,24 @@ import { URLSearchParams } from "url";
22
import { TextDecoder, TextEncoder } from "util";
33
import test, { Macro, ThrowsExpectation } from "ava";
44
import { ReadableStream } from "web-streams-polyfill/ponyfill/es6";
5-
// Import UnsafeHTMLRewriter as HTMLRewriter since we're using it much more
6-
// (we want to be able to run tests in parallel) and typing "Unsafe" each time
7-
// is annoying.
85
import {
9-
UnsafeHTMLRewriter as HTMLRewriter,
6+
Comment,
7+
Doctype,
8+
DocumentEnd,
9+
Element,
10+
HTMLRewriter,
1011
NoOpLog,
1112
Response,
12-
HTMLRewriter as SafeHTMLRewriter,
13+
TextChunk,
1314
} from "../../src";
1415
import {
1516
HTMLRewriterModule,
1617
transformToArray,
1718
} from "../../src/modules/rewriter";
1819
import { getObjectProperties, wait } from "../helpers";
1920

21+
// TODO: remove most of these tests, they're now in html-rewriter-wasm
22+
2023
// region: Uint8ArrayTransformStream
2124
const encoder = new TextEncoder();
2225
test("transformToArray: passes through Uint8Array", (t) => {
@@ -63,7 +66,7 @@ const mutationsMacro: Macro<
6366
[
6467
(
6568
rw: HTMLRewriter,
66-
handler: (token: Element | Text | Comment) => void
69+
handler: (token: Element | TextChunk | Comment) => void
6770
) => HTMLRewriter,
6871
string,
6972
{
@@ -338,7 +341,7 @@ const textMutationsExpected = {
338341
};
339342

340343
const textPropertiesMacro: Macro<
341-
[(rw: HTMLRewriter, text: (text: Text) => void) => HTMLRewriter]
344+
[(rw: HTMLRewriter, text: (text: TextChunk) => void) => HTMLRewriter]
342345
> = async (t, func) => {
343346
t.plan(6);
344347
const res = func(new HTMLRewriter(), (text) => {
@@ -364,7 +367,7 @@ test(
364367
textMutationsExpected
365368
);
366369
const textAsyncHandlerMacro: Macro<
367-
[(rw: HTMLRewriter, text: (t: Text) => Promise<void>) => HTMLRewriter]
370+
[(rw: HTMLRewriter, text: (t: TextChunk) => Promise<void>) => HTMLRewriter]
368371
> = async (t, func) => {
369372
const res = func(new HTMLRewriter(), async (text) => {
370373
if (text.text === "t") {
@@ -380,11 +383,16 @@ test.serial(
380383
(rw, text) => rw.on("p", { text })
381384
);
382385
const textClassHandlerMacro: Macro<
383-
[(rw: HTMLRewriter, handler: { text: (text: Text) => void }) => HTMLRewriter]
386+
[
387+
(
388+
rw: HTMLRewriter,
389+
handler: { text: (text: TextChunk) => void }
390+
) => HTMLRewriter
391+
]
384392
> = async (t, func) => {
385393
class Handler {
386394
constructor(private content: string) {}
387-
text(text: Text) {
395+
text(text: TextChunk) {
388396
if (text.text === "t") text.after(this.content);
389397
}
390398
}
@@ -615,7 +623,7 @@ test("HTMLRewriter: handles streaming responses", async (t) => {
615623
t.is(chunks.join(""), '<html lang="en"><body><p>test</p></body></html>');
616624
});
617625
test("HTMLRewriter: handles empty response", async (t) => {
618-
// Shouldn't call LOLHTMLRewriter.write, just LOLHTMLRewriter.end
626+
// Shouldn't call BaseHTMLRewriter.write, just BaseHTMLRewriter.end
619627
const res = new HTMLRewriter().transform(new Response());
620628
t.is(await res.text(), "");
621629
});
@@ -663,7 +671,7 @@ test.serial(
663671
// Note this test requires the "safe" HTMLRewriter, see comments in
664672
// src/modules/rewriter.ts for more details
665673
const rewriter = (i: number) =>
666-
new SafeHTMLRewriter()
674+
new HTMLRewriter()
667675
.on("p", {
668676
async element(element) {
669677
await wait(50);
@@ -858,17 +866,9 @@ test("HTMLRewriter: throws error on unsupported selector", async (t) => {
858866

859867
// region: MODULE
860868

861-
test("HTMLRewriter: defaults to safe implementation", async (t) => {
862-
const module = new HTMLRewriterModule(new NoOpLog());
863-
const { HTMLRewriter: SandboxHTMLRewriter } = module.buildSandbox({});
864-
t.is(SandboxHTMLRewriter, SafeHTMLRewriter);
865-
});
866-
test("HTMLRewriter: allows unsafe implementation", async (t) => {
869+
test("HTMLRewriter: included in sandbox", async (t) => {
867870
const module = new HTMLRewriterModule(new NoOpLog());
868-
const { HTMLRewriter: SandboxHTMLRewriter } = module.buildSandbox({
869-
htmlRewriterUnsafe: true,
870-
});
871-
// UnsafeHTMLRewriter has been imported as HTMLRewriter
871+
const { HTMLRewriter: SandboxHTMLRewriter } = module.buildSandbox();
872872
t.is(SandboxHTMLRewriter, HTMLRewriter);
873873
});
874874

test/options/index.spec.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ test("logOptions: logs all options", (t) => {
6363
durableObjectsPersist: true,
6464
bindings: { KEY: "value" },
6565
https: true,
66-
htmlRewriterUnsafe: true,
6766
});
6867

6968
t.deepEqual(log.debugs, [
@@ -84,7 +83,6 @@ test("logOptions: logs all options", (t) => {
8483
"- Durable Objects Persistence: true",
8584
"- Bindings: KEY",
8685
"- HTTPS: Self-Signed",
87-
"- Unsafe HTMLRewriter: true",
8886
]);
8987
});
9088

0 commit comments

Comments
 (0)