Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,14 @@ jobs:
fail-fast: false
matrix:
node-version:
- 18
# Explicitly test minimum Node.js versions. Keep in sync with package.json.
- 20.19.0
- 20
- latest
- 22.12.0
- 22
- 24.0.0
- lts/* # currently 24
- latest # currently 25
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ const sniffedEncoding = htmlEncodingSniffer(htmlBytes);

The passed bytes are given as a `Uint8Array`; the Node.js `Buffer` subclass of `Uint8Array` will also work, as shown above.

The returned value will be a canonical [encoding name](https://encoding.spec.whatwg.org/#names-and-labels) (not a label). You might then combine this with the [whatwg-encoding](https://github.com/jsdom/whatwg-encoding) package to decode the result:
The returned value will be an [encoding label](https://encoding.spec.whatwg.org/#names-and-labels), and in particular, the label which is a lowercased version of the encoding's name. You might then combine this with the [`@exodus/bytes`](https://github.com/ExodusOSS/bytes/) package to decode the result:

```js
const whatwgEncoding = require("whatwg-encoding");
const htmlString = whatwgEncoding.decode(htmlBytes, sniffedEncoding);
const { TextDecoder } = require("@exodus/bytes");
const htmlString = (new TextEncoder(sniffedEncoding)).decode(htmlBytes);
```

## Options
Expand Down
16 changes: 8 additions & 8 deletions lib/html-encoding-sniffer.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"use strict";
const whatwgEncoding = require("whatwg-encoding");
const { getBOMEncoding, normalizeEncoding: labelToName } = require("@exodus/bytes/encoding-lite.js");

// https://html.spec.whatwg.org/#encoding-sniffing-algorithm
module.exports = (uint8Array, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => {
let encoding = whatwgEncoding.getBOMEncoding(uint8Array);
let encoding = getBOMEncoding(uint8Array);

if (encoding === null && transportLayerEncodingLabel !== undefined) {
encoding = whatwgEncoding.labelToName(transportLayerEncodingLabel);
encoding = labelToName(transportLayerEncodingLabel);
}

if (encoding === null) {
Expand Down Expand Up @@ -69,7 +69,7 @@ function prescanMetaCharset(uint8Array) {
needPragma = true;
}
} else if (attrRes.attr.name === "charset") {
charset = whatwgEncoding.labelToName(attrRes.attr.value);
charset = labelToName(attrRes.attr.value);
needPragma = false;
}
}
Expand All @@ -86,8 +86,8 @@ function prescanMetaCharset(uint8Array) {
continue;
}

if (charset === "UTF-16LE" || charset === "UTF-16BE") {
charset = "UTF-8";
if (charset === "utf-16le" || charset === "utf-16be") {
charset = "utf-8";
}
if (charset === "x-user-defined") {
charset = "windows-1252";
Expand Down Expand Up @@ -271,7 +271,7 @@ function extractCharacterEncodingFromMeta(string) {
const nextIndex = string.indexOf(string[position], position + 1);

if (nextIndex !== -1) {
return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex));
return labelToName(string.substring(position + 1, nextIndex));
}

// It is an unmatched quotation mark
Expand All @@ -287,7 +287,7 @@ function extractCharacterEncodingFromMeta(string) {
string.length :
position + indexOfASCIIWhitespaceOrSemicolon + 1;

return whatwgEncoding.labelToName(string.substring(position, end));
return labelToName(string.substring(position, end));
}

function isSpaceCharacter(c) {
Expand Down
48 changes: 19 additions & 29 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
"lint": "eslint ."
},
"dependencies": {
"whatwg-encoding": "^3.1.1"
"@exodus/bytes": "^1.0.0"
},
"devDependencies": {
"@domenic/eslint-config": "^3.0.0",
"eslint": "^8.53.0"
},
"engines": {
"node": ">=18"
"node": "^20.19.0 || ^22.12.0 || >=24.0.0"
}
}
File renamed without changes.
File renamed without changes.
File renamed without changes.
20 changes: 10 additions & 10 deletions test/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/bom"))) {
it(`should sniff as ${desiredEncoding}, given overriding options`, () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
transportLayerEncodingLabel: "windows-1252",
defaultEncoding: "UTF-16LE"
defaultEncoding: "utf-16le"
});

assert.strictEqual(sniffedEncoding, desiredEncoding);
Expand All @@ -47,7 +47,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/normal"))) {
it("should sniff as the transport layer encoding, given that", () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
transportLayerEncodingLabel: "windows-1251",
defaultEncoding: "ISO-8859-16"
defaultEncoding: "iso-8859-16"
});

assert.strictEqual(sniffedEncoding, "windows-1251");
Expand All @@ -56,7 +56,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/normal"))) {

it(`should sniff as ${desiredEncoding}, given only a default encoding`, () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
defaultEncoding: "ISO-8859-16"
defaultEncoding: "iso-8859-16"
});

assert.strictEqual(sniffedEncoding, desiredEncoding);
Expand All @@ -78,7 +78,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/no-result"))
it("should sniff as the transport layer encoding, given that", () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
transportLayerEncodingLabel: "windows-1251",
defaultEncoding: "ISO-8859-16"
defaultEncoding: "iso-8859-16"
});

assert.strictEqual(sniffedEncoding, "windows-1251");
Expand All @@ -87,10 +87,10 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/no-result"))

it("should sniff as the default encoding, given that", () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
defaultEncoding: "ISO-8859-16"
defaultEncoding: "iso-8859-16"
});

assert.strictEqual(sniffedEncoding, "ISO-8859-16");
assert.strictEqual(sniffedEncoding, "iso-8859-16");
});
});
}
Expand All @@ -102,13 +102,13 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/utf"))) {
it("should sniff as UTF-8, given no options", () => {
const sniffedEncoding = htmlEncodingSniffer(buffer);

assert.strictEqual(sniffedEncoding, "UTF-8");
assert.strictEqual(sniffedEncoding, "utf-8");
});

it("should sniff as the transport layer encoding, given that", () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
transportLayerEncodingLabel: "windows-1251",
defaultEncoding: "ISO-8859-16"
defaultEncoding: "iso-8859-16"
});

assert.strictEqual(sniffedEncoding, "windows-1251");
Expand All @@ -117,10 +117,10 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/utf"))) {

it("should sniff as UTF-8, given only a default encoding", () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
defaultEncoding: "ISO-8859-16"
defaultEncoding: "iso-8859-16"
});

assert.strictEqual(sniffedEncoding, "UTF-8");
assert.strictEqual(sniffedEncoding, "utf-8");
});
});
}