Skip to content

Commit 8d8d867

Browse files
committed
src: implement Windows-1252 encoding support and update related tests
1 parent 768f3ba commit 8d8d867

File tree

6 files changed

+152
-7
lines changed

6 files changed

+152
-7
lines changed

lib/internal/encoding.js

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ const kEncoding = Symbol('encoding');
2828
const kDecoder = Symbol('decoder');
2929
const kFatal = Symbol('kFatal');
3030
const kUTF8FastPath = Symbol('kUTF8FastPath');
31-
const kLatin1FastPath = Symbol('kLatin1FastPath');
31+
const kWindows1252FastPath = Symbol('kWindows1252FastPath');
3232
const kIgnoreBOM = Symbol('kIgnoreBOM');
3333

3434
const {
@@ -55,7 +55,7 @@ const {
5555
encodeIntoResults,
5656
encodeUtf8String,
5757
decodeUTF8,
58-
decodeLatin1,
58+
decodeWindows1252,
5959
} = binding;
6060

6161
const { Buffer } = require('buffer');
@@ -420,10 +420,10 @@ function makeTextDecoderICU() {
420420
this[kFatal] = Boolean(options?.fatal);
421421
// Only support fast path for UTF-8.
422422
this[kUTF8FastPath] = enc === 'utf-8';
423-
this[kLatin1FastPath] = enc === 'windows-1252';
423+
this[kWindows1252FastPath] = enc === 'windows-1252';
424424
this[kHandle] = undefined;
425425

426-
if (!this[kUTF8FastPath] && !this[kLatin1FastPath]) {
426+
if (!this[kUTF8FastPath] && !this[kWindows1252FastPath]) {
427427
this.#prepareConverter();
428428
}
429429
}
@@ -440,14 +440,14 @@ function makeTextDecoderICU() {
440440
validateDecoder(this);
441441

442442
this[kUTF8FastPath] &&= !(options?.stream);
443-
this[kLatin1FastPath] &&= !(options?.stream);
443+
this[kWindows1252FastPath] &&= !(options?.stream);
444444

445445
if (this[kUTF8FastPath]) {
446446
return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]);
447447
}
448448

449-
if (this[kLatin1FastPath]) {
450-
return decodeLatin1(input, this[kIgnoreBOM], this[kFatal]);
449+
if (this[kWindows1252FastPath]) {
450+
return decodeWindows1252(input, this[kIgnoreBOM], this[kFatal]);
451451
}
452452

453453
this.#prepareConverter();

src/encoding_binding.cc

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,8 @@ void BindingData::CreatePerIsolateProperties(IsolateData* isolate_data,
222222
SetMethodNoSideEffect(isolate, target, "toASCII", ToASCII);
223223
SetMethodNoSideEffect(isolate, target, "toUnicode", ToUnicode);
224224
SetMethodNoSideEffect(isolate, target, "decodeLatin1", DecodeLatin1);
225+
SetMethodNoSideEffect(
226+
isolate, target, "decodeWindows1252", DecodeWindows1252);
225227
}
226228

227229
void BindingData::CreatePerContextProperties(Local<Object> target,
@@ -240,6 +242,7 @@ void BindingData::RegisterTimerExternalReferences(
240242
registry->Register(ToASCII);
241243
registry->Register(ToUnicode);
242244
registry->Register(DecodeLatin1);
245+
registry->Register(DecodeWindows1252);
243246
}
244247

245248
void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
@@ -288,6 +291,77 @@ void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
288291
}
289292
}
290293

294+
void BindingData::DecodeWindows1252(const FunctionCallbackInfo<Value>& args) {
295+
Environment* env = Environment::GetCurrent(args);
296+
297+
CHECK_GE(args.Length(), 1);
298+
if (!(args[0]->IsArrayBuffer() || args[0]->IsSharedArrayBuffer() ||
299+
args[0]->IsArrayBufferView())) {
300+
return node::THROW_ERR_INVALID_ARG_TYPE(
301+
env->isolate(),
302+
"The \"input\" argument must be an instance of ArrayBuffer, "
303+
"SharedArrayBuffer, or ArrayBufferView.");
304+
}
305+
306+
bool ignore_bom = args[1]->IsTrue();
307+
bool has_fatal = args[2]->IsTrue();
308+
309+
ArrayBufferViewContents<uint8_t> buffer(args[0]);
310+
const uint8_t* data = buffer.data();
311+
size_t length = buffer.length();
312+
313+
if (ignore_bom && length > 0 && data[0] == 0xFF) {
314+
data++;
315+
length--;
316+
}
317+
318+
if (length == 0) {
319+
return args.GetReturnValue().SetEmptyString();
320+
}
321+
322+
// Windows-1252 specific mapping for bytes 128-159
323+
// These differ from Latin-1/ISO-8859-1
324+
static const uint16_t windows1252_mapping[32] = {
325+
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
326+
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
327+
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
328+
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
329+
};
330+
331+
std::string result;
332+
result.reserve(length * 3); // Reserve space for UTF-8 output
333+
334+
for (size_t i = 0; i < length; i++) {
335+
uint8_t byte = data[i];
336+
uint32_t codepoint;
337+
338+
// Check if byte is in the special Windows-1252 range (128-159)
339+
if (byte >= 0x80 && byte <= 0x9F) {
340+
codepoint = windows1252_mapping[byte - 0x80];
341+
} else {
342+
// For all other bytes, Windows-1252 is identical to Latin-1
343+
codepoint = byte;
344+
}
345+
346+
// Convert codepoint to UTF-8
347+
if (codepoint < 0x80) {
348+
result.push_back(static_cast<char>(codepoint));
349+
} else if (codepoint < 0x800) {
350+
result.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
351+
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
352+
} else {
353+
result.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
354+
result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
355+
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
356+
}
357+
}
358+
359+
Local<Value> ret;
360+
if (ToV8Value(env->context(), result, env->isolate()).ToLocal(&ret)) {
361+
args.GetReturnValue().Set(ret);
362+
}
363+
}
364+
291365
} // namespace encoding_binding
292366
} // namespace node
293367

src/encoding_binding.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ class BindingData : public SnapshotableObject {
3232
static void EncodeUtf8String(const v8::FunctionCallbackInfo<v8::Value>& args);
3333
static void DecodeUTF8(const v8::FunctionCallbackInfo<v8::Value>& args);
3434
static void DecodeLatin1(const v8::FunctionCallbackInfo<v8::Value>& args);
35+
static void DecodeWindows1252(
36+
const v8::FunctionCallbackInfo<v8::Value>& args);
3537

3638
static void ToASCII(const v8::FunctionCallbackInfo<v8::Value>& args);
3739
static void ToUnicode(const v8::FunctionCallbackInfo<v8::Value>& args);

test/parallel/test-internal-encoding-binding.js

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,25 @@ const binding = internalBinding('encoding_binding');
4646
const buf = Uint8Array.from([0xFE, 0xFF, 0xC1, 0xE9, 0xF3]);
4747
assert.strictEqual(binding.decodeLatin1(buf, true, true), 'þÿÁéó');
4848
}
49+
50+
// Windows-1252 specific tests
51+
{
52+
// Test Windows-1252 special characters in 128-159 range
53+
// These differ from Latin-1
54+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x80), false, false), '€');
55+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x82), false, false), '‚');
56+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x83), false, false), 'ƒ');
57+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x9F), false, false), 'Ÿ');
58+
}
59+
60+
{
61+
// Test Windows-1252 characters outside 128-159 range (same as Latin-1)
62+
const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]);
63+
assert.strictEqual(binding.decodeWindows1252(buf, false, false), 'Áéó');
64+
}
65+
66+
{
67+
// Empty input
68+
const buf = Uint8Array.from([]);
69+
assert.strictEqual(binding.decodeWindows1252(buf, false, false), '');
70+
}

test/parallel/test-util-text-decoder.js

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,49 @@ test('TextDecoder correctly decodes windows-1252 encoded data', { skip: !common.
1515

1616
assert.strictEqual(decodedString, expectedString);
1717
});
18+
19+
// Test for the difference between Latin1 and Windows-1252 in the 128-159
20+
// range
21+
// Ref: https://github.com/nodejs/node/issues/60888
22+
test('TextDecoder correctly decodes windows-1252 special characters in ' +
23+
'128-159 range', { skip: !common.hasIntl }, () => {
24+
const decoder = new TextDecoder('windows-1252');
25+
26+
// Test specific characters that differ between Latin1 and Windows-1252.
27+
// € Euro sign
28+
assert.strictEqual(decoder.decode(Uint8Array.of(128)).codePointAt(0),
29+
8364);
30+
// ‚ Single low-9 quotation mark
31+
assert.strictEqual(decoder.decode(Uint8Array.of(130)).codePointAt(0),
32+
8218);
33+
// Latin small letter f with hook (ƒ)
34+
assert.strictEqual(decoder.decode(Uint8Array.of(131)).codePointAt(0),
35+
402);
36+
// Ÿ Latin capital letter Y with diaeresis
37+
assert.strictEqual(decoder.decode(Uint8Array.of(159)).codePointAt(0),
38+
376);
39+
40+
// Test the full range to ensure no character is treated as Latin1
41+
// Directly.
42+
const expectedMappings = [
43+
[128, 8364], [129, 129], [130, 8218], [131, 402], [132, 8222],
44+
[133, 8230], [134, 8224], [135, 8225], [136, 710], [137, 8240],
45+
[138, 352], [139, 8249], [140, 338], [141, 141], [142, 381],
46+
[143, 143], [144, 144], [145, 8216], [146, 8217], [147, 8220],
47+
[148, 8221], [149, 8226], [150, 8211], [151, 8212], [152, 732],
48+
[153, 8482], [154, 353], [155, 8250], [156, 339], [157, 157],
49+
[158, 382], [159, 376],
50+
];
51+
52+
for (const [byte, expectedCodePoint] of expectedMappings) {
53+
const result = decoder.decode(Uint8Array.of(byte));
54+
const actualCodePoint = result.codePointAt(0);
55+
assert.strictEqual(
56+
actualCodePoint,
57+
expectedCodePoint,
58+
`Byte 0x${byte.toString(16)} should decode to ` +
59+
`U+${expectedCodePoint.toString(16)} but got ` +
60+
`U+${actualCodePoint.toString(16)}`
61+
);
62+
}
63+
});

typings/internalBinding/encoding_binding.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ export interface EncodingBinding {
55
toASCII(input: string): string;
66
toUnicode(input: string): string;
77
decodeLatin1(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string;
8+
decodeWindows1252(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string;
89
}

0 commit comments

Comments
 (0)