Skip to content

Commit 8e110a7

Browse files
committed
Utf8Tools: add truncateToUtf8ByteLength
1 parent 6c7d5e5 commit 8e110a7

File tree

2 files changed

+88
-0
lines changed

2 files changed

+88
-0
lines changed

src/utf8-tools/Utf8Tools.ts

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,4 +208,50 @@ export class Utf8Tools {
208208
// If the while loop was broken early, i is smaller and the array is not valid UTF-8.
209209
return i === bytes.length;
210210
}
211+
212+
public static truncateToUtf8ByteLength(input: string | Uint8Array, length: number, applyEllipsis: boolean = true)
213+
: { truncatedString: string, truncatedBytes: Uint8Array, didTruncate: boolean } {
214+
if (length < 0) {
215+
throw new Error('Invalid byte length');
216+
}
217+
218+
let bytes: Uint8Array;
219+
if (typeof input === 'string') {
220+
bytes = Utf8Tools.stringToUtf8ByteArray(input);
221+
} else {
222+
bytes = input;
223+
}
224+
225+
if (bytes.length <= length) {
226+
return {
227+
truncatedString: typeof input === 'string' ? input : Utf8Tools.utf8ByteArrayToString(input),
228+
truncatedBytes: bytes,
229+
didTruncate: false,
230+
};
231+
}
232+
233+
const ellipsisBytes = [226, 128, 166];
234+
if (length < ellipsisBytes.length) applyEllipsis = false;
235+
236+
bytes = bytes.subarray(0, length - (applyEllipsis ? ellipsisBytes.length : 0));
237+
238+
// Cut off last byte until byte array is valid utf-8
239+
while (!Utf8Tools.isValidUtf8(bytes)) bytes = bytes.subarray(0, bytes.length - 1);
240+
241+
if (applyEllipsis) {
242+
// Add ellipsis. Note that we can safely extend by the ellipsis bytes as we shoved these bytes off before.
243+
bytes = new Uint8Array(bytes.buffer, bytes.byteOffset, bytes.length + ellipsisBytes.length);
244+
if (typeof input !== 'string') {
245+
// We're working on the input bytes. Create a copy to not modify the original data.
246+
bytes = new Uint8Array(bytes);
247+
}
248+
bytes.set(ellipsisBytes, bytes.length - ellipsisBytes.length);
249+
}
250+
251+
return {
252+
truncatedString: Utf8Tools.utf8ByteArrayToString(bytes),
253+
truncatedBytes: bytes,
254+
didTruncate: true,
255+
};
256+
}
211257
}

tests/Utf8Tools.spec.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,46 @@ describe('Utf8Tools', () => {
7878

7979
it('can validate utf-8 bytes.', testIsValidUtf8);
8080
});
81+
82+
it('can truncate to utf8 byte lengths', () => {
83+
expect(() => Utf8Tools.truncateToUtf8ByteLength(asciiString, -1)).toThrow();
84+
85+
const expected = {
86+
didTruncate: false,
87+
truncatedString: hanziString,
88+
truncatedBytes: hanziBytes,
89+
};
90+
expect(Utf8Tools.truncateToUtf8ByteLength(hanziString, hanziBytes.length, true)).toEqual(expected);
91+
expect(Utf8Tools.truncateToUtf8ByteLength(hanziString, hanziBytes.length, false)).toEqual(expected);
92+
expect(Utf8Tools.truncateToUtf8ByteLength(hanziString, hanziBytes.length + 1, true)).toEqual(expected);
93+
expect(Utf8Tools.truncateToUtf8ByteLength(hanziString, hanziBytes.length + 1, false)).toEqual(expected);
94+
95+
expected.didTruncate = true;
96+
expected.truncatedString = asciiString.substring(0, asciiString.length - 1);
97+
expected.truncatedBytes = Utf8Tools.stringToUtf8ByteArray(expected.truncatedString);
98+
expect(Utf8Tools.truncateToUtf8ByteLength(asciiString, asciiBytes.length - 1, false)).toEqual(expected);
99+
expect(Utf8Tools.truncateToUtf8ByteLength(asciiBytes, asciiBytes.length - 1, false)).toEqual(expected);
100+
expect(Utf8Tools.truncateToUtf8ByteLength(asciiString, asciiBytes.length - 1, true)).toEqual(expected);
101+
expect(Utf8Tools.truncateToUtf8ByteLength(asciiBytes, asciiBytes.length - 1, true)).toEqual(expected);
102+
103+
expected.truncatedString = hanziString.substring(0, hanziString.length - 1);
104+
expected.truncatedBytes = Utf8Tools.stringToUtf8ByteArray(expected.truncatedString);
105+
expect(Utf8Tools.truncateToUtf8ByteLength(hanziString, hanziBytes.length - 1, false)).toEqual(expected);
106+
expect(Utf8Tools.truncateToUtf8ByteLength(hanziBytes, hanziBytes.length - 1, false)).toEqual(expected);
107+
108+
expected.truncatedString = `${hanziString.substring(0, hanziString.length - 2)}…`;
109+
expected.truncatedBytes = Utf8Tools.stringToUtf8ByteArray(expected.truncatedString);
110+
expect(Utf8Tools.truncateToUtf8ByteLength(hanziString, hanziBytes.length - 1, true)).toEqual(expected);
111+
expect(Utf8Tools.truncateToUtf8ByteLength(hanziBytes, hanziBytes.length - 1, true)).toEqual(expected);
112+
113+
expected.truncatedString = '';
114+
expected.truncatedBytes = Utf8Tools.stringToUtf8ByteArray(expected.truncatedString);
115+
expect(Utf8Tools.truncateToUtf8ByteLength(astralString, astralBytes.length - 1, false)).toEqual(expected);
116+
expect(Utf8Tools.truncateToUtf8ByteLength(astralBytes, astralBytes.length - 1, false)).toEqual(expected);
117+
118+
expected.truncatedString = '…';
119+
expected.truncatedBytes = Utf8Tools.stringToUtf8ByteArray(expected.truncatedString);
120+
expect(Utf8Tools.truncateToUtf8ByteLength(astralString, astralBytes.length - 1, true)).toEqual(expected);
121+
expect(Utf8Tools.truncateToUtf8ByteLength(astralBytes, astralBytes.length - 1, true)).toEqual(expected);
122+
});
81123
});

0 commit comments

Comments
 (0)