Skip to content

Commit eb66232

Browse files
authored
Improve utf8 text split and add unit test (#1414)
1 parent 3b4362e commit eb66232

File tree

4 files changed

+71
-11
lines changed

4 files changed

+71
-11
lines changed

.changeset/grumpy-zebras-itch.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"livekit-client": patch
3+
---
4+
5+
Improve utf8 text split and add unit test

src/room/participant/LocalParticipant.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1608,10 +1608,10 @@ export default class LocalParticipant extends Participant {
16081608
const writableStream = new WritableStream<string>({
16091609
// Implement the sink
16101610
async write(text) {
1611-
for (const textChunk of splitUtf8(text, STREAM_CHUNK_SIZE)) {
1611+
for (const textByteChunk of splitUtf8(text, STREAM_CHUNK_SIZE)) {
16121612
await localP.engine.waitForBufferStatusLow(DataPacket_Kind.RELIABLE);
16131613
const chunk = new DataStream_Chunk({
1614-
content: new TextEncoder().encode(textChunk),
1614+
content: textByteChunk,
16151615
streamId,
16161616
chunkIndex: numberToBigInt(chunkId),
16171617
});

src/room/utils.test.ts

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { describe, expect, it } from 'vitest';
2-
import { toWebsocketUrl } from './utils';
2+
import { splitUtf8, toWebsocketUrl } from './utils';
33

44
describe('toWebsocketUrl', () => {
55
it('leaves wss urls alone', () => {
@@ -14,3 +14,49 @@ describe('toWebsocketUrl', () => {
1414
expect(toWebsocketUrl('https://httpsmywebsite.com')).toEqual('wss://httpsmywebsite.com');
1515
});
1616
});
17+
18+
describe('splitUtf8', () => {
19+
it('splits a string into chunks of the given size', () => {
20+
expect(splitUtf8('hello world', 5)).toEqual([
21+
new TextEncoder().encode('hello'),
22+
new TextEncoder().encode(' worl'),
23+
new TextEncoder().encode('d'),
24+
]);
25+
});
26+
27+
it('splits a string with special characters into chunks of the given size', () => {
28+
expect(splitUtf8('héllo wörld', 5)).toEqual([
29+
new TextEncoder().encode('héll'),
30+
new TextEncoder().encode('o wö'),
31+
new TextEncoder().encode('rld'),
32+
]);
33+
});
34+
35+
it('splits a string with multi-byte utf8 characters correctly', () => {
36+
expect(splitUtf8('こんにちは世界', 5)).toEqual([
37+
new TextEncoder().encode('こ'),
38+
new TextEncoder().encode('ん'),
39+
new TextEncoder().encode('に'),
40+
new TextEncoder().encode('ち'),
41+
new TextEncoder().encode('は'),
42+
new TextEncoder().encode('世'),
43+
new TextEncoder().encode('界'),
44+
]);
45+
});
46+
47+
it('handles a string with a single multi-byte utf8 character', () => {
48+
expect(splitUtf8('😊', 5)).toEqual([new TextEncoder().encode('😊')]);
49+
});
50+
51+
it('handles a string with mixed single and multi-byte utf8 characters', () => {
52+
expect(splitUtf8('a😊b', 4)).toEqual([
53+
new TextEncoder().encode('a'),
54+
new TextEncoder().encode('😊'),
55+
new TextEncoder().encode('b'),
56+
]);
57+
});
58+
59+
it('handles an empty string', () => {
60+
expect(splitUtf8('', 5)).toEqual([]);
61+
});
62+
});

src/room/utils.ts

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -626,18 +626,27 @@ export function isRemoteParticipant(p: Participant): p is RemoteParticipant {
626626
return !p.isLocal;
627627
}
628628

629-
export function splitUtf8(s: string, n: number): string[] {
629+
export function splitUtf8(s: string, n: number): Uint8Array[] {
630+
if (n < 4) {
631+
throw new Error('n must be at least 4 due to utf8 encoding rules');
632+
}
630633
// adapted from https://stackoverflow.com/a/6043797
631-
const result: string[] = [];
632-
while (s.length > n) {
634+
const result: Uint8Array[] = [];
635+
let encoded = new TextEncoder().encode(s);
636+
while (encoded.length > n) {
633637
let k = n;
634-
// Move back to find the start of a UTF-8 character
635-
while ((s.charCodeAt(k) & 0xc0) === 0x80) {
638+
while (k > 0) {
639+
const byte = encoded[k];
640+
if (byte !== undefined && (byte & 0xc0) !== 0x80) {
641+
break;
642+
}
636643
k--;
637644
}
638-
result.push(s.slice(0, k));
639-
s = s.slice(k);
645+
result.push(encoded.slice(0, k));
646+
encoded = encoded.slice(k);
647+
}
648+
if (encoded.length > 0) {
649+
result.push(encoded);
640650
}
641-
result.push(s);
642651
return result;
643652
}

0 commit comments

Comments
 (0)