Skip to content

Commit 14de126

Browse files
committed
improve utf8 split handling
1 parent 4bda9fc commit 14de126

File tree

2 files changed

+14
-8
lines changed

2 files changed

+14
-8
lines changed

livekit-rtc/livekit/rtc/_utils.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from collections import deque
1818
import ctypes
1919
import random
20-
from typing import Callable, Generic, List, TypeVar
20+
from typing import Callable, Generator, Generic, List, TypeVar
2121

2222
logger = logging.getLogger("livekit")
2323

@@ -133,12 +133,13 @@ def generate_random_base62(length=12):
133133

134134

135135
# adapted from https://stackoverflow.com/a/6043797
136-
def split_utf8(s: str, n: int):
136+
def split_utf8(s: str, n: int) -> Generator[bytes, None, None]:
137137
"""Split UTF-8 s into chunks of maximum length n."""
138-
while len(s) > n:
138+
encoded = s.encode()
139+
while len(encoded) > n:
139140
k = n
140-
while (ord(s[k]) & 0xC0) == 0x80:
141+
while (encoded[k] & 0xC0) == 0x80:
141142
k -= 1
142-
yield s[:k]
143-
s = s[k:]
144-
yield s
143+
yield encoded[:k]
144+
encoded = encoded[k:]
145+
yield encoded

livekit-rtc/livekit/rtc/data_stream.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from __future__ import annotations
1616

1717
import asyncio
18+
import logging
1819
import uuid
1920
import datetime
2021
from collections.abc import Callable
@@ -27,11 +28,14 @@
2728
from ._utils import split_utf8
2829
from typing import TYPE_CHECKING
2930

31+
3032
if TYPE_CHECKING:
3133
from .participant import LocalParticipant
3234

3335
STREAM_CHUNK_SIZE = 15_000
3436

37+
logger = logging.getLogger("livekit")
38+
3539

3640
@dataclass
3741
class BaseStreamInfo:
@@ -66,6 +70,7 @@ def __init__(
6670
self._queue: asyncio.Queue[proto_DataStream.Chunk | None] = asyncio.Queue()
6771

6872
async def _on_chunk_update(self, chunk: proto_DataStream.Chunk):
73+
logger.info("Received text chunk: %s", chunk.content)
6974
await self._queue.put(chunk)
7075

7176
async def _on_stream_close(self, trailer: proto_DataStream.Trailer):
@@ -280,7 +285,7 @@ def __init__(
280285
async def write(self, text: str):
281286
async with self._write_lock:
282287
for chunk in split_utf8(text, STREAM_CHUNK_SIZE):
283-
content = chunk.encode()
288+
content = chunk
284289
chunk_index = self._next_chunk_index
285290
self._next_chunk_index += 1
286291
chunk_msg = proto_DataStream.Chunk(

0 commit comments

Comments
 (0)