Skip to content

Commit 92dd39b

Browse files
committed
fix: handle UTF-8 boundary issues in streaming responses
- Add UTF8StreamDecoder utility to properly handle multi-byte UTF-8 characters split across chunk boundaries - Integrate decoder into OpenAI and BaseOpenAiCompatibleProvider to fix garbled output with large files - Add comprehensive tests for UTF-8 boundary handling - Fixes issue #8787 where vLLM outputs showed garbled characters with large file outputs
1 parent 8187a8e commit 92dd39b

File tree

4 files changed

+433
-11
lines changed

4 files changed

+433
-11
lines changed

src/api/providers/base-openai-compatible-provider.ts

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import type { ModelInfo } from "@roo-code/types"
66
import type { ApiHandlerOptions } from "../../shared/api"
77
import { ApiStream } from "../transform/stream"
88
import { convertToOpenAiMessages } from "../transform/openai-format"
9+
import { UTF8StreamDecoder } from "../utils/utf8-stream-decoder"
910

1011
import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from "../index"
1112
import { DEFAULT_HEADERS } from "./constants"
@@ -99,13 +100,20 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>
99100
): ApiStream {
100101
const stream = await this.createStream(systemPrompt, messages, metadata)
101102

103+
// Create UTF-8 decoder for handling large outputs properly
104+
const utf8Decoder = new UTF8StreamDecoder()
105+
102106
for await (const chunk of stream) {
103107
const delta = chunk.choices[0]?.delta
104108

105109
if (delta?.content) {
106-
yield {
107-
type: "text",
108-
text: delta.content,
110+
// Decode the content properly to handle UTF-8 boundary issues
111+
const decodedContent = utf8Decoder.decode(delta.content)
112+
if (decodedContent) {
113+
yield {
114+
type: "text",
115+
text: decodedContent,
116+
}
109117
}
110118
}
111119

@@ -117,6 +125,15 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>
117125
}
118126
}
119127
}
128+
129+
// Finalize any remaining buffered content
130+
const finalContent = utf8Decoder.finalize()
131+
if (finalContent) {
132+
yield {
133+
type: "text",
134+
text: finalContent,
135+
}
136+
}
120137
}
121138

122139
async completePrompt(prompt: string): Promise<string> {

src/api/providers/openai.ts

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import {
1313
import type { ApiHandlerOptions } from "../../shared/api"
1414

1515
import { XmlMatcher } from "../../utils/xml-matcher"
16+
import { UTF8StreamDecoder } from "../utils/utf8-stream-decoder"
1617

1718
import { convertToOpenAiMessages } from "../transform/openai-format"
1819
import { convertToR1Format } from "../transform/r1-format"
@@ -188,28 +189,47 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
188189
}) as const,
189190
)
190191

192+
// Create UTF-8 decoder for handling large outputs properly
193+
const utf8Decoder = new UTF8StreamDecoder()
194+
191195
let lastUsage
192196

193197
for await (const chunk of stream) {
194198
const delta = chunk.choices[0]?.delta ?? {}
195199

196200
if (delta.content) {
197-
for (const chunk of matcher.update(delta.content)) {
198-
yield chunk
201+
// Decode the content properly to handle UTF-8 boundary issues
202+
const decodedContent = utf8Decoder.decode(delta.content)
203+
if (decodedContent) {
204+
for (const chunk of matcher.update(decodedContent)) {
205+
yield chunk
206+
}
199207
}
200208
}
201209

202210
if ("reasoning_content" in delta && delta.reasoning_content) {
203-
yield {
204-
type: "reasoning",
205-
text: (delta.reasoning_content as string | undefined) || "",
211+
// Also decode reasoning content properly
212+
const decodedReasoning = utf8Decoder.decode(delta.reasoning_content as string)
213+
if (decodedReasoning) {
214+
yield {
215+
type: "reasoning",
216+
text: decodedReasoning,
217+
}
206218
}
207219
}
208220
if (chunk.usage) {
209221
lastUsage = chunk.usage
210222
}
211223
}
212224

225+
// Finalize any remaining buffered content
226+
const finalContent = utf8Decoder.finalize()
227+
if (finalContent) {
228+
for (const chunk of matcher.update(finalContent)) {
229+
yield chunk
230+
}
231+
}
232+
213233
for (const chunk of matcher.final()) {
214234
yield chunk
215235
}
@@ -386,12 +406,19 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
386406
}
387407

388408
private async *handleStreamResponse(stream: AsyncIterable<OpenAI.Chat.Completions.ChatCompletionChunk>): ApiStream {
409+
// Create UTF-8 decoder for handling large outputs properly
410+
const utf8Decoder = new UTF8StreamDecoder()
411+
389412
for await (const chunk of stream) {
390413
const delta = chunk.choices[0]?.delta
391414
if (delta?.content) {
392-
yield {
393-
type: "text",
394-
text: delta.content,
415+
// Decode the content properly to handle UTF-8 boundary issues
416+
const decodedContent = utf8Decoder.decode(delta.content)
417+
if (decodedContent) {
418+
yield {
419+
type: "text",
420+
text: decodedContent,
421+
}
395422
}
396423
}
397424

@@ -403,6 +430,15 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
403430
}
404431
}
405432
}
433+
434+
// Finalize any remaining buffered content
435+
const finalContent = utf8Decoder.finalize()
436+
if (finalContent) {
437+
yield {
438+
type: "text",
439+
text: finalContent,
440+
}
441+
}
406442
}
407443

408444
private _getUrlHost(baseUrl?: string): string {
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
import { describe, it, expect, beforeEach } from "vitest"
2+
import { UTF8StreamDecoder } from "../utf8-stream-decoder"
3+
4+
describe("UTF8StreamDecoder", () => {
5+
let decoder: UTF8StreamDecoder
6+
7+
beforeEach(() => {
8+
decoder = new UTF8StreamDecoder()
9+
})
10+
11+
describe("decode", () => {
12+
it("should handle complete ASCII strings", () => {
13+
const result = decoder.decode("Hello World")
14+
expect(result).toBe("Hello World")
15+
})
16+
17+
it("should handle complete UTF-8 strings", () => {
18+
const result = decoder.decode("Hello 世界 🌍")
19+
expect(result).toBe("Hello 世界 🌍")
20+
})
21+
22+
it("should handle multi-byte UTF-8 characters split across chunks", () => {
23+
// "世" (U+4E16) in UTF-8 is 0xE4 0xB8 0x96
24+
// Split it across two chunks
25+
const chunk1 = new Uint8Array([0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0xe4]) // "Hello " + first byte of "世"
26+
const chunk2 = new Uint8Array([0xb8, 0x96]) // remaining bytes of "世"
27+
28+
const result1 = decoder.decode(chunk1)
29+
expect(result1).toBe("Hello ") // Should only decode complete characters
30+
31+
const result2 = decoder.decode(chunk2)
32+
expect(result2).toBe("世") // Should complete the character
33+
})
34+
35+
it("should handle 4-byte emoji split across chunks", () => {
36+
// "🌍" (U+1F30D) in UTF-8 is 0xF0 0x9F 0x8C 0x8D
37+
// Split it across multiple chunks
38+
const chunk1 = new Uint8Array([0x48, 0x69, 0x20, 0xf0]) // "Hi " + first byte
39+
const chunk2 = new Uint8Array([0x9f, 0x8c]) // middle bytes
40+
const chunk3 = new Uint8Array([0x8d, 0x21]) // last byte + "!"
41+
42+
const result1 = decoder.decode(chunk1)
43+
expect(result1).toBe("Hi ") // Should only decode complete characters
44+
45+
const result2 = decoder.decode(chunk2)
46+
expect(result2).toBe("") // Still incomplete
47+
48+
const result3 = decoder.decode(chunk3)
49+
expect(result3).toBe("🌍!") // Should complete the emoji and include the exclamation
50+
})
51+
52+
it("should handle string chunks with potential partial sequences", () => {
53+
// Simulate a string that ends with a partial UTF-8 sequence marker
54+
const chunk1 = "Hello 世"
55+
const chunk2 = "界 World"
56+
57+
const result1 = decoder.decode(chunk1)
58+
const result2 = decoder.decode(chunk2)
59+
60+
expect(result1 + result2).toBe("Hello 世界 World")
61+
})
62+
63+
it("should handle replacement characters properly", () => {
64+
// Test with actual replacement characters (U+FFFD)
65+
const chunk = "Hello \uFFFD World"
66+
const result = decoder.decode(chunk)
67+
expect(result).toBe("Hello \uFFFD World")
68+
})
69+
70+
it("should handle replacement characters in the middle of text", () => {
71+
// Replacement characters in the middle should be preserved
72+
const chunk = "Hello \uFFFD World"
73+
const result = decoder.decode(chunk)
74+
expect(result).toBe("Hello \uFFFD World")
75+
})
76+
77+
it("should handle multiple replacement characters", () => {
78+
// Multiple replacement characters might indicate encoding issues
79+
// but we should preserve them as they might be intentional
80+
const chunk1 = "Hello World"
81+
const chunk2 = " Test"
82+
83+
const result1 = decoder.decode(chunk1)
84+
expect(result1).toBe("Hello World")
85+
86+
const result2 = decoder.decode(chunk2)
87+
expect(result2).toBe(" Test")
88+
})
89+
90+
it("should handle empty chunks", () => {
91+
const result = decoder.decode("")
92+
expect(result).toBe("")
93+
})
94+
95+
it("should handle Uint8Array empty chunks", () => {
96+
const result = decoder.decode(new Uint8Array(0))
97+
expect(result).toBe("")
98+
})
99+
})
100+
101+
describe("finalize", () => {
102+
it("should return empty string when no buffered content", () => {
103+
const result = decoder.finalize()
104+
expect(result).toBe("")
105+
})
106+
107+
it("should decode remaining buffered content", () => {
108+
// Send an incomplete sequence
109+
const chunk = new Uint8Array([0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0xe4, 0xb8]) // "Hello " + partial "世"
110+
decoder.decode(chunk)
111+
112+
// Finalize should attempt to decode what's left (may produce replacement character)
113+
const result = decoder.finalize()
114+
expect(result.length).toBeGreaterThan(0) // Should produce something (likely with replacement char)
115+
})
116+
117+
it("should clear buffer after finalize", () => {
118+
const chunk = new Uint8Array([0xe4]) // Partial character
119+
decoder.decode(chunk)
120+
121+
decoder.finalize()
122+
const secondFinalize = decoder.finalize()
123+
expect(secondFinalize).toBe("") // Buffer should be empty
124+
})
125+
})
126+
127+
describe("reset", () => {
128+
it("should clear the buffer", () => {
129+
// Add some partial data
130+
const chunk = new Uint8Array([0xe4, 0xb8]) // Partial character
131+
decoder.decode(chunk)
132+
133+
// Reset
134+
decoder.reset()
135+
136+
// Should start fresh
137+
const result = decoder.decode("Hello")
138+
expect(result).toBe("Hello")
139+
140+
// Finalize should return nothing
141+
const final = decoder.finalize()
142+
expect(final).toBe("")
143+
})
144+
})
145+
146+
describe("large file handling", () => {
147+
it("should handle large text with many UTF-8 characters", () => {
148+
// Simulate a large file with mixed content
149+
const largeText = "初めまして、私は人工知能です。" + "世界は美しい。".repeat(100) + "🌍🌎🌏"
150+
151+
// Split into random chunks to simulate streaming
152+
const chunkSize = 17 // Prime number to ensure we split across character boundaries
153+
const chunks: string[] = []
154+
for (let i = 0; i < largeText.length; i += chunkSize) {
155+
chunks.push(largeText.slice(i, i + chunkSize))
156+
}
157+
158+
// Decode all chunks
159+
let result = ""
160+
for (const chunk of chunks) {
161+
result += decoder.decode(chunk)
162+
}
163+
result += decoder.finalize()
164+
165+
// Should reconstruct the original text
166+
expect(result).toBe(largeText)
167+
})
168+
169+
it("should handle simulated vLLM output with potential garbling", () => {
170+
// Simulate what might come from vLLM with large outputs
171+
const chunks = [
172+
"def process_data(items):\n",
173+
' """Process a list of items',
174+
" with special handling for UTF-8",
175+
" characters like 你好", // Chinese characters might be split
176+
'世界"""\n result = []\n',
177+
" for item in items:\n",
178+
" # Handle special chars: €£¥",
179+
"🔧🔨\n",
180+
" result.append(transform(item))\n",
181+
" return result",
182+
]
183+
184+
let decoded = ""
185+
for (const chunk of chunks) {
186+
decoded += decoder.decode(chunk)
187+
}
188+
decoded += decoder.finalize()
189+
190+
// Should contain all the expected content without garbling
191+
expect(decoded).toContain("你好世界")
192+
expect(decoded).toContain("€£¥")
193+
expect(decoded).toContain("🔧🔨")
194+
expect(decoded).not.toContain("\uFFFD") // Should not have replacement characters
195+
})
196+
})
197+
})

0 commit comments

Comments
 (0)