|
| 1 | +import { describe, it, expect, beforeEach } from "vitest" |
| 2 | +import { UTF8StreamDecoder } from "../utf8-stream-decoder" |
| 3 | + |
| 4 | +describe("UTF8StreamDecoder", () => { |
| 5 | + let decoder: UTF8StreamDecoder |
| 6 | + |
| 7 | + beforeEach(() => { |
| 8 | + decoder = new UTF8StreamDecoder() |
| 9 | + }) |
| 10 | + |
| 11 | + describe("decode", () => { |
| 12 | + it("should handle complete ASCII strings", () => { |
| 13 | + const result = decoder.decode("Hello World") |
| 14 | + expect(result).toBe("Hello World") |
| 15 | + }) |
| 16 | + |
| 17 | + it("should handle complete UTF-8 strings", () => { |
| 18 | + const result = decoder.decode("Hello 世界 🌍") |
| 19 | + expect(result).toBe("Hello 世界 🌍") |
| 20 | + }) |
| 21 | + |
| 22 | + it("should handle multi-byte UTF-8 characters split across chunks", () => { |
| 23 | + // "世" (U+4E16) in UTF-8 is 0xE4 0xB8 0x96 |
| 24 | + // Split it across two chunks |
| 25 | + const chunk1 = new Uint8Array([0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0xe4]) // "Hello " + first byte of "世" |
| 26 | + const chunk2 = new Uint8Array([0xb8, 0x96]) // remaining bytes of "世" |
| 27 | + |
| 28 | + const result1 = decoder.decode(chunk1) |
| 29 | + expect(result1).toBe("Hello ") // Should only decode complete characters |
| 30 | + |
| 31 | + const result2 = decoder.decode(chunk2) |
| 32 | + expect(result2).toBe("世") // Should complete the character |
| 33 | + }) |
| 34 | + |
| 35 | + it("should handle 4-byte emoji split across chunks", () => { |
| 36 | + // "🌍" (U+1F30D) in UTF-8 is 0xF0 0x9F 0x8C 0x8D |
| 37 | + // Split it across multiple chunks |
| 38 | + const chunk1 = new Uint8Array([0x48, 0x69, 0x20, 0xf0]) // "Hi " + first byte |
| 39 | + const chunk2 = new Uint8Array([0x9f, 0x8c]) // middle bytes |
| 40 | + const chunk3 = new Uint8Array([0x8d, 0x21]) // last byte + "!" |
| 41 | + |
| 42 | + const result1 = decoder.decode(chunk1) |
| 43 | + expect(result1).toBe("Hi ") // Should only decode complete characters |
| 44 | + |
| 45 | + const result2 = decoder.decode(chunk2) |
| 46 | + expect(result2).toBe("") // Still incomplete |
| 47 | + |
| 48 | + const result3 = decoder.decode(chunk3) |
| 49 | + expect(result3).toBe("🌍!") // Should complete the emoji and include the exclamation |
| 50 | + }) |
| 51 | + |
| 52 | + it("should handle string chunks with potential partial sequences", () => { |
| 53 | + // Simulate a string that ends with a partial UTF-8 sequence marker |
| 54 | + const chunk1 = "Hello 世" |
| 55 | + const chunk2 = "界 World" |
| 56 | + |
| 57 | + const result1 = decoder.decode(chunk1) |
| 58 | + const result2 = decoder.decode(chunk2) |
| 59 | + |
| 60 | + expect(result1 + result2).toBe("Hello 世界 World") |
| 61 | + }) |
| 62 | + |
| 63 | + it("should handle replacement characters properly", () => { |
| 64 | + // Test with actual replacement characters (U+FFFD) |
| 65 | + const chunk = "Hello \uFFFD World" |
| 66 | + const result = decoder.decode(chunk) |
| 67 | + expect(result).toBe("Hello \uFFFD World") |
| 68 | + }) |
| 69 | + |
| 70 | + it("should handle replacement characters in the middle of text", () => { |
| 71 | + // Replacement characters in the middle should be preserved |
| 72 | + const chunk = "Hello \uFFFD World" |
| 73 | + const result = decoder.decode(chunk) |
| 74 | + expect(result).toBe("Hello \uFFFD World") |
| 75 | + }) |
| 76 | + |
| 77 | + it("should handle multiple replacement characters", () => { |
| 78 | + // Multiple replacement characters might indicate encoding issues |
| 79 | + // but we should preserve them as they might be intentional |
| 80 | + const chunk1 = "Hello World" |
| 81 | + const chunk2 = " Test" |
| 82 | + |
| 83 | + const result1 = decoder.decode(chunk1) |
| 84 | + expect(result1).toBe("Hello World") |
| 85 | + |
| 86 | + const result2 = decoder.decode(chunk2) |
| 87 | + expect(result2).toBe(" Test") |
| 88 | + }) |
| 89 | + |
| 90 | + it("should handle empty chunks", () => { |
| 91 | + const result = decoder.decode("") |
| 92 | + expect(result).toBe("") |
| 93 | + }) |
| 94 | + |
| 95 | + it("should handle Uint8Array empty chunks", () => { |
| 96 | + const result = decoder.decode(new Uint8Array(0)) |
| 97 | + expect(result).toBe("") |
| 98 | + }) |
| 99 | + }) |
| 100 | + |
| 101 | + describe("finalize", () => { |
| 102 | + it("should return empty string when no buffered content", () => { |
| 103 | + const result = decoder.finalize() |
| 104 | + expect(result).toBe("") |
| 105 | + }) |
| 106 | + |
| 107 | + it("should decode remaining buffered content", () => { |
| 108 | + // Send an incomplete sequence |
| 109 | + const chunk = new Uint8Array([0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0xe4, 0xb8]) // "Hello " + partial "世" |
| 110 | + decoder.decode(chunk) |
| 111 | + |
| 112 | + // Finalize should attempt to decode what's left (may produce replacement character) |
| 113 | + const result = decoder.finalize() |
| 114 | + expect(result.length).toBeGreaterThan(0) // Should produce something (likely with replacement char) |
| 115 | + }) |
| 116 | + |
| 117 | + it("should clear buffer after finalize", () => { |
| 118 | + const chunk = new Uint8Array([0xe4]) // Partial character |
| 119 | + decoder.decode(chunk) |
| 120 | + |
| 121 | + decoder.finalize() |
| 122 | + const secondFinalize = decoder.finalize() |
| 123 | + expect(secondFinalize).toBe("") // Buffer should be empty |
| 124 | + }) |
| 125 | + }) |
| 126 | + |
| 127 | + describe("reset", () => { |
| 128 | + it("should clear the buffer", () => { |
| 129 | + // Add some partial data |
| 130 | + const chunk = new Uint8Array([0xe4, 0xb8]) // Partial character |
| 131 | + decoder.decode(chunk) |
| 132 | + |
| 133 | + // Reset |
| 134 | + decoder.reset() |
| 135 | + |
| 136 | + // Should start fresh |
| 137 | + const result = decoder.decode("Hello") |
| 138 | + expect(result).toBe("Hello") |
| 139 | + |
| 140 | + // Finalize should return nothing |
| 141 | + const final = decoder.finalize() |
| 142 | + expect(final).toBe("") |
| 143 | + }) |
| 144 | + }) |
| 145 | + |
| 146 | + describe("large file handling", () => { |
| 147 | + it("should handle large text with many UTF-8 characters", () => { |
| 148 | + // Simulate a large file with mixed content |
| 149 | + const largeText = "初めまして、私は人工知能です。" + "世界は美しい。".repeat(100) + "🌍🌎🌏" |
| 150 | + |
| 151 | + // Split into random chunks to simulate streaming |
| 152 | + const chunkSize = 17 // Prime number to ensure we split across character boundaries |
| 153 | + const chunks: string[] = [] |
| 154 | + for (let i = 0; i < largeText.length; i += chunkSize) { |
| 155 | + chunks.push(largeText.slice(i, i + chunkSize)) |
| 156 | + } |
| 157 | + |
| 158 | + // Decode all chunks |
| 159 | + let result = "" |
| 160 | + for (const chunk of chunks) { |
| 161 | + result += decoder.decode(chunk) |
| 162 | + } |
| 163 | + result += decoder.finalize() |
| 164 | + |
| 165 | + // Should reconstruct the original text |
| 166 | + expect(result).toBe(largeText) |
| 167 | + }) |
| 168 | + |
| 169 | + it("should handle simulated vLLM output with potential garbling", () => { |
| 170 | + // Simulate what might come from vLLM with large outputs |
| 171 | + const chunks = [ |
| 172 | + "def process_data(items):\n", |
| 173 | + ' """Process a list of items', |
| 174 | + " with special handling for UTF-8", |
| 175 | + " characters like 你好", // Chinese characters might be split |
| 176 | + '世界"""\n result = []\n', |
| 177 | + " for item in items:\n", |
| 178 | + " # Handle special chars: €£¥", |
| 179 | + "🔧🔨\n", |
| 180 | + " result.append(transform(item))\n", |
| 181 | + " return result", |
| 182 | + ] |
| 183 | + |
| 184 | + let decoded = "" |
| 185 | + for (const chunk of chunks) { |
| 186 | + decoded += decoder.decode(chunk) |
| 187 | + } |
| 188 | + decoded += decoder.finalize() |
| 189 | + |
| 190 | + // Should contain all the expected content without garbling |
| 191 | + expect(decoded).toContain("你好世界") |
| 192 | + expect(decoded).toContain("€£¥") |
| 193 | + expect(decoded).toContain("🔧🔨") |
| 194 | + expect(decoded).not.toContain("\uFFFD") // Should not have replacement characters |
| 195 | + }) |
| 196 | + }) |
| 197 | +}) |
0 commit comments