Skip to content

Commit 96394f5

Browse files
author
Dimitar Grigorov
committed
Add UTF-16/UTF-32 BOM detection and UTF-16 encoding support
1 parent 3313661 commit 96394f5

File tree

6 files changed

+158
-183
lines changed

6 files changed

+158
-183
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ Provides 19 tools for file operations with automatic encoding conversion:
2727
- [`move_file`](TOOLS.md#move_file) - Move or rename files and directories
2828
- [`list_allowed_directories`](TOOLS.md#list_allowed_directories) - Show accessible directories
2929

30-
**Supported encodings (20 total):**
30+
**Supported encodings (22 total):**
31+
- **Unicode:** UTF-8, UTF-16 LE, UTF-16 BE (with BOM detection for UTF-16 and UTF-32)
3132
- **Cyrillic:** Windows-1251, KOI8-R, KOI8-U, CP866, ISO-8859-5
3233
- **Western European:** Windows-1252, ISO-8859-1, ISO-8859-15
3334
- **Central European:** Windows-1250, ISO-8859-2

TOOLS.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ Detect line ending style (CRLF/LF/mixed) and find lines with inconsistent ending
410410

411411
### list_encodings
412412

413-
Returns all 20 supported encodings with name, aliases, and description.
413+
Returns all 22 supported encodings with name, aliases, and description.
414414

415415
### list_allowed_directories
416416

@@ -421,6 +421,8 @@ Returns directories the server is allowed to access. If empty, add paths as args
421421
| Name | Aliases | Description |
422422
|------|---------|-------------|
423423
| utf-8 | utf8, ascii | Unicode, no conversion |
424+
| utf-16-le | utf16le, utf-16le | Unicode UTF-16 Little Endian |
425+
| utf-16-be | utf16be, utf-16be | Unicode UTF-16 Big Endian |
424426
| windows-1251 | cp1251 | Windows Cyrillic |
425427
| koi8-r | koi8r | Russian Cyrillic (Unix/Linux) |
426428
| koi8-u | koi8u | Ukrainian Cyrillic (Unix/Linux) |

internal/encoding/detect.go

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,38 @@ type DetectionResult struct {
2525
HasBOM bool
2626
}
2727

28+
// detectBOM checks for Unicode BOMs and returns a result if found.
29+
// Order matters: UTF-32 BOMs must be checked before UTF-16 since they share prefixes.
30+
func detectBOM(data []byte) (DetectionResult, bool) {
31+
if len(data) >= 4 {
32+
// UTF-32 BE: 00 00 FE FF
33+
if data[0] == 0x00 && data[1] == 0x00 && data[2] == 0xFE && data[3] == 0xFF {
34+
return DetectionResult{Charset: "utf-32-be", Confidence: 100, HasBOM: true}, true
35+
}
36+
// UTF-32 LE: FF FE 00 00
37+
if data[0] == 0xFF && data[1] == 0xFE && data[2] == 0x00 && data[3] == 0x00 {
38+
return DetectionResult{Charset: "utf-32-le", Confidence: 100, HasBOM: true}, true
39+
}
40+
}
41+
if len(data) >= 3 {
42+
// UTF-8 BOM: EF BB BF
43+
if data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
44+
return DetectionResult{Charset: "utf-8", Confidence: 100, HasBOM: true}, true
45+
}
46+
}
47+
if len(data) >= 2 {
48+
// UTF-16 BE: FE FF
49+
if data[0] == 0xFE && data[1] == 0xFF {
50+
return DetectionResult{Charset: "utf-16-be", Confidence: 100, HasBOM: true}, true
51+
}
52+
// UTF-16 LE: FF FE
53+
if data[0] == 0xFF && data[1] == 0xFE {
54+
return DetectionResult{Charset: "utf-16-le", Confidence: 100, HasBOM: true}, true
55+
}
56+
}
57+
return DetectionResult{}, false
58+
}
59+
2860
// --- Primary API (file-based, streaming) ---
2961

3062
// DetectFromFile detects encoding from a file path using streaming I/O.
@@ -46,9 +78,8 @@ func DetectFromFile(path string, mode string) (DetectionResult, error) {
4678

4779
// Detect detects encoding from a byte slice.
4880
func Detect(data []byte) DetectionResult {
49-
// Check UTF-8 BOM
50-
if len(data) >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
51-
return DetectionResult{Charset: "utf-8", Confidence: 100, HasBOM: true}
81+
if result, ok := detectBOM(data); ok {
82+
return result
5283
}
5384

5485
detected := chardet.Detect(data)
@@ -138,9 +169,8 @@ func detectSampleFromReader(r io.ReaderAt, size int64) (DetectionResult, error)
138169
}
139170
beginChunk = beginChunk[:n]
140171

141-
// Check for BOM
142-
if len(beginChunk) >= 3 && beginChunk[0] == 0xEF && beginChunk[1] == 0xBB && beginChunk[2] == 0xBF {
143-
return DetectionResult{Charset: "utf-8", Confidence: 100, HasBOM: true}, nil
172+
if result, ok := detectBOM(beginChunk); ok {
173+
return result, nil
144174
}
145175

146176
// Check beginning chunk - if high confidence, return early
@@ -187,11 +217,11 @@ func detectChunkedFromReader(r io.ReaderAt, size int64) (DetectionResult, error)
187217
return Detect(data), nil
188218
}
189219

190-
// Check for BOM
191-
bomCheck := make([]byte, 3)
192-
if n, _ := r.ReadAt(bomCheck, 0); n >= 3 {
193-
if bomCheck[0] == 0xEF && bomCheck[1] == 0xBB && bomCheck[2] == 0xBF {
194-
return DetectionResult{Charset: "utf-8", Confidence: 100, HasBOM: true}, nil
220+
// Check for BOM (need 4 bytes for UTF-32)
221+
bomCheck := make([]byte, 4)
222+
if n, _ := r.ReadAt(bomCheck, 0); n >= 2 {
223+
if result, ok := detectBOM(bomCheck[:n]); ok {
224+
return result, nil
195225
}
196226
}
197227

internal/encoding/detect_test.go

Lines changed: 71 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,31 @@ func isASCIICompatible(charset string) bool {
1212
return charset == "utf-8" || charset == "ascii"
1313
}
1414

15-
func TestDetect_UTF8BOM(t *testing.T) {
16-
// UTF-8 BOM: EF BB BF
17-
data := []byte{0xEF, 0xBB, 0xBF, 'H', 'e', 'l', 'l', 'o'}
18-
result := Detect(data)
19-
20-
if result.Charset != "utf-8" {
21-
t.Errorf("Charset = %q, want utf-8", result.Charset)
22-
}
23-
if result.Confidence != 100 {
24-
t.Errorf("Confidence = %d, want 100", result.Confidence)
25-
}
26-
if !result.HasBOM {
27-
t.Error("HasBOM = false, want true")
15+
func TestDetect_BOMs(t *testing.T) {
16+
tests := []struct {
17+
name string
18+
data []byte
19+
wantCharset string
20+
}{
21+
{"UTF-8 BOM", []byte{0xEF, 0xBB, 0xBF, 'H', 'i'}, "utf-8"},
22+
{"UTF-16 LE BOM", []byte{0xFF, 0xFE, 'H', 0x00}, "utf-16-le"},
23+
{"UTF-16 BE BOM", []byte{0xFE, 0xFF, 0x00, 'H'}, "utf-16-be"},
24+
{"UTF-32 LE BOM", []byte{0xFF, 0xFE, 0x00, 0x00, 'H', 0x00, 0x00, 0x00}, "utf-32-le"},
25+
{"UTF-32 BE BOM", []byte{0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 'H'}, "utf-32-be"},
26+
}
27+
for _, tt := range tests {
28+
t.Run(tt.name, func(t *testing.T) {
29+
result := Detect(tt.data)
30+
if result.Charset != tt.wantCharset {
31+
t.Errorf("Charset = %q, want %q", result.Charset, tt.wantCharset)
32+
}
33+
if result.Confidence != 100 {
34+
t.Errorf("Confidence = %d, want 100", result.Confidence)
35+
}
36+
if !result.HasBOM {
37+
t.Error("HasBOM = false, want true")
38+
}
39+
})
2840
}
2941
}
3042

@@ -284,6 +296,52 @@ func TestDetectFromFile_SampleMode_LargeFileWithBOM(t *testing.T) {
284296
}
285297
}
286298

299+
func TestDetectFromFile_UTF16LE_WithBOM(t *testing.T) {
300+
tempDir := t.TempDir()
301+
path := filepath.Join(tempDir, "utf16le.txt")
302+
// UTF-16 LE BOM + "Hi" encoded as UTF-16 LE
303+
content := []byte{0xFF, 0xFE, 'H', 0x00, 'i', 0x00}
304+
if err := os.WriteFile(path, content, 0644); err != nil {
305+
t.Fatal(err)
306+
}
307+
308+
for _, mode := range []string{"sample", "chunked", "full"} {
309+
t.Run(mode, func(t *testing.T) {
310+
result, err := DetectFromFile(path, mode)
311+
if err != nil {
312+
t.Fatalf("unexpected error: %v", err)
313+
}
314+
if result.Charset != "utf-16-le" {
315+
t.Errorf("Charset = %q, want utf-16-le", result.Charset)
316+
}
317+
if !result.HasBOM {
318+
t.Error("HasBOM = false, want true")
319+
}
320+
})
321+
}
322+
}
323+
324+
func TestDetectFromFile_UTF16BE_WithBOM(t *testing.T) {
325+
tempDir := t.TempDir()
326+
path := filepath.Join(tempDir, "utf16be.txt")
327+
// UTF-16 BE BOM + "Hi" encoded as UTF-16 BE
328+
content := []byte{0xFE, 0xFF, 0x00, 'H', 0x00, 'i'}
329+
if err := os.WriteFile(path, content, 0644); err != nil {
330+
t.Fatal(err)
331+
}
332+
333+
result, err := DetectFromFile(path, "sample")
334+
if err != nil {
335+
t.Fatalf("unexpected error: %v", err)
336+
}
337+
if result.Charset != "utf-16-be" {
338+
t.Errorf("Charset = %q, want utf-16-be", result.Charset)
339+
}
340+
if !result.HasBOM {
341+
t.Error("HasBOM = false, want true")
342+
}
343+
}
344+
287345
func TestDetect_NoEncoding(t *testing.T) {
288346
// Random binary data that might not have a clear encoding
289347
data := []byte{0x00, 0x01, 0x02, 0x03, 0x04}

0 commit comments

Comments
 (0)