Add UTF-16/UTF-32 BOM detection and UTF-16 encoding support

Dimitar Grigorov · Dimitar Grigorov · commit 96394f557927 · 2026-02-06T12:10:43.000+02:00
diff --git a/README.md b/README.md
@@ -27,7 +27,8 @@ Provides 19 tools for file operations with automatic encoding conversion:
 - [`move_file`](TOOLS.md#move_file) - Move or rename files and directories
 - [`list_allowed_directories`](TOOLS.md#list_allowed_directories) - Show accessible directories
 
-**Supported encodings (20 total):**
+**Supported encodings (22 total):**
+- **Unicode:** UTF-8, UTF-16 LE, UTF-16 BE (with BOM detection for UTF-16 and UTF-32)
 - **Cyrillic:** Windows-1251, KOI8-R, KOI8-U, CP866, ISO-8859-5
 - **Western European:** Windows-1252, ISO-8859-1, ISO-8859-15
 - **Central European:** Windows-1250, ISO-8859-2
diff --git a/TOOLS.md b/TOOLS.md
@@ -410,7 +410,7 @@ Detect line ending style (CRLF/LF/mixed) and find lines with inconsistent ending
 
 ### list_encodings
 
-Returns all 20 supported encodings with name, aliases, and description.
+Returns all 22 supported encodings with name, aliases, and description.
 
 ### list_allowed_directories
 
@@ -421,6 +421,8 @@ Returns directories the server is allowed to access. If empty, add paths as args
 | Name | Aliases | Description |
 |------|---------|-------------|
 | utf-8 | utf8, ascii | Unicode, no conversion |
+| utf-16-le | utf16le, utf-16le | Unicode UTF-16 Little Endian |
+| utf-16-be | utf16be, utf-16be | Unicode UTF-16 Big Endian |
 | windows-1251 | cp1251 | Windows Cyrillic |
 | koi8-r | koi8r | Russian Cyrillic (Unix/Linux) |
 | koi8-u | koi8u | Ukrainian Cyrillic (Unix/Linux) |
diff --git a/internal/encoding/detect.go b/internal/encoding/detect.go
@@ -25,6 +25,38 @@ type DetectionResult struct {
 	HasBOM     bool
 }
 
+// detectBOM checks for Unicode BOMs and returns a result if found.
+// Order matters: UTF-32 BOMs must be checked before UTF-16 since they share prefixes.
+func detectBOM(data []byte) (DetectionResult, bool) {
+	if len(data) >= 4 {
+		// UTF-32 BE: 00 00 FE FF
+		if data[0] == 0x00 && data[1] == 0x00 && data[2] == 0xFE && data[3] == 0xFF {
+			return DetectionResult{Charset: "utf-32-be", Confidence: 100, HasBOM: true}, true
+		}
+		// UTF-32 LE: FF FE 00 00
+		if data[0] == 0xFF && data[1] == 0xFE && data[2] == 0x00 && data[3] == 0x00 {
+			return DetectionResult{Charset: "utf-32-le", Confidence: 100, HasBOM: true}, true
+		}
+	}
+	if len(data) >= 3 {
+		// UTF-8 BOM: EF BB BF
+		if data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
+			return DetectionResult{Charset: "utf-8", Confidence: 100, HasBOM: true}, true
+		}
+	}
+	if len(data) >= 2 {
+		// UTF-16 BE: FE FF
+		if data[0] == 0xFE && data[1] == 0xFF {
+			return DetectionResult{Charset: "utf-16-be", Confidence: 100, HasBOM: true}, true
+		}
+		// UTF-16 LE: FF FE
+		if data[0] == 0xFF && data[1] == 0xFE {
+			return DetectionResult{Charset: "utf-16-le", Confidence: 100, HasBOM: true}, true
+		}
+	}
+	return DetectionResult{}, false
+}
+
 // --- Primary API (file-based, streaming) ---
 
 // DetectFromFile detects encoding from a file path using streaming I/O.
@@ -46,9 +78,8 @@ func DetectFromFile(path string, mode string) (DetectionResult, error) {
 
 // Detect detects encoding from a byte slice.
 func Detect(data []byte) DetectionResult {
-	// Check UTF-8 BOM
-	if len(data) >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
-		return DetectionResult{Charset: "utf-8", Confidence: 100, HasBOM: true}
+	if result, ok := detectBOM(data); ok {
+		return result
 	}
 
 	detected := chardet.Detect(data)
@@ -138,9 +169,8 @@ func detectSampleFromReader(r io.ReaderAt, size int64) (DetectionResult, error)
 	}
 	beginChunk = beginChunk[:n]
 
-	// Check for BOM
-	if len(beginChunk) >= 3 && beginChunk[0] == 0xEF && beginChunk[1] == 0xBB && beginChunk[2] == 0xBF {
-		return DetectionResult{Charset: "utf-8", Confidence: 100, HasBOM: true}, nil
+	if result, ok := detectBOM(beginChunk); ok {
+		return result, nil
 	}
 
 	// Check beginning chunk - if high confidence, return early
@@ -187,11 +217,11 @@ func detectChunkedFromReader(r io.ReaderAt, size int64) (DetectionResult, error)
 		return Detect(data), nil
 	}
 
-	// Check for BOM
-	bomCheck := make([]byte, 3)
-	if n, _ := r.ReadAt(bomCheck, 0); n >= 3 {
-		if bomCheck[0] == 0xEF && bomCheck[1] == 0xBB && bomCheck[2] == 0xBF {
-			return DetectionResult{Charset: "utf-8", Confidence: 100, HasBOM: true}, nil
+	// Check for BOM (need 4 bytes for UTF-32)
+	bomCheck := make([]byte, 4)
+	if n, _ := r.ReadAt(bomCheck, 0); n >= 2 {
+		if result, ok := detectBOM(bomCheck[:n]); ok {
+			return result, nil
 		}
 	}
 
diff --git a/internal/encoding/detect_test.go b/internal/encoding/detect_test.go
@@ -12,19 +12,31 @@ func isASCIICompatible(charset string) bool {
 	return charset == "utf-8" || charset == "ascii"
 }
 
-func TestDetect_UTF8BOM(t *testing.T) {
-	// UTF-8 BOM: EF BB BF
-	data := []byte{0xEF, 0xBB, 0xBF, 'H', 'e', 'l', 'l', 'o'}
-	result := Detect(data)
-
-	if result.Charset != "utf-8" {
-		t.Errorf("Charset = %q, want utf-8", result.Charset)
-	}
-	if result.Confidence != 100 {
-		t.Errorf("Confidence = %d, want 100", result.Confidence)
-	}
-	if !result.HasBOM {
-		t.Error("HasBOM = false, want true")
+func TestDetect_BOMs(t *testing.T) {
+	tests := []struct {
+		name        string
+		data        []byte
+		wantCharset string
+	}{
+		{"UTF-8 BOM", []byte{0xEF, 0xBB, 0xBF, 'H', 'i'}, "utf-8"},
+		{"UTF-16 LE BOM", []byte{0xFF, 0xFE, 'H', 0x00}, "utf-16-le"},
+		{"UTF-16 BE BOM", []byte{0xFE, 0xFF, 0x00, 'H'}, "utf-16-be"},
+		{"UTF-32 LE BOM", []byte{0xFF, 0xFE, 0x00, 0x00, 'H', 0x00, 0x00, 0x00}, "utf-32-le"},
+		{"UTF-32 BE BOM", []byte{0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 'H'}, "utf-32-be"},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := Detect(tt.data)
+			if result.Charset != tt.wantCharset {
+				t.Errorf("Charset = %q, want %q", result.Charset, tt.wantCharset)
+			}
+			if result.Confidence != 100 {
+				t.Errorf("Confidence = %d, want 100", result.Confidence)
+			}
+			if !result.HasBOM {
+				t.Error("HasBOM = false, want true")
+			}
+		})
 	}
 }
 
@@ -284,6 +296,52 @@ func TestDetectFromFile_SampleMode_LargeFileWithBOM(t *testing.T) {
 	}
 }
 
+func TestDetectFromFile_UTF16LE_WithBOM(t *testing.T) {
+	tempDir := t.TempDir()
+	path := filepath.Join(tempDir, "utf16le.txt")
+	// UTF-16 LE BOM + "Hi" encoded as UTF-16 LE
+	content := []byte{0xFF, 0xFE, 'H', 0x00, 'i', 0x00}
+	if err := os.WriteFile(path, content, 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	for _, mode := range []string{"sample", "chunked", "full"} {
+		t.Run(mode, func(t *testing.T) {
+			result, err := DetectFromFile(path, mode)
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+			if result.Charset != "utf-16-le" {
+				t.Errorf("Charset = %q, want utf-16-le", result.Charset)
+			}
+			if !result.HasBOM {
+				t.Error("HasBOM = false, want true")
+			}
+		})
+	}
+}
+
+func TestDetectFromFile_UTF16BE_WithBOM(t *testing.T) {
+	tempDir := t.TempDir()
+	path := filepath.Join(tempDir, "utf16be.txt")
+	// UTF-16 BE BOM + "Hi" encoded as UTF-16 BE
+	content := []byte{0xFE, 0xFF, 0x00, 'H', 0x00, 'i'}
+	if err := os.WriteFile(path, content, 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	result, err := DetectFromFile(path, "sample")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if result.Charset != "utf-16-be" {
+		t.Errorf("Charset = %q, want utf-16-be", result.Charset)
+	}
+	if !result.HasBOM {
+		t.Error("HasBOM = false, want true")
+	}
+}
+
 func TestDetect_NoEncoding(t *testing.T) {
 	// Random binary data that might not have a clear encoding
 	data := []byte{0x00, 0x01, 0x02, 0x03, 0x04}
diff --git a/internal/encoding/registry.go b/internal/encoding/registry.go
diff --git a/internal/encoding/registry_test.go b/internal/encoding/registry_test.go