Skip to content

Commit f180ac0

Browse files
committed
Add utf-16/32 recognizers
1 parent 6cd3d9e commit f180ac0

File tree

2 files changed

+99
-0
lines changed

2 files changed

+99
-0
lines changed

detector.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ type Detector struct {
1818
// List of charset recognizers
1919
var recognizers = []recognizer{
2020
new(recognizerUtf8),
21+
new(recognizerUtf16be),
22+
new(recognizerUtf16le),
23+
newRecognizerUtf32be(),
24+
newRecognizerUtf32le(),
2125
}
2226

2327
func NewDetector() *Detector {

unicode.go

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
package chardet
2+
3+
import (
4+
"bytes"
5+
)
6+
7+
var (
8+
utf16beBom = []byte{0xFE, 0xFF}
9+
utf16leBom = []byte{0xFF, 0xFE}
10+
utf32beBom = []byte{0x00, 0x00, 0xFE, 0xFF}
11+
utf32leBom = []byte{0xFF, 0xFE, 0x00, 0x00}
12+
)
13+
14+
type recognizerUtf16be struct {
15+
}
16+
17+
func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
18+
output = recognizerOutput{
19+
Charset: "UTF-16BE",
20+
}
21+
if bytes.HasPrefix(input.raw, utf16beBom) {
22+
output.Confidence = 100
23+
}
24+
return
25+
}
26+
27+
type recognizerUtf16le struct {
28+
}
29+
30+
func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
31+
output = recognizerOutput{
32+
Charset: "UTF-16LE",
33+
}
34+
if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
35+
output.Confidence = 100
36+
}
37+
return
38+
}
39+
40+
type recognizerUtf32 struct {
41+
name string
42+
bom []byte
43+
decodeChar func(input []byte) rune
44+
}
45+
46+
func decodeUtf32be(input []byte) rune {
47+
return rune(input[0] << 24 | input[1] << 16 | input[2] << 8 | input[3])
48+
}
49+
50+
func decodeUtf32le(input []byte) rune {
51+
return rune(input[3] << 24 | input[2] << 16 | input[1] << 8 | input[0])
52+
}
53+
54+
func newRecognizerUtf32be() *recognizerUtf32 {
55+
return &recognizerUtf32{
56+
"UTF-32BE",
57+
utf32beBom,
58+
decodeUtf32be,
59+
}
60+
}
61+
62+
func newRecognizerUtf32le() *recognizerUtf32 {
63+
return &recognizerUtf32{
64+
"UTF-32LE",
65+
utf32leBom,
66+
decodeUtf32le,
67+
}
68+
}
69+
70+
func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
71+
output = recognizerOutput {
72+
Charset: r.name,
73+
}
74+
hasBom := bytes.HasPrefix(input.raw, r.bom)
75+
var numValid, numInvalid uint32
76+
for b := input.raw; len(b) >= 4; b = b[4:] {
77+
if c := r.decodeChar(b); c < 0 || c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
78+
numInvalid++
79+
} else {
80+
numValid++
81+
}
82+
}
83+
if hasBom && numInvalid == 0 {
84+
output.Confidence = 100
85+
} else if hasBom && numValid > numInvalid*10 {
86+
output.Confidence = 80
87+
} else if numValid > 3 && numInvalid == 0 {
88+
output.Confidence = 100
89+
} else if numValid > 0 && numInvalid == 0 {
90+
output.Confidence = 80
91+
} else if numValid > numInvalid*10 {
92+
output.Confidence = 25
93+
}
94+
return
95+
}

0 commit comments

Comments
 (0)