Skip to content

Commit a3955fb

Browse files
committed
Add Utf8 recognizer
1 parent 2381dc9 commit a3955fb

File tree

3 files changed

+86
-4
lines changed

3 files changed

+86
-4
lines changed

detector.go

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,14 @@ type Result struct {
1111
}
1212

1313
type Detector struct {
14-
recognizers []recognizer
14+
}
15+
16+
// List of charset recognizers
17+
var recognizers = []recognizer {
18+
new(recognizerUtf8),
1519
}
1620

1721
func NewDetector() *Detector {
18-
// Init recognizer
1922
return &Detector{}
2023
}
2124

@@ -31,7 +34,19 @@ func (d *Detector) DetectBest(b []byte, stripTag bool, declaredCharset string) (
3134
return
3235
}
3336

37+
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
38+
outputChan <- r.Match(input)
39+
}
40+
3441
func (d *Detector) DetectAll(b []byte, stripTag bool, declaredCharset string) ([]Result, error) {
35-
_ = newRecognizerInput(b, stripTag, declaredCharset)
42+
input := newRecognizerInput(b, stripTag, declaredCharset)
43+
outputChan := make(chan recognizerOutput)
44+
for _, r := range recognizers {
45+
go matchHelper(r, input, outputChan)
46+
}
47+
outputs := make([]recognizerOutput, 0, len(recognizers))
48+
for i := 0; i < len(recognizers); i++ {
49+
outputs = append(outputs, <-outputChan)
50+
}
3651
return nil, NotDetectedError
3752
}

recognizer.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package chardet
22

33
type recognizer interface {
4-
Match(recognizerInput) recognizerOutput
4+
Match(*recognizerInput) recognizerOutput
55
}
66

77
type recognizerOutput struct {

utf8.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package chardet
2+
3+
import (
4+
"bytes"
5+
)
6+
7+
var utf8Bom = []byte { 0xEF, 0xBB, 0xBF }
8+
9+
type recognizerUtf8 struct {
10+
}
11+
12+
func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
13+
output = recognizerOutput {
14+
Charset: "UTF-8",
15+
}
16+
hasBom := bytes.HasPrefix(input.raw, utf8Bom)
17+
inputLen := len(input.raw)
18+
var numValid, numInvalid uint32
19+
var trailBytes uint8
20+
for i := 0; i < inputLen; i++ {
21+
c := input.raw[i]
22+
if c & 0x80 == 0 {
23+
continue
24+
}
25+
if c & 0xE0 == 0xC0 {
26+
trailBytes = 1
27+
} else if c & 0xF0 == 0xE0 {
28+
trailBytes = 2
29+
} else if c & 0xF8 == 0xF0 {
30+
trailBytes = 3
31+
} else {
32+
numInvalid++
33+
if numInvalid > 5 {
34+
break
35+
}
36+
trailBytes = 0
37+
}
38+
39+
for i++; i < inputLen; i++ {
40+
c = input.raw[i]
41+
if c & 0xC0 != 0x80 {
42+
numInvalid++
43+
break
44+
}
45+
if trailBytes--; trailBytes == 0 {
46+
numValid++
47+
break
48+
}
49+
}
50+
}
51+
52+
if hasBom && numInvalid == 0 {
53+
output.Confidence = 100
54+
} else if hasBom && numValid > numInvalid * 10 {
55+
output.Confidence = 80
56+
} else if numValid > 3 && numInvalid == 0 {
57+
output.Confidence = 100
58+
} else if numValid > 0 && numInvalid == 0 {
59+
output.Confidence = 80
60+
} else if numValid == 0 && numInvalid == 0 {
61+
// Plain ASCII
62+
output.Confidence = 10
63+
} else if numValid > numInvalid * 10 {
64+
output.Confidence = 25
65+
}
66+
return
67+
}

0 commit comments

Comments
 (0)