Skip to content

Commit 67b9aa7

Browse files
committed
Add test for utf8. Fix bugs found by the test. Apply go fmt
1 parent 7bb0e42 commit 67b9aa7

File tree

6 files changed

+185
-91
lines changed

6 files changed

+185
-91
lines changed

base_test.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
package chardet
2+
3+
import (
4+
"testing"
5+
)
6+
7+
type chardetTester struct {
8+
d *Detector
9+
}
10+
11+
func newChardetTester(r ...recognizer) *chardetTester {
12+
if len(r) == 0 {
13+
return &chardetTester{NewDetector()}
14+
}
15+
return &chardetTester{&Detector{r}}
16+
}
17+
18+
func (this *chardetTester) ExpectBest(b []byte, charset string, lang string, t *testing.T) bool {
19+
r, err := this.d.DetectBest(b, true, "")
20+
if err != nil {
21+
t.Error(err)
22+
return false
23+
}
24+
if r.Charset != charset || r.Language != lang {
25+
t.Errorf("Expect %#v, actual %#v", Result{charset, lang, 0}, *r)
26+
return false
27+
}
28+
return true
29+
}
30+
31+
func (this *chardetTester) ExpectUnknown(b []byte, t *testing.T) bool {
32+
r, err := this.d.DetectBest(b, true, "")
33+
if err == nil {
34+
t.Errorf("Expect unknown, actual %#v", *r)
35+
return false
36+
}
37+
return true
38+
}

data_test.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package chardet
2+
3+
var file0 = [...]byte{
4+
193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229,
5+
193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229,
6+
193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229,
7+
193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229,
8+
193, 99, 197, 233, 164, 164, 164, 229,
9+
}
10+
11+
var file1 = [...]byte{
12+
188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196,
13+
188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196,
14+
188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196,
15+
}
16+
17+
var file2 = [...]byte{
18+
147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250,
19+
150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123,
20+
140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234,
21+
147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 10,
22+
}
23+
24+
var file3 = [...]byte{
25+
230, 177, 137, 229, 173, 151, 230, 188, 162, 229, 173, 151, 231, 181, 177, 228, 184, 128, 231, 183, 168, 231, 162, 188, 232, 144, 172, 229, 156, 139, 231, 162,
26+
188, 10,
27+
}
28+
29+
var embeddedfiles = map[string][]byte{
30+
"big5.txt": file0[0:],
31+
"gb18030.txt": file1[0:],
32+
"shift_jis.txt": file2[0:],
33+
"utf8.txt": file3[0:],
34+
}

detector.go

Lines changed: 38 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -12,66 +12,68 @@ type Result struct {
1212
}
1313

1414
type Detector struct {
15+
recognizers []recognizer
1516
}
1617

1718
// List of charset recognizers
18-
var recognizers = []recognizer {
19-
new(recognizerUtf8),
19+
var recognizers = []recognizer{
20+
new(recognizerUtf8),
2021
}
2122

2223
func NewDetector() *Detector {
23-
return &Detector{}
24+
return &Detector{recognizers}
2425
}
2526

2627
var (
27-
NotDetectedError = errors.New("Charset not detected.")
28+
NotDetectedError = errors.New("Charset not detected.")
2829
)
2930

3031
func (d *Detector) DetectBest(b []byte, stripTag bool, declaredCharset string) (r *Result, err error) {
3132
var all []Result
32-
if all, err = d.DetectAll(b, stripTag, declaredCharset); err != nil {
33+
if all, err = d.DetectAll(b, stripTag, declaredCharset); err == nil {
3334
r = &all[0]
3435
}
3536
return
3637
}
3738

3839
func (d *Detector) DetectAll(b []byte, stripTag bool, declaredCharset string) ([]Result, error) {
39-
input := newRecognizerInput(b, stripTag, declaredCharset)
40-
outputChan := make(chan recognizerOutput)
41-
for _, r := range recognizers {
42-
go matchHelper(r, input, outputChan)
43-
}
44-
outputs := make([]recognizerOutput, 0, len(recognizers))
45-
for i := 0; i < len(recognizers); i++ {
46-
o := <-outputChan
47-
if o.Confidence > 0 {
48-
outputs = append(outputs, <-outputChan)
49-
}
50-
}
51-
if len(outputs) == 0 {
52-
return nil, NotDetectedError
53-
}
40+
input := newRecognizerInput(b, stripTag, declaredCharset)
41+
outputChan := make(chan recognizerOutput)
42+
for _, r := range d.recognizers {
43+
go matchHelper(r, input, outputChan)
44+
}
45+
outputs := make([]recognizerOutput, 0, len(d.recognizers))
46+
for i := 0; i < len(d.recognizers); i++ {
47+
o := <-outputChan
48+
if o.Confidence > 0 {
49+
outputs = append(outputs, o)
50+
}
51+
}
52+
if len(outputs) == 0 {
53+
return nil, NotDetectedError
54+
}
5455

55-
sort.Sort(recognizerOutputs(outputs))
56-
dedupOutputs := make([]Result, 0, len(outputs))
57-
foundCharsets := make(map[string]struct{}, len(outputs))
58-
for _, o := range outputs {
59-
if _, found := foundCharsets[o.Charset]; !found {
60-
dedupOutputs = append(dedupOutputs, Result(o))
61-
foundCharsets[o.Charset] = struct{}{}
62-
}
63-
}
64-
if len(dedupOutputs) == 0 {
65-
return nil, NotDetectedError
66-
}
67-
return dedupOutputs, nil
56+
sort.Sort(recognizerOutputs(outputs))
57+
dedupOutputs := make([]Result, 0, len(outputs))
58+
foundCharsets := make(map[string]struct{}, len(outputs))
59+
for _, o := range outputs {
60+
if _, found := foundCharsets[o.Charset]; !found {
61+
dedupOutputs = append(dedupOutputs, Result(o))
62+
foundCharsets[o.Charset] = struct{}{}
63+
}
64+
}
65+
if len(dedupOutputs) == 0 {
66+
return nil, NotDetectedError
67+
}
68+
return dedupOutputs, nil
6869
}
6970

7071
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
71-
outputChan <- r.Match(input)
72+
outputChan <- r.Match(input)
7273
}
7374

7475
type recognizerOutputs []recognizerOutput
75-
func (r recognizerOutputs) Len() int { return len(r) }
76+
77+
func (r recognizerOutputs) Len() int { return len(r) }
7678
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
77-
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
79+
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }

recognizer.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package chardet
22

33
type recognizer interface {
4-
Match(*recognizerInput) recognizerOutput
4+
Match(*recognizerInput) recognizerOutput
55
}
66

77
type recognizerOutput Result

utf8.go

Lines changed: 54 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,67 +1,67 @@
11
package chardet
22

33
import (
4-
"bytes"
4+
"bytes"
55
)
66

7-
var utf8Bom = []byte { 0xEF, 0xBB, 0xBF }
7+
var utf8Bom = []byte{0xEF, 0xBB, 0xBF}
88

99
type recognizerUtf8 struct {
1010
}
1111

1212
func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
13-
output = recognizerOutput {
14-
Charset: "UTF-8",
15-
}
16-
hasBom := bytes.HasPrefix(input.raw, utf8Bom)
17-
inputLen := len(input.raw)
18-
var numValid, numInvalid uint32
19-
var trailBytes uint8
20-
for i := 0; i < inputLen; i++ {
21-
c := input.raw[i]
22-
if c & 0x80 == 0 {
23-
continue
24-
}
25-
if c & 0xE0 == 0xC0 {
26-
trailBytes = 1
27-
} else if c & 0xF0 == 0xE0 {
28-
trailBytes = 2
29-
} else if c & 0xF8 == 0xF0 {
30-
trailBytes = 3
31-
} else {
32-
numInvalid++
33-
if numInvalid > 5 {
34-
break
35-
}
36-
trailBytes = 0
37-
}
13+
output = recognizerOutput{
14+
Charset: "UTF-8",
15+
}
16+
hasBom := bytes.HasPrefix(input.raw, utf8Bom)
17+
inputLen := len(input.raw)
18+
var numValid, numInvalid uint32
19+
var trailBytes uint8
20+
for i := 0; i < inputLen; i++ {
21+
c := input.raw[i]
22+
if c&0x80 == 0 {
23+
continue
24+
}
25+
if c&0xE0 == 0xC0 {
26+
trailBytes = 1
27+
} else if c&0xF0 == 0xE0 {
28+
trailBytes = 2
29+
} else if c&0xF8 == 0xF0 {
30+
trailBytes = 3
31+
} else {
32+
numInvalid++
33+
if numInvalid > 5 {
34+
break
35+
}
36+
trailBytes = 0
37+
}
3838

39-
for i++; i < inputLen; i++ {
40-
c = input.raw[i]
41-
if c & 0xC0 != 0x80 {
42-
numInvalid++
43-
break
44-
}
45-
if trailBytes--; trailBytes == 0 {
46-
numValid++
47-
break
48-
}
49-
}
50-
}
39+
for i++; i < inputLen; i++ {
40+
c = input.raw[i]
41+
if c&0xC0 != 0x80 {
42+
numInvalid++
43+
break
44+
}
45+
if trailBytes--; trailBytes == 0 {
46+
numValid++
47+
break
48+
}
49+
}
50+
}
5151

52-
if hasBom && numInvalid == 0 {
53-
output.Confidence = 100
54-
} else if hasBom && numValid > numInvalid * 10 {
55-
output.Confidence = 80
56-
} else if numValid > 3 && numInvalid == 0 {
57-
output.Confidence = 100
58-
} else if numValid > 0 && numInvalid == 0 {
59-
output.Confidence = 80
60-
} else if numValid == 0 && numInvalid == 0 {
61-
// Plain ASCII
62-
output.Confidence = 10
63-
} else if numValid > numInvalid * 10 {
64-
output.Confidence = 25
65-
}
66-
return
52+
if hasBom && numInvalid == 0 {
53+
output.Confidence = 100
54+
} else if hasBom && numValid > numInvalid*10 {
55+
output.Confidence = 80
56+
} else if numValid > 3 && numInvalid == 0 {
57+
output.Confidence = 100
58+
} else if numValid > 0 && numInvalid == 0 {
59+
output.Confidence = 80
60+
} else if numValid == 0 && numInvalid == 0 {
61+
// Plain ASCII
62+
output.Confidence = 10
63+
} else if numValid > numInvalid*10 {
64+
output.Confidence = 25
65+
}
66+
return
6767
}

utf8_test.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package chardet
2+
3+
import (
4+
"testing"
5+
)
6+
7+
var utf8Recognizers = []recognizer{
8+
new(recognizerUtf8),
9+
}
10+
11+
func TestUtf8(t *testing.T) {
12+
ct := newChardetTester(new(recognizerUtf8))
13+
for name, content := range embeddedfiles {
14+
if name == "utf8.txt" {
15+
ct.ExpectBest(content, "UTF-8", "", t)
16+
} else {
17+
ct.ExpectUnknown(content, t)
18+
}
19+
}
20+
}

0 commit comments

Comments
 (0)