Skip to content

Commit b1ae293

Browse files
committed
Add detector_test and fix bugs found by it
1 parent f39cc11 commit b1ae293

File tree

3 files changed

+29
-7
lines changed

3 files changed

+29
-7
lines changed

detector_test.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
package chardet
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestDetector(t *testing.T) {
8+
type file_charset_language struct {
9+
File, Charset, Language string
10+
}
11+
var data = []file_charset_language{
12+
{"utf8.txt", "UTF-8", ""},
13+
{"big5.txt", "Big5", "zh"},
14+
{"shift_jis.txt", "Shift_JIS", "ja"},
15+
{"gb18030.txt", "GB-18030", "zh"},
16+
}
17+
18+
ct := newChardetTester()
19+
for _, d := range data {
20+
ct.ExpectBest(embeddedfiles[d.File], d.Charset, d.Language, t)
21+
}
22+
}

multi_byte.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ func (r *recognizerMultiByte) matchConfidence(input *recognizerInput) int {
7878

7979
func binarySearch(l []uint16, c uint16) bool {
8080
start := 0
81-
end := len(l)
81+
end := len(l) - 1
8282
for start <= end {
8383
mid := (start + end) / 2
8484
if c == l[mid] {

unicode.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,15 +48,15 @@ func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput
4848
type recognizerUtf32 struct {
4949
name string
5050
bom []byte
51-
decodeChar func(input []byte) rune
51+
decodeChar func(input []byte) uint32
5252
}
5353

54-
func decodeUtf32be(input []byte) rune {
55-
return rune(input[0]<<24 | input[1]<<16 | input[2]<<8 | input[3])
54+
func decodeUtf32be(input []byte) uint32 {
55+
return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3])
5656
}
5757

58-
func decodeUtf32le(input []byte) rune {
59-
return rune(input[3]<<24 | input[2]<<16 | input[1]<<8 | input[0])
58+
func decodeUtf32le(input []byte) uint32 {
59+
return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0])
6060
}
6161

6262
func newRecognizer_utf32be() *recognizerUtf32 {
@@ -82,7 +82,7 @@ func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput
8282
hasBom := bytes.HasPrefix(input.raw, r.bom)
8383
var numValid, numInvalid uint32
8484
for b := input.raw; len(b) >= 4; b = b[4:] {
85-
if c := r.decodeChar(b); c < 0 || c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
85+
if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
8686
numInvalid++
8787
} else {
8888
numValid++

0 commit comments

Comments
 (0)