Skip to content

Commit a27af3e

Browse files
committed
Change detector_test to be something more real. Add a bunch of test pages.
1 parent ff68a64 commit a27af3e

14 files changed

+17523
-8
lines changed

detector_test.go

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,60 @@
1-
package chardet
1+
package chardet_test
22

33
import (
4+
"github.com/saintfish/chardet"
5+
"io"
6+
"os"
7+
"path/filepath"
48
"testing"
59
)
610

711
func TestDetector(t *testing.T) {
812
type file_charset_language struct {
9-
File, Charset, Language string
13+
File string
14+
IsHtml bool
15+
Charset string
16+
Language string
1017
}
1118
var data = []file_charset_language{
12-
{"utf8.txt", "UTF-8", ""},
13-
{"big5.txt", "Big5", "zh"},
14-
{"shift_jis.txt", "Shift_JIS", "ja"},
15-
{"gb18030.txt", "GB-18030", "zh"},
19+
{"utf8.html", true, "UTF-8", ""},
20+
{"utf8_bom.html", true, "UTF-8", ""},
21+
{"8859_1_en.html", true, "ISO-8859-1", "en"},
22+
{"8859_1_da.html", true, "ISO-8859-1", "da"},
23+
{"8859_1_de.html", true, "ISO-8859-1", "de"},
24+
{"8859_1_es.html", true, "ISO-8859-1", "es"},
25+
{"8859_1_fr.html", true, "ISO-8859-1", "fr"},
26+
{"8859_1_pt.html", true, "ISO-8859-1", "pt"},
27+
{"shift_jis.html", true, "Shift_JIS", "ja"},
28+
{"gb18030.html", true, "GB-18030", "zh"},
29+
{"euc_jp.html", true, "EUC-JP", "ja"},
30+
{"euc_kr.html", true, "EUC-KR", "ko"},
31+
{"big5.html", true, "Big5", "zh"},
1632
}
1733

18-
ct := newChardetTester()
34+
textDetector := chardet.NewTextDetector()
35+
htmlDetector := chardet.NewHtmlDetector()
36+
buffer := make([]byte, 32<<10)
1937
for _, d := range data {
20-
ct.ExpectBest(embeddedfiles[d.File], d.Charset, d.Language, t)
38+
f, err := os.Open(filepath.Join("testdata", d.File))
39+
if err != nil {
40+
t.Fatal(err)
41+
}
42+
defer f.Close()
43+
size, _ := io.ReadFull(f, buffer)
44+
input := buffer[:size]
45+
var detector = textDetector
46+
if d.IsHtml {
47+
detector = htmlDetector
48+
}
49+
result, err := detector.DetectBest(input)
50+
if err != nil {
51+
t.Fatal(err)
52+
}
53+
if result.Charset != d.Charset {
54+
t.Errorf("Expected charset %s, actual %s", d.Charset, result.Charset)
55+
}
56+
if result.Language != d.Language {
57+
t.Errorf("Expected language %s, actual %s", d.Language, result.Language)
58+
}
2159
}
2260
}

testdata/8859_1_da.html

Lines changed: 391 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)