|
1 |
| -package chardet |
| 1 | +package chardet_test |
2 | 2 |
|
3 | 3 | import (
|
| 4 | + "github.com/saintfish/chardet" |
| 5 | + "io" |
| 6 | + "os" |
| 7 | + "path/filepath" |
4 | 8 | "testing"
|
5 | 9 | )
|
6 | 10 |
|
7 | 11 | func TestDetector(t *testing.T) {
|
8 | 12 | type file_charset_language struct {
|
9 |
| - File, Charset, Language string |
| 13 | + File string |
| 14 | + IsHtml bool |
| 15 | + Charset string |
| 16 | + Language string |
10 | 17 | }
|
11 | 18 | var data = []file_charset_language{
|
12 |
| - {"utf8.txt", "UTF-8", ""}, |
13 |
| - {"big5.txt", "Big5", "zh"}, |
14 |
| - {"shift_jis.txt", "Shift_JIS", "ja"}, |
15 |
| - {"gb18030.txt", "GB-18030", "zh"}, |
| 19 | + {"utf8.html", true, "UTF-8", ""}, |
| 20 | + {"utf8_bom.html", true, "UTF-8", ""}, |
| 21 | + {"8859_1_en.html", true, "ISO-8859-1", "en"}, |
| 22 | + {"8859_1_da.html", true, "ISO-8859-1", "da"}, |
| 23 | + {"8859_1_de.html", true, "ISO-8859-1", "de"}, |
| 24 | + {"8859_1_es.html", true, "ISO-8859-1", "es"}, |
| 25 | + {"8859_1_fr.html", true, "ISO-8859-1", "fr"}, |
| 26 | + {"8859_1_pt.html", true, "ISO-8859-1", "pt"}, |
| 27 | + {"shift_jis.html", true, "Shift_JIS", "ja"}, |
| 28 | + {"gb18030.html", true, "GB-18030", "zh"}, |
| 29 | + {"euc_jp.html", true, "EUC-JP", "ja"}, |
| 30 | + {"euc_kr.html", true, "EUC-KR", "ko"}, |
| 31 | + {"big5.html", true, "Big5", "zh"}, |
16 | 32 | }
|
17 | 33 |
|
18 |
| - ct := newChardetTester() |
| 34 | + textDetector := chardet.NewTextDetector() |
| 35 | + htmlDetector := chardet.NewHtmlDetector() |
| 36 | + buffer := make([]byte, 32<<10) |
19 | 37 | for _, d := range data {
|
20 |
| - ct.ExpectBest(embeddedfiles[d.File], d.Charset, d.Language, t) |
| 38 | + f, err := os.Open(filepath.Join("testdata", d.File)) |
| 39 | + if err != nil { |
| 40 | + t.Fatal(err) |
| 41 | + } |
| 42 | + defer f.Close() |
| 43 | + size, _ := io.ReadFull(f, buffer) |
| 44 | + input := buffer[:size] |
| 45 | + var detector = textDetector |
| 46 | + if d.IsHtml { |
| 47 | + detector = htmlDetector |
| 48 | + } |
| 49 | + result, err := detector.DetectBest(input) |
| 50 | + if err != nil { |
| 51 | + t.Fatal(err) |
| 52 | + } |
| 53 | + if result.Charset != d.Charset { |
| 54 | + t.Errorf("Expected charset %s, actual %s", d.Charset, result.Charset) |
| 55 | + } |
| 56 | + if result.Language != d.Language { |
| 57 | + t.Errorf("Expected language %s, actual %s", d.Language, result.Language) |
| 58 | + } |
21 | 59 | }
|
22 | 60 | }
|
0 commit comments