Add test for utf8. Fix bugs found by the test. Apply go fmt

saintfish · saintfish · commit 67b9aa7d5706 · 2012-08-12T17:08:29.000-07:00
diff --git a/base_test.go b/base_test.go
@@ -0,0 +1,38 @@
+package chardet
+
+import (
+	"testing"
+)
+
+type chardetTester struct {
+	d *Detector
+}
+
+func newChardetTester(r ...recognizer) *chardetTester {
+	if len(r) == 0 {
+		return &chardetTester{NewDetector()}
+	}
+	return &chardetTester{&Detector{r}}
+}
+
+func (this *chardetTester) ExpectBest(b []byte, charset string, lang string, t *testing.T) bool {
+	r, err := this.d.DetectBest(b, true, "")
+	if err != nil {
+		t.Error(err)
+		return false
+	}
+	if r.Charset != charset || r.Language != lang {
+		t.Errorf("Expect %#v, actual %#v", Result{charset, lang, 0}, *r)
+		return false
+	}
+	return true
+}
+
+func (this *chardetTester) ExpectUnknown(b []byte, t *testing.T) bool {
+	r, err := this.d.DetectBest(b, true, "")
+	if err == nil {
+		t.Errorf("Expect unknown, actual %#v", *r)
+		return false
+	}
+	return true
+}
diff --git a/data_test.go b/data_test.go
@@ -0,0 +1,34 @@
+package chardet
+
+var file0 = [...]byte{
+	193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229,
+	193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229,
+	193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229,
+	193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229, 193, 99, 197, 233, 164, 164, 164, 229,
+	193, 99, 197, 233, 164, 164, 164, 229,
+}
+
+var file1 = [...]byte{
+	188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196,
+	188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196,
+	188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196, 188, 242, 204, 229, 214, 208, 206, 196,
+}
+
+var file2 = [...]byte{
+	147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250,
+	150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123,
+	140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234,
+	147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 147, 250, 150, 123, 140, 234, 10,
+}
+
+var file3 = [...]byte{
+	230, 177, 137, 229, 173, 151, 230, 188, 162, 229, 173, 151, 231, 181, 177, 228, 184, 128, 231, 183, 168, 231, 162, 188, 232, 144, 172, 229, 156, 139, 231, 162,
+	188, 10,
+}
+
+var embeddedfiles = map[string][]byte{
+	"big5.txt":      file0[0:],
+	"gb18030.txt":   file1[0:],
+	"shift_jis.txt": file2[0:],
+	"utf8.txt":      file3[0:],
+}
diff --git a/detector.go b/detector.go
@@ -12,66 +12,68 @@ type Result struct {
 }
 
 type Detector struct {
+	recognizers []recognizer
 }
 
 // List of charset recognizers
-var recognizers = []recognizer {
-    new(recognizerUtf8),
+var recognizers = []recognizer{
+	new(recognizerUtf8),
 }
 
 func NewDetector() *Detector {
-    return &Detector{}
+	return &Detector{recognizers}
 }
 
 var (
-    NotDetectedError = errors.New("Charset not detected.")
+	NotDetectedError = errors.New("Charset not detected.")
 )
 
 func (d *Detector) DetectBest(b []byte, stripTag bool, declaredCharset string) (r *Result, err error) {
 	var all []Result
-	if all, err = d.DetectAll(b, stripTag, declaredCharset); err != nil {
+	if all, err = d.DetectAll(b, stripTag, declaredCharset); err == nil {
 		r = &all[0]
 	}
 	return
 }
 
 func (d *Detector) DetectAll(b []byte, stripTag bool, declaredCharset string) ([]Result, error) {
-    input := newRecognizerInput(b, stripTag, declaredCharset)
-    outputChan := make(chan recognizerOutput)
-    for _, r := range recognizers {
-        go matchHelper(r, input, outputChan)
-    }
-    outputs := make([]recognizerOutput, 0, len(recognizers))
-    for i := 0; i < len(recognizers); i++ {
-        o := <-outputChan
-        if o.Confidence > 0 {
-            outputs = append(outputs, <-outputChan)
-        }
-    }
-    if len(outputs) == 0 {
-        return nil, NotDetectedError
-    }
+	input := newRecognizerInput(b, stripTag, declaredCharset)
+	outputChan := make(chan recognizerOutput)
+	for _, r := range d.recognizers {
+		go matchHelper(r, input, outputChan)
+	}
+	outputs := make([]recognizerOutput, 0, len(d.recognizers))
+	for i := 0; i < len(d.recognizers); i++ {
+		o := <-outputChan
+		if o.Confidence > 0 {
+			outputs = append(outputs, o)
+		}
+	}
+	if len(outputs) == 0 {
+		return nil, NotDetectedError
+	}
 
-    sort.Sort(recognizerOutputs(outputs))
-    dedupOutputs := make([]Result, 0, len(outputs))
-    foundCharsets := make(map[string]struct{}, len(outputs))
-    for _, o := range outputs {
-        if _, found := foundCharsets[o.Charset]; !found {
-            dedupOutputs = append(dedupOutputs, Result(o))
-            foundCharsets[o.Charset] = struct{}{}
-        }
-    }
-    if len(dedupOutputs) == 0 {
-        return nil, NotDetectedError
-    }
-    return dedupOutputs, nil
+	sort.Sort(recognizerOutputs(outputs))
+	dedupOutputs := make([]Result, 0, len(outputs))
+	foundCharsets := make(map[string]struct{}, len(outputs))
+	for _, o := range outputs {
+		if _, found := foundCharsets[o.Charset]; !found {
+			dedupOutputs = append(dedupOutputs, Result(o))
+			foundCharsets[o.Charset] = struct{}{}
+		}
+	}
+	if len(dedupOutputs) == 0 {
+		return nil, NotDetectedError
+	}
+	return dedupOutputs, nil
 }
 
 func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
-    outputChan <- r.Match(input)
+	outputChan <- r.Match(input)
 }
 
 type recognizerOutputs []recognizerOutput
-func (r recognizerOutputs) Len() int { return len(r) }
+
+func (r recognizerOutputs) Len() int           { return len(r) }
 func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
-func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
+func (r recognizerOutputs) Swap(i, j int)      { r[i], r[j] = r[j], r[i] }
diff --git a/recognizer.go b/recognizer.go
@@ -1,7 +1,7 @@
 package chardet
 
 type recognizer interface {
-    Match(*recognizerInput) recognizerOutput
+	Match(*recognizerInput) recognizerOutput
 }
 
 type recognizerOutput Result
diff --git a/utf8.go b/utf8.go
@@ -1,67 +1,67 @@
 package chardet
 
 import (
-    "bytes"
+	"bytes"
 )
 
-var utf8Bom = []byte { 0xEF, 0xBB, 0xBF }
+var utf8Bom = []byte{0xEF, 0xBB, 0xBF}
 
 type recognizerUtf8 struct {
 }
 
 func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
-    output = recognizerOutput {
-        Charset: "UTF-8",
-    }
-    hasBom := bytes.HasPrefix(input.raw, utf8Bom)
-    inputLen := len(input.raw)
-    var numValid, numInvalid uint32
-    var trailBytes uint8
-    for i := 0; i < inputLen; i++ {
-        c := input.raw[i]
-        if c & 0x80 == 0 {
-            continue
-        }
-        if c & 0xE0 == 0xC0 {
-            trailBytes = 1
-        } else if c & 0xF0 == 0xE0 {
-            trailBytes = 2
-        } else if c & 0xF8 == 0xF0 {
-            trailBytes = 3
-        } else {
-            numInvalid++
-            if numInvalid > 5 {
-                break
-            }
-            trailBytes = 0
-        }
+	output = recognizerOutput{
+		Charset: "UTF-8",
+	}
+	hasBom := bytes.HasPrefix(input.raw, utf8Bom)
+	inputLen := len(input.raw)
+	var numValid, numInvalid uint32
+	var trailBytes uint8
+	for i := 0; i < inputLen; i++ {
+		c := input.raw[i]
+		if c&0x80 == 0 {
+			continue
+		}
+		if c&0xE0 == 0xC0 {
+			trailBytes = 1
+		} else if c&0xF0 == 0xE0 {
+			trailBytes = 2
+		} else if c&0xF8 == 0xF0 {
+			trailBytes = 3
+		} else {
+			numInvalid++
+			if numInvalid > 5 {
+				break
+			}
+			trailBytes = 0
+		}
 
-        for i++; i < inputLen; i++ {
-            c = input.raw[i]
-            if c & 0xC0 != 0x80 {
-                numInvalid++
-                break
-            }
-            if trailBytes--; trailBytes == 0 {
-                numValid++
-                break
-            }
-        }
-    }
+		for i++; i < inputLen; i++ {
+			c = input.raw[i]
+			if c&0xC0 != 0x80 {
+				numInvalid++
+				break
+			}
+			if trailBytes--; trailBytes == 0 {
+				numValid++
+				break
+			}
+		}
+	}
 
-    if hasBom && numInvalid == 0 {
-        output.Confidence = 100
-    } else if hasBom && numValid > numInvalid * 10 {
-        output.Confidence = 80
-    } else if numValid > 3 && numInvalid == 0 {
-        output.Confidence = 100
-    } else if numValid > 0 && numInvalid == 0 {
-        output.Confidence = 80
-    } else if numValid == 0 && numInvalid == 0 {
-        // Plain ASCII
-        output.Confidence = 10
-    } else if numValid > numInvalid * 10 {
-        output.Confidence = 25
-    }
-    return
+	if hasBom && numInvalid == 0 {
+		output.Confidence = 100
+	} else if hasBom && numValid > numInvalid*10 {
+		output.Confidence = 80
+	} else if numValid > 3 && numInvalid == 0 {
+		output.Confidence = 100
+	} else if numValid > 0 && numInvalid == 0 {
+		output.Confidence = 80
+	} else if numValid == 0 && numInvalid == 0 {
+		// Plain ASCII
+		output.Confidence = 10
+	} else if numValid > numInvalid*10 {
+		output.Confidence = 25
+	}
+	return
 }
diff --git a/utf8_test.go b/utf8_test.go
@@ -0,0 +1,20 @@
+package chardet
+
+import (
+	"testing"
+)
+
+var utf8Recognizers = []recognizer{
+	new(recognizerUtf8),
+}
+
+func TestUtf8(t *testing.T) {
+	ct := newChardetTester(new(recognizerUtf8))
+	for name, content := range embeddedfiles {
+		if name == "utf8.txt" {
+			ct.ExpectBest(content, "UTF-8", "", t)
+		} else {
+			ct.ExpectUnknown(content, t)
+		}
+	}
+}

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`package chardet`
`2`	`2`
`3`	`3`	`type recognizer interface {`
`4`		`- Match(*recognizerInput) recognizerOutput`
	`4`	`+ Match(*recognizerInput) recognizerOutput`
`5`	`5`	`}`
`6`	`6`
`7`	`7`	`type recognizerOutput Result`