Skip to content

Commit 7bb0e42

Browse files
committed
Add logic to sort and dedup recognizer outputs
1 parent a3955fb commit 7bb0e42

File tree

2 files changed

+32
-11
lines changed

2 files changed

+32
-11
lines changed

detector.go

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package chardet
22

33
import (
44
"errors"
5+
"sort"
56
)
67

78
type Result struct {
@@ -34,10 +35,6 @@ func (d *Detector) DetectBest(b []byte, stripTag bool, declaredCharset string) (
3435
return
3536
}
3637

37-
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
38-
outputChan <- r.Match(input)
39-
}
40-
4138
func (d *Detector) DetectAll(b []byte, stripTag bool, declaredCharset string) ([]Result, error) {
4239
input := newRecognizerInput(b, stripTag, declaredCharset)
4340
outputChan := make(chan recognizerOutput)
@@ -46,7 +43,35 @@ func (d *Detector) DetectAll(b []byte, stripTag bool, declaredCharset string) ([
4643
}
4744
outputs := make([]recognizerOutput, 0, len(recognizers))
4845
for i := 0; i < len(recognizers); i++ {
49-
outputs = append(outputs, <-outputChan)
46+
o := <-outputChan
47+
if o.Confidence > 0 {
48+
outputs = append(outputs, <-outputChan)
49+
}
50+
}
51+
if len(outputs) == 0 {
52+
return nil, NotDetectedError
53+
}
54+
55+
sort.Sort(recognizerOutputs(outputs))
56+
dedupOutputs := make([]Result, 0, len(outputs))
57+
foundCharsets := make(map[string]struct{}, len(outputs))
58+
for _, o := range outputs {
59+
if _, found := foundCharsets[o.Charset]; !found {
60+
dedupOutputs = append(dedupOutputs, Result(o))
61+
foundCharsets[o.Charset] = struct{}{}
62+
}
5063
}
51-
return nil, NotDetectedError
64+
if len(dedupOutputs) == 0 {
65+
return nil, NotDetectedError
66+
}
67+
return dedupOutputs, nil
68+
}
69+
70+
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
71+
outputChan <- r.Match(input)
5272
}
73+
74+
type recognizerOutputs []recognizerOutput
75+
func (r recognizerOutputs) Len() int { return len(r) }
76+
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
77+
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }

recognizer.go

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,7 @@ type recognizer interface {
44
Match(*recognizerInput) recognizerOutput
55
}
66

7-
type recognizerOutput struct {
8-
Charset string
9-
Language string
10-
Confidence uint32
11-
}
7+
type recognizerOutput Result
128

139
type recognizerInput struct {
1410
raw []byte

0 commit comments

Comments
 (0)