Skip to content

Commit e495a8d

Browse files
committed
Define interface of chardet.Detector. Also implement the input processing logic.
0 parents  commit e495a8d

File tree

3 files changed

+120
-0
lines changed

3 files changed

+120
-0
lines changed

detector.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
package chardet
2+
3+
import (
4+
"errors"
5+
)
6+
7+
type Result struct {
8+
Charset string
9+
Language string
10+
Confidence int
11+
}
12+
13+
type Detector struct {
14+
recognizers []recognizer
15+
}
16+
17+
func NewDetector() *Detector {
18+
// Init recognizer
19+
return &Detector{}
20+
}
21+
22+
var (
23+
NotDetectedError = errors.New("Charset not detected.")
24+
)
25+
26+
func (d *Detector) DetectBest(b []byte, stripTag bool, declaredCharset string) (r *Result, err error) {
27+
var all []Result
28+
if all, err = d.DetectAll(b, stripTag, declaredCharset); err != nil {
29+
r = &all[0]
30+
}
31+
return
32+
}
33+
34+
func (d *Detector) DetectAll(b []byte, stripTag bool, declaredCharset string) ([]Result, error) {
35+
_ = newRecognizerInput(b, stripTag, declaredCharset)
36+
return nil, NotDetectedError
37+
}

input.go

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
package chardet
2+
3+
type recognizerInput struct {
4+
raw []byte
5+
input []byte
6+
tagStripped bool
7+
declaredCharset string
8+
byteStats []int
9+
hasC1Bytes bool
10+
}
11+
12+
func newRecognizerInput(raw []byte, stripTag bool, declaredCharset string) *recognizerInput {
13+
input, stripped := mayStripInput(raw, stripTag)
14+
byteStats := computeByteStats(input)
15+
return &recognizerInput{
16+
raw: raw,
17+
input: input,
18+
tagStripped: stripped,
19+
declaredCharset: declaredCharset,
20+
byteStats: byteStats,
21+
hasC1Bytes: computeHasC1Bytes(byteStats),
22+
}
23+
}
24+
25+
func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) {
26+
const inputBufferSize = 8192
27+
out = make([]byte, 0, inputBufferSize)
28+
var badTags, openTags int32
29+
var inMarkup bool = false
30+
stripped = false
31+
if stripTag {
32+
stripped = true
33+
for _, c := range raw {
34+
if c == '<' {
35+
if inMarkup {
36+
badTags += 1
37+
}
38+
inMarkup = true
39+
openTags += 1
40+
}
41+
if !inMarkup {
42+
out = append(out, c)
43+
if len(out) >= inputBufferSize {
44+
break
45+
}
46+
}
47+
if c == '>' {
48+
inMarkup = false
49+
}
50+
}
51+
}
52+
if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) {
53+
limit := len(raw)
54+
if limit > inputBufferSize {
55+
limit = inputBufferSize
56+
}
57+
out = make([]byte, limit)
58+
copy(out, raw[:limit])
59+
stripped = false
60+
}
61+
return
62+
}
63+
64+
func computeByteStats(input []byte) []int {
65+
r := make([]int, 256)
66+
for _, c := range input {
67+
r[c] += 1
68+
}
69+
return r
70+
}
71+
72+
func computeHasC1Bytes(byteStats []int) bool {
73+
for _, count := range byteStats[0x80 : 0x9F+1] {
74+
if count > 0 {
75+
return true
76+
}
77+
}
78+
return false
79+
}

recognizer.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
package chardet
2+
3+
type recognizer interface {
4+
}

0 commit comments

Comments
 (0)