Skip to content

Commit 2381dc9

Browse files
committed
Merge input and output to recognizer.go. Define interface of recognizer.
1 parent e495a8d commit 2381dc9

File tree

2 files changed

+85
-79
lines changed

2 files changed

+85
-79
lines changed

input.go

Lines changed: 0 additions & 79 deletions
This file was deleted.

recognizer.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,89 @@
11
package chardet
22

33
type recognizer interface {
4+
Match(recognizerInput) recognizerOutput
5+
}
6+
7+
type recognizerOutput struct {
8+
Charset string
9+
Language string
10+
Confidence uint32
11+
}
12+
13+
type recognizerInput struct {
14+
raw []byte
15+
input []byte
16+
tagStripped bool
17+
declaredCharset string
18+
byteStats []int
19+
hasC1Bytes bool
20+
}
21+
22+
func newRecognizerInput(raw []byte, stripTag bool, declaredCharset string) *recognizerInput {
23+
input, stripped := mayStripInput(raw, stripTag)
24+
byteStats := computeByteStats(input)
25+
return &recognizerInput{
26+
raw: raw,
27+
input: input,
28+
tagStripped: stripped,
29+
declaredCharset: declaredCharset,
30+
byteStats: byteStats,
31+
hasC1Bytes: computeHasC1Bytes(byteStats),
32+
}
33+
}
34+
35+
func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) {
36+
const inputBufferSize = 8192
37+
out = make([]byte, 0, inputBufferSize)
38+
var badTags, openTags int32
39+
var inMarkup bool = false
40+
stripped = false
41+
if stripTag {
42+
stripped = true
43+
for _, c := range raw {
44+
if c == '<' {
45+
if inMarkup {
46+
badTags += 1
47+
}
48+
inMarkup = true
49+
openTags += 1
50+
}
51+
if !inMarkup {
52+
out = append(out, c)
53+
if len(out) >= inputBufferSize {
54+
break
55+
}
56+
}
57+
if c == '>' {
58+
inMarkup = false
59+
}
60+
}
61+
}
62+
if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) {
63+
limit := len(raw)
64+
if limit > inputBufferSize {
65+
limit = inputBufferSize
66+
}
67+
out = make([]byte, limit)
68+
copy(out, raw[:limit])
69+
stripped = false
70+
}
71+
return
72+
}
73+
74+
func computeByteStats(input []byte) []int {
75+
r := make([]int, 256)
76+
for _, c := range input {
77+
r[c] += 1
78+
}
79+
return r
80+
}
81+
82+
func computeHasC1Bytes(byteStats []int) bool {
83+
for _, count := range byteStats[0x80 : 0x9F+1] {
84+
if count > 0 {
85+
return true
86+
}
87+
}
88+
return false
489
}

0 commit comments

Comments
 (0)