|
1 | 1 | package chardet
|
2 | 2 |
|
3 | 3 | type recognizer interface {
|
| 4 | + Match(recognizerInput) recognizerOutput |
| 5 | +} |
| 6 | + |
| 7 | +type recognizerOutput struct { |
| 8 | + Charset string |
| 9 | + Language string |
| 10 | + Confidence uint32 |
| 11 | +} |
| 12 | + |
| 13 | +type recognizerInput struct { |
| 14 | + raw []byte |
| 15 | + input []byte |
| 16 | + tagStripped bool |
| 17 | + declaredCharset string |
| 18 | + byteStats []int |
| 19 | + hasC1Bytes bool |
| 20 | +} |
| 21 | + |
| 22 | +func newRecognizerInput(raw []byte, stripTag bool, declaredCharset string) *recognizerInput { |
| 23 | + input, stripped := mayStripInput(raw, stripTag) |
| 24 | + byteStats := computeByteStats(input) |
| 25 | + return &recognizerInput{ |
| 26 | + raw: raw, |
| 27 | + input: input, |
| 28 | + tagStripped: stripped, |
| 29 | + declaredCharset: declaredCharset, |
| 30 | + byteStats: byteStats, |
| 31 | + hasC1Bytes: computeHasC1Bytes(byteStats), |
| 32 | + } |
| 33 | +} |
| 34 | + |
| 35 | +func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) { |
| 36 | + const inputBufferSize = 8192 |
| 37 | + out = make([]byte, 0, inputBufferSize) |
| 38 | + var badTags, openTags int32 |
| 39 | + var inMarkup bool = false |
| 40 | + stripped = false |
| 41 | + if stripTag { |
| 42 | + stripped = true |
| 43 | + for _, c := range raw { |
| 44 | + if c == '<' { |
| 45 | + if inMarkup { |
| 46 | + badTags += 1 |
| 47 | + } |
| 48 | + inMarkup = true |
| 49 | + openTags += 1 |
| 50 | + } |
| 51 | + if !inMarkup { |
| 52 | + out = append(out, c) |
| 53 | + if len(out) >= inputBufferSize { |
| 54 | + break |
| 55 | + } |
| 56 | + } |
| 57 | + if c == '>' { |
| 58 | + inMarkup = false |
| 59 | + } |
| 60 | + } |
| 61 | + } |
| 62 | + if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) { |
| 63 | + limit := len(raw) |
| 64 | + if limit > inputBufferSize { |
| 65 | + limit = inputBufferSize |
| 66 | + } |
| 67 | + out = make([]byte, limit) |
| 68 | + copy(out, raw[:limit]) |
| 69 | + stripped = false |
| 70 | + } |
| 71 | + return |
| 72 | +} |
| 73 | + |
| 74 | +func computeByteStats(input []byte) []int { |
| 75 | + r := make([]int, 256) |
| 76 | + for _, c := range input { |
| 77 | + r[c] += 1 |
| 78 | + } |
| 79 | + return r |
| 80 | +} |
| 81 | + |
| 82 | +func computeHasC1Bytes(byteStats []int) bool { |
| 83 | + for _, count := range byteStats[0x80 : 0x9F+1] { |
| 84 | + if count > 0 { |
| 85 | + return true |
| 86 | + } |
| 87 | + } |
| 88 | + return false |
4 | 89 | }
|
0 commit comments