|
| 1 | +package chardet |
| 2 | + |
| 3 | +import ( |
| 4 | + "errors" |
| 5 | + "math" |
| 6 | +) |
| 7 | + |
| 8 | +type recognizerMultiByte struct { |
| 9 | + charset string |
| 10 | + language string |
| 11 | + decoder charDecoder |
| 12 | + commonChars []uint16 |
| 13 | +} |
| 14 | + |
| 15 | +type charDecoder interface { |
| 16 | + DecodeOneChar([]byte) (c uint16, remain []byte, err error) |
| 17 | +} |
| 18 | + |
| 19 | +func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) { |
| 20 | + return recognizerOutput{ |
| 21 | + Charset: r.charset, |
| 22 | + Language: r.language, |
| 23 | + Confidence: r.matchConfidence(input), |
| 24 | + } |
| 25 | +} |
| 26 | + |
| 27 | +func (r *recognizerMultiByte) matchConfidence(input *recognizerInput) int { |
| 28 | + raw := input.raw |
| 29 | + var c uint16 |
| 30 | + var err error |
| 31 | + var totalCharCount, badCharCount, singleByteCharCount, doubleByteCharCount, commonCharCount int |
| 32 | + for c, raw, err = r.decoder.DecodeOneChar(raw); len(raw) > 0; c, raw, err = r.decoder.DecodeOneChar(raw) { |
| 33 | + totalCharCount++ |
| 34 | + if err != nil { |
| 35 | + badCharCount++ |
| 36 | + } else if c <= 0xFF { |
| 37 | + singleByteCharCount++ |
| 38 | + } else { |
| 39 | + doubleByteCharCount++ |
| 40 | + if r.commonChars != nil && binarySearch(r.commonChars, c) { |
| 41 | + commonCharCount++ |
| 42 | + } |
| 43 | + } |
| 44 | + if badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount { |
| 45 | + return 0 |
| 46 | + } |
| 47 | + } |
| 48 | + |
| 49 | + if doubleByteCharCount <= 10 && badCharCount == 0 { |
| 50 | + if doubleByteCharCount == 0 && totalCharCount < 10 { |
| 51 | + return 0 |
| 52 | + } else { |
| 53 | + return 10 |
| 54 | + } |
| 55 | + } |
| 56 | + |
| 57 | + if doubleByteCharCount < 20*badCharCount { |
| 58 | + return 0 |
| 59 | + } |
| 60 | + if r.commonChars == nil { |
| 61 | + confidence := 30 + doubleByteCharCount - 20*badCharCount |
| 62 | + if confidence > 100 { |
| 63 | + confidence = 100 |
| 64 | + } |
| 65 | + return confidence |
| 66 | + } |
| 67 | + maxVal := math.Log(float64(doubleByteCharCount) / 4) |
| 68 | + scaleFactor := 90 / maxVal |
| 69 | + confidence := int(math.Log(float64(commonCharCount)+1)*scaleFactor + 10) |
| 70 | + if confidence > 100 { |
| 71 | + confidence = 100 |
| 72 | + } |
| 73 | + if confidence < 0 { |
| 74 | + confidence = 0 |
| 75 | + } |
| 76 | + return confidence |
| 77 | +} |
| 78 | + |
| 79 | +func binarySearch(l []uint16, c uint16) bool { |
| 80 | + start := 0 |
| 81 | + end := len(l) |
| 82 | + for start <= end { |
| 83 | + mid := (start + end) / 2 |
| 84 | + if c == l[mid] { |
| 85 | + return true |
| 86 | + } else if c < l[mid] { |
| 87 | + end = mid - 1 |
| 88 | + } else { |
| 89 | + start = mid + 1 |
| 90 | + } |
| 91 | + } |
| 92 | + return false |
| 93 | +} |
| 94 | + |
| 95 | +var eobError = errors.New("End of input buffer") |
| 96 | +var badCharError = errors.New("Decode a bad char") |
| 97 | + |
| 98 | +type charDecoder_sjis struct { |
| 99 | +} |
| 100 | + |
| 101 | +func (charDecoder_sjis) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) { |
| 102 | + if len(input) == 0 { |
| 103 | + return 0, nil, eobError |
| 104 | + } |
| 105 | + first := input[0] |
| 106 | + c = uint16(first) |
| 107 | + remain = input[1:] |
| 108 | + if first <= 0x7F || (first > 0xA0 && first <= 0xDF) { |
| 109 | + return |
| 110 | + } |
| 111 | + if len(remain) == 0 { |
| 112 | + return c, remain, badCharError |
| 113 | + } |
| 114 | + second := remain[0] |
| 115 | + remain = remain[1:] |
| 116 | + c = c<<8 | uint16(second) |
| 117 | + if (second >= 0x40 && second <= 0x7F) || (second >= 0x80 && second <= 0xFE) { |
| 118 | + } else { |
| 119 | + err = badCharError |
| 120 | + } |
| 121 | + return |
| 122 | +} |
| 123 | + |
| 124 | +var commonChars_sjis = []uint16{ |
| 125 | + 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, |
| 126 | + 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, |
| 127 | + 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, |
| 128 | + 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, |
| 129 | + 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, |
| 130 | + 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa, |
| 131 | +} |
| 132 | + |
| 133 | +func newRecognizer_sjis() *recognizerMultiByte { |
| 134 | + return &recognizerMultiByte{ |
| 135 | + "Shift_JIS", |
| 136 | + "ja", |
| 137 | + charDecoder_sjis{}, |
| 138 | + commonChars_sjis, |
| 139 | + } |
| 140 | +} |
| 141 | + |
| 142 | +type charDecoder_euc struct { |
| 143 | +} |
| 144 | + |
| 145 | +func (charDecoder_euc) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) { |
| 146 | + if len(input) == 0 { |
| 147 | + return 0, nil, eobError |
| 148 | + } |
| 149 | + first := input[0] |
| 150 | + remain = input[1:] |
| 151 | + c = uint16(first) |
| 152 | + if first <= 0x8D { |
| 153 | + return uint16(first), remain, nil |
| 154 | + } |
| 155 | + if len(remain) == 0 { |
| 156 | + return 0, nil, eobError |
| 157 | + } |
| 158 | + second := remain[0] |
| 159 | + remain = remain[1:] |
| 160 | + c = c<<8 | uint16(second) |
| 161 | + if first >= 0xA1 && first <= 0xFE { |
| 162 | + if second < 0xA1 { |
| 163 | + err = badCharError |
| 164 | + } |
| 165 | + return |
| 166 | + } |
| 167 | + if first == 0x8E { |
| 168 | + if second < 0xA1 { |
| 169 | + err = badCharError |
| 170 | + } |
| 171 | + return |
| 172 | + } |
| 173 | + if first == 0x8F { |
| 174 | + if len(remain) == 0 { |
| 175 | + return 0, nil, eobError |
| 176 | + } |
| 177 | + third := remain[0] |
| 178 | + remain = remain[1:] |
| 179 | + c = c<<0 | uint16(third) |
| 180 | + if third < 0xa1 { |
| 181 | + err = badCharError |
| 182 | + } |
| 183 | + } |
| 184 | + return |
| 185 | +} |
| 186 | + |
| 187 | +var commonChars_euc_jp = []uint16{ |
| 188 | + 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, |
| 189 | + 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, |
| 190 | + 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, |
| 191 | + 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, |
| 192 | + 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, |
| 193 | + 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, |
| 194 | + 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, |
| 195 | + 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, |
| 196 | + 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, |
| 197 | + 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1, |
| 198 | +} |
| 199 | + |
| 200 | +var commonChars_euc_kr = []uint16{ |
| 201 | + 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, |
| 202 | + 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, |
| 203 | + 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, |
| 204 | + 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, |
| 205 | + 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, |
| 206 | + 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, |
| 207 | + 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, |
| 208 | + 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, |
| 209 | + 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, |
| 210 | + 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad, |
| 211 | +} |
| 212 | + |
| 213 | +func newRecognizer_euc_jp() *recognizerMultiByte { |
| 214 | + return &recognizerMultiByte{ |
| 215 | + "EUC-JP", |
| 216 | + "ja", |
| 217 | + charDecoder_euc{}, |
| 218 | + commonChars_euc_jp, |
| 219 | + } |
| 220 | +} |
| 221 | + |
| 222 | +func newRecognizer_euc_kr() *recognizerMultiByte { |
| 223 | + return &recognizerMultiByte{ |
| 224 | + "EUC-KR", |
| 225 | + "ko", |
| 226 | + charDecoder_euc{}, |
| 227 | + commonChars_euc_kr, |
| 228 | + } |
| 229 | +} |
| 230 | + |
| 231 | +type charDecoder_big5 struct { |
| 232 | +} |
| 233 | + |
| 234 | +func (charDecoder_big5) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) { |
| 235 | + if len(input) == 0 { |
| 236 | + return 0, nil, eobError |
| 237 | + } |
| 238 | + first := input[0] |
| 239 | + remain = input[1:] |
| 240 | + c = uint16(first) |
| 241 | + if first <= 0x7F || first == 0xFF { |
| 242 | + return |
| 243 | + } |
| 244 | + if len(remain) == 0 { |
| 245 | + return c, nil, eobError |
| 246 | + } |
| 247 | + second := remain[0] |
| 248 | + remain = remain[1:] |
| 249 | + c = c<<8 | uint16(second) |
| 250 | + if second < 0x40 || second == 0x7F || second == 0xFF { |
| 251 | + err = badCharError |
| 252 | + } |
| 253 | + return |
| 254 | +} |
| 255 | + |
| 256 | +var commonChars_big5 = []uint16{ |
| 257 | + 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, |
| 258 | + 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, |
| 259 | + 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, |
| 260 | + 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, |
| 261 | + 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, |
| 262 | + 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, |
| 263 | + 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, |
| 264 | + 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, |
| 265 | + 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, |
| 266 | + 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f, |
| 267 | +} |
| 268 | + |
| 269 | +func newRecognizer_big5() *recognizerMultiByte { |
| 270 | + return &recognizerMultiByte{ |
| 271 | + "Big5", |
| 272 | + "zh", |
| 273 | + charDecoder_big5{}, |
| 274 | + commonChars_big5, |
| 275 | + } |
| 276 | +} |
| 277 | + |
| 278 | +type charDecoder_gb_18030 struct { |
| 279 | +} |
| 280 | + |
| 281 | +func (charDecoder_gb_18030) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) { |
| 282 | + if len(input) == 0 { |
| 283 | + return 0, nil, eobError |
| 284 | + } |
| 285 | + first := input[0] |
| 286 | + remain = input[1:] |
| 287 | + c = uint16(first) |
| 288 | + if first <= 0x80 { |
| 289 | + return |
| 290 | + } |
| 291 | + if len(remain) == 0 { |
| 292 | + return 0, nil, eobError |
| 293 | + } |
| 294 | + second := remain[0] |
| 295 | + remain = remain[1:] |
| 296 | + c = c<<8 | uint16(second) |
| 297 | + if first >= 0x81 && first <= 0xFE { |
| 298 | + if (second >= 0x40 && second <= 0x7E) || (second >= 0x80 && second <= 0xFE) { |
| 299 | + return |
| 300 | + } |
| 301 | + |
| 302 | + if second >= 0x30 && second <= 0x39 { |
| 303 | + if len(remain) == 0 { |
| 304 | + return 0, nil, eobError |
| 305 | + } |
| 306 | + third := remain[0] |
| 307 | + remain = remain[1:] |
| 308 | + if third >= 0x81 && third <= 0xFE { |
| 309 | + if len(remain) == 0 { |
| 310 | + return 0, nil, eobError |
| 311 | + } |
| 312 | + fourth := remain[0] |
| 313 | + remain = remain[1:] |
| 314 | + if fourth >= 0x30 && fourth <= 0x39 { |
| 315 | + c = c<<16 | uint16(third)<<8 | uint16(fourth) |
| 316 | + return |
| 317 | + } |
| 318 | + } |
| 319 | + } |
| 320 | + err = badCharError |
| 321 | + } |
| 322 | + return |
| 323 | +} |
| 324 | + |
| 325 | +var commonChars_gb_18030 = []uint16{ |
| 326 | + 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, |
| 327 | + 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, |
| 328 | + 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, |
| 329 | + 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, |
| 330 | + 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, |
| 331 | + 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, |
| 332 | + 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, |
| 333 | + 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, |
| 334 | + 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, |
| 335 | + 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0, |
| 336 | +} |
| 337 | + |
| 338 | +func newRecognizer_gb_18030() *recognizerMultiByte { |
| 339 | + return &recognizerMultiByte{ |
| 340 | + "GB-18030", |
| 341 | + "zh", |
| 342 | + charDecoder_gb_18030{}, |
| 343 | + commonChars_gb_18030, |
| 344 | + } |
| 345 | +} |
0 commit comments