25
25
# 02110-1301 USA
26
26
######################### END LICENSE BLOCK #########################
27
27
28
+ from typing import Tuple , Union
29
+
28
30
from .big5freq import (
29
31
BIG5_CHAR_TO_FREQ_ORDER ,
30
32
BIG5_TABLE_SIZE ,
@@ -59,22 +61,22 @@ class CharDistributionAnalysis:
59
61
SURE_NO = 0.01
60
62
MINIMUM_DATA_THRESHOLD = 3
61
63
62
- def __init__ (self ):
64
+ def __init__ (self ) -> None :
63
65
# Mapping table to get frequency order from char order (get from
64
66
# GetOrder())
65
- self ._char_to_freq_order = tuple ()
66
- self ._table_size = None # Size of above table
67
+ self ._char_to_freq_order : Tuple [ int , ...] = tuple ()
68
+ self ._table_size = 0 # Size of above table
67
69
# This is a constant value which varies from language to language,
68
70
# used in calculating confidence. See
69
71
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
70
72
# for further detail.
71
- self .typical_distribution_ratio = None
72
- self ._done = None
73
- self ._total_chars = None
74
- self ._freq_chars = None
73
+ self .typical_distribution_ratio = 0.0
74
+ self ._done = False
75
+ self ._total_chars = 0
76
+ self ._freq_chars = 0
75
77
self .reset ()
76
78
77
- def reset (self ):
79
+ def reset (self ) -> None :
78
80
"""reset analyser, clear any state"""
79
81
# If this flag is set to True, detection is done and conclusion has
80
82
# been made
@@ -83,7 +85,7 @@ def reset(self):
83
85
# The number of characters whose frequency order is less than 512
84
86
self ._freq_chars = 0
85
87
86
- def feed (self , char , char_len ) :
88
+ def feed (self , char : Union [ bytes , bytearray ], char_len : int ) -> None :
87
89
"""feed a character with known length"""
88
90
if char_len == 2 :
89
91
# we only care about 2-bytes character in our distribution analysis
@@ -97,7 +99,7 @@ def feed(self, char, char_len):
97
99
if 512 > self ._char_to_freq_order [order ]:
98
100
self ._freq_chars += 1
99
101
100
- def get_confidence (self ):
102
+ def get_confidence (self ) -> float :
101
103
"""return confidence based on existing data"""
102
104
# if we didn't receive any character in our consideration range,
103
105
# return negative answer
@@ -114,12 +116,12 @@ def get_confidence(self):
114
116
# normalize confidence (we don't want to be 100% sure)
115
117
return self .SURE_YES
116
118
117
- def got_enough_data (self ):
119
+ def got_enough_data (self ) -> bool :
118
120
# It is not necessary to receive all data to draw conclusion.
119
121
# For charset detection, certain amount of data is enough
120
122
return self ._total_chars > self .ENOUGH_DATA_THRESHOLD
121
123
122
- def get_order (self , _ ) :
124
+ def get_order (self , _ : Union [ bytes , bytearray ]) -> int :
123
125
# We do not handle characters based on the original encoding string,
124
126
# but convert this encoding string to a number, here called order.
125
127
# This allows multiple encodings of a language to share one frequency
@@ -128,13 +130,13 @@ def get_order(self, _):
128
130
129
131
130
132
class EUCTWDistributionAnalysis (CharDistributionAnalysis ):
131
- def __init__ (self ):
133
+ def __init__ (self ) -> None :
132
134
super ().__init__ ()
133
135
self ._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
134
136
self ._table_size = EUCTW_TABLE_SIZE
135
137
self .typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
136
138
137
- def get_order (self , byte_str ) :
139
+ def get_order (self , byte_str : Union [ bytes , bytearray ]) -> int :
138
140
# for euc-TW encoding, we are interested
139
141
# first byte range: 0xc4 -- 0xfe
140
142
# second byte range: 0xa1 -- 0xfe
@@ -146,13 +148,13 @@ def get_order(self, byte_str):
146
148
147
149
148
150
class EUCKRDistributionAnalysis (CharDistributionAnalysis ):
149
- def __init__ (self ):
151
+ def __init__ (self ) -> None :
150
152
super ().__init__ ()
151
153
self ._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
152
154
self ._table_size = EUCKR_TABLE_SIZE
153
155
self .typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
154
156
155
- def get_order (self , byte_str ) :
157
+ def get_order (self , byte_str : Union [ bytes , bytearray ]) -> int :
156
158
# for euc-KR encoding, we are interested
157
159
# first byte range: 0xb0 -- 0xfe
158
160
# second byte range: 0xa1 -- 0xfe
@@ -164,13 +166,13 @@ def get_order(self, byte_str):
164
166
165
167
166
168
class JOHABDistributionAnalysis (CharDistributionAnalysis ):
167
- def __init__ (self ):
169
+ def __init__ (self ) -> None :
168
170
super ().__init__ ()
169
171
self ._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
170
172
self ._table_size = EUCKR_TABLE_SIZE
171
173
self .typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
172
174
173
- def get_order (self , byte_str ) :
175
+ def get_order (self , byte_str : Union [ bytes , bytearray ]) -> int :
174
176
first_char = byte_str [0 ]
175
177
if 0x88 <= first_char < 0xD4 :
176
178
code = first_char * 256 + byte_str [1 ]
@@ -179,13 +181,13 @@ def get_order(self, byte_str):
179
181
180
182
181
183
class GB2312DistributionAnalysis (CharDistributionAnalysis ):
182
- def __init__ (self ):
184
+ def __init__ (self ) -> None :
183
185
super ().__init__ ()
184
186
self ._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
185
187
self ._table_size = GB2312_TABLE_SIZE
186
188
self .typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
187
189
188
- def get_order (self , byte_str ) :
190
+ def get_order (self , byte_str : Union [ bytes , bytearray ]) -> int :
189
191
# for GB2312 encoding, we are interested
190
192
# first byte range: 0xb0 -- 0xfe
191
193
# second byte range: 0xa1 -- 0xfe
@@ -197,13 +199,13 @@ def get_order(self, byte_str):
197
199
198
200
199
201
class Big5DistributionAnalysis (CharDistributionAnalysis ):
200
- def __init__ (self ):
202
+ def __init__ (self ) -> None :
201
203
super ().__init__ ()
202
204
self ._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
203
205
self ._table_size = BIG5_TABLE_SIZE
204
206
self .typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
205
207
206
- def get_order (self , byte_str ) :
208
+ def get_order (self , byte_str : Union [ bytes , bytearray ]) -> int :
207
209
# for big5 encoding, we are interested
208
210
# first byte range: 0xa4 -- 0xfe
209
211
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
@@ -217,13 +219,13 @@ def get_order(self, byte_str):
217
219
218
220
219
221
class SJISDistributionAnalysis (CharDistributionAnalysis ):
220
- def __init__ (self ):
222
+ def __init__ (self ) -> None :
221
223
super ().__init__ ()
222
224
self ._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
223
225
self ._table_size = JIS_TABLE_SIZE
224
226
self .typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
225
227
226
- def get_order (self , byte_str ) :
228
+ def get_order (self , byte_str : Union [ bytes , bytearray ]) -> int :
227
229
# for sjis encoding, we are interested
228
230
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
229
231
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
@@ -242,13 +244,13 @@ def get_order(self, byte_str):
242
244
243
245
244
246
class EUCJPDistributionAnalysis (CharDistributionAnalysis ):
245
- def __init__ (self ):
247
+ def __init__ (self ) -> None :
246
248
super ().__init__ ()
247
249
self ._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
248
250
self ._table_size = JIS_TABLE_SIZE
249
251
self .typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
250
252
251
- def get_order (self , byte_str ) :
253
+ def get_order (self , byte_str : Union [ bytes , bytearray ]) -> int :
252
254
# for euc-JP encoding, we are interested
253
255
# first byte range: 0xa0 -- 0xfe
254
256
# second byte range: 0xa1 -- 0xfe
0 commit comments