@@ -103,7 +103,7 @@ def translate(translation_file, i18ns):
103
103
def frequent_ngrams (corpus , sz , n ):
104
104
return collections .Counter (corpus [i :i + sz ] for i in range (len (corpus )- sz )).most_common (n )
105
105
106
- def ngrams_to_pua (translation , ngrams ):
106
+ def encode_ngrams (translation , ngrams ):
107
107
if len (ngrams ) > 32 :
108
108
start = 0xe000
109
109
else :
@@ -112,7 +112,7 @@ def ngrams_to_pua(translation, ngrams):
112
112
translation = translation .replace (g , chr (start + i ))
113
113
return translation
114
114
115
- def pua_to_ngrams (compressed , ngrams ):
115
+ def decode_ngrams (compressed , ngrams ):
116
116
if len (ngrams ) > 32 :
117
117
start , end = 0xe000 , 0xf8ff
118
118
else :
@@ -123,7 +123,7 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
123
123
all_strings = [x [1 ] for x in translations ]
124
124
all_strings_concat = "" .join (all_strings )
125
125
ngrams = [i [0 ] for i in frequent_ngrams (all_strings_concat , 2 , 32 )]
126
- all_strings_concat = ngrams_to_pua (all_strings_concat , ngrams )
126
+ all_strings_concat = encode_ngrams (all_strings_concat , ngrams )
127
127
counts = collections .Counter (all_strings_concat )
128
128
cb = huffman .codebook (counts .items ())
129
129
values = []
@@ -211,7 +211,7 @@ def decompress(encoding_table, encoded, encoded_length_bits):
211
211
searched_length += lengths [bit_length ]
212
212
213
213
v = values [searched_length + bits - max_code ]
214
- v = pua_to_ngrams (v , ngrams )
214
+ v = decode_ngrams (v , ngrams )
215
215
i += len (v .encode ('utf-8' ))
216
216
dec .append (v )
217
217
return '' .join (dec )
@@ -220,7 +220,7 @@ def compress(encoding_table, decompressed, encoded_length_bits, len_translation_
220
220
if not isinstance (decompressed , str ):
221
221
raise TypeError ()
222
222
values , lengths , ngrams = encoding_table
223
- decompressed = ngrams_to_pua (decompressed , ngrams )
223
+ decompressed = encode_ngrams (decompressed , ngrams )
224
224
enc = bytearray (len (decompressed ) * 3 )
225
225
#print(decompressed)
226
226
#print(lengths)
0 commit comments