@@ -174,7 +174,7 @@ class EncodingTable:
174
174
qstrs_inv : object
175
175
176
176
177
- def compute_huffman_coding (qstrs , translation_name , translations , f ):
177
+ def compute_huffman_coding (qstrs , translation_name , translations , f , compression_level ):
178
178
# possible future improvement: some languages are better when consider len(k) > 2. try both?
179
179
qstrs = dict ((k , v ) for k , v in qstrs .items () if len (k ) > 3 )
180
180
qstr_strs = list (qstrs .keys ())
@@ -209,6 +209,8 @@ def remove_offset(c):
209
209
if 0x80 <= ord_c < 0xFF :
210
210
end_unused = min (ord_c , end_unused )
211
211
max_words = end_unused - 0x80
212
+ if compression_level < 5 :
213
+ max_words = 0
212
214
213
215
bits_per_codepoint = 16 if max_ord > 255 else 8
214
216
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
@@ -298,8 +300,12 @@ def est_net_savings(s, occ):
298
300
word = scores [0 ][0 ]
299
301
words .append (word )
300
302
303
+ splitters = words [:]
304
+ if compression_level > 3 :
305
+ splitters .extend (qstr_strs )
306
+
301
307
words .sort (key = len )
302
- extractor = TextSplitter (words + qstr_strs )
308
+ extractor = TextSplitter (splitters )
303
309
counter = collections .Counter ()
304
310
used_qstr = 0
305
311
for t in texts :
@@ -356,8 +362,8 @@ def est_net_savings(s, occ):
356
362
len (translation .encode ("utf-8" )) for (original , translation ) in translations
357
363
)
358
364
359
- maxlen = len (words [- 1 ])
360
- minlen = len (words [0 ])
365
+ maxlen = len (words [- 1 ]) if words else 0
366
+ minlen = len (words [0 ]) if words else 0
361
367
wlencount = [len ([None for w in words if len (w ) == l ]) for l in range (minlen , maxlen + 1 )]
362
368
363
369
translation_qstr_bits = used_qstr .bit_length ()
@@ -596,6 +602,12 @@ def output_translation_data(encoding_table, i18ns, out):
596
602
parser .add_argument (
597
603
"--translation" , default = None , type = str , help = "translations for i18n() items"
598
604
)
605
+ parser .add_argument (
606
+ "--compression_level" ,
607
+ type = int ,
608
+ default = 9 ,
609
+ help = "degree of compression (>5: construct dictionary; >3: use qstrs)" ,
610
+ )
599
611
parser .add_argument (
600
612
"--compression_filename" ,
601
613
type = argparse .FileType ("w" , encoding = "UTF-8" ),
@@ -619,6 +631,6 @@ def output_translation_data(encoding_table, i18ns, out):
619
631
i18ns = sorted (i18ns )
620
632
translations = translate (args .translation , i18ns )
621
633
encoding_table = compute_huffman_coding (
622
- qstrs , args .translation , translations , args .compression_filename
634
+ qstrs , args .translation , translations , args .compression_filename , args . compression_level
623
635
)
624
636
output_translation_data (encoding_table , translations , args .translation_filename )
0 commit comments