makeqstrdata: correct range of low code points to 0x80..0x9f inclusive

jepler · jepler · commit c34cb82ecb26 · 2020-09-02T15:52:02.000-05:00
The previous range was unintentionally big and overlaps some characters
we'd like to use (and also 0xa0, which we don't intentionally use)
diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py
@@ -116,7 +116,7 @@ def pua_to_ngrams(compressed, ngrams):
     if len(ngrams) > 32:
         start, end = 0xe000, 0xf8ff
     else:
-        start, end = 0x80, 0xbf
+        start, end = 0x80, 0x9f
     return "".join(ngrams[ord(c) - start] if (start <= ord(c) <= end) else c for c in compressed)
 
 def compute_huffman_coding(translations, qstrs, compression_filename):
@@ -146,6 +146,7 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
         last_l = l
     lengths = bytearray()
     print("// length count", length_count)
+    print("// bigrams", ngrams)
     for i in range(1, max(length_count) + 2):
         lengths.append(length_count.get(i, 0))
     print("// values", values, "lengths", len(lengths), lengths)