Skip to content

Commit bdb07ad

Browse files
committed
translations: Make decompression clearer
Now this gets filled in with values e.g., 128 (0x80) and 159 (0x9f).
1 parent cbfd38d commit bdb07ad

File tree

2 files changed

+15
-9
lines changed

2 files changed

+15
-9
lines changed

py/makeqstrdata.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,14 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
159159
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
160160
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
161161
f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
162-
f.write("const {} ngrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata)))
162+
f.write("const {} bigrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata)))
163+
if len(ngrams) > 32:
164+
bigram_start = 0xe000
165+
else:
166+
bigram_start = 0x80
167+
bigram_end = bigram_start + len(ngrams) - 1 # End is inclusive
168+
f.write("#define bigram_start {}\n".format(bigram_start))
169+
f.write("#define bigram_end {}\n".format(bigram_end))
163170
return values, lengths, ngrams
164171

165172
def decompress(encoding_table, encoded, encoded_length_bits):

supervisor/shared/translate.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,19 +47,18 @@ STATIC int put_utf8(char *buf, int u) {
4747
if(u <= 0x7f) {
4848
*buf = u;
4949
return 1;
50-
} else if(MP_ARRAY_SIZE(ngrams) <= 64 && u <= 0xbf) {
50+
} else if(bigram_start <= u && u <= bigram_end) {
5151
int n = (u - 0x80) * 2;
52-
int ret = put_utf8(buf, ngrams[n]);
53-
return ret + put_utf8(buf + ret, ngrams[n+1]);
52+
// (note that at present, entries in the bigrams table are
53+
// guaranteed not to represent bigrams themselves, so this adds
54+
// at most 1 level of recursive call
55+
int ret = put_utf8(buf, bigrams[n]);
56+
return ret + put_utf8(buf + ret, bigrams[n+1]);
5457
} else if(u <= 0x07ff) {
5558
*buf++ = 0b11000000 | (u >> 6);
5659
*buf = 0b10000000 | (u & 0b00111111);
5760
return 2;
58-
} else if(MP_ARRAY_SIZE(ngrams) > 64 && u >= 0xe000 && u <= 0xf8ff) {
59-
int n = (u - 0xe000) * 2;
60-
int ret = put_utf8(buf, ngrams[n]);
61-
return ret + put_utf8(buf + ret, ngrams[n+1]);
62-
} else { // u <= 0xffff)
61+
} else { // u <= 0xffff
6362
*buf++ = 0b11000000 | (u >> 12);
6463
*buf = 0b10000000 | ((u >> 6) & 0b00111111);
6564
*buf = 0b10000000 | (u & 0b00111111);

0 commit comments

Comments
 (0)