Skip to content

Commit 1ba28b3

Browse files
authored
Merge pull request #3370 from jepler/compression-bigrams
add bigram compression to makeqstrdata (save ~100 bytes on trinket m0 de_DE)
2 parents 683462c + 0eee937 commit 1ba28b3

File tree

4 files changed

+54
-13
lines changed

4 files changed

+54
-13
lines changed

locale/cs.po

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,15 @@ msgstr ""
4444

4545
#: py/obj.c
4646
msgid " File \"%q\""
47-
msgstr "  Soubor \"%q\""
47+
msgstr " Soubor \"%q\""
4848

4949
#: py/obj.c
5050
msgid " File \"%q\", line %d"
51-
msgstr "  Soubor \"%q\", řádek %d"
51+
msgstr " Soubor \"%q\", řádek %d"
5252

5353
#: main.c
5454
msgid " output:\n"
55-
msgstr " výstup:\n"
55+
msgstr " výstup:\n"
5656

5757
#: py/objstr.c
5858
#, c-format

locale/pl.po

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1971,7 +1971,7 @@ msgstr "wartość kalibracji poza zakresem +/-127"
19711971

19721972
#: py/emitinlinethumb.c
19731973
msgid "can only have up to 4 parameters to Thumb assembly"
1974-
msgstr "asembler Thumb może przyjąć do 4 parameterów"
1974+
msgstr "asembler Thumb może przyjąć do 4 parameterów"
19751975

19761976
#: py/emitinlinextensa.c
19771977
msgid "can only have up to 4 parameters to Xtensa assembly"
@@ -3562,7 +3562,7 @@ msgstr ""
35623562
#~ msgstr "Nie udało się odkryć serwisów"
35633563

35643564
#~ msgid "Failed to get local address"
3565-
#~ msgstr "Nie udało się uzyskać lokalnego adresu"
3565+
#~ msgstr "Nie udało się uzyskać lokalnego adresu"
35663566

35673567
#~ msgid "Failed to get softdevice state"
35683568
#~ msgstr "Nie udało się odczytać stanu softdevice"
@@ -3610,7 +3610,7 @@ msgstr ""
36103610
#~ msgstr "Nie udało się zapisać gatts, błąd 0x%04x"
36113611

36123612
#~ msgid "Flash erase failed"
3613-
#~ msgstr "Nie udało się skasować flash"
3613+
#~ msgstr "Nie udało się skasować flash"
36143614

36153615
#~ msgid "Flash erase failed to start, err 0x%04x"
36163616
#~ msgstr "Nie udało się rozpocząć kasowania flash, błąd 0x%04x"

py/makeqstrdata.py

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,30 @@ def translate(translation_file, i18ns):
100100
translations.append((original, translation))
101101
return translations
102102

103+
def frequent_ngrams(corpus, sz, n):
104+
return collections.Counter(corpus[i:i+sz] for i in range(len(corpus)-sz)).most_common(n)
105+
106+
def encode_ngrams(translation, ngrams):
107+
if len(ngrams) > 32:
108+
start = 0xe000
109+
else:
110+
start = 0x80
111+
for i, g in enumerate(ngrams):
112+
translation = translation.replace(g, chr(start + i))
113+
return translation
114+
115+
def decode_ngrams(compressed, ngrams):
116+
if len(ngrams) > 32:
117+
start, end = 0xe000, 0xf8ff
118+
else:
119+
start, end = 0x80, 0x9f
120+
return "".join(ngrams[ord(c) - start] if (start <= ord(c) <= end) else c for c in compressed)
121+
103122
def compute_huffman_coding(translations, qstrs, compression_filename):
104123
all_strings = [x[1] for x in translations]
105124
all_strings_concat = "".join(all_strings)
125+
ngrams = [i[0] for i in frequent_ngrams(all_strings_concat, 2, 32)]
126+
all_strings_concat = encode_ngrams(all_strings_concat, ngrams)
106127
counts = collections.Counter(all_strings_concat)
107128
cb = huffman.codebook(counts.items())
108129
values = []
@@ -125,21 +146,31 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
125146
last_l = l
126147
lengths = bytearray()
127148
print("// length count", length_count)
149+
print("// bigrams", ngrams)
128150
for i in range(1, max(length_count) + 2):
129151
lengths.append(length_count.get(i, 0))
130152
print("// values", values, "lengths", len(lengths), lengths)
131-
print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
153+
ngramdata = [ord(ni) for i in ngrams for ni in i]
154+
print("// estimated total memory size", len(lengths) + 2*len(values) + 2 * len(ngramdata) + sum((len(cb[u]) + 7)//8 for u in all_strings_concat))
132155
print("//", values, lengths)
133156
values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
134157
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations)
135158
with open(compression_filename, "w") as f:
136159
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
137160
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
138161
f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
139-
return values, lengths
162+
f.write("const {} bigrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata)))
163+
if len(ngrams) > 32:
164+
bigram_start = 0xe000
165+
else:
166+
bigram_start = 0x80
167+
bigram_end = bigram_start + len(ngrams) - 1 # End is inclusive
168+
f.write("#define bigram_start {}\n".format(bigram_start))
169+
f.write("#define bigram_end {}\n".format(bigram_end))
170+
return values, lengths, ngrams
140171

141172
def decompress(encoding_table, encoded, encoded_length_bits):
142-
values, lengths = encoding_table
173+
values, lengths, ngrams = encoding_table
143174
dec = []
144175
this_byte = 0
145176
this_bit = 7
@@ -187,14 +218,16 @@ def decompress(encoding_table, encoded, encoded_length_bits):
187218
searched_length += lengths[bit_length]
188219

189220
v = values[searched_length + bits - max_code]
221+
v = decode_ngrams(v, ngrams)
190222
i += len(v.encode('utf-8'))
191223
dec.append(v)
192224
return ''.join(dec)
193225

194226
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
195227
if not isinstance(decompressed, str):
196228
raise TypeError()
197-
values, lengths = encoding_table
229+
values, lengths, ngrams = encoding_table
230+
decompressed = encode_ngrams(decompressed, ngrams)
198231
enc = bytearray(len(decompressed) * 3)
199232
#print(decompressed)
200233
#print(lengths)

supervisor/shared/translate.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include "genhdr/compression.generated.h"
3535
#endif
3636

37+
#include "py/misc.h"
3738
#include "supervisor/serial.h"
3839

3940
void serial_write_compressed(const compressed_string_t* compressed) {
@@ -46,13 +47,20 @@ STATIC int put_utf8(char *buf, int u) {
4647
if(u <= 0x7f) {
4748
*buf = u;
4849
return 1;
50+
} else if(bigram_start <= u && u <= bigram_end) {
51+
int n = (u - 0x80) * 2;
52+
// (note that at present, entries in the bigrams table are
53+
// guaranteed not to represent bigrams themselves, so this adds
54+
// at most 1 level of recursive call
55+
int ret = put_utf8(buf, bigrams[n]);
56+
return ret + put_utf8(buf + ret, bigrams[n+1]);
4957
} else if(u <= 0x07ff) {
5058
*buf++ = 0b11000000 | (u >> 6);
5159
*buf = 0b10000000 | (u & 0b00111111);
5260
return 2;
53-
} else { // u <= 0xffff)
54-
*buf++ = 0b11000000 | (u >> 12);
55-
*buf = 0b10000000 | ((u >> 6) & 0b00111111);
61+
} else { // u <= 0xffff
62+
*buf++ = 0b11100000 | (u >> 12);
63+
*buf++ = 0b10000000 | ((u >> 6) & 0b00111111);
5664
*buf = 0b10000000 | (u & 0b00111111);
5765
return 3;
5866
}

0 commit comments

Comments
 (0)