Skip to content

Commit fd8fa1c

Browse files
committed
filter-modify: Proof-of-concept integration of UTF8 encoding for string values
Signed-off-by: Nigel Stewart <[email protected]>
1 parent f2987f8 commit fd8fa1c

File tree

1 file changed

+309
-0
lines changed

1 file changed

+309
-0
lines changed

lib/tutf8e/codegen.py

Lines changed: 309 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,309 @@
1+
#!/usr/bin/env python3
2+
3+
encodings = [
4+
'windows-1250', 'windows-1251', 'windows-1252', 'windows-1253', 'windows-1254',
5+
'windows-1255', 'windows-1256', 'windows-1257', 'windows-1258',
6+
'iso-8859-1', 'iso-8859-2', 'iso-8859-3', 'iso-8859-4', 'iso-8859-5', 'iso-8859-6', 'iso-8859-7', 'iso-8859-8', 'iso-8859-9', 'iso-8859-10',
7+
'iso-8859-11', 'iso-8859-13', 'iso-8859-14', 'iso-8859-15', 'iso-8859-16'
8+
]
9+
10+
with open('src/tutf8e.c', 'w') as src:
11+
12+
src.write('''
13+
#include <tutf8e.h>
14+
15+
#include <sys/errno.h>
16+
17+
/* Determine the input length and UTF8 encoded length of NUL-terminated input string */
18+
/* return ENOENT if input character is not convertable */
19+
/* return 0 for success */
20+
21+
int tutf8e_string_length(const uint16_t *table, const char *input, size_t *ilen, size_t *olen)
22+
{
23+
for (const unsigned char *i = (const unsigned char *) input; *i; ++i, (*ilen)++) {
24+
const uint16_t c = table[*i];
25+
if (c<0x80) {
26+
*olen += 1;
27+
continue;
28+
}
29+
if (c<0x800) {
30+
*olen += 2;
31+
continue;
32+
}
33+
if (c<0xffff) {
34+
*olen += 3;
35+
continue;
36+
}
37+
return ENOENT;
38+
}
39+
return 0;
40+
}
41+
42+
/* Determine the length of the UTF8 encoding of given input string and table */
43+
/* return ENOENT if input character is not convertable */
44+
/* return 0 for success */
45+
46+
int tutf8e_buffer_length(const uint16_t *table, const char *input, size_t ilen, size_t *length)
47+
{
48+
for (const unsigned char *i = (const unsigned char *) input; ilen; ++i, --ilen) {
49+
const uint16_t c = table[*i];
50+
if (c<0x80) {
51+
++*length;
52+
continue;
53+
}
54+
if (c<0x800) {
55+
*length += 2;
56+
continue;
57+
}
58+
if (c<0xffff) {
59+
*length += 3;
60+
continue;
61+
}
62+
return ENOENT;
63+
}
64+
return 0;
65+
}
66+
67+
/* UTF8 encode the given input string and table */
68+
/* olen input is output buffer size, output is encoded length */
69+
/* return E2BIG if output buffer insuficient */
70+
/* return ENOENT if input character is not convertable */
71+
/* return 0 for success */
72+
73+
int tutf8e_buffer_encode(const uint16_t *table, const char *input, size_t ilen, char *output, size_t *olen)
74+
{
75+
size_t left = *olen;
76+
unsigned char *o = (unsigned char *) output;
77+
for (const unsigned char *i = (const unsigned char *) input; ilen; ++i, --ilen) {
78+
const uint16_t c = table[*i];
79+
if (c<0x80) {
80+
if (left<1) return E2BIG;
81+
*(o++) = c;
82+
left -= 1;
83+
continue;
84+
}
85+
if (c<0x800) {
86+
if (left<2) return E2BIG;
87+
*(o++) = 0xc0 | (c>>6);
88+
*(o++) = 0x80 | (c&0x3f);
89+
left -= 2;
90+
continue;
91+
}
92+
if (c<0xffff) {
93+
if (left<3) return E2BIG;
94+
*(o++) = 0xe0 | (c>>12);
95+
*(o++) = 0x80 | ((c>>6)&0x3f);
96+
*(o++) = 0x80 | (c&0x3f);
97+
left -= 3;
98+
continue;
99+
}
100+
return ENOENT;
101+
}
102+
*olen -= left;
103+
return 0;
104+
}
105+
''')
106+
107+
with open('include/tutf8e.h', 'w') as include:
108+
109+
include.write('''
110+
#ifndef TUTF8E_H
111+
#define TUTF8E_H
112+
113+
#include <stddef.h> /* size_t */
114+
#include <stdint.h> /* uint16_t */
115+
116+
/* Internal API */
117+
extern int tutf8e_string_length(const uint16_t *table, const char *i, size_t *ilen, size_t *olen);
118+
extern int tutf8e_buffer_length(const uint16_t *table, const char *i, size_t ilen, size_t *length);
119+
extern int tutf8e_buffer_encode(const uint16_t *table, const char *i, size_t ilen, char *output, size_t *olen);
120+
121+
/* External API */
122+
''')
123+
124+
include.write('\n/* Encode NUL-terminated string to UTF8 */\n')
125+
for e in sorted(encodings):
126+
name = e.replace('-', '_').lower()
127+
include.write('extern int % -33s(char *output, size_t olen, const char *input);\n'%('tutf8e_string_encode_%s'%(name)))
128+
129+
include.write('\n/* Encode NUL-terminated string to UTF8, realloc as necessary */\n')
130+
for e in sorted(encodings):
131+
name = e.replace('-', '_').lower()
132+
include.write('extern char * % -33s(char *input);\n'%('tutf8e_string_encode_%s_realloc'%(name)))
133+
134+
include.write('\n/* Encode buffer to UTF8 */\n')
135+
for e in sorted(encodings):
136+
name = e.replace('-', '_').lower()
137+
include.write('extern int % -33s(char *output, size_t *olen, const char *input, size_t ilen);\n'%('tutf8e_buffer_encode_%s'%(name)))
138+
139+
for e in sorted(encodings):
140+
141+
mapping = {}
142+
domain = []
143+
144+
name = e.replace('-', '_').lower()
145+
146+
print('Encoding: %s'%(e))
147+
148+
# include.write('\n/* %s */\n'%(e))
149+
# include.write('extern char * encode_%s_to_utf8(const char *input);\n'%(name))
150+
# include.write('extern int % -33s(char *output, size_t olen, const char *input);\n'%('tutf8e_string_encode_%s'%(name)))
151+
152+
with open('src/%s.c'%(name), 'w') as src:
153+
154+
# Emit code
155+
156+
src.write('#include <tutf8e.h>\n')
157+
src.write('\n')
158+
src.write('#include <string.h> /* strlen */\n')
159+
src.write('#include <stdlib.h> /* malloc/free */\n')
160+
src.write('\n')
161+
162+
v = []
163+
for i in range(0,256):
164+
try:
165+
v.append(ord(bytes([i]).decode(e)[0]))
166+
except:
167+
v.append(0xffff)
168+
pass
169+
170+
src.write('static const uint16_t %s_utf8[256] =\n'%(name))
171+
src.write('{\n')
172+
for i in range(0,256,16):
173+
src.write(' %s,\n'%(', '.join([ '0x%04x'%(i) for i in v[i:i+16]])))
174+
src.write('};\n')
175+
176+
src.write('\n')
177+
src.write('int tutf8e_string_encode_%s(char *output, size_t olen, const char *input)\n'%(name))
178+
src.write('{\n')
179+
src.write(' size_t len = strlen(input) + 1;\n')
180+
src.write(' return tutf8e_buffer_encode(%s_utf8, input, len, output, &olen);\n'%(name))
181+
src.write('}\n')
182+
183+
src.write('\n')
184+
src.write('int tutf8e_buffer_encode_%s(char *output, size_t *olen, const char *input, size_t ilen)\n'%(name))
185+
src.write('{\n')
186+
src.write(' return tutf8e_buffer_encode(%s_utf8, input, ilen, output, olen);\n'%(name))
187+
src.write('}\n')
188+
189+
src.write('\n')
190+
src.write('char * tutf8e_string_encode_%s_realloc(char *input)\n'%(name))
191+
src.write('{\n')
192+
src.write(' size_t ilen = 0;\n')
193+
src.write(' size_t olen = 0;\n')
194+
src.write(' if (input && !tutf8e_string_length(%s_utf8, input, &ilen, &olen) && ilen && olen && ilen!=olen) {\n'%(name))
195+
src.write(' char * output = malloc(olen + 1);\n')
196+
src.write(' if (output && !tutf8e_buffer_encode(%s_utf8, input, ilen, output, &olen)) {\n'%(name))
197+
src.write(' free(input);\n')
198+
src.write(' output[olen] = 0;\n')
199+
src.write(' return output;\n')
200+
src.write(' }\n')
201+
src.write(' free(output);\n')
202+
src.write(' }\n')
203+
src.write(' return input;\n')
204+
src.write('}\n')
205+
206+
include.write('\n')
207+
include.write('#endif\n')
208+
209+
# TESTS
210+
211+
# List of pangrams
212+
# http://clagnut.com/blog/2380/
213+
214+
tests = [
215+
('english', 'iso-8859-1', 'A quick brown fox jumps over the lazy dog'),
216+
('czech', 'iso-8859-2', 'Nechť již hříšné saxofony ďáblů rozezvučí síň úděsnými tóny waltzu, tanga a quickstepu.'),
217+
('turkish', 'iso-8859-3', 'Pijamalı hasta yağız şoföre çabucak güvendi.'),
218+
('estonian', 'iso-8859-4', 'Põdur Zagrebi tšellomängija-följetonist Ciqo külmetas kehvas garaažis'),
219+
('russian', 'iso-8859-5', 'В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!'),
220+
('greek', 'iso-8859-7', 'διαφυλάξτε γενικά τη ζωή σας από βαθειά ψυχικά τραύματα'),
221+
('hebrew', 'iso-8859-8', 'עטלף אבק נס דרך מזגן שהתפוצץ כי חם'),
222+
('turkish2', 'iso-8859-9', 'Pijamalı hasta yağız şoföre çabucak güvendi.'),
223+
('swedish', 'iso-8859-10', 'Flygande bäckasiner söka hwila på mjuka tuvor.'),
224+
('thai', 'iso-8859-11', 'เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอยฯ'),
225+
('polish', 'iso-8859-13', 'Jeżu klątw, spłódź Finom część gry hańb!')
226+
]
227+
228+
with open('test/test.c', 'w') as test:
229+
230+
test.write('#include <tutf8e.h>\n')
231+
test.write('\n')
232+
test.write('#include <stdio.h>\n')
233+
test.write('#include <string.h>\n')
234+
test.write('#include <stdlib.h>\n')
235+
test.write('\n')
236+
test.write('int main(int argc, char *argv[])\n')
237+
test.write('{\n')
238+
test.write(' int pass = 0;\n')
239+
test.write(' int fail = 0;\n')
240+
test.write(' int ret;\n')
241+
test.write(' size_t ilen, olen;\n')
242+
test.write(' char buffer[1024];\n')
243+
test.write(' char *encoded;\n')
244+
test.write('\n')
245+
246+
for i in tests:
247+
if i[1] in encodings:
248+
test.write(' static const char %s[] = {\n'%(i[0]))
249+
data = [i for i in i[2].encode(i[1])] + [ 0 ]
250+
for i in range(0, len(data), 24):
251+
test.write(' %s,\n'%(', '.join([ '0x%02x'%(j) for j in data[i:i+24]])))
252+
test.write(' };\n')
253+
254+
test.write('\n')
255+
for i in tests:
256+
if i[1] in encodings:
257+
test.write(' static const char %sUTF8[] = {\n'%(i[0]))
258+
data = [i for i in i[2].encode('utf-8')] + [ 0 ]
259+
for i in range(0, len(data), 24):
260+
test.write(' %s,\n'%(', '.join([ '0x%02x'%(j) for j in data[i:i+24]])))
261+
test.write(' };\n')
262+
263+
test.write('\n /* string encode to UTF8 */\n')
264+
for i in tests:
265+
if i[1] in encodings:
266+
name = i[1].replace('-', '_').lower()
267+
test.write(' ret = tutf8e_string_encode_%s(buffer, sizeof(buffer), %s);\n'%(name, i[0]))
268+
test.write(' if (!ret && !strcmp(buffer, %sUTF8)) {\n'%(i[0]))
269+
test.write(' printf("%s\\n", buffer);\n')
270+
test.write(' pass++;\n')
271+
test.write(' } else {\n')
272+
test.write(' printf("Failed to encode %s test\\n");\n'%(i[0]))
273+
test.write(' fail++;\n')
274+
test.write(' }\n')
275+
test.write('\n')
276+
277+
test.write('\n /* buffer encode to UTF8 */\n')
278+
for i in tests:
279+
if i[1] in encodings:
280+
name = i[1].replace('-', '_').lower()
281+
test.write(' ilen = strlen(%s);\n'%(i[0]))
282+
test.write(' olen = sizeof(buffer);\n')
283+
test.write(' ret = tutf8e_buffer_encode_%s(buffer, &olen, %s, ilen);\n'%(name, i[0]))
284+
test.write(' if (!ret && (olen+1)==sizeof(%sUTF8) && !strncmp(buffer, %sUTF8, olen)) {\n'%(i[0], i[0]))
285+
test.write(' pass++;\n')
286+
test.write(' } else {\n')
287+
test.write(' printf("Failed to encode %s test\\n");\n'%(i[0]))
288+
test.write(' fail++;\n')
289+
test.write(' }\n')
290+
test.write('\n')
291+
292+
test.write('\n /* string encode with possible re-allocation to UTF8 */\n')
293+
for i in tests:
294+
if i[1] in encodings:
295+
name = i[1].replace('-', '_').lower()
296+
test.write(' encoded = tutf8e_string_encode_%s_realloc(strdup(%s));\n'%(name, i[0]))
297+
test.write(' if (encoded && !strcmp(encoded, %sUTF8)) {\n'%(i[0]))
298+
test.write(' printf("%s\\n", encoded);\n')
299+
test.write(' pass++;\n')
300+
test.write(' } else {\n')
301+
test.write(' printf("Failed to encode %s test\\n");\n'%(i[0]))
302+
test.write(' fail++;\n')
303+
test.write(' }\n')
304+
test.write(' free(encoded);\n')
305+
test.write('\n')
306+
307+
test.write(' printf("%d passed, %d failed tests\\n", pass, fail);\n')
308+
309+
test.write('}\n')

0 commit comments

Comments
 (0)