|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +encodings = [ |
| 4 | + 'windows-1250', 'windows-1251', 'windows-1252', 'windows-1253', 'windows-1254', |
| 5 | + 'windows-1255', 'windows-1256', 'windows-1257', 'windows-1258', |
| 6 | + 'iso-8859-1', 'iso-8859-2', 'iso-8859-3', 'iso-8859-4', 'iso-8859-5', 'iso-8859-6', 'iso-8859-7', 'iso-8859-8', 'iso-8859-9', 'iso-8859-10', |
| 7 | + 'iso-8859-11', 'iso-8859-13', 'iso-8859-14', 'iso-8859-15', 'iso-8859-16' |
| 8 | + ] |
| 9 | + |
| 10 | +with open('src/tutf8e.c', 'w') as src: |
| 11 | + |
| 12 | + src.write(''' |
| 13 | +#include <tutf8e.h> |
| 14 | +
|
| 15 | +#include <sys/errno.h> |
| 16 | +
|
| 17 | +/* Determine the input length and UTF8 encoded length of NUL-terminated input string */ |
| 18 | +/* return ENOENT if input character is not convertable */ |
| 19 | +/* return 0 for success */ |
| 20 | +
|
| 21 | +int tutf8e_string_length(const uint16_t *table, const char *input, size_t *ilen, size_t *olen) |
| 22 | +{ |
| 23 | + for (const unsigned char *i = (const unsigned char *) input; *i; ++i, (*ilen)++) { |
| 24 | + const uint16_t c = table[*i]; |
| 25 | + if (c<0x80) { |
| 26 | + *olen += 1; |
| 27 | + continue; |
| 28 | + } |
| 29 | + if (c<0x800) { |
| 30 | + *olen += 2; |
| 31 | + continue; |
| 32 | + } |
| 33 | + if (c<0xffff) { |
| 34 | + *olen += 3; |
| 35 | + continue; |
| 36 | + } |
| 37 | + return ENOENT; |
| 38 | + } |
| 39 | + return 0; |
| 40 | +} |
| 41 | +
|
| 42 | +/* Determine the length of the UTF8 encoding of given input string and table */ |
| 43 | +/* return ENOENT if input character is not convertable */ |
| 44 | +/* return 0 for success */ |
| 45 | +
|
| 46 | +int tutf8e_buffer_length(const uint16_t *table, const char *input, size_t ilen, size_t *length) |
| 47 | +{ |
| 48 | + for (const unsigned char *i = (const unsigned char *) input; ilen; ++i, --ilen) { |
| 49 | + const uint16_t c = table[*i]; |
| 50 | + if (c<0x80) { |
| 51 | + ++*length; |
| 52 | + continue; |
| 53 | + } |
| 54 | + if (c<0x800) { |
| 55 | + *length += 2; |
| 56 | + continue; |
| 57 | + } |
| 58 | + if (c<0xffff) { |
| 59 | + *length += 3; |
| 60 | + continue; |
| 61 | + } |
| 62 | + return ENOENT; |
| 63 | + } |
| 64 | + return 0; |
| 65 | +} |
| 66 | +
|
| 67 | +/* UTF8 encode the given input string and table */ |
| 68 | +/* olen input is output buffer size, output is encoded length */ |
| 69 | +/* return E2BIG if output buffer insuficient */ |
| 70 | +/* return ENOENT if input character is not convertable */ |
| 71 | +/* return 0 for success */ |
| 72 | +
|
| 73 | +int tutf8e_buffer_encode(const uint16_t *table, const char *input, size_t ilen, char *output, size_t *olen) |
| 74 | +{ |
| 75 | + size_t left = *olen; |
| 76 | + unsigned char *o = (unsigned char *) output; |
| 77 | + for (const unsigned char *i = (const unsigned char *) input; ilen; ++i, --ilen) { |
| 78 | + const uint16_t c = table[*i]; |
| 79 | + if (c<0x80) { |
| 80 | + if (left<1) return E2BIG; |
| 81 | + *(o++) = c; |
| 82 | + left -= 1; |
| 83 | + continue; |
| 84 | + } |
| 85 | + if (c<0x800) { |
| 86 | + if (left<2) return E2BIG; |
| 87 | + *(o++) = 0xc0 | (c>>6); |
| 88 | + *(o++) = 0x80 | (c&0x3f); |
| 89 | + left -= 2; |
| 90 | + continue; |
| 91 | + } |
| 92 | + if (c<0xffff) { |
| 93 | + if (left<3) return E2BIG; |
| 94 | + *(o++) = 0xe0 | (c>>12); |
| 95 | + *(o++) = 0x80 | ((c>>6)&0x3f); |
| 96 | + *(o++) = 0x80 | (c&0x3f); |
| 97 | + left -= 3; |
| 98 | + continue; |
| 99 | + } |
| 100 | + return ENOENT; |
| 101 | + } |
| 102 | + *olen -= left; |
| 103 | + return 0; |
| 104 | +} |
| 105 | +''') |
| 106 | + |
| 107 | +with open('include/tutf8e.h', 'w') as include: |
| 108 | + |
| 109 | + include.write(''' |
| 110 | +#ifndef TUTF8E_H |
| 111 | +#define TUTF8E_H |
| 112 | +
|
| 113 | +#include <stddef.h> /* size_t */ |
| 114 | +#include <stdint.h> /* uint16_t */ |
| 115 | +
|
| 116 | +/* Internal API */ |
| 117 | +extern int tutf8e_string_length(const uint16_t *table, const char *i, size_t *ilen, size_t *olen); |
| 118 | +extern int tutf8e_buffer_length(const uint16_t *table, const char *i, size_t ilen, size_t *length); |
| 119 | +extern int tutf8e_buffer_encode(const uint16_t *table, const char *i, size_t ilen, char *output, size_t *olen); |
| 120 | +
|
| 121 | +/* External API */ |
| 122 | +''') |
| 123 | + |
| 124 | + include.write('\n/* Encode NUL-terminated string to UTF8 */\n') |
| 125 | + for e in sorted(encodings): |
| 126 | + name = e.replace('-', '_').lower() |
| 127 | + include.write('extern int % -33s(char *output, size_t olen, const char *input);\n'%('tutf8e_string_encode_%s'%(name))) |
| 128 | + |
| 129 | + include.write('\n/* Encode NUL-terminated string to UTF8, realloc as necessary */\n') |
| 130 | + for e in sorted(encodings): |
| 131 | + name = e.replace('-', '_').lower() |
| 132 | + include.write('extern char * % -33s(char *input);\n'%('tutf8e_string_encode_%s_realloc'%(name))) |
| 133 | + |
| 134 | + include.write('\n/* Encode buffer to UTF8 */\n') |
| 135 | + for e in sorted(encodings): |
| 136 | + name = e.replace('-', '_').lower() |
| 137 | + include.write('extern int % -33s(char *output, size_t *olen, const char *input, size_t ilen);\n'%('tutf8e_buffer_encode_%s'%(name))) |
| 138 | + |
| 139 | + for e in sorted(encodings): |
| 140 | + |
| 141 | + mapping = {} |
| 142 | + domain = [] |
| 143 | + |
| 144 | + name = e.replace('-', '_').lower() |
| 145 | + |
| 146 | + print('Encoding: %s'%(e)) |
| 147 | + |
| 148 | +# include.write('\n/* %s */\n'%(e)) |
| 149 | +# include.write('extern char * encode_%s_to_utf8(const char *input);\n'%(name)) |
| 150 | +# include.write('extern int % -33s(char *output, size_t olen, const char *input);\n'%('tutf8e_string_encode_%s'%(name))) |
| 151 | + |
| 152 | + with open('src/%s.c'%(name), 'w') as src: |
| 153 | + |
| 154 | + # Emit code |
| 155 | + |
| 156 | + src.write('#include <tutf8e.h>\n') |
| 157 | + src.write('\n') |
| 158 | + src.write('#include <string.h> /* strlen */\n') |
| 159 | + src.write('#include <stdlib.h> /* malloc/free */\n') |
| 160 | + src.write('\n') |
| 161 | + |
| 162 | + v = [] |
| 163 | + for i in range(0,256): |
| 164 | + try: |
| 165 | + v.append(ord(bytes([i]).decode(e)[0])) |
| 166 | + except: |
| 167 | + v.append(0xffff) |
| 168 | + pass |
| 169 | + |
| 170 | + src.write('static const uint16_t %s_utf8[256] =\n'%(name)) |
| 171 | + src.write('{\n') |
| 172 | + for i in range(0,256,16): |
| 173 | + src.write(' %s,\n'%(', '.join([ '0x%04x'%(i) for i in v[i:i+16]]))) |
| 174 | + src.write('};\n') |
| 175 | + |
| 176 | + src.write('\n') |
| 177 | + src.write('int tutf8e_string_encode_%s(char *output, size_t olen, const char *input)\n'%(name)) |
| 178 | + src.write('{\n') |
| 179 | + src.write(' size_t len = strlen(input) + 1;\n') |
| 180 | + src.write(' return tutf8e_buffer_encode(%s_utf8, input, len, output, &olen);\n'%(name)) |
| 181 | + src.write('}\n') |
| 182 | + |
| 183 | + src.write('\n') |
| 184 | + src.write('int tutf8e_buffer_encode_%s(char *output, size_t *olen, const char *input, size_t ilen)\n'%(name)) |
| 185 | + src.write('{\n') |
| 186 | + src.write(' return tutf8e_buffer_encode(%s_utf8, input, ilen, output, olen);\n'%(name)) |
| 187 | + src.write('}\n') |
| 188 | + |
| 189 | + src.write('\n') |
| 190 | + src.write('char * tutf8e_string_encode_%s_realloc(char *input)\n'%(name)) |
| 191 | + src.write('{\n') |
| 192 | + src.write(' size_t ilen = 0;\n') |
| 193 | + src.write(' size_t olen = 0;\n') |
| 194 | + src.write(' if (input && !tutf8e_string_length(%s_utf8, input, &ilen, &olen) && ilen && olen && ilen!=olen) {\n'%(name)) |
| 195 | + src.write(' char * output = malloc(olen + 1);\n') |
| 196 | + src.write(' if (output && !tutf8e_buffer_encode(%s_utf8, input, ilen, output, &olen)) {\n'%(name)) |
| 197 | + src.write(' free(input);\n') |
| 198 | + src.write(' output[olen] = 0;\n') |
| 199 | + src.write(' return output;\n') |
| 200 | + src.write(' }\n') |
| 201 | + src.write(' free(output);\n') |
| 202 | + src.write(' }\n') |
| 203 | + src.write(' return input;\n') |
| 204 | + src.write('}\n') |
| 205 | + |
| 206 | + include.write('\n') |
| 207 | + include.write('#endif\n') |
| 208 | + |
| 209 | +# TESTS |
| 210 | + |
| 211 | +# List of pangrams |
| 212 | +# http://clagnut.com/blog/2380/ |
| 213 | + |
| 214 | +tests = [ |
| 215 | + ('english', 'iso-8859-1', 'A quick brown fox jumps over the lazy dog'), |
| 216 | + ('czech', 'iso-8859-2', 'Nechť již hříšné saxofony ďáblů rozezvučí síň úděsnými tóny waltzu, tanga a quickstepu.'), |
| 217 | + ('turkish', 'iso-8859-3', 'Pijamalı hasta yağız şoföre çabucak güvendi.'), |
| 218 | + ('estonian', 'iso-8859-4', 'Põdur Zagrebi tšellomängija-följetonist Ciqo külmetas kehvas garaažis'), |
| 219 | + ('russian', 'iso-8859-5', 'В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!'), |
| 220 | + ('greek', 'iso-8859-7', 'διαφυλάξτε γενικά τη ζωή σας από βαθειά ψυχικά τραύματα'), |
| 221 | + ('hebrew', 'iso-8859-8', 'עטלף אבק נס דרך מזגן שהתפוצץ כי חם'), |
| 222 | + ('turkish2', 'iso-8859-9', 'Pijamalı hasta yağız şoföre çabucak güvendi.'), |
| 223 | + ('swedish', 'iso-8859-10', 'Flygande bäckasiner söka hwila på mjuka tuvor.'), |
| 224 | + ('thai', 'iso-8859-11', 'เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอยฯ'), |
| 225 | + ('polish', 'iso-8859-13', 'Jeżu klątw, spłódź Finom część gry hańb!') |
| 226 | +] |
| 227 | + |
| 228 | +with open('test/test.c', 'w') as test: |
| 229 | + |
| 230 | + test.write('#include <tutf8e.h>\n') |
| 231 | + test.write('\n') |
| 232 | + test.write('#include <stdio.h>\n') |
| 233 | + test.write('#include <string.h>\n') |
| 234 | + test.write('#include <stdlib.h>\n') |
| 235 | + test.write('\n') |
| 236 | + test.write('int main(int argc, char *argv[])\n') |
| 237 | + test.write('{\n') |
| 238 | + test.write(' int pass = 0;\n') |
| 239 | + test.write(' int fail = 0;\n') |
| 240 | + test.write(' int ret;\n') |
| 241 | + test.write(' size_t ilen, olen;\n') |
| 242 | + test.write(' char buffer[1024];\n') |
| 243 | + test.write(' char *encoded;\n') |
| 244 | + test.write('\n') |
| 245 | + |
| 246 | + for i in tests: |
| 247 | + if i[1] in encodings: |
| 248 | + test.write(' static const char %s[] = {\n'%(i[0])) |
| 249 | + data = [i for i in i[2].encode(i[1])] + [ 0 ] |
| 250 | + for i in range(0, len(data), 24): |
| 251 | + test.write(' %s,\n'%(', '.join([ '0x%02x'%(j) for j in data[i:i+24]]))) |
| 252 | + test.write(' };\n') |
| 253 | + |
| 254 | + test.write('\n') |
| 255 | + for i in tests: |
| 256 | + if i[1] in encodings: |
| 257 | + test.write(' static const char %sUTF8[] = {\n'%(i[0])) |
| 258 | + data = [i for i in i[2].encode('utf-8')] + [ 0 ] |
| 259 | + for i in range(0, len(data), 24): |
| 260 | + test.write(' %s,\n'%(', '.join([ '0x%02x'%(j) for j in data[i:i+24]]))) |
| 261 | + test.write(' };\n') |
| 262 | + |
| 263 | + test.write('\n /* string encode to UTF8 */\n') |
| 264 | + for i in tests: |
| 265 | + if i[1] in encodings: |
| 266 | + name = i[1].replace('-', '_').lower() |
| 267 | + test.write(' ret = tutf8e_string_encode_%s(buffer, sizeof(buffer), %s);\n'%(name, i[0])) |
| 268 | + test.write(' if (!ret && !strcmp(buffer, %sUTF8)) {\n'%(i[0])) |
| 269 | + test.write(' printf("%s\\n", buffer);\n') |
| 270 | + test.write(' pass++;\n') |
| 271 | + test.write(' } else {\n') |
| 272 | + test.write(' printf("Failed to encode %s test\\n");\n'%(i[0])) |
| 273 | + test.write(' fail++;\n') |
| 274 | + test.write(' }\n') |
| 275 | + test.write('\n') |
| 276 | + |
| 277 | + test.write('\n /* buffer encode to UTF8 */\n') |
| 278 | + for i in tests: |
| 279 | + if i[1] in encodings: |
| 280 | + name = i[1].replace('-', '_').lower() |
| 281 | + test.write(' ilen = strlen(%s);\n'%(i[0])) |
| 282 | + test.write(' olen = sizeof(buffer);\n') |
| 283 | + test.write(' ret = tutf8e_buffer_encode_%s(buffer, &olen, %s, ilen);\n'%(name, i[0])) |
| 284 | + test.write(' if (!ret && (olen+1)==sizeof(%sUTF8) && !strncmp(buffer, %sUTF8, olen)) {\n'%(i[0], i[0])) |
| 285 | + test.write(' pass++;\n') |
| 286 | + test.write(' } else {\n') |
| 287 | + test.write(' printf("Failed to encode %s test\\n");\n'%(i[0])) |
| 288 | + test.write(' fail++;\n') |
| 289 | + test.write(' }\n') |
| 290 | + test.write('\n') |
| 291 | + |
| 292 | + test.write('\n /* string encode with possible re-allocation to UTF8 */\n') |
| 293 | + for i in tests: |
| 294 | + if i[1] in encodings: |
| 295 | + name = i[1].replace('-', '_').lower() |
| 296 | + test.write(' encoded = tutf8e_string_encode_%s_realloc(strdup(%s));\n'%(name, i[0])) |
| 297 | + test.write(' if (encoded && !strcmp(encoded, %sUTF8)) {\n'%(i[0])) |
| 298 | + test.write(' printf("%s\\n", encoded);\n') |
| 299 | + test.write(' pass++;\n') |
| 300 | + test.write(' } else {\n') |
| 301 | + test.write(' printf("Failed to encode %s test\\n");\n'%(i[0])) |
| 302 | + test.write(' fail++;\n') |
| 303 | + test.write(' }\n') |
| 304 | + test.write(' free(encoded);\n') |
| 305 | + test.write('\n') |
| 306 | + |
| 307 | + test.write(' printf("%d passed, %d failed tests\\n", pass, fail);\n') |
| 308 | + |
| 309 | + test.write('}\n') |
0 commit comments