diff --git a/lib/tutf8e/CMakeLists.txt b/lib/tutf8e/CMakeLists.txt index fcf2012af4e..35cbe01adab 100644 --- a/lib/tutf8e/CMakeLists.txt +++ b/lib/tutf8e/CMakeLists.txt @@ -14,4 +14,4 @@ set_property(TARGET tutf8e PROPERTY C_EXTENSIONS OFF) add_executable(tutf8e-test test/test.c) target_link_libraries(tutf8e-test tutf8e) set_property(TARGET tutf8e-test PROPERTY C_STANDARD 99) -set_property(TARGET tutf8e-test PROPERTY C_EXTENSIONS OFF) +set_property(TARGET tutf8e-test PROPERTY C_EXTENSIONS ON) diff --git a/lib/tutf8e/codegen.py b/lib/tutf8e/codegen.py index 27af1c81bd0..8d7c2c635a9 100755 --- a/lib/tutf8e/codegen.py +++ b/lib/tutf8e/codegen.py @@ -19,42 +19,107 @@ #include /* size_t */ #include /* uint16_t */ -/* Internal API */ +/*************** Internal API ***************/ -extern int tutf8e_string_length(const uint16_t *table, const char *i, size_t *ilen, size_t *olen); -extern int tutf8e_string_encode(const uint16_t *table, const char *i, char *o, size_t *olen); +/* NUL-terminated C-string API */ -extern int tutf8e_buffer_length(const uint16_t *table, const char *i, size_t ilen, size_t *olen); -extern int tutf8e_buffer_encode(const uint16_t *table, const char *i, size_t ilen, char *o, size_t *olen); +extern int tutf8e_string_length(const uint16_t *table, const char *input, const char *invalid, size_t *input_length, size_t *output_length); +extern int tutf8e_string_encode(const uint16_t *table, const char *input, const char *invalid, char *output, size_t *output_length); -/* Generic API */ +/* Known-length buffer API */ + +extern int tutf8e_buffer_length(const uint16_t *table, const char *input, size_t input_length, const char *invalid, size_t *output_length); +extern int tutf8e_buffer_encode(const uint16_t *table, const char *input, size_t input_length, const char *invalid, char *output, size_t *output_length); + +/*************** Public API ***************/ + +/* Opaque handle type */ typedef void *TUTF8encoder; +/* Query encoder by name */ + extern TUTF8encoder tutf8e_encoder(const char *encoding); #define TUTF8E_OK 0 /* Success */ #define TUTF8E_INVALID 1 /* Invalid input character */ #define TUTF8E_TOOLONG 2 /* Insufficient output buffer */ -static inline int tutf8e_encoder_string_length(const TUTF8encoder encoder, const char *i, size_t *ilen, size_t *olen) +/* + * tutf8e_encoder_string_length + * + * Determine the length of input and UTF8 encoded output of NUL-terminated string + * Performance: single pass O(n) + * + * output NUL terminator not counted + * + * - TUTF8E_INVALID if input character is not convertable + * - TUTF8E_OK for success + */ + +static inline int tutf8e_encoder_string_length(const TUTF8encoder encoder, const char *input, const char *invalid, size_t *input_length, size_t *output_length) { - return tutf8e_string_length((const uint16_t *) encoder, i, ilen, olen); + return tutf8e_string_length((const uint16_t *) encoder, input, invalid, input_length, output_length); } -static inline int tutf8e_encoder_string_encode(const TUTF8encoder encoder, const char *i, char *o, size_t *olen) +/* + * tutf8e_encoder_string_encode + * + * UTF8 encode NUL-terminated string + * Performance: two pass O(n) + * + * output string is NUL terminated + * output_length is output buffer size for input + * output_length is encoded length for output, including NUL + * + * - TUTF8E_TOOLONG if output buffer insuficient + * - TUTF8E_INVALID if input character is not convertable + * - TUTF8E_OK for success + */ + +static inline int tutf8e_encoder_string_encode(const TUTF8encoder encoder, const char *input, const char *invalid, char *output, size_t *output_length) { - return tutf8e_string_encode((const uint16_t *) encoder, i, o, olen); + return tutf8e_string_encode((const uint16_t *) encoder, input, invalid, output, output_length); } -static inline int tutf8e_encoder_buffer_length(const TUTF8encoder encoder, const char *i, size_t ilen, size_t *length) +/* Known-length buffer API */ + +/* + * tutf8e_encoder_buffer_length + * + * Determine the length of input and UTF8 encoded output of string + * Performance: single pass O(n) + * + * output NUL terminator not counted + * + * - TUTF8E_INVALID if input character is not convertable + * - TUTF8E_OK for success + */ + +static inline int tutf8e_encoder_buffer_length(const TUTF8encoder encoder, const char *input, const char *invalid, size_t input_length, size_t *length) { - return tutf8e_buffer_length((const uint16_t *) encoder, i, ilen, length); + return tutf8e_buffer_length((const uint16_t *) encoder, input, input_length, invalid, length); } -static inline int tutf8e_encoder_buffer_encode(const TUTF8encoder encoder, const char *i, size_t ilen, char *o, size_t *olen) +/* + * tutf8e_encoder_buffer_encode + * + * UTF8 encode string + * Performance: two pass O(n) + * + * output string is not NUL terminated + * + * output_length is output buffer size for input + * output_length is encoded length for output + * + * - TUTF8E_TOOLONG if output buffer insuficient + * - TUTF8E_INVALID if input character is not convertable + * - TUTF8E_OK for success + */ + +static inline int tutf8e_encoder_buffer_encode(const TUTF8encoder encoder, const char *input, size_t input_length, const char *invalid, char *output, size_t *output_length) { - return tutf8e_buffer_encode((const uint16_t *) encoder, i, ilen, o, olen); + return tutf8e_buffer_encode((const uint16_t *) encoder, input, input_length, invalid, output, output_length); } ''') @@ -74,63 +139,65 @@ #include -/* Determine the input length and UTF8 encoded length of NUL-terminated input string */ -/* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ - -int tutf8e_string_length(const uint16_t *table, const char *input, size_t *ilen, size_t *olen) +int tutf8e_string_length(const uint16_t *table, const char *input, const char *invalid, size_t *input_length, size_t *output_length) { + const size_t invalid_length = invalid ? strlen(invalid) : 0; + const unsigned char *i; - for (i = (const unsigned char *) input; *i; ++i, (*ilen)++) { + for (i = (const unsigned char *) input; *i; ++i, (*input_length)++) { const uint16_t c = table[*i]; if (c<0x80) { - *olen += 1; + *output_length += 1; continue; } if (c<0x800) { - *olen += 2; + *output_length += 2; continue; } if (c<0xffff) { - *olen += 3; + *output_length += 3; continue; } - return TUTF8E_INVALID; + if (invalid) { + *output_length += invalid_length; + } + else { + return TUTF8E_INVALID; + } } return TUTF8E_OK; } -/* UTF8 encode the given input string and table */ -/* olen input is output buffer size, output is encoded length */ -/* return TUTF8E_TOOLONG if output buffer insuficient */ -/* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ - -int tutf8e_string_encode(const uint16_t *table, const char *i, char *o, size_t *olen) +int tutf8e_string_encode(const uint16_t *table, const char *input, const char *invalid, char *output, size_t *output_length) { int ret; - size_t ilen = 0; - size_t length = 0; - if (!(ret = tutf8e_string_length(table, i, &ilen, &length))) + size_t input_length = 0; + size_t encoded_length = 0; + if (!(ret = tutf8e_string_length(table, input, invalid, &input_length, &encoded_length))) { - if (length+1 > *olen) return TUTF8E_TOOLONG; - if (!(ret = tutf8e_buffer_encode(table, i, ilen, o, olen))) + if (encoded_length+1 > *output_length) return TUTF8E_TOOLONG; + if (!(ret = tutf8e_buffer_encode(table, input, input_length, invalid, output, output_length))) { - o[length] = 0; + output[encoded_length] = 0; return TUTF8E_OK; } } return ret; } -/* Determine the length of the UTF8 encoding of given input string and table */ -/* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ - -int tutf8e_buffer_length(const uint16_t *table, const char *input, size_t ilen, size_t *length) +int tutf8e_buffer_length +( + const uint16_t *table, + const char *input, + size_t input_length, + const char *invalid, + size_t *length +) { + const size_t invalid_length = invalid ? strlen(invalid) : 0; + const unsigned char *i; - for (i = (const unsigned char *) input; ilen; ++i, --ilen) { + for (i = (const unsigned char *) input; input_length; ++i, --input_length) { const uint16_t c = table[*i]; if (c<0x80) { ++*length; @@ -144,23 +211,32 @@ *length += 3; continue; } - return TUTF8E_INVALID; + if (invalid) { + *length += invalid_length; + } + else { + return TUTF8E_INVALID; + } } return TUTF8E_OK; } -/* UTF8 encode the given input string and table */ -/* olen input is output buffer size, output is encoded length */ -/* return TUTF8E_TOOLONG if output buffer insuficient */ -/* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ - -int tutf8e_buffer_encode(const uint16_t *table, const char *input, size_t ilen, char *output, size_t *olen) +int tutf8e_buffer_encode +( + const uint16_t *table, + const char *input, + size_t input_length, + const char *invalid, + char *output, + size_t *output_length +) { - size_t left = *olen; + size_t invalid_length = invalid ? strlen(invalid) : 0; + + size_t left = *output_length; unsigned char *o = (unsigned char *) output; const unsigned char *i; - for (i = (const unsigned char *) input; ilen; ++i, --ilen) { + for (i = (const unsigned char *) input; input_length; ++i, --input_length) { const uint16_t c = table[*i]; if (c<0x80) { if (left<1) return TUTF8E_TOOLONG; @@ -183,9 +259,20 @@ left -= 3; continue; } - return TUTF8E_INVALID; + if (invalid) + { + if (left ? */\n') + for i in tests: + if i[1] in ['iso-8859-6', 'iso-8859-7', 'iso-8859-11']: + name = i[1].replace('-', '_').lower() + test.write(' output_length = sizeof(buffer);\n') + test.write(' copy = strdup(%s);\n'%(i[0])) + test.write(' copy[0] = 255;\n') + test.write(' buffer[0] = 255;\n') + test.write(' ret = tutf8e_encoder_string_encode(tutf8e_encoder_%s, copy, "?", buffer, &output_length);\n'%(name)) + test.write(' if (!ret && buffer[0]==\'?\') {\n') + test.write(' printf("%s\\n", buffer);\n') + test.write(' pass++;\n') + test.write(' } else {\n') + test.write(' printf("Failed to encode %s test\\n");\n'%(i[0])) + test.write(' fail++;\n') + test.write(' }\n') + test.write(' free(copy);\n') + test.write('\n') + + test.write('\n /* string encode to UTF8, first input character invalid -> [INVALID] */\n') + for i in tests: + if i[1] in ['iso-8859-6', 'iso-8859-7', 'iso-8859-11']: + name = i[1].replace('-', '_').lower() + test.write(' output_length = sizeof(buffer);\n') + test.write(' copy = strdup(%s);\n'%(i[0])) + test.write(' copy[0] = 255;\n') + test.write(' buffer[0] = 255;\n') + test.write(' ret = tutf8e_encoder_string_encode(tutf8e_encoder_%s, copy, "[INVALID]", buffer, &output_length);\n'%(name)) + test.write(' if (!ret && !strncmp(buffer, "[INVALID]", 9)) {\n') + test.write(' printf("%s\\n", buffer);\n') test.write(' pass++;\n') test.write(' } else {\n') test.write(' printf("Failed to encode %s test\\n");\n'%(i[0])) test.write(' fail++;\n') test.write(' }\n') + test.write(' free(copy);\n') test.write('\n') test.write(' printf("%d passed, %d failed tests\\n", pass, fail);\n') diff --git a/lib/tutf8e/include/tutf8e.h b/lib/tutf8e/include/tutf8e.h index 7c0befbd74a..478f514b572 100644 --- a/lib/tutf8e/include/tutf8e.h +++ b/lib/tutf8e/include/tutf8e.h @@ -5,42 +5,107 @@ #include /* size_t */ #include /* uint16_t */ -/* Internal API */ +/*************** Internal API ***************/ -extern int tutf8e_string_length(const uint16_t *table, const char *i, size_t *ilen, size_t *olen); -extern int tutf8e_string_encode(const uint16_t *table, const char *i, char *o, size_t *olen); +/* NUL-terminated C-string API */ -extern int tutf8e_buffer_length(const uint16_t *table, const char *i, size_t ilen, size_t *olen); -extern int tutf8e_buffer_encode(const uint16_t *table, const char *i, size_t ilen, char *o, size_t *olen); +extern int tutf8e_string_length(const uint16_t *table, const char *input, const char *invalid, size_t *input_length, size_t *output_length); +extern int tutf8e_string_encode(const uint16_t *table, const char *input, const char *invalid, char *output, size_t *output_length); -/* Generic API */ +/* Known-length buffer API */ + +extern int tutf8e_buffer_length(const uint16_t *table, const char *input, size_t input_length, const char *invalid, size_t *output_length); +extern int tutf8e_buffer_encode(const uint16_t *table, const char *input, size_t input_length, const char *invalid, char *output, size_t *output_length); + +/*************** Public API ***************/ + +/* Opaque handle type */ typedef void *TUTF8encoder; +/* Query encoder by name */ + extern TUTF8encoder tutf8e_encoder(const char *encoding); #define TUTF8E_OK 0 /* Success */ #define TUTF8E_INVALID 1 /* Invalid input character */ #define TUTF8E_TOOLONG 2 /* Insufficient output buffer */ -static inline int tutf8e_encoder_string_length(const TUTF8encoder encoder, const char *i, size_t *ilen, size_t *olen) +/* + * tutf8e_encoder_string_length + * + * Determine the length of input and UTF8 encoded output of NUL-terminated string + * Performance: single pass O(n) + * + * output NUL terminator not counted + * + * - TUTF8E_INVALID if input character is not convertable + * - TUTF8E_OK for success + */ + +static inline int tutf8e_encoder_string_length(const TUTF8encoder encoder, const char *input, const char *invalid, size_t *input_length, size_t *output_length) { - return tutf8e_string_length((const uint16_t *) encoder, i, ilen, olen); + return tutf8e_string_length((const uint16_t *) encoder, input, invalid, input_length, output_length); } -static inline int tutf8e_encoder_string_encode(const TUTF8encoder encoder, const char *i, char *o, size_t *olen) +/* + * tutf8e_encoder_string_encode + * + * UTF8 encode NUL-terminated string + * Performance: two pass O(n) + * + * output string is NUL terminated + * output_length is output buffer size for input + * output_length is encoded length for output, including NUL + * + * - TUTF8E_TOOLONG if output buffer insuficient + * - TUTF8E_INVALID if input character is not convertable + * - TUTF8E_OK for success + */ + +static inline int tutf8e_encoder_string_encode(const TUTF8encoder encoder, const char *input, const char *invalid, char *output, size_t *output_length) { - return tutf8e_string_encode((const uint16_t *) encoder, i, o, olen); + return tutf8e_string_encode((const uint16_t *) encoder, input, invalid, output, output_length); } -static inline int tutf8e_encoder_buffer_length(const TUTF8encoder encoder, const char *i, size_t ilen, size_t *length) +/* Known-length buffer API */ + +/* + * tutf8e_encoder_buffer_length + * + * Determine the length of input and UTF8 encoded output of string + * Performance: single pass O(n) + * + * output NUL terminator not counted + * + * - TUTF8E_INVALID if input character is not convertable + * - TUTF8E_OK for success + */ + +static inline int tutf8e_encoder_buffer_length(const TUTF8encoder encoder, const char *input, const char *invalid, size_t input_length, size_t *length) { - return tutf8e_buffer_length((const uint16_t *) encoder, i, ilen, length); + return tutf8e_buffer_length((const uint16_t *) encoder, input, input_length, invalid, length); } -static inline int tutf8e_encoder_buffer_encode(const TUTF8encoder encoder, const char *i, size_t ilen, char *o, size_t *olen) +/* + * tutf8e_encoder_buffer_encode + * + * UTF8 encode string + * Performance: two pass O(n) + * + * output string is not NUL terminated + * + * output_length is output buffer size for input + * output_length is encoded length for output + * + * - TUTF8E_TOOLONG if output buffer insuficient + * - TUTF8E_INVALID if input character is not convertable + * - TUTF8E_OK for success + */ + +static inline int tutf8e_encoder_buffer_encode(const TUTF8encoder encoder, const char *input, size_t input_length, const char *invalid, char *output, size_t *output_length) { - return tutf8e_buffer_encode((const uint16_t *) encoder, i, ilen, o, olen); + return tutf8e_buffer_encode((const uint16_t *) encoder, input, input_length, invalid, output, output_length); } /* Supported encoders */ diff --git a/lib/tutf8e/src/tutf8e.c b/lib/tutf8e/src/tutf8e.c index 70d4602b79d..9e169becdb1 100644 --- a/lib/tutf8e/src/tutf8e.c +++ b/lib/tutf8e/src/tutf8e.c @@ -3,63 +3,65 @@ #include -/* Determine the input length and UTF8 encoded length of NUL-terminated input string */ -/* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ - -int tutf8e_string_length(const uint16_t *table, const char *input, size_t *ilen, size_t *olen) +int tutf8e_string_length(const uint16_t *table, const char *input, const char *invalid, size_t *input_length, size_t *output_length) { + const size_t invalid_length = invalid ? strlen(invalid) : 0; + const unsigned char *i; - for (i = (const unsigned char *) input; *i; ++i, (*ilen)++) { + for (i = (const unsigned char *) input; *i; ++i, (*input_length)++) { const uint16_t c = table[*i]; if (c<0x80) { - *olen += 1; + *output_length += 1; continue; } if (c<0x800) { - *olen += 2; + *output_length += 2; continue; } if (c<0xffff) { - *olen += 3; + *output_length += 3; continue; } - return TUTF8E_INVALID; + if (invalid) { + *output_length += invalid_length; + } + else { + return TUTF8E_INVALID; + } } return TUTF8E_OK; } -/* UTF8 encode the given input string and table */ -/* olen input is output buffer size, output is encoded length */ -/* return TUTF8E_TOOLONG if output buffer insuficient */ -/* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ - -int tutf8e_string_encode(const uint16_t *table, const char *i, char *o, size_t *olen) +int tutf8e_string_encode(const uint16_t *table, const char *input, const char *invalid, char *output, size_t *output_length) { int ret; - size_t ilen = 0; - size_t length = 0; - if (!(ret = tutf8e_string_length(table, i, &ilen, &length))) + size_t input_length = 0; + size_t encoded_length = 0; + if (!(ret = tutf8e_string_length(table, input, invalid, &input_length, &encoded_length))) { - if (length+1 > *olen) return TUTF8E_TOOLONG; - if (!(ret = tutf8e_buffer_encode(table, i, ilen, o, olen))) + if (encoded_length+1 > *output_length) return TUTF8E_TOOLONG; + if (!(ret = tutf8e_buffer_encode(table, input, input_length, invalid, output, output_length))) { - o[length] = 0; + output[encoded_length] = 0; return TUTF8E_OK; } } return ret; } -/* Determine the length of the UTF8 encoding of given input string and table */ -/* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ - -int tutf8e_buffer_length(const uint16_t *table, const char *input, size_t ilen, size_t *length) +int tutf8e_buffer_length +( + const uint16_t *table, + const char *input, + size_t input_length, + const char *invalid, + size_t *length +) { + const size_t invalid_length = invalid ? strlen(invalid) : 0; + const unsigned char *i; - for (i = (const unsigned char *) input; ilen; ++i, --ilen) { + for (i = (const unsigned char *) input; input_length; ++i, --input_length) { const uint16_t c = table[*i]; if (c<0x80) { ++*length; @@ -73,23 +75,32 @@ int tutf8e_buffer_length(const uint16_t *table, const char *input, size_t ilen, *length += 3; continue; } - return TUTF8E_INVALID; + if (invalid) { + *length += invalid_length; + } + else { + return TUTF8E_INVALID; + } } return TUTF8E_OK; } -/* UTF8 encode the given input string and table */ -/* olen input is output buffer size, output is encoded length */ -/* return TUTF8E_TOOLONG if output buffer insuficient */ -/* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ - -int tutf8e_buffer_encode(const uint16_t *table, const char *input, size_t ilen, char *output, size_t *olen) +int tutf8e_buffer_encode +( + const uint16_t *table, + const char *input, + size_t input_length, + const char *invalid, + char *output, + size_t *output_length +) { - size_t left = *olen; + size_t invalid_length = invalid ? strlen(invalid) : 0; + + size_t left = *output_length; unsigned char *o = (unsigned char *) output; const unsigned char *i; - for (i = (const unsigned char *) input; ilen; ++i, --ilen) { + for (i = (const unsigned char *) input; input_length; ++i, --input_length) { const uint16_t c = table[*i]; if (c<0x80) { if (left<1) return TUTF8E_TOOLONG; @@ -112,9 +123,20 @@ int tutf8e_buffer_encode(const uint16_t *table, const char *input, size_t ilen, left -= 3; continue; } - return TUTF8E_INVALID; + if (invalid) + { + if (left ? */ + output_length = sizeof(buffer); + copy = strdup(greek); + copy[0] = 255; + buffer[0] = 255; + ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_7, copy, "?", buffer, &output_length); + if (!ret && buffer[0]=='?') { + printf("%s\n", buffer); + pass++; + } else { + printf("Failed to encode greek test\n"); + fail++; + } + free(copy); + + output_length = sizeof(buffer); + copy = strdup(thai); + copy[0] = 255; + buffer[0] = 255; + ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_11, copy, "?", buffer, &output_length); + if (!ret && buffer[0]=='?') { + printf("%s\n", buffer); + pass++; + } else { + printf("Failed to encode thai test\n"); + fail++; + } + free(copy); + + + /* string encode to UTF8, first input character invalid -> [INVALID] */ + output_length = sizeof(buffer); + copy = strdup(greek); + copy[0] = 255; + buffer[0] = 255; + ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_7, copy, "[INVALID]", buffer, &output_length); + if (!ret && !strncmp(buffer, "[INVALID]", 9)) { + printf("%s\n", buffer); + pass++; + } else { + printf("Failed to encode greek test\n"); + fail++; + } + free(copy); + + output_length = sizeof(buffer); + copy = strdup(thai); + copy[0] = 255; + buffer[0] = 255; + ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_11, copy, "[INVALID]", buffer, &output_length); + if (!ret && !strncmp(buffer, "[INVALID]", 9)) { + printf("%s\n", buffer); + pass++; + } else { + printf("Failed to encode thai test\n"); + fail++; + } + free(copy); + printf("%d passed, %d failed tests\n", pass, fail); }