diff --git a/lib/tutf8e/CMakeLists.txt b/lib/tutf8e/CMakeLists.txt index 1561063dea0..d5f76023a6c 100644 --- a/lib/tutf8e/CMakeLists.txt +++ b/lib/tutf8e/CMakeLists.txt @@ -4,7 +4,9 @@ project(tutf8e) # Not supported: -std=c90 (lacks support for inline) # Supported: -std=gnu90, -std=c99 or -std=gnu99 -set(CMAKE_C_FLAGS "-Os -Wall") +# set(CMAKE_C_FLAGS "-Os -Wall") + +set(CMAKE_C_FLAGS "-g -Wall") include_directories(include) add_library(tutf8e STATIC src/tutf8e.c) @@ -15,3 +17,14 @@ add_executable(tutf8e-test test/test.c) target_link_libraries(tutf8e-test tutf8e) set_property(TARGET tutf8e-test PROPERTY C_STANDARD 99) set_property(TARGET tutf8e-test PROPERTY C_EXTENSIONS OFF) + +add_executable(tutf8e-test-invalid test/test-invalid.c) +target_link_libraries(tutf8e-test-invalid tutf8e) +set_property(TARGET tutf8e-test-invalid PROPERTY C_STANDARD 99) +set_property(TARGET tutf8e-test-invalid PROPERTY C_EXTENSIONS OFF) + + +add_executable(tutf8e-tester test/tester.c) +target_link_libraries(tutf8e-tester tutf8e) +set_property(TARGET tutf8e-test PROPERTY C_STANDARD 99) +set_property(TARGET tutf8e-test PROPERTY C_EXTENSIONS OFF) diff --git a/lib/tutf8e/codegen.py b/lib/tutf8e/codegen.py index 27af1c81bd0..e0d23563034 100755 --- a/lib/tutf8e/codegen.py +++ b/lib/tutf8e/codegen.py @@ -18,44 +18,35 @@ #include /* size_t */ #include /* uint16_t */ - -/* Internal API */ - -extern int tutf8e_string_length(const uint16_t *table, const char *i, size_t *ilen, size_t *olen); -extern int tutf8e_string_encode(const uint16_t *table, const char *i, char *o, size_t *olen); - -extern int tutf8e_buffer_length(const uint16_t *table, const char *i, size_t ilen, size_t *olen); -extern int tutf8e_buffer_encode(const uint16_t *table, const char *i, size_t ilen, char *o, size_t *olen); +#include /* Generic API */ typedef void *TUTF8encoder; +extern int tutf8e_string_length(const TUTF8encoder encoder, const char *input, size_t *ilen, size_t *olen, uint32_t flags); +extern int tutf8e_string_encode(const TUTF8encoder encoder, const char *input, char *output, size_t *olen, uint32_t flags); +extern int tutf8e_buffer_length(const TUTF8encoder encoder, const char *input, size_t ilen, size_t *olen, uint32_t flags); +extern int tutf8e_buffer_encode(const TUTF8encoder encoder, const char *input, size_t ilen, char *output, size_t *olen, uint32_t flags); + extern TUTF8encoder tutf8e_encoder(const char *encoding); +extern uint32_t tutf8e_encoder_flag(const char *string_flag); -#define TUTF8E_OK 0 /* Success */ -#define TUTF8E_INVALID 1 /* Invalid input character */ -#define TUTF8E_TOOLONG 2 /* Insufficient output buffer */ +#define TUTF8E_OK 1 /* Sucesss : changed */ +#define TUTF8E_SAME 0 /* Success : no change */ +#define TUTF8E_INVALID -1 /* Invalid input character */ +#define TUTF8E_TOOLONG -2 /* Insufficient output buffer */ -static inline int tutf8e_encoder_string_length(const TUTF8encoder encoder, const char *i, size_t *ilen, size_t *olen) -{ - return tutf8e_string_length((const uint16_t *) encoder, i, ilen, olen); -} -static inline int tutf8e_encoder_string_encode(const TUTF8encoder encoder, const char *i, char *o, size_t *olen) -{ - return tutf8e_string_encode((const uint16_t *) encoder, i, o, olen); -} -static inline int tutf8e_encoder_buffer_length(const TUTF8encoder encoder, const char *i, size_t ilen, size_t *length) -{ - return tutf8e_buffer_length((const uint16_t *) encoder, i, ilen, length); -} +#define TUTF8E_FLAG_INV_KEEP 0 /* illegal char: keep, just use as unicode codepoint */ +#define TUTF8E_FLAG_INV_FAIL 1 /* illegal char: fail on invalid char */ +#define TUTF8E_FLAG_INV_IGNORE 2 /* illegal char: skip/ignore invalid char */ +#define TUTF8E_FLAG_INV_REPLACEMENT 3 /* illegal char: convert to replacement character */ +#define TUTF8E_FLAG_INV_QUESTION 4 /* illegal char: convert to '?' */ +#define TUTF8E_FLAG_INV_COPY 5 /* illegal char: just copy byte */ -static inline int tutf8e_encoder_buffer_encode(const TUTF8encoder encoder, const char *i, size_t ilen, char *o, size_t *olen) -{ - return tutf8e_buffer_encode((const uint16_t *) encoder, i, ilen, o, olen); -} +#define TUTF8E_FLAG_INV_MASK 0x07 /* illegal char mask */ ''') @@ -74,120 +65,203 @@ #include + +uint32_t tutf8e_encoder_flag(const char *string_flag) +{ + if(string_flag == NULL || *string_flag == 0) { + return 0; + } + switch(*string_flag) { + case 'f': + if(!strcmp(string_flag,"fail")) { + return TUTF8E_FLAG_INV_FAIL; + } + break; + + case 'i': + if(!strcmp(string_flag,"ignore")) { + return TUTF8E_FLAG_INV_IGNORE; + } + break; + + case 'k': + if(!strcmp(string_flag,"keep")) { + return TUTF8E_FLAG_INV_KEEP; + } + break; + + case 'r': + if(!strcmp(string_flag,"replacement")) { + return TUTF8E_FLAG_INV_REPLACEMENT; + } + break; + + case 'q': + if(!strcmp(string_flag,"question")) { + return TUTF8E_FLAG_INV_QUESTION; + } + break; + default: + break; + } + return (uint32_t)(-1); +} + /* Determine the input length and UTF8 encoded length of NUL-terminated input string */ -/* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ +/* return TUTF8E_INVALID if input character is not convertable TUTF8E_FLAG_INV_FAIL */ +/* return TUTF8E_SAME if string doesn't need changes */ +/* return TUTF8E_OK if string changes */ + -int tutf8e_string_length(const uint16_t *table, const char *input, size_t *ilen, size_t *olen) +int tutf8e_string_length(const TUTF8encoder encoder, const char *input, size_t *ilen, size_t *olen, uint32_t flags) { + const uint16_t *table = (uint16_t *) encoder; const unsigned char *i; - for (i = (const unsigned char *) input; *i; ++i, (*ilen)++) { + int changed = 0; + int len = 0; + for (i = (const unsigned char *) input; *i; ++i) { const uint16_t c = table[*i]; + if (c<0x80) { - *olen += 1; + len++; + if(c != *i) changed++; continue; } + changed++; if (c<0x800) { - *olen += 2; + len += 2; continue; } if (c<0xffff) { - *olen += 3; + len += 3; continue; } - return TUTF8E_INVALID; + switch(flags & TUTF8E_FLAG_INV_MASK) { + case TUTF8E_FLAG_INV_KEEP : len += *i <= 0x80 ? 1 : 2; break; + case TUTF8E_FLAG_INV_FAIL : return TUTF8E_INVALID; + case TUTF8E_FLAG_INV_IGNORE : break; + case TUTF8E_FLAG_INV_REPLACEMENT : len += 3; break; + case TUTF8E_FLAG_INV_QUESTION : len++; break; + case TUTF8E_FLAG_INV_COPY : len++; break; + } } - return TUTF8E_OK; + *ilen = (char*)i- (char *)input; + *olen = len; + return changed ? TUTF8E_OK : TUTF8E_SAME; } + /* UTF8 encode the given input string and table */ /* olen input is output buffer size, output is encoded length */ -/* return TUTF8E_TOOLONG if output buffer insuficient */ + /* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ +/* return >= 0 : length of encoded string */ -int tutf8e_string_encode(const uint16_t *table, const char *i, char *o, size_t *olen) +int tutf8e_string_encode(const TUTF8encoder encoder, const char *input, char *output, size_t *olen, uint32_t flags) { int ret; size_t ilen = 0; size_t length = 0; - if (!(ret = tutf8e_string_length(table, i, &ilen, &length))) - { - if (length+1 > *olen) return TUTF8E_TOOLONG; - if (!(ret = tutf8e_buffer_encode(table, i, ilen, o, olen))) - { - o[length] = 0; - return TUTF8E_OK; - } + + if ((ret = tutf8e_string_length(encoder, input, &ilen, &length, flags)) < 0) { + return ret; } - return ret; + + if (length+1 > *olen) return TUTF8E_TOOLONG; + + if ((ret = tutf8e_buffer_encode(encoder, input, ilen, output, olen, flags)) < 0) { + return ret; + } + + output[*olen] = 0; + + return TUTF8E_OK; } /* Determine the length of the UTF8 encoding of given input string and table */ /* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ +/* return TUTF8E_SAME if string doesn't need change */ +/* return TUTF8E_OK if string changes */ -int tutf8e_buffer_length(const uint16_t *table, const char *input, size_t ilen, size_t *length) +int tutf8e_buffer_length(const TUTF8encoder encoder, const char *input, size_t ilen, size_t *olen, uint32_t flags) { + const uint16_t *table = (uint16_t *) encoder; const unsigned char *i; + int changed = 0; + int len = 0; for (i = (const unsigned char *) input; ilen; ++i, --ilen) { const uint16_t c = table[*i]; if (c<0x80) { - ++*length; + len++; + if(c != *i) changed++; continue; } + changed++; if (c<0x800) { - *length += 2; + len += 2; continue; } if (c<0xffff) { - *length += 3; + len += 3; continue; } - return TUTF8E_INVALID; + switch(flags & TUTF8E_FLAG_INV_MASK) { + case TUTF8E_FLAG_INV_KEEP : len += *i <= 0x80 ? 1 : 2; break; + case TUTF8E_FLAG_INV_FAIL : return TUTF8E_INVALID; + case TUTF8E_FLAG_INV_IGNORE : break; + case TUTF8E_FLAG_INV_REPLACEMENT : len += 3; break; + case TUTF8E_FLAG_INV_QUESTION : len++; break; + case TUTF8E_FLAG_INV_COPY : len++; break; + } } - return TUTF8E_OK; + *olen = len; + return changed ? TUTF8E_OK : TUTF8E_SAME; } /* UTF8 encode the given input string and table */ /* olen input is output buffer size, output is encoded length */ -/* return TUTF8E_TOOLONG if output buffer insuficient */ /* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ +/* return >= 0 size of encoded string */ -int tutf8e_buffer_encode(const uint16_t *table, const char *input, size_t ilen, char *output, size_t *olen) +int tutf8e_buffer_encode(const TUTF8encoder encoder, const char *input, size_t ilen, char *output, size_t *olen, uint32_t flags) { - size_t left = *olen; + const uint16_t *table = (uint16_t *) encoder; unsigned char *o = (unsigned char *) output; const unsigned char *i; + + for (i = (const unsigned char *) input; ilen; ++i, --ilen) { - const uint16_t c = table[*i]; + uint16_t c = table[*i]; + + if(c == 0xffff) { + switch(flags & TUTF8E_FLAG_INV_MASK) { + case TUTF8E_FLAG_INV_KEEP : c = *i; break; + case TUTF8E_FLAG_INV_FAIL : return TUTF8E_INVALID; + case TUTF8E_FLAG_INV_IGNORE : continue; + case TUTF8E_FLAG_INV_REPLACEMENT : c = (uint16_t) 0xFFFD ; break; + case TUTF8E_FLAG_INV_QUESTION : c = (uint16_t) '?' ; break; + case TUTF8E_FLAG_INV_COPY : *(o++) = *i; continue; + } + } + if (c<0x80) { - if (left<1) return TUTF8E_TOOLONG; *(o++) = c; - left -= 1; continue; } if (c<0x800) { - if (left<2) return TUTF8E_TOOLONG; *(o++) = 0xc0 | (c>>6); *(o++) = 0x80 | (c&0x3f); - left -= 2; continue; } - if (c<0xffff) { - if (left<3) return TUTF8E_TOOLONG; - *(o++) = 0xe0 | (c>>12); - *(o++) = 0x80 | ((c>>6)&0x3f); - *(o++) = 0x80 | (c&0x3f); - left -= 3; - continue; - } - return TUTF8E_INVALID; + *(o++) = 0xe0 | (c>>12); + *(o++) = 0x80 | ((c>>6)&0x3f); + *(o++) = 0x80 | (c&0x3f); } - *olen -= left; + *olen = (char*) o - output; return TUTF8E_OK; } + + ''') for e in sorted(encodings): @@ -220,14 +294,35 @@ src.write(''' TUTF8encoder tutf8e_encoder(const char *encoding) { + char last; + int slen = strlen(encoding); + if(slen == 0) return NULL; + last = encoding[slen-1]; ''') + + lasts = {} + for e in sorted(encodings): name = e.replace('-', '_').lower() - src.write(' if (!strcmp(encoding, "%s")) return tutf8e_encoder_%s;\n'%(e, name)) - src.write(''' - return NULL; -} -''') + lastChar = name[-1] + if not lastChar in lasts: + lasts[lastChar] = [] + lasts[lastChar].append((len(name), e , name)) + + src.write(" switch(last) {\n") + + for lastChar in lasts: + codings = lasts[lastChar] + src.write(" case '%s':\n" % (lastChar)) + for n in codings: + slen, encoding, name = n + src.write(' if (slen == %d && !memcmp(encoding, "%s", %d)) return tutf8e_encoder_%s;\n' % (slen, encoding, slen, name)) + src.write(' break;\n\n') + src.write(" default: break;\n\n") + src.write(" }\n") + src.write(" return NULL;\n") + src.write("}") + for e in sorted(encodings): @@ -245,30 +340,30 @@ '''%(name.upper(), name.upper())) include.write(''' -static inline int tutf8e_%s_string_length(const char *i, size_t *ilen, size_t *olen) +static inline int tutf8e_%s_string_length(const char *i, size_t *ilen, size_t *olen, uint32_t flags) { - return tutf8e_encoder_string_length(tutf8e_encoder_%s, i, ilen, olen); + return tutf8e_string_length(tutf8e_encoder_%s, i, ilen, olen, flags); } '''%(name, name)) include.write(''' -static inline int tutf8e_%s_string_encode(const char *i, char *o, size_t *olen) +static inline int tutf8e_%s_string_encode(const char *i, char *o, uint32_t flags) { - return tutf8e_encoder_string_encode(tutf8e_encoder_%s, i, o, olen); + return tutf8e_string_encode(tutf8e_encoder_%s, i, o, flags); } '''%(name, name)) include.write(''' -static inline int tutf8e_%s_buffer_length(const char *i, size_t ilen, size_t *length) +static inline int tutf8e_%s_buffer_length(const char *i, size_t ilen, size_t *length, uint32_t flags) { - return tutf8e_encoder_buffer_length(tutf8e_encoder_%s, i, ilen, length); + return tutf8e_buffer_length(tutf8e_encoder_%s, i, ilen, length, flags); } '''%(name, name)) include.write(''' -static inline int tutf8e_%s_buffer_encode(const char *i, size_t ilen, char *o, size_t *olen) +static inline int tutf8e_%s_buffer_encode(const char *i, size_t ilen, char *o, uint32_t flags) { - return tutf8e_encoder_buffer_encode(tutf8e_encoder_%s, i, ilen, o, olen); + return tutf8e_buffer_encode(tutf8e_encoder_%s, i, ilen, o, olen, flags); } '''%(name, name)) @@ -313,7 +408,11 @@ test.write(' int pass = 0;\n') test.write(' int fail = 0;\n') test.write(' int ret;\n') - test.write(' size_t ilen, olen;\n') + test.write(' size_t ilen;\n') + test.write(' size_t olen;\n') + test.write(' size_t xlen;\n') + test.write(' size_t ylen;\n') + test.write(' char buffer[1024];\n') # test.write(' char *encoded;\n') test.write('\n') @@ -335,17 +434,42 @@ test.write(' %s,\n'%(', '.join([ '0x%02x'%(j) for j in data[i:i+24]]))) test.write(' };\n') + + test.write('\n /* test length function */\n') + for i in tests: + if i[1] in encodings: + name = i[1].replace('-', '_').lower() + test.write(' xlen = strlen(%s);\n'%(i[0])) + test.write(' ylen = strlen(%sUTF8);\n'%(i[0])) + test.write(' olen = sizeof(buffer);\n') + test.write(' ret = tutf8e_string_length(tutf8e_encoder_%s, %s, &ilen, &olen, 0);\n'%(name, i[0])) + test.write(' if (ret < 0) {\n') + test.write(' printf("(length test) %s : got error %%d\\n", ret);\n'%(i[0])) + test.write(' fail++;\n') + test.write(' } else if(xlen != ilen) {\n'); + test.write(' printf("(length test) %s : ilen wrong %%ld != %%ld\\n", ilen , xlen);\n'%(i[0])) + test.write(' fail++;\n') + test.write(' } else if(ylen != olen){\n'); + test.write(' printf("(length test) %s : olen wrong %%ld != %%ld\\n", olen , ylen);\n'%(i[0])) + test.write(' fail++;\n') + test.write(' } else {\n') + test.write(' printf("(length test) %s : ok\\n");\n'%(i[0])) + test.write(' pass++;\n') + test.write(' }\n') + test.write('\n') + + test.write('\n /* string encode to UTF8 */\n') for i in tests: if i[1] in encodings: name = i[1].replace('-', '_').lower() test.write(' olen = sizeof(buffer);\n') - test.write(' ret = tutf8e_encoder_string_encode(tutf8e_encoder_%s, %s, buffer, &olen);\n'%(name, i[0])) - test.write(' if (!ret && !strcmp(buffer, %sUTF8)) {\n'%(i[0])) - test.write(' printf("%s\\n", buffer);\n') + test.write(' ret = tutf8e_string_encode(tutf8e_encoder_%s, %s, buffer, &olen, 0);\n'%(name, i[0])) + test.write(' if (ret >= 0 && !strcmp(buffer, %sUTF8)) {\n'%(i[0])) + test.write(' printf("(string test) : ok : %s : %%s\\n", buffer);\n' %(i[0])) test.write(' pass++;\n') test.write(' } else {\n') - test.write(' printf("Failed to encode %s test\\n");\n'%(i[0])) + test.write(' printf("(string test) Failed to encode %s test\\n");\n'%(i[0])) test.write(' fail++;\n') test.write(' }\n') test.write('\n') @@ -355,13 +479,19 @@ if i[1] in encodings: name = i[1].replace('-', '_').lower() test.write(' ilen = strlen(%s);\n'%(i[0])) - test.write(' olen = sizeof(buffer);\n') - test.write(' ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_%s, %s, ilen, buffer, &olen);\n'%(name, i[0])) - test.write(' if (!ret && (olen+1)==sizeof(%sUTF8) && !strncmp(buffer, %sUTF8, olen)) {\n'%(i[0], i[0])) - test.write(' pass++;\n') - test.write(' } else {\n') - test.write(' printf("Failed to encode %s test\\n");\n'%(i[0])) + test.write(' xlen = strlen(%sUTF8);\n'%(i[0])) + test.write(' ret = tutf8e_buffer_encode(tutf8e_encoder_%s, %s, ilen, buffer, &olen, 0);\n'%(name, i[0])) + test.write(' if (ret < 0) {\n') + test.write(' printf("(buffer test) Failed to encode %s test : ret(%%d)\\n", ret);\n'%(i[0])) + test.write(' } else if (olen != xlen) {\n') + test.write(' printf("(buffer test) Failed to encode %s test : length diff : %%ld != %%ld\\n", olen, xlen);\n'%(i[0])) + test.write(' fail++;\n') + test.write(' } else if (strncmp(buffer, %sUTF8, olen)) {\n'%(i[0])) + test.write(' printf("(buffer test) Failed to encode %s test : output diffs=(%%s) expect(%%s)\\n", buffer, %sUTF8);\n'%(i[0],i[0])) test.write(' fail++;\n') + test.write(' } else {\n') + test.write(' printf("(buffer test) ok %s\\n");\n'%(i[0])) + test.write(' pass++;\n') test.write(' }\n') test.write('\n') diff --git a/lib/tutf8e/include/tutf8e.h b/lib/tutf8e/include/tutf8e.h index 7c0befbd74a..bb4ed4b02a0 100644 --- a/lib/tutf8e/include/tutf8e.h +++ b/lib/tutf8e/include/tutf8e.h @@ -4,44 +4,35 @@ #include /* size_t */ #include /* uint16_t */ - -/* Internal API */ - -extern int tutf8e_string_length(const uint16_t *table, const char *i, size_t *ilen, size_t *olen); -extern int tutf8e_string_encode(const uint16_t *table, const char *i, char *o, size_t *olen); - -extern int tutf8e_buffer_length(const uint16_t *table, const char *i, size_t ilen, size_t *olen); -extern int tutf8e_buffer_encode(const uint16_t *table, const char *i, size_t ilen, char *o, size_t *olen); +#include /* Generic API */ typedef void *TUTF8encoder; +extern int tutf8e_string_length(const TUTF8encoder encoder, const char *input, size_t *ilen, size_t *olen, uint32_t flags); +extern int tutf8e_string_encode(const TUTF8encoder encoder, const char *input, char *output, size_t *olen, uint32_t flags); +extern int tutf8e_buffer_length(const TUTF8encoder encoder, const char *input, size_t ilen, size_t *olen, uint32_t flags); +extern int tutf8e_buffer_encode(const TUTF8encoder encoder, const char *input, size_t ilen, char *output, size_t *olen, uint32_t flags); + extern TUTF8encoder tutf8e_encoder(const char *encoding); +extern uint32_t tutf8e_encoder_flag(const char *string_flag); -#define TUTF8E_OK 0 /* Success */ -#define TUTF8E_INVALID 1 /* Invalid input character */ -#define TUTF8E_TOOLONG 2 /* Insufficient output buffer */ +#define TUTF8E_OK 1 /* Sucesss : changed */ +#define TUTF8E_SAME 0 /* Success : no change */ +#define TUTF8E_INVALID -1 /* Invalid input character */ +#define TUTF8E_TOOLONG -2 /* Insufficient output buffer */ -static inline int tutf8e_encoder_string_length(const TUTF8encoder encoder, const char *i, size_t *ilen, size_t *olen) -{ - return tutf8e_string_length((const uint16_t *) encoder, i, ilen, olen); -} -static inline int tutf8e_encoder_string_encode(const TUTF8encoder encoder, const char *i, char *o, size_t *olen) -{ - return tutf8e_string_encode((const uint16_t *) encoder, i, o, olen); -} -static inline int tutf8e_encoder_buffer_length(const TUTF8encoder encoder, const char *i, size_t ilen, size_t *length) -{ - return tutf8e_buffer_length((const uint16_t *) encoder, i, ilen, length); -} +#define TUTF8E_FLAG_INV_KEEP 0 /* illegal char: keep, just use as unicode codepoint */ +#define TUTF8E_FLAG_INV_FAIL 1 /* illegal char: fail on invalid char */ +#define TUTF8E_FLAG_INV_IGNORE 2 /* illegal char: skip/ignore invalid char */ +#define TUTF8E_FLAG_INV_REPLACEMENT 3 /* illegal char: convert to replacement character */ +#define TUTF8E_FLAG_INV_QUESTION 4 /* illegal char: convert to '?' */ +#define TUTF8E_FLAG_INV_COPY 5 /* illegal char: just copy byte */ -static inline int tutf8e_encoder_buffer_encode(const TUTF8encoder encoder, const char *i, size_t ilen, char *o, size_t *olen) -{ - return tutf8e_buffer_encode((const uint16_t *) encoder, i, ilen, o, olen); -} +#define TUTF8E_FLAG_INV_MASK 0x07 /* illegal char mask */ /* Supported encoders */ diff --git a/lib/tutf8e/src/tutf8e.c b/lib/tutf8e/src/tutf8e.c index 70d4602b79d..bec8cdc5a90 100644 --- a/lib/tutf8e/src/tutf8e.c +++ b/lib/tutf8e/src/tutf8e.c @@ -3,121 +3,204 @@ #include + +uint32_t tutf8e_encoder_flag(const char *string_flag) +{ + if(string_flag == NULL || *string_flag == 0) { + return 0; + } + switch(*string_flag) { + case 'f': + if(!strcmp(string_flag,"fail")) { + return TUTF8E_FLAG_INV_FAIL; + } + break; + + case 'i': + if(!strcmp(string_flag,"ignore")) { + return TUTF8E_FLAG_INV_IGNORE; + } + break; + + case 'k': + if(!strcmp(string_flag,"keep")) { + return TUTF8E_FLAG_INV_KEEP; + } + break; + + case 'r': + if(!strcmp(string_flag,"replacement")) { + return TUTF8E_FLAG_INV_REPLACEMENT; + } + break; + + case 'q': + if(!strcmp(string_flag,"question")) { + return TUTF8E_FLAG_INV_QUESTION; + } + break; + default: + break; + } + return (uint32_t)(-1); +} + /* Determine the input length and UTF8 encoded length of NUL-terminated input string */ -/* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ +/* return TUTF8E_INVALID if input character is not convertable TUTF8E_FLAG_INV_FAIL */ +/* return TUTF8E_SAME if string doesn't need changes */ +/* return TUTF8E_OK if string changes */ + -int tutf8e_string_length(const uint16_t *table, const char *input, size_t *ilen, size_t *olen) +int tutf8e_string_length(const TUTF8encoder encoder, const char *input, size_t *ilen, size_t *olen, uint32_t flags) { + const uint16_t *table = (uint16_t *) encoder; const unsigned char *i; - for (i = (const unsigned char *) input; *i; ++i, (*ilen)++) { + int changed = 0; + int len = 0; + for (i = (const unsigned char *) input; *i; ++i) { const uint16_t c = table[*i]; + if (c<0x80) { - *olen += 1; + len++; + if(c != *i) changed++; continue; } + changed++; if (c<0x800) { - *olen += 2; + len += 2; continue; } if (c<0xffff) { - *olen += 3; + len += 3; continue; } - return TUTF8E_INVALID; + switch(flags & TUTF8E_FLAG_INV_MASK) { + case TUTF8E_FLAG_INV_KEEP : len += *i <= 0x80 ? 1 : 2; break; + case TUTF8E_FLAG_INV_FAIL : return TUTF8E_INVALID; + case TUTF8E_FLAG_INV_IGNORE : break; + case TUTF8E_FLAG_INV_REPLACEMENT : len += 3; break; + case TUTF8E_FLAG_INV_QUESTION : len++; break; + case TUTF8E_FLAG_INV_COPY : len++; break; + } } - return TUTF8E_OK; + *ilen = (char*)i- (char *)input; + *olen = len; + return changed ? TUTF8E_OK : TUTF8E_SAME; } + /* UTF8 encode the given input string and table */ /* olen input is output buffer size, output is encoded length */ -/* return TUTF8E_TOOLONG if output buffer insuficient */ + /* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ +/* return >= 0 : length of encoded string */ -int tutf8e_string_encode(const uint16_t *table, const char *i, char *o, size_t *olen) +int tutf8e_string_encode(const TUTF8encoder encoder, const char *input, char *output, size_t *olen, uint32_t flags) { int ret; size_t ilen = 0; size_t length = 0; - if (!(ret = tutf8e_string_length(table, i, &ilen, &length))) - { - if (length+1 > *olen) return TUTF8E_TOOLONG; - if (!(ret = tutf8e_buffer_encode(table, i, ilen, o, olen))) - { - o[length] = 0; - return TUTF8E_OK; - } + + if ((ret = tutf8e_string_length(encoder, input, &ilen, &length, flags)) < 0) { + return ret; + } + + if (length+1 > *olen) return TUTF8E_TOOLONG; + + if ((ret = tutf8e_buffer_encode(encoder, input, ilen, output, olen, flags)) < 0) { + return ret; } - return ret; + + output[*olen] = 0; + + return TUTF8E_OK; } /* Determine the length of the UTF8 encoding of given input string and table */ /* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ +/* return TUTF8E_SAME if string doesn't need change */ +/* return TUTF8E_OK if string changes */ -int tutf8e_buffer_length(const uint16_t *table, const char *input, size_t ilen, size_t *length) +int tutf8e_buffer_length(const TUTF8encoder encoder, const char *input, size_t ilen, size_t *olen, uint32_t flags) { + const uint16_t *table = (uint16_t *) encoder; const unsigned char *i; + int changed = 0; + int len = 0; for (i = (const unsigned char *) input; ilen; ++i, --ilen) { const uint16_t c = table[*i]; if (c<0x80) { - ++*length; + len++; + if(c != *i) changed++; continue; } + changed++; if (c<0x800) { - *length += 2; + len += 2; continue; } if (c<0xffff) { - *length += 3; + len += 3; continue; } - return TUTF8E_INVALID; + switch(flags & TUTF8E_FLAG_INV_MASK) { + case TUTF8E_FLAG_INV_KEEP : len += *i <= 0x80 ? 1 : 2; break; + case TUTF8E_FLAG_INV_FAIL : return TUTF8E_INVALID; + case TUTF8E_FLAG_INV_IGNORE : break; + case TUTF8E_FLAG_INV_REPLACEMENT : len += 3; break; + case TUTF8E_FLAG_INV_QUESTION : len++; break; + case TUTF8E_FLAG_INV_COPY : len++; break; + } } - return TUTF8E_OK; + *olen = len; + return changed ? TUTF8E_OK : TUTF8E_SAME; } /* UTF8 encode the given input string and table */ /* olen input is output buffer size, output is encoded length */ -/* return TUTF8E_TOOLONG if output buffer insuficient */ /* return TUTF8E_INVALID if input character is not convertable */ -/* return TUTF8E_OK for success */ +/* return >= 0 size of encoded string */ -int tutf8e_buffer_encode(const uint16_t *table, const char *input, size_t ilen, char *output, size_t *olen) +int tutf8e_buffer_encode(const TUTF8encoder encoder, const char *input, size_t ilen, char *output, size_t *olen, uint32_t flags) { - size_t left = *olen; + const uint16_t *table = (uint16_t *) encoder; unsigned char *o = (unsigned char *) output; const unsigned char *i; + + for (i = (const unsigned char *) input; ilen; ++i, --ilen) { - const uint16_t c = table[*i]; + uint16_t c = table[*i]; + + if(c == 0xffff) { + switch(flags & TUTF8E_FLAG_INV_MASK) { + case TUTF8E_FLAG_INV_KEEP : c = *i; break; + case TUTF8E_FLAG_INV_FAIL : return TUTF8E_INVALID; + case TUTF8E_FLAG_INV_IGNORE : continue; + case TUTF8E_FLAG_INV_REPLACEMENT : c = (uint16_t) 0xFFFD ; break; + case TUTF8E_FLAG_INV_QUESTION : c = (uint16_t) '?' ; break; + case TUTF8E_FLAG_INV_COPY : *(o++) = *i; continue; + } + } + if (c<0x80) { - if (left<1) return TUTF8E_TOOLONG; *(o++) = c; - left -= 1; continue; } if (c<0x800) { - if (left<2) return TUTF8E_TOOLONG; *(o++) = 0xc0 | (c>>6); *(o++) = 0x80 | (c&0x3f); - left -= 2; continue; } - if (c<0xffff) { - if (left<3) return TUTF8E_TOOLONG; - *(o++) = 0xe0 | (c>>12); - *(o++) = 0x80 | ((c>>6)&0x3f); - *(o++) = 0x80 | (c&0x3f); - left -= 3; - continue; - } - return TUTF8E_INVALID; + *(o++) = 0xe0 | (c>>12); + *(o++) = 0x80 | ((c>>6)&0x3f); + *(o++) = 0x80 | (c&0x3f); } - *olen -= left; + *olen = (char*) o - output; return TUTF8E_OK; } + + const uint16_t tutf8e_iso_8859_1_utf8[256] = { 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, @@ -625,30 +708,67 @@ const TUTF8encoder tutf8e_encoder_windows_1258 = (TUTF8encoder) tutf8e_windows_1 TUTF8encoder tutf8e_encoder(const char *encoding) { - if (!strcmp(encoding, "iso-8859-1")) return tutf8e_encoder_iso_8859_1; - if (!strcmp(encoding, "iso-8859-10")) return tutf8e_encoder_iso_8859_10; - if (!strcmp(encoding, "iso-8859-11")) return tutf8e_encoder_iso_8859_11; - if (!strcmp(encoding, "iso-8859-13")) return tutf8e_encoder_iso_8859_13; - if (!strcmp(encoding, "iso-8859-14")) return tutf8e_encoder_iso_8859_14; - if (!strcmp(encoding, "iso-8859-15")) return tutf8e_encoder_iso_8859_15; - if (!strcmp(encoding, "iso-8859-16")) return tutf8e_encoder_iso_8859_16; - if (!strcmp(encoding, "iso-8859-2")) return tutf8e_encoder_iso_8859_2; - if (!strcmp(encoding, "iso-8859-3")) return tutf8e_encoder_iso_8859_3; - if (!strcmp(encoding, "iso-8859-4")) return tutf8e_encoder_iso_8859_4; - if (!strcmp(encoding, "iso-8859-5")) return tutf8e_encoder_iso_8859_5; - if (!strcmp(encoding, "iso-8859-6")) return tutf8e_encoder_iso_8859_6; - if (!strcmp(encoding, "iso-8859-7")) return tutf8e_encoder_iso_8859_7; - if (!strcmp(encoding, "iso-8859-8")) return tutf8e_encoder_iso_8859_8; - if (!strcmp(encoding, "iso-8859-9")) return tutf8e_encoder_iso_8859_9; - if (!strcmp(encoding, "windows-1250")) return tutf8e_encoder_windows_1250; - if (!strcmp(encoding, "windows-1251")) return tutf8e_encoder_windows_1251; - if (!strcmp(encoding, "windows-1252")) return tutf8e_encoder_windows_1252; - if (!strcmp(encoding, "windows-1253")) return tutf8e_encoder_windows_1253; - if (!strcmp(encoding, "windows-1254")) return tutf8e_encoder_windows_1254; - if (!strcmp(encoding, "windows-1255")) return tutf8e_encoder_windows_1255; - if (!strcmp(encoding, "windows-1256")) return tutf8e_encoder_windows_1256; - if (!strcmp(encoding, "windows-1257")) return tutf8e_encoder_windows_1257; - if (!strcmp(encoding, "windows-1258")) return tutf8e_encoder_windows_1258; + char last; + int slen = strlen(encoding); + if(slen == 0) return NULL; + last = encoding[slen-1]; + switch(last) { + case '1': + if (slen == 10 && !memcmp(encoding, "iso-8859-1", 10)) return tutf8e_encoder_iso_8859_1; + if (slen == 11 && !memcmp(encoding, "iso-8859-11", 11)) return tutf8e_encoder_iso_8859_11; + if (slen == 12 && !memcmp(encoding, "windows-1251", 12)) return tutf8e_encoder_windows_1251; + break; + + case '0': + if (slen == 11 && !memcmp(encoding, "iso-8859-10", 11)) return tutf8e_encoder_iso_8859_10; + if (slen == 12 && !memcmp(encoding, "windows-1250", 12)) return tutf8e_encoder_windows_1250; + break; + + case '3': + if (slen == 11 && !memcmp(encoding, "iso-8859-13", 11)) return tutf8e_encoder_iso_8859_13; + if (slen == 10 && !memcmp(encoding, "iso-8859-3", 10)) return tutf8e_encoder_iso_8859_3; + if (slen == 12 && !memcmp(encoding, "windows-1253", 12)) return tutf8e_encoder_windows_1253; + break; + + case '4': + if (slen == 11 && !memcmp(encoding, "iso-8859-14", 11)) return tutf8e_encoder_iso_8859_14; + if (slen == 10 && !memcmp(encoding, "iso-8859-4", 10)) return tutf8e_encoder_iso_8859_4; + if (slen == 12 && !memcmp(encoding, "windows-1254", 12)) return tutf8e_encoder_windows_1254; + break; + + case '5': + if (slen == 11 && !memcmp(encoding, "iso-8859-15", 11)) return tutf8e_encoder_iso_8859_15; + if (slen == 10 && !memcmp(encoding, "iso-8859-5", 10)) return tutf8e_encoder_iso_8859_5; + if (slen == 12 && !memcmp(encoding, "windows-1255", 12)) return tutf8e_encoder_windows_1255; + break; + + case '6': + if (slen == 11 && !memcmp(encoding, "iso-8859-16", 11)) return tutf8e_encoder_iso_8859_16; + if (slen == 10 && !memcmp(encoding, "iso-8859-6", 10)) return tutf8e_encoder_iso_8859_6; + if (slen == 12 && !memcmp(encoding, "windows-1256", 12)) return tutf8e_encoder_windows_1256; + break; + + case '2': + if (slen == 10 && !memcmp(encoding, "iso-8859-2", 10)) return tutf8e_encoder_iso_8859_2; + if (slen == 12 && !memcmp(encoding, "windows-1252", 12)) return tutf8e_encoder_windows_1252; + break; + + case '7': + if (slen == 10 && !memcmp(encoding, "iso-8859-7", 10)) return tutf8e_encoder_iso_8859_7; + if (slen == 12 && !memcmp(encoding, "windows-1257", 12)) return tutf8e_encoder_windows_1257; + break; + + case '8': + if (slen == 10 && !memcmp(encoding, "iso-8859-8", 10)) return tutf8e_encoder_iso_8859_8; + if (slen == 12 && !memcmp(encoding, "windows-1258", 12)) return tutf8e_encoder_windows_1258; + break; + + case '9': + if (slen == 10 && !memcmp(encoding, "iso-8859-9", 10)) return tutf8e_encoder_iso_8859_9; + break; + + default: break; + } return NULL; -} +} \ No newline at end of file diff --git a/lib/tutf8e/test/test-invalid.c b/lib/tutf8e/test/test-invalid.c new file mode 100644 index 00000000000..98545b73857 --- /dev/null +++ b/lib/tutf8e/test/test-invalid.c @@ -0,0 +1,350 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/** + * Test invalid flags + * + */ + + +#include + +#include +#include +#include + +#include "acutest.h" + + + +/* + ca1252: + + char = 129 (8x81) => 0xFFFF + unicode: 0xc2 0x81 + + replacement char: + 0xfffd => 0xef 0xbf 0xbd +*/ + +/* + + olen = sizeof(buffer); + ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_1, english, buffer, &olen, 0); + if (ret >= 0 && !strcmp(buffer, englishUTF8)) { + printf("%s\n", buffer); + pass++; + } else { + printf("Failed to encode english test\n"); + fail++; + } + +*/ + +#define CHR(x) ((unsigned char)((x) & 0xff)) + + +static void test_basic(void) { + TUTF8encoder encoding; + int ret; + size_t olen; + size_t ilen; + char ibuf[80]; + char obuf[80]; + + encoding = tutf8e_encoder("windows-1252"); + + TEST_CHECK(encoding != NULL); + + ret = tutf8e_string_length(encoding, "", &ilen, &olen, 0); + TEST_CHECK(ret == TUTF8E_SAME); + TEST_CHECK(olen == 0); + + olen = sizeof(obuf); + ret = tutf8e_string_encode(encoding, "", obuf, &olen, 0); + TEST_CHECK_(ret == TUTF8E_OK, "ret=%d", ret); + TEST_CHECK(strcmp("",obuf) == 0); + + olen = sizeof(obuf); + ret = tutf8e_string_encode(encoding, "abc", obuf, &olen, 0); + TEST_CHECK(ret == TUTF8E_OK); + TEST_CHECK(strcmp("abc",obuf) == 0); + + ibuf[0] = 'a'; + ibuf[1] = CHR(0xe4); // 'ä' + ibuf[2] = 0; + + olen = sizeof(obuf); + ret = tutf8e_string_encode(encoding, ibuf, obuf, &olen, 0); + TEST_CHECK(ret == TUTF8E_OK); + TEST_CHECK(olen = 3); +} + + +static void test_toolong(void) { + TUTF8encoder encoding; + int ret; + size_t olen; + char ibuf[80]; + char obuf[80]; + + + encoding = tutf8e_encoder("windows-1252"); + + TEST_CHECK(encoding != NULL); + + + ibuf[0] = 'a'; + ibuf[1] = CHR(0xe4); // 'ä' + ibuf[2] = 0; + + olen = 2; + ret = tutf8e_string_encode(encoding, ibuf, obuf, &olen, 0); + TEST_CHECK(ret == TUTF8E_TOOLONG); +} + + +static void test_valid(void) { + TUTF8encoder encoding; + int ret; + size_t olen; + size_t ilen; + char ibuf[80]; + char obuf[80]; + char tbuf[80]; + + encoding = tutf8e_encoder("windows-1252"); + + // aä => 0x61 0xc3 0xc4 + + ibuf[0] = 'a'; + ibuf[1] = CHR(0xe4); // 'ä' + ibuf[2] = 0; + + + tbuf[0] = 'a'; + tbuf[1] = CHR(0xc3); + tbuf[2] = CHR(0xa4); + tbuf[3] = 0; + + ret = tutf8e_string_length(encoding, ibuf, &ilen, &olen, 0); + TEST_CHECK(ret == TUTF8E_OK); + TEST_CHECK(ilen == 2); + TEST_CHECK(olen == 3); + + olen = sizeof(obuf); + ret = tutf8e_string_encode(encoding, ibuf, obuf,&olen, 0); + TEST_CHECK(ret == TUTF8E_OK); + TEST_CHECK(olen == 3); + TEST_CHECK_(strcmp(obuf,(char*)tbuf) == 0, "(encoding) %s != %s", obuf, tbuf); +} + + +static void test_inv_keep(void) { + TUTF8encoder encoding; + int ret; + size_t olen; + size_t ilen; + char ibuf[80]; + char obuf[80]; + char tbuf[80]; + + encoding = tutf8e_encoder("windows-1252"); + + // a 0x81 / 129 (euro) => 0xffff + // encoding: inevalid. + // char = 129 (8x81) => 0xFFFF + // direct unicode: 0xc2 0x81 + + ibuf[0] = 'a'; + ibuf[1] = CHR(0x81); // 'ä' + ibuf[2] = 0; + + + tbuf[0] = 'a'; + tbuf[1] = CHR(0xc2); + tbuf[2] = CHR(0x81); + tbuf[3] = 0; + + // flags = 0 == KEEP + + ret = tutf8e_string_length(encoding, ibuf, &ilen, &olen, TUTF8E_FLAG_INV_KEEP); + TEST_CHECK(ret == TUTF8E_OK); + TEST_CHECK(ilen == 2); + TEST_CHECK(olen == 3); + + olen = sizeof(obuf); + ret = tutf8e_string_encode(encoding, ibuf, obuf, &olen, TUTF8E_FLAG_INV_KEEP); + TEST_CHECK(ret == TUTF8E_OK); + TEST_CHECK(olen == 3); + TEST_CHECK(strlen(obuf) == 3); + TEST_CHECK_(strcmp(obuf,(char*)tbuf) == 0, "(encoding) %s != %s", obuf, tbuf); +} + + +static void test_inv_ignore(void) { + TUTF8encoder encoding; + int ret; + size_t olen; + size_t ilen; + char ibuf[80]; + char obuf[80]; + char tbuf[80]; + + encoding = tutf8e_encoder("windows-1252"); + + // a 0x81 / 129 => 0xffff + // char = 129 (8x81) => 0xFFFF + // direct unicode: 0xc2 0x81 + + ibuf[0] = 'a'; + ibuf[1] = CHR(0x81); // euro + ibuf[2] = 0; + + + tbuf[0] = 'a'; + tbuf[1] = 0; + + ret = tutf8e_string_length(encoding, ibuf, &ilen, &olen, TUTF8E_FLAG_INV_IGNORE); + TEST_CHECK(ret == TUTF8E_OK); + TEST_CHECK(ilen == 2); + TEST_CHECK(olen == 1); + + olen = sizeof(obuf); + ret = tutf8e_string_encode(encoding, ibuf, obuf, &olen, TUTF8E_FLAG_INV_IGNORE); + TEST_CHECK(ret == TUTF8E_OK); + TEST_CHECK(olen == 1); + TEST_CHECK(strlen(obuf) == 1); + TEST_CHECK_(strcmp(obuf,(char*)tbuf) == 0, "() %s != %s", obuf, tbuf); +} + +static void test_inv_fail(void) { + TUTF8encoder encoding; + int ret; + size_t olen; + size_t ilen; + char ibuf[80]; + char obuf[80]; + + encoding = tutf8e_encoder("windows-1252"); + + // a 0x81 / 129 => 0xffff + // char = 129 (8x81) => 0xFFFF + // direct unicode: 0xc2 0x81 + + ibuf[0] = 'a'; + ibuf[1] = CHR(0x81); // euro + ibuf[2] = 0; + + + ret = tutf8e_string_length(encoding, ibuf, &ilen, &olen, TUTF8E_FLAG_INV_FAIL); + TEST_CHECK(ret == TUTF8E_INVALID); + + olen = sizeof(obuf); + ret = tutf8e_string_encode(encoding, ibuf, obuf, &olen, TUTF8E_FLAG_INV_FAIL); + TEST_CHECK(ret == TUTF8E_INVALID); +} + + + + +static void test_inv_question(void) { + TUTF8encoder encoding; + int ret; + size_t olen; + size_t ilen; + size_t tlen; + char ibuf[80]; + char obuf[80]; + char tbuf[80]; + + encoding = tutf8e_encoder("windows-1252"); + + // a 0x81 / 129 => 0xffff + // char = 129 (8x81) => 0xFFFF + // direct unicode: 0xc2 0x81 + + ibuf[0] = 'a'; + ibuf[1] = CHR(0x81); // euro + ibuf[2] = 0; + + + tbuf[0] = 'a'; + tbuf[1] = '?'; + tbuf[2] = 0; + + tlen = strlen(tbuf); + + + ret = tutf8e_string_length(encoding, ibuf, &ilen, &olen, TUTF8E_FLAG_INV_QUESTION); + TEST_CHECK(ret == TUTF8E_OK); + TEST_CHECK(ilen == 2); + TEST_CHECK(olen == 2); + + olen++; /* room for nul */ + ret = tutf8e_string_encode(encoding, ibuf, obuf, &olen, TUTF8E_FLAG_INV_QUESTION); + TEST_CHECK(ret == TUTF8E_OK); + TEST_CHECK(olen == 2); + TEST_CHECK(strlen(obuf) == 2); + TEST_CHECK_(strcmp(obuf,(char*)tbuf) == 0, "() %s != %s", obuf, tbuf); +} + + +/** + * unkown char is converted to 0xffff == 0xef, 0xbf, 0xbd (replacement char) + */ +static void test_inv_replacement(void) { + TUTF8encoder encoding; + int ret; + size_t olen; + size_t ilen; + size_t tlen; + char ibuf[80]; + char obuf[80]; + char tbuf[80]; + + encoding = tutf8e_encoder("windows-1252"); + + // a 0x81 / 129 => 0xffff + // char = 129 (8x81) => 0xFFFF + // direct unicode: 0xc2 0x81 + // replacement 0xfffd => 0xef 0xbf 0xbd + + ibuf[0] = 'a'; + ibuf[1] = CHR(0x81); // euro + ibuf[2] = 0; + + + tbuf[0] = 'a'; + tbuf[1] = CHR(0xef); + tbuf[2] = CHR(0xbf); + tbuf[3] = CHR(0xbd); + tbuf[4] = 0; + + tlen = strlen(tbuf); + ret = tutf8e_string_length(encoding, ibuf, &ilen, &olen, TUTF8E_FLAG_INV_REPLACEMENT); + TEST_CHECK(ret == TUTF8E_OK); + TEST_CHECK(ilen == 2); + TEST_CHECK(olen == 4); + + olen++; + ret = tutf8e_string_encode(encoding, ibuf, obuf, &olen, TUTF8E_FLAG_INV_REPLACEMENT); + TEST_CHECK(ret == TUTF8E_OK); + TEST_CHECK(olen == 4); + TEST_CHECK(strlen(obuf) == 4); + TEST_CHECK_(strcmp(obuf,(char*)tbuf) == 0, "() %s != %s", obuf, tbuf); +} + + + + +TEST_LIST = { + { "test-basic", test_basic }, + { "test-valid", test_valid }, + { "test-toolong", test_toolong }, + { "test-inv-keep", test_inv_keep }, + { "test-inv-ignore", test_inv_ignore }, + { "test-inv-fail", test_inv_fail }, + { "test-inv-replacement", test_inv_replacement }, + { "test-inv-question", test_inv_question }, + { NULL } +}; + diff --git a/lib/tutf8e/test/test.c b/lib/tutf8e/test/test.c index 43c1e3bde88..d9e47b99108 100644 --- a/lib/tutf8e/test/test.c +++ b/lib/tutf8e/test/test.c @@ -9,7 +9,10 @@ int main(int argc, char *argv[]) int pass = 0; int fail = 0; int ret; - size_t ilen, olen; + size_t ilen; + size_t olen; + size_t xlen; + size_t ylen; char buffer[1024]; static const char english[] = { @@ -166,247 +169,537 @@ int main(int argc, char *argv[]) 0x21, 0x00, }; - /* string encode to UTF8 */ + /* test length function */ + xlen = strlen(english); + ylen = strlen(englishUTF8); olen = sizeof(buffer); - ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_1, english, buffer, &olen); - if (!ret && !strcmp(buffer, englishUTF8)) { - printf("%s\n", buffer); + ret = tutf8e_string_length(tutf8e_encoder_iso_8859_1, english, &ilen, &olen, 0); + if (ret < 0) { + printf("(length test) english : got error %d\n", ret); + fail++; + } else if(xlen != ilen) { + printf("(length test) english : ilen wrong %ld != %ld\n", ilen , xlen); + fail++; + } else if(ylen != olen){ + printf("(length test) english : olen wrong %ld != %ld\n", olen , ylen); + fail++; + } else { + printf("(length test) english : ok\n"); pass++; + } + + xlen = strlen(finnish); + ylen = strlen(finnishUTF8); + olen = sizeof(buffer); + ret = tutf8e_string_length(tutf8e_encoder_iso_8859_1, finnish, &ilen, &olen, 0); + if (ret < 0) { + printf("(length test) finnish : got error %d\n", ret); + fail++; + } else if(xlen != ilen) { + printf("(length test) finnish : ilen wrong %ld != %ld\n", ilen , xlen); + fail++; + } else if(ylen != olen){ + printf("(length test) finnish : olen wrong %ld != %ld\n", olen , ylen); + fail++; } else { - printf("Failed to encode english test\n"); + printf("(length test) finnish : ok\n"); + pass++; + } + + xlen = strlen(czech); + ylen = strlen(czechUTF8); + olen = sizeof(buffer); + ret = tutf8e_string_length(tutf8e_encoder_iso_8859_2, czech, &ilen, &olen, 0); + if (ret < 0) { + printf("(length test) czech : got error %d\n", ret); fail++; + } else if(xlen != ilen) { + printf("(length test) czech : ilen wrong %ld != %ld\n", ilen , xlen); + fail++; + } else if(ylen != olen){ + printf("(length test) czech : olen wrong %ld != %ld\n", olen , ylen); + fail++; + } else { + printf("(length test) czech : ok\n"); + pass++; } + xlen = strlen(turkish); + ylen = strlen(turkishUTF8); olen = sizeof(buffer); - ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_1, finnish, buffer, &olen); - if (!ret && !strcmp(buffer, finnishUTF8)) { - printf("%s\n", buffer); + ret = tutf8e_string_length(tutf8e_encoder_iso_8859_3, turkish, &ilen, &olen, 0); + if (ret < 0) { + printf("(length test) turkish : got error %d\n", ret); + fail++; + } else if(xlen != ilen) { + printf("(length test) turkish : ilen wrong %ld != %ld\n", ilen , xlen); + fail++; + } else if(ylen != olen){ + printf("(length test) turkish : olen wrong %ld != %ld\n", olen , ylen); + fail++; + } else { + printf("(length test) turkish : ok\n"); pass++; + } + + xlen = strlen(estonian); + ylen = strlen(estonianUTF8); + olen = sizeof(buffer); + ret = tutf8e_string_length(tutf8e_encoder_iso_8859_4, estonian, &ilen, &olen, 0); + if (ret < 0) { + printf("(length test) estonian : got error %d\n", ret); + fail++; + } else if(xlen != ilen) { + printf("(length test) estonian : ilen wrong %ld != %ld\n", ilen , xlen); + fail++; + } else if(ylen != olen){ + printf("(length test) estonian : olen wrong %ld != %ld\n", olen , ylen); + fail++; } else { - printf("Failed to encode finnish test\n"); + printf("(length test) estonian : ok\n"); + pass++; + } + + xlen = strlen(russian); + ylen = strlen(russianUTF8); + olen = sizeof(buffer); + ret = tutf8e_string_length(tutf8e_encoder_iso_8859_5, russian, &ilen, &olen, 0); + if (ret < 0) { + printf("(length test) russian : got error %d\n", ret); + fail++; + } else if(xlen != ilen) { + printf("(length test) russian : ilen wrong %ld != %ld\n", ilen , xlen); + fail++; + } else if(ylen != olen){ + printf("(length test) russian : olen wrong %ld != %ld\n", olen , ylen); fail++; + } else { + printf("(length test) russian : ok\n"); + pass++; } + xlen = strlen(greek); + ylen = strlen(greekUTF8); olen = sizeof(buffer); - ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_2, czech, buffer, &olen); - if (!ret && !strcmp(buffer, czechUTF8)) { - printf("%s\n", buffer); + ret = tutf8e_string_length(tutf8e_encoder_iso_8859_7, greek, &ilen, &olen, 0); + if (ret < 0) { + printf("(length test) greek : got error %d\n", ret); + fail++; + } else if(xlen != ilen) { + printf("(length test) greek : ilen wrong %ld != %ld\n", ilen , xlen); + fail++; + } else if(ylen != olen){ + printf("(length test) greek : olen wrong %ld != %ld\n", olen , ylen); + fail++; + } else { + printf("(length test) greek : ok\n"); pass++; + } + + xlen = strlen(hebrew); + ylen = strlen(hebrewUTF8); + olen = sizeof(buffer); + ret = tutf8e_string_length(tutf8e_encoder_iso_8859_8, hebrew, &ilen, &olen, 0); + if (ret < 0) { + printf("(length test) hebrew : got error %d\n", ret); + fail++; + } else if(xlen != ilen) { + printf("(length test) hebrew : ilen wrong %ld != %ld\n", ilen , xlen); + fail++; + } else if(ylen != olen){ + printf("(length test) hebrew : olen wrong %ld != %ld\n", olen , ylen); + fail++; } else { - printf("Failed to encode czech test\n"); + printf("(length test) hebrew : ok\n"); + pass++; + } + + xlen = strlen(turkish2); + ylen = strlen(turkish2UTF8); + olen = sizeof(buffer); + ret = tutf8e_string_length(tutf8e_encoder_iso_8859_9, turkish2, &ilen, &olen, 0); + if (ret < 0) { + printf("(length test) turkish2 : got error %d\n", ret); + fail++; + } else if(xlen != ilen) { + printf("(length test) turkish2 : ilen wrong %ld != %ld\n", ilen , xlen); fail++; + } else if(ylen != olen){ + printf("(length test) turkish2 : olen wrong %ld != %ld\n", olen , ylen); + fail++; + } else { + printf("(length test) turkish2 : ok\n"); + pass++; } + xlen = strlen(swedish); + ylen = strlen(swedishUTF8); olen = sizeof(buffer); - ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_3, turkish, buffer, &olen); - if (!ret && !strcmp(buffer, turkishUTF8)) { - printf("%s\n", buffer); + ret = tutf8e_string_length(tutf8e_encoder_iso_8859_10, swedish, &ilen, &olen, 0); + if (ret < 0) { + printf("(length test) swedish : got error %d\n", ret); + fail++; + } else if(xlen != ilen) { + printf("(length test) swedish : ilen wrong %ld != %ld\n", ilen , xlen); + fail++; + } else if(ylen != olen){ + printf("(length test) swedish : olen wrong %ld != %ld\n", olen , ylen); + fail++; + } else { + printf("(length test) swedish : ok\n"); pass++; + } + + xlen = strlen(thai); + ylen = strlen(thaiUTF8); + olen = sizeof(buffer); + ret = tutf8e_string_length(tutf8e_encoder_iso_8859_11, thai, &ilen, &olen, 0); + if (ret < 0) { + printf("(length test) thai : got error %d\n", ret); + fail++; + } else if(xlen != ilen) { + printf("(length test) thai : ilen wrong %ld != %ld\n", ilen , xlen); + fail++; + } else if(ylen != olen){ + printf("(length test) thai : olen wrong %ld != %ld\n", olen , ylen); + fail++; } else { - printf("Failed to encode turkish test\n"); + printf("(length test) thai : ok\n"); + pass++; + } + + xlen = strlen(polish); + ylen = strlen(polishUTF8); + olen = sizeof(buffer); + ret = tutf8e_string_length(tutf8e_encoder_iso_8859_13, polish, &ilen, &olen, 0); + if (ret < 0) { + printf("(length test) polish : got error %d\n", ret); fail++; + } else if(xlen != ilen) { + printf("(length test) polish : ilen wrong %ld != %ld\n", ilen , xlen); + fail++; + } else if(ylen != olen){ + printf("(length test) polish : olen wrong %ld != %ld\n", olen , ylen); + fail++; + } else { + printf("(length test) polish : ok\n"); + pass++; } + + /* string encode to UTF8 */ olen = sizeof(buffer); - ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_4, estonian, buffer, &olen); - if (!ret && !strcmp(buffer, estonianUTF8)) { - printf("%s\n", buffer); + ret = tutf8e_string_encode(tutf8e_encoder_iso_8859_1, english, buffer, &olen, 0); + if (ret >= 0 && !strcmp(buffer, englishUTF8)) { + printf("(string test) : ok : english : %s\n", buffer); pass++; } else { - printf("Failed to encode estonian test\n"); + printf("(string test) Failed to encode english test\n"); fail++; } olen = sizeof(buffer); - ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_5, russian, buffer, &olen); - if (!ret && !strcmp(buffer, russianUTF8)) { - printf("%s\n", buffer); + ret = tutf8e_string_encode(tutf8e_encoder_iso_8859_1, finnish, buffer, &olen, 0); + if (ret >= 0 && !strcmp(buffer, finnishUTF8)) { + printf("(string test) : ok : finnish : %s\n", buffer); pass++; } else { - printf("Failed to encode russian test\n"); + printf("(string test) Failed to encode finnish test\n"); fail++; } olen = sizeof(buffer); - ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_7, greek, buffer, &olen); - if (!ret && !strcmp(buffer, greekUTF8)) { - printf("%s\n", buffer); + ret = tutf8e_string_encode(tutf8e_encoder_iso_8859_2, czech, buffer, &olen, 0); + if (ret >= 0 && !strcmp(buffer, czechUTF8)) { + printf("(string test) : ok : czech : %s\n", buffer); pass++; } else { - printf("Failed to encode greek test\n"); + printf("(string test) Failed to encode czech test\n"); fail++; } olen = sizeof(buffer); - ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_8, hebrew, buffer, &olen); - if (!ret && !strcmp(buffer, hebrewUTF8)) { - printf("%s\n", buffer); + ret = tutf8e_string_encode(tutf8e_encoder_iso_8859_3, turkish, buffer, &olen, 0); + if (ret >= 0 && !strcmp(buffer, turkishUTF8)) { + printf("(string test) : ok : turkish : %s\n", buffer); pass++; } else { - printf("Failed to encode hebrew test\n"); + printf("(string test) Failed to encode turkish test\n"); fail++; } olen = sizeof(buffer); - ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_9, turkish2, buffer, &olen); - if (!ret && !strcmp(buffer, turkish2UTF8)) { - printf("%s\n", buffer); + ret = tutf8e_string_encode(tutf8e_encoder_iso_8859_4, estonian, buffer, &olen, 0); + if (ret >= 0 && !strcmp(buffer, estonianUTF8)) { + printf("(string test) : ok : estonian : %s\n", buffer); pass++; } else { - printf("Failed to encode turkish2 test\n"); + printf("(string test) Failed to encode estonian test\n"); fail++; } olen = sizeof(buffer); - ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_10, swedish, buffer, &olen); - if (!ret && !strcmp(buffer, swedishUTF8)) { - printf("%s\n", buffer); + ret = tutf8e_string_encode(tutf8e_encoder_iso_8859_5, russian, buffer, &olen, 0); + if (ret >= 0 && !strcmp(buffer, russianUTF8)) { + printf("(string test) : ok : russian : %s\n", buffer); pass++; } else { - printf("Failed to encode swedish test\n"); + printf("(string test) Failed to encode russian test\n"); fail++; } olen = sizeof(buffer); - ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_11, thai, buffer, &olen); - if (!ret && !strcmp(buffer, thaiUTF8)) { - printf("%s\n", buffer); + ret = tutf8e_string_encode(tutf8e_encoder_iso_8859_7, greek, buffer, &olen, 0); + if (ret >= 0 && !strcmp(buffer, greekUTF8)) { + printf("(string test) : ok : greek : %s\n", buffer); pass++; } else { - printf("Failed to encode thai test\n"); + printf("(string test) Failed to encode greek test\n"); fail++; } olen = sizeof(buffer); - ret = tutf8e_encoder_string_encode(tutf8e_encoder_iso_8859_13, polish, buffer, &olen); - if (!ret && !strcmp(buffer, polishUTF8)) { - printf("%s\n", buffer); + ret = tutf8e_string_encode(tutf8e_encoder_iso_8859_8, hebrew, buffer, &olen, 0); + if (ret >= 0 && !strcmp(buffer, hebrewUTF8)) { + printf("(string test) : ok : hebrew : %s\n", buffer); pass++; } else { - printf("Failed to encode polish test\n"); + printf("(string test) Failed to encode hebrew test\n"); fail++; } - - /* buffer encode to UTF8 */ - ilen = strlen(english); olen = sizeof(buffer); - ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_iso_8859_1, english, ilen, buffer, &olen); - if (!ret && (olen+1)==sizeof(englishUTF8) && !strncmp(buffer, englishUTF8, olen)) { + ret = tutf8e_string_encode(tutf8e_encoder_iso_8859_9, turkish2, buffer, &olen, 0); + if (ret >= 0 && !strcmp(buffer, turkish2UTF8)) { + printf("(string test) : ok : turkish2 : %s\n", buffer); pass++; } else { - printf("Failed to encode english test\n"); + printf("(string test) Failed to encode turkish2 test\n"); fail++; } - ilen = strlen(finnish); olen = sizeof(buffer); - ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_iso_8859_1, finnish, ilen, buffer, &olen); - if (!ret && (olen+1)==sizeof(finnishUTF8) && !strncmp(buffer, finnishUTF8, olen)) { + ret = tutf8e_string_encode(tutf8e_encoder_iso_8859_10, swedish, buffer, &olen, 0); + if (ret >= 0 && !strcmp(buffer, swedishUTF8)) { + printf("(string test) : ok : swedish : %s\n", buffer); pass++; } else { - printf("Failed to encode finnish test\n"); + printf("(string test) Failed to encode swedish test\n"); fail++; } - ilen = strlen(czech); olen = sizeof(buffer); - ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_iso_8859_2, czech, ilen, buffer, &olen); - if (!ret && (olen+1)==sizeof(czechUTF8) && !strncmp(buffer, czechUTF8, olen)) { + ret = tutf8e_string_encode(tutf8e_encoder_iso_8859_11, thai, buffer, &olen, 0); + if (ret >= 0 && !strcmp(buffer, thaiUTF8)) { + printf("(string test) : ok : thai : %s\n", buffer); pass++; } else { - printf("Failed to encode czech test\n"); + printf("(string test) Failed to encode thai test\n"); fail++; } - ilen = strlen(turkish); olen = sizeof(buffer); - ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_iso_8859_3, turkish, ilen, buffer, &olen); - if (!ret && (olen+1)==sizeof(turkishUTF8) && !strncmp(buffer, turkishUTF8, olen)) { + ret = tutf8e_string_encode(tutf8e_encoder_iso_8859_13, polish, buffer, &olen, 0); + if (ret >= 0 && !strcmp(buffer, polishUTF8)) { + printf("(string test) : ok : polish : %s\n", buffer); pass++; } else { - printf("Failed to encode turkish test\n"); + printf("(string test) Failed to encode polish test\n"); fail++; } - ilen = strlen(estonian); - olen = sizeof(buffer); - ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_iso_8859_4, estonian, ilen, buffer, &olen); - if (!ret && (olen+1)==sizeof(estonianUTF8) && !strncmp(buffer, estonianUTF8, olen)) { + + /* buffer encode to UTF8 */ + ilen = strlen(english); + xlen = strlen(englishUTF8); + ret = tutf8e_buffer_encode(tutf8e_encoder_iso_8859_1, english, ilen, buffer, &olen, 0); + if (ret < 0) { + printf("(buffer test) Failed to encode english test : ret(%d)\n", ret); + } else if (olen != xlen) { + printf("(buffer test) Failed to encode english test : length diff : %ld != %ld\n", olen, xlen); + fail++; + } else if (strncmp(buffer, englishUTF8, olen)) { + printf("(buffer test) Failed to encode english test : output diffs=(%s) expect(%s)\n", buffer, englishUTF8); + fail++; + } else { + printf("(buffer test) ok english\n"); pass++; + } + + ilen = strlen(finnish); + xlen = strlen(finnishUTF8); + ret = tutf8e_buffer_encode(tutf8e_encoder_iso_8859_1, finnish, ilen, buffer, &olen, 0); + if (ret < 0) { + printf("(buffer test) Failed to encode finnish test : ret(%d)\n", ret); + } else if (olen != xlen) { + printf("(buffer test) Failed to encode finnish test : length diff : %ld != %ld\n", olen, xlen); + fail++; + } else if (strncmp(buffer, finnishUTF8, olen)) { + printf("(buffer test) Failed to encode finnish test : output diffs=(%s) expect(%s)\n", buffer, finnishUTF8); + fail++; } else { - printf("Failed to encode estonian test\n"); + printf("(buffer test) ok finnish\n"); + pass++; + } + + ilen = strlen(czech); + xlen = strlen(czechUTF8); + ret = tutf8e_buffer_encode(tutf8e_encoder_iso_8859_2, czech, ilen, buffer, &olen, 0); + if (ret < 0) { + printf("(buffer test) Failed to encode czech test : ret(%d)\n", ret); + } else if (olen != xlen) { + printf("(buffer test) Failed to encode czech test : length diff : %ld != %ld\n", olen, xlen); fail++; + } else if (strncmp(buffer, czechUTF8, olen)) { + printf("(buffer test) Failed to encode czech test : output diffs=(%s) expect(%s)\n", buffer, czechUTF8); + fail++; + } else { + printf("(buffer test) ok czech\n"); + pass++; } - ilen = strlen(russian); - olen = sizeof(buffer); - ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_iso_8859_5, russian, ilen, buffer, &olen); - if (!ret && (olen+1)==sizeof(russianUTF8) && !strncmp(buffer, russianUTF8, olen)) { + ilen = strlen(turkish); + xlen = strlen(turkishUTF8); + ret = tutf8e_buffer_encode(tutf8e_encoder_iso_8859_3, turkish, ilen, buffer, &olen, 0); + if (ret < 0) { + printf("(buffer test) Failed to encode turkish test : ret(%d)\n", ret); + } else if (olen != xlen) { + printf("(buffer test) Failed to encode turkish test : length diff : %ld != %ld\n", olen, xlen); + fail++; + } else if (strncmp(buffer, turkishUTF8, olen)) { + printf("(buffer test) Failed to encode turkish test : output diffs=(%s) expect(%s)\n", buffer, turkishUTF8); + fail++; + } else { + printf("(buffer test) ok turkish\n"); pass++; + } + + ilen = strlen(estonian); + xlen = strlen(estonianUTF8); + ret = tutf8e_buffer_encode(tutf8e_encoder_iso_8859_4, estonian, ilen, buffer, &olen, 0); + if (ret < 0) { + printf("(buffer test) Failed to encode estonian test : ret(%d)\n", ret); + } else if (olen != xlen) { + printf("(buffer test) Failed to encode estonian test : length diff : %ld != %ld\n", olen, xlen); + fail++; + } else if (strncmp(buffer, estonianUTF8, olen)) { + printf("(buffer test) Failed to encode estonian test : output diffs=(%s) expect(%s)\n", buffer, estonianUTF8); + fail++; } else { - printf("Failed to encode russian test\n"); + printf("(buffer test) ok estonian\n"); + pass++; + } + + ilen = strlen(russian); + xlen = strlen(russianUTF8); + ret = tutf8e_buffer_encode(tutf8e_encoder_iso_8859_5, russian, ilen, buffer, &olen, 0); + if (ret < 0) { + printf("(buffer test) Failed to encode russian test : ret(%d)\n", ret); + } else if (olen != xlen) { + printf("(buffer test) Failed to encode russian test : length diff : %ld != %ld\n", olen, xlen); + fail++; + } else if (strncmp(buffer, russianUTF8, olen)) { + printf("(buffer test) Failed to encode russian test : output diffs=(%s) expect(%s)\n", buffer, russianUTF8); fail++; + } else { + printf("(buffer test) ok russian\n"); + pass++; } ilen = strlen(greek); - olen = sizeof(buffer); - ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_iso_8859_7, greek, ilen, buffer, &olen); - if (!ret && (olen+1)==sizeof(greekUTF8) && !strncmp(buffer, greekUTF8, olen)) { - pass++; - } else { - printf("Failed to encode greek test\n"); + xlen = strlen(greekUTF8); + ret = tutf8e_buffer_encode(tutf8e_encoder_iso_8859_7, greek, ilen, buffer, &olen, 0); + if (ret < 0) { + printf("(buffer test) Failed to encode greek test : ret(%d)\n", ret); + } else if (olen != xlen) { + printf("(buffer test) Failed to encode greek test : length diff : %ld != %ld\n", olen, xlen); fail++; + } else if (strncmp(buffer, greekUTF8, olen)) { + printf("(buffer test) Failed to encode greek test : output diffs=(%s) expect(%s)\n", buffer, greekUTF8); + fail++; + } else { + printf("(buffer test) ok greek\n"); + pass++; } ilen = strlen(hebrew); - olen = sizeof(buffer); - ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_iso_8859_8, hebrew, ilen, buffer, &olen); - if (!ret && (olen+1)==sizeof(hebrewUTF8) && !strncmp(buffer, hebrewUTF8, olen)) { - pass++; - } else { - printf("Failed to encode hebrew test\n"); + xlen = strlen(hebrewUTF8); + ret = tutf8e_buffer_encode(tutf8e_encoder_iso_8859_8, hebrew, ilen, buffer, &olen, 0); + if (ret < 0) { + printf("(buffer test) Failed to encode hebrew test : ret(%d)\n", ret); + } else if (olen != xlen) { + printf("(buffer test) Failed to encode hebrew test : length diff : %ld != %ld\n", olen, xlen); + fail++; + } else if (strncmp(buffer, hebrewUTF8, olen)) { + printf("(buffer test) Failed to encode hebrew test : output diffs=(%s) expect(%s)\n", buffer, hebrewUTF8); fail++; + } else { + printf("(buffer test) ok hebrew\n"); + pass++; } ilen = strlen(turkish2); - olen = sizeof(buffer); - ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_iso_8859_9, turkish2, ilen, buffer, &olen); - if (!ret && (olen+1)==sizeof(turkish2UTF8) && !strncmp(buffer, turkish2UTF8, olen)) { - pass++; - } else { - printf("Failed to encode turkish2 test\n"); + xlen = strlen(turkish2UTF8); + ret = tutf8e_buffer_encode(tutf8e_encoder_iso_8859_9, turkish2, ilen, buffer, &olen, 0); + if (ret < 0) { + printf("(buffer test) Failed to encode turkish2 test : ret(%d)\n", ret); + } else if (olen != xlen) { + printf("(buffer test) Failed to encode turkish2 test : length diff : %ld != %ld\n", olen, xlen); fail++; + } else if (strncmp(buffer, turkish2UTF8, olen)) { + printf("(buffer test) Failed to encode turkish2 test : output diffs=(%s) expect(%s)\n", buffer, turkish2UTF8); + fail++; + } else { + printf("(buffer test) ok turkish2\n"); + pass++; } ilen = strlen(swedish); - olen = sizeof(buffer); - ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_iso_8859_10, swedish, ilen, buffer, &olen); - if (!ret && (olen+1)==sizeof(swedishUTF8) && !strncmp(buffer, swedishUTF8, olen)) { - pass++; - } else { - printf("Failed to encode swedish test\n"); + xlen = strlen(swedishUTF8); + ret = tutf8e_buffer_encode(tutf8e_encoder_iso_8859_10, swedish, ilen, buffer, &olen, 0); + if (ret < 0) { + printf("(buffer test) Failed to encode swedish test : ret(%d)\n", ret); + } else if (olen != xlen) { + printf("(buffer test) Failed to encode swedish test : length diff : %ld != %ld\n", olen, xlen); + fail++; + } else if (strncmp(buffer, swedishUTF8, olen)) { + printf("(buffer test) Failed to encode swedish test : output diffs=(%s) expect(%s)\n", buffer, swedishUTF8); fail++; + } else { + printf("(buffer test) ok swedish\n"); + pass++; } ilen = strlen(thai); - olen = sizeof(buffer); - ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_iso_8859_11, thai, ilen, buffer, &olen); - if (!ret && (olen+1)==sizeof(thaiUTF8) && !strncmp(buffer, thaiUTF8, olen)) { - pass++; - } else { - printf("Failed to encode thai test\n"); + xlen = strlen(thaiUTF8); + ret = tutf8e_buffer_encode(tutf8e_encoder_iso_8859_11, thai, ilen, buffer, &olen, 0); + if (ret < 0) { + printf("(buffer test) Failed to encode thai test : ret(%d)\n", ret); + } else if (olen != xlen) { + printf("(buffer test) Failed to encode thai test : length diff : %ld != %ld\n", olen, xlen); fail++; + } else if (strncmp(buffer, thaiUTF8, olen)) { + printf("(buffer test) Failed to encode thai test : output diffs=(%s) expect(%s)\n", buffer, thaiUTF8); + fail++; + } else { + printf("(buffer test) ok thai\n"); + pass++; } ilen = strlen(polish); - olen = sizeof(buffer); - ret = tutf8e_encoder_buffer_encode(tutf8e_encoder_iso_8859_13, polish, ilen, buffer, &olen); - if (!ret && (olen+1)==sizeof(polishUTF8) && !strncmp(buffer, polishUTF8, olen)) { - pass++; - } else { - printf("Failed to encode polish test\n"); + xlen = strlen(polishUTF8); + ret = tutf8e_buffer_encode(tutf8e_encoder_iso_8859_13, polish, ilen, buffer, &olen, 0); + if (ret < 0) { + printf("(buffer test) Failed to encode polish test : ret(%d)\n", ret); + } else if (olen != xlen) { + printf("(buffer test) Failed to encode polish test : length diff : %ld != %ld\n", olen, xlen); + fail++; + } else if (strncmp(buffer, polishUTF8, olen)) { + printf("(buffer test) Failed to encode polish test : output diffs=(%s) expect(%s)\n", buffer, polishUTF8); fail++; + } else { + printf("(buffer test) ok polish\n"); + pass++; } printf("%d passed, %d failed tests\n", pass, fail); diff --git a/lib/tutf8e/test/tester.c b/lib/tutf8e/test/tester.c new file mode 100644 index 00000000000..725abc23dc0 --- /dev/null +++ b/lib/tutf8e/test/tester.c @@ -0,0 +1,72 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +#include + +#include +#include +#include +#include + +#define TUTF8E_FLAG_INV_KEEP 0 /* illegal char: keep, just use as unicode codepoint */ +#define TUTF8E_FLAG_INV_FAIL 1 /* illegal char: fail on invalid char */ +#define TUTF8E_FLAG_INV_IGNORE 2 /* illegal char: skip/ignore invalid char */ +#define TUTF8E_FLAG_INV_REPLACEMENT 3 /* illegal char: convert to replacement character */ +#define TUTF8E_FLAG_INV_QUESTION 4 /* illegal char: convert to '?' */ +#define TUTF8E_FLAG_INV_COPY 5 /* illegal char: just copy byte */ + + +// 0xfffd => ef bf bd +// '?' => 0x3f + + +#define BUFFER_SIZE (4 * 1024) + +static char ibuffer[BUFFER_SIZE]; +static char obuffer[BUFFER_SIZE]; + +int main(int argc, char *argv[]) +{ + uint32_t flags = 0; + TUTF8encoder encoder; + size_t olen; + int line = 0; + int ch; + int ret; + + const char *encoder_name = "windows-1252"; + + while ((ch = getopt(argc, argv, "kfirqc?e:")) != -1) { + switch (ch) { + case 'k': flags = TUTF8E_FLAG_INV_KEEP; break; + case 'f': flags = TUTF8E_FLAG_INV_FAIL; break; + case 'i': flags = TUTF8E_FLAG_INV_IGNORE; break; + case 'r': flags = TUTF8E_FLAG_INV_REPLACEMENT; break; + case 'q': + case '?': flags = TUTF8E_FLAG_INV_QUESTION; break; + case 'c': flags = TUTF8E_FLAG_INV_COPY; break; + case 'e': + encoder_name = strdup(optarg); + break; + default: + fprintf(stderr,"illegal code: %c", ch); + } + } + argc -= optind; + argv += optind; + + encoder = tutf8e_encoder(encoder_name); + if(!encoder) { + fprintf(stderr,"no such encoder: '%s'\n", encoder_name); + exit(1); + } + + while(fgets(ibuffer,BUFFER_SIZE-1, stdin)) { + line++; + olen = BUFFER_SIZE; + if((ret = tutf8e_string_encode(encoder, ibuffer, obuffer, &olen, flags)) < 0) { + fprintf(stderr, "[%d] failed: %d\n", line, ret); + continue; + } + fputs(obuffer,stdout); + } +}