diff --git a/plugins/in_tail/tail.c b/plugins/in_tail/tail.c index 8ecdbb1fdc5..41ac94a95bc 100644 --- a/plugins/in_tail/tail.c +++ b/plugins/in_tail/tail.c @@ -826,6 +826,15 @@ static struct flb_config_map config_map[] = { "Currently, UTF-16LE, UTF-16BE, auto are supported.", }, #endif + { + FLB_CONFIG_MAP_STR, "generic.encoding", NULL, + 0, FLB_FALSE, 0, + "specify the preferred input encoding for converting to UTF-8. " + "Currently, the following encodings are supported: " + "ShiftJIS, UHC, GBK, GB18030, Big5, " + "Win866, Win874, " + "Win1250, Win1251, Win1252, Win2513, Win1254, Win1255, WIn1256", + }, /* EOF */ {0} }; diff --git a/plugins/in_tail/tail_config.c b/plugins/in_tail/tail_config.c index a97a8a63f7c..f06e1c3ec63 100644 --- a/plugins/in_tail/tail_config.c +++ b/plugins/in_tail/tail_config.c @@ -36,9 +36,7 @@ #include "tail_multiline.h" #endif -#ifdef FLB_HAVE_UNICODE_ENCODER #include -#endif static int multiline_load_parsers(struct flb_tail_config *ctx) { @@ -114,6 +112,7 @@ struct flb_tail_config *flb_tail_config_create(struct flb_input_instance *ins, #ifdef FLB_HAVE_UNICODE_ENCODER ctx->preferred_input_encoding = FLB_UNICODE_ENCODING_UNSPECIFIED; #endif + ctx->generic_input_encoding_type = FLB_GENERIC_UNSPECIFIED; /* Default is unspecified */ /* Load the config map */ ret = flb_input_config_map_set(ins, (void *) ctx); @@ -222,6 +221,20 @@ struct flb_tail_config *flb_tail_config_create(struct flb_input_instance *ins, } #endif + tmp = flb_input_get_property("generic.encoding", ins); + if (tmp) { + ret = flb_unicode_generic_select_encoding_type(tmp); + if (ret != FLB_GENERIC_UNSPECIFIED) { + ctx->generic_input_encoding_type = ret; + ctx->generic_input_encoding_name = tmp; + } + else { + flb_plg_error(ctx->ins, "invalid encoding 'generic.encoding' value %s", tmp); + flb_free(ctx); + return NULL; + } + } + #ifdef FLB_HAVE_PARSER /* Config: multi-line support */ if (ctx->multiline == FLB_TRUE) { diff --git a/plugins/in_tail/tail_config.h b/plugins/in_tail/tail_config.h index 911eb8cd18c..326dba870a4 100644 --- a/plugins/in_tail/tail_config.h +++ b/plugins/in_tail/tail_config.h @@ -129,6 +129,8 @@ struct flb_tail_config { #ifdef FLB_HAVE_UNICODE_ENCODER int preferred_input_encoding; #endif + int generic_input_encoding_type; + const char *generic_input_encoding_name; /* Multiline */ int multiline; /* multiline enabled ? */ diff --git a/plugins/in_tail/tail_file.c b/plugins/in_tail/tail_file.c index b68bb3fc467..8aa3eb10783 100644 --- a/plugins/in_tail/tail_file.c +++ b/plugins/in_tail/tail_file.c @@ -48,9 +48,7 @@ #include "win32.h" #endif -#ifdef FLB_HAVE_UNICODE_ENCODER #include -#endif #include @@ -445,8 +443,8 @@ static int process_content(struct flb_tail_file *file, size_t *bytes) time_t now = time(NULL); struct flb_time out_time = {0}; struct flb_tail_config *ctx; -#ifdef FLB_HAVE_UNICODE_ENCODER char *decoded = NULL; +#ifdef FLB_HAVE_UNICODE_ENCODER size_t decoded_len; #endif @@ -485,6 +483,20 @@ static int process_content(struct flb_tail_file *file, size_t *bytes) } } #endif + if (ctx->generic_input_encoding_type != FLB_GENERIC_UNSPECIFIED) { + original_len = end - data; + decoded = NULL; + ret = flb_unicode_generic_convert_to_utf8(ctx->generic_input_encoding_name, + (unsigned char*)data, (unsigned char**)&decoded, + end - data); + if (ret > 0) { + data = decoded; + end = data + strlen(decoded); + } + else { + flb_plg_error(ctx->ins, "encoding failed '%.*s' with status %d", end - data, data, ret); + } + } /* Skip null characters from the head (sometimes introduced by copy-truncate log rotation) */ while (data < end && *data == '\0') { diff --git a/tests/runtime/data/tail/generate_generic_encoder_testing_data.py b/tests/runtime/data/tail/generate_generic_encoder_testing_data.py new file mode 100644 index 00000000000..c1bdf236557 --- /dev/null +++ b/tests/runtime/data/tail/generate_generic_encoder_testing_data.py @@ -0,0 +1,199 @@ +# -*- coding: utf-8 -*- + +import os + +# This script generates a set of text files for testing various character encodings. +# Each file contains a curated list of common, neutral words appropriate for the +# target language and encoding. +# +# The word lists specifically exclude: +# - Religious terminology +# - Names of capital cities +# +# To use this script: +# 1. Save it as a Python file (e.g., `generate_files.py`). +# 2. Run it from your terminal: `python generate_files.py` +# 3. The script will create several .txt files in the same directory. + +# Dictionary of encodings and their corresponding test data. +# The keys are the encoding names (and will be used in the filenames). +# The values are lists of strings to be written to the files. +ENCODING_DATA = { + # --- East Asian Encodings --- + "sjis": [ + "こんにちは", # Hello + "ありがとう", # Thank you + "さようなら", # Goodbye + "日本", # Japan + "猫", # Cat + "犬", # Dog + "食べる", # To eat + "飲む", # To drink + "空", # Sky + "海", # Sea + "月", # Moon + "花", # Flower + ], + "big5": [ + "你好", # Hello + "謝謝", # Thank you + "再見", # Goodbye + "貓", # Cat + "狗", # Dog + "吃", # To eat + "喝", # To drink + "天", # Sky + "海", # Sea + "月亮", # Moon + "花卉", # Flower + ], + "gbk": [ + "你好", # Hello + "谢谢", # Thank you + "再见", # Goodbye + "中国", # China + "猫", # Cat + "狗", # Dog + "吃", # To eat + "喝", # To drink + "天", # Sky + "海", # Sea + "月亮", # Moon + "花", # Flower + ], + "gb18030": [ # Superset of GBK, can include the same + more + "你好", "谢谢", "再见", "中国", "猫", "狗", "吃", "喝", "天", "海", + "欧元符号€", # Euro symbol to test expanded range + "龘", "龍", # Complex characters + ], + "euc-kr": [ # Often used for Korean, UHC is a Microsoft equivalent + "안녕하세요", # Hello + "감사합니다", # Thank you + "안녕히 가세요",# Goodbye + "한국", # Korea + "고양이", # Cat + "개", # Dog + "먹다", # To eat + "마시다", # To drink + "하늘", # Sky + "바다", # Sea + "달", # Moon + "꽃", # Flower + ], + + # --- Windows Codepage Encodings --- + "cp866": [ # Cyrillic (DOS) + "Привет", # Hello + "Спасибо", # Thank you + "До свидания", # Goodbye + "Компьютер", # Computer + "Информация", # Information + "Программа", # Program + "Файл", # File + ], + "cp874": [ # Thai + "สวัสดี", # Hello + "ขอบคุณ", # Thank you + "ลาก่อน", # Goodbye + "ภาษาไทย", # Thai language + "แมว", # Cat + "สุนัข", # Dog + "กิน", # Eat + "ดื่ม", # Drink + ], + "cp1250": [ # Central European (Polish, Czech, etc.) + "Cześć", "Dziękuję", # Polish + "Ahoj", "Děkuji", # Czech + "Žluťoučký kůň", # Czech phrase with diacritics + "Gęślą jaźń", # Polish phrase with diacritics + "Árvíztűrő tükörfúrógép", # Hungarian + ], + "cp1251": [ # Cyrillic (Windows) + "Привет", "Спасибо", "До свидания", + "Кошка", "Собака", "Небо", "Море", + "Български език", # Bulgarian + "Українська мова",# Ukrainian + "Беларуская мова",# Belarusian + ], + "cp1252": [ # Western European + "Hello", "Thank you", "Goodbye", # English + "Bonjour", "Merci", "Au revoir", # French + "Hallo", "Danke", "Auf Wiedersehen", # German + "Hola", "Gracias", "Adiós", # Spanish + "Crème brûlée", "Piñata", "Fjord", + ], + "cp1253": [ # Greek + "Γειά σου", # Hello + "Ευχαριστώ", # Thank you + "Αντίο", # Goodbye + "Ελληνικά", # Greek + "Γάτα", # Cat + "Σκύλος", # Dog + "Ουρανός", # Sky + "Θάλασσα", # Sea + ], + "cp1254": [ # Turkish + "Merhaba", "Teşekkür ederim", "Hoşça kal", + "Türkiye", "Kedi", "Köpek", + "Yemek", "İçmek", "Gök", "Deniz", + "Öğrenci", "Işık", "Ağaç", # Words with specific Turkish chars + ], + "cp1255": [ # Hebrew + "שלום", # Hello/Peace + "תודה", # Thank you + "להתראות", # Goodbye + "עברית", # Hebrew + "חתול", # Cat + "כלב", # Dog + "שמיים", # Sky + "ים", # Sea + ], + "cp1256": [ # Arabic + "مرحبا", # Hello + "شكرا", # Thank you + "مع السلامة", # Goodbye + "العربية", # Arabic + "قط", # Cat + "كلب", # Dog + "سماء", # Sky + "بحر", # Sea + ], +} + +def generate_files(): + """ + Iterates through the ENCODING_DATA dictionary and creates a file for each entry. + """ + # Get the directory where the script is running to save files there. + output_dir = os.path.dirname(os.path.abspath(__file__)) + print(f"Files will be generated in: {output_dir}\n") + + for encoding, content_list in ENCODING_DATA.items(): + # Sanitize encoding name for use in filename, replacing cp with win + # for clarity as requested. UHC is an alias for euc-kr in this context. + if encoding.startswith("cp"): + filename_prefix = encoding.replace("cp", "win") + elif encoding == "euc-kr": + filename_prefix = "uhc" + else: + filename_prefix = encoding + + file_path = os.path.join(output_dir, "log", f"generic_enc_{filename_prefix}.log") + + try: + # Open the file with the specified encoding + with open(file_path, 'w', encoding=encoding) as f: + # Join the list of words with newline characters + f.write('\n'.join(content_list)) + f.write('\n') + print(f"Successfully created: {os.path.basename(file_path)} (Encoding: {encoding})") + + except UnicodeEncodeError as e: + print(f"Error: Could not encode content for '{encoding}'.") + print(f" - File not created: {os.path.basename(file_path)}") + print(f" - Details: {e}") + except Exception as e: + print(f"An unexpected error occurred for '{encoding}': {e}") + +if __name__ == "__main__": + generate_files() diff --git a/tests/runtime/data/tail/log/generic_enc_big5.log b/tests/runtime/data/tail/log/generic_enc_big5.log new file mode 100644 index 00000000000..46a8e277906 --- /dev/null +++ b/tests/runtime/data/tail/log/generic_enc_big5.log @@ -0,0 +1,11 @@ +An + +A + + +Y + + + +G +c diff --git a/tests/runtime/data/tail/log/generic_enc_gb18030.log b/tests/runtime/data/tail/log/generic_enc_gb18030.log new file mode 100644 index 00000000000..dc2f533a68d --- /dev/null +++ b/tests/runtime/data/tail/log/generic_enc_gb18030.log @@ -0,0 +1,13 @@ + +лл +ټ +й +è + + + + + +ŷԪŢ + + diff --git a/tests/runtime/data/tail/log/generic_enc_gbk.log b/tests/runtime/data/tail/log/generic_enc_gbk.log new file mode 100644 index 00000000000..d5862ffacd2 --- /dev/null +++ b/tests/runtime/data/tail/log/generic_enc_gbk.log @@ -0,0 +1,12 @@ + +лл +ټ +й +è + + + + + + + diff --git a/tests/runtime/data/tail/log/generic_enc_sjis.log b/tests/runtime/data/tail/log/generic_enc_sjis.log new file mode 100644 index 00000000000..2329939be27 --- /dev/null +++ b/tests/runtime/data/tail/log/generic_enc_sjis.log @@ -0,0 +1,12 @@ +ɂ +肪Ƃ +悤Ȃ +{ +L + +Hׂ + + +C + + diff --git a/tests/runtime/data/tail/log/generic_enc_uhc.log b/tests/runtime/data/tail/log/generic_enc_uhc.log new file mode 100644 index 00000000000..aa854290a8f --- /dev/null +++ b/tests/runtime/data/tail/log/generic_enc_uhc.log @@ -0,0 +1,12 @@ +ȳϼ +մϴ +ȳ +ѱ + + +Դ +ô +ϴ +ٴ + + diff --git a/tests/runtime/data/tail/log/generic_enc_win1250.log b/tests/runtime/data/tail/log/generic_enc_win1250.log new file mode 100644 index 00000000000..167592edd03 --- /dev/null +++ b/tests/runtime/data/tail/log/generic_enc_win1250.log @@ -0,0 +1,7 @@ +Cze +Dzikuj +Ahoj +Dkuji +luouk k +Gl ja +rvztr tkrfrgp diff --git a/tests/runtime/data/tail/log/generic_enc_win1251.log b/tests/runtime/data/tail/log/generic_enc_win1251.log new file mode 100644 index 00000000000..4dd56096778 --- /dev/null +++ b/tests/runtime/data/tail/log/generic_enc_win1251.log @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/tests/runtime/data/tail/log/generic_enc_win1252.log b/tests/runtime/data/tail/log/generic_enc_win1252.log new file mode 100644 index 00000000000..f7c48fcbd8e --- /dev/null +++ b/tests/runtime/data/tail/log/generic_enc_win1252.log @@ -0,0 +1,15 @@ +Hello +Thank you +Goodbye +Bonjour +Merci +Au revoir +Hallo +Danke +Auf Wiedersehen +Hola +Gracias +Adis +Crme brle +Piata +Fjord diff --git a/tests/runtime/data/tail/log/generic_enc_win1253.log b/tests/runtime/data/tail/log/generic_enc_win1253.log new file mode 100644 index 00000000000..dbd66e524ef --- /dev/null +++ b/tests/runtime/data/tail/log/generic_enc_win1253.log @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/tests/runtime/data/tail/log/generic_enc_win1254.log b/tests/runtime/data/tail/log/generic_enc_win1254.log new file mode 100644 index 00000000000..69754a11e08 --- /dev/null +++ b/tests/runtime/data/tail/log/generic_enc_win1254.log @@ -0,0 +1,13 @@ +Merhaba +Teekkr ederim +Hoa kal +Trkiye +Kedi +Kpek +Yemek +mek +Gk +Deniz +renci +Ik +Aa diff --git a/tests/runtime/data/tail/log/generic_enc_win1255.log b/tests/runtime/data/tail/log/generic_enc_win1255.log new file mode 100644 index 00000000000..54d94245b6f --- /dev/null +++ b/tests/runtime/data/tail/log/generic_enc_win1255.log @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/tests/runtime/data/tail/log/generic_enc_win1256.log b/tests/runtime/data/tail/log/generic_enc_win1256.log new file mode 100644 index 00000000000..794c832b7e7 --- /dev/null +++ b/tests/runtime/data/tail/log/generic_enc_win1256.log @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/tests/runtime/data/tail/log/generic_enc_win866.log b/tests/runtime/data/tail/log/generic_enc_win866.log new file mode 100644 index 00000000000..0ad046657fa --- /dev/null +++ b/tests/runtime/data/tail/log/generic_enc_win866.log @@ -0,0 +1,7 @@ +ਢ +ᨡ + ᢨ + +ଠ +ணࠬ + diff --git a/tests/runtime/data/tail/log/generic_enc_win874.log b/tests/runtime/data/tail/log/generic_enc_win874.log new file mode 100644 index 00000000000..48a1d1bd74c --- /dev/null +++ b/tests/runtime/data/tail/log/generic_enc_win874.log @@ -0,0 +1,8 @@ +ʴ +ͺس +ҡ͹ + + +عѢ +Թ + diff --git a/tests/runtime/data/tail/out/generic_enc_big5.out b/tests/runtime/data/tail/out/generic_enc_big5.out new file mode 100644 index 00000000000..9a53fac380b --- /dev/null +++ b/tests/runtime/data/tail/out/generic_enc_big5.out @@ -0,0 +1,10 @@ +{"log":"\u4f60\u597d"} +{"log":"\u8b1d\u8b1d"} +{"log":"\u518d\u898b"} +{"log":"\u8c93"} +{"log":"\u72d7"} +{"log":"\u5403"} +{"log":"\u559d"} +{"log":"\u5929"} +{"log":"\u6d77"} +{"log":"\u6708\u4eae"} diff --git a/tests/runtime/data/tail/out/generic_enc_gb18030.out b/tests/runtime/data/tail/out/generic_enc_gb18030.out new file mode 100644 index 00000000000..58f853d041d --- /dev/null +++ b/tests/runtime/data/tail/out/generic_enc_gb18030.out @@ -0,0 +1,12 @@ +{"log":"\u4f60\u597d"} +{"log":"\u8c22\u8c22"} +{"log":"\u518d\u89c1"} +{"log":"\u4e2d\u56fd"} +{"log":"\u732b"} +{"log":"\u72d7"} +{"log":"\u5403"} +{"log":"\u559d"} +{"log":"\u5929"} +{"log":"\u6d77"} +{"log":"\u6b27\u5143\u7b26\u53f7\u20ac"} +{"log":"\u9f98"} diff --git a/tests/runtime/data/tail/out/generic_enc_gbk.out b/tests/runtime/data/tail/out/generic_enc_gbk.out new file mode 100644 index 00000000000..3f9e6e0f0ec --- /dev/null +++ b/tests/runtime/data/tail/out/generic_enc_gbk.out @@ -0,0 +1,11 @@ +{"log":"\u4f60\u597d"} +{"log":"\u8c22\u8c22"} +{"log":"\u518d\u89c1"} +{"log":"\u4e2d\u56fd"} +{"log":"\u732b"} +{"log":"\u72d7"} +{"log":"\u5403"} +{"log":"\u559d"} +{"log":"\u5929"} +{"log":"\u6d77"} +{"log":"\u6708\u4eae"} diff --git a/tests/runtime/data/tail/out/generic_enc_sjis.out b/tests/runtime/data/tail/out/generic_enc_sjis.out new file mode 100644 index 00000000000..9720ce05abc --- /dev/null +++ b/tests/runtime/data/tail/out/generic_enc_sjis.out @@ -0,0 +1,11 @@ +{"log":"\u3053\u3093\u306b\u3061\u306f"} +{"log":"\u3042\u308a\u304c\u3068\u3046"} +{"log":"\u3055\u3088\u3046\u306a\u3089"} +{"log":"\u65e5\u672c"} +{"log":"\u732b"} +{"log":"\u72ac"} +{"log":"\u98df\u3079\u308b"} +{"log":"\u98f2\u3080"} +{"log":"\u7a7a"} +{"log":"\u6d77"} +{"log":"\u6708"} \ No newline at end of file diff --git a/tests/runtime/data/tail/out/generic_enc_win1250.out b/tests/runtime/data/tail/out/generic_enc_win1250.out new file mode 100644 index 00000000000..13b39f49d9b --- /dev/null +++ b/tests/runtime/data/tail/out/generic_enc_win1250.out @@ -0,0 +1,6 @@ +{"log":"Cze\u015b\u0107"} +{"log":"Dzi\u0119kuj\u0119"} +{"log":"Ahoj"} +{"log":"D\u011bkuji"} +{"log":"\u017dlu\u0165ou\u010dk\u00fd k\u016f\u0148"} +{"log":"G\u0119\u015bl\u0105 ja\u017a\u0144"} diff --git a/tests/runtime/data/tail/out/generic_enc_win1251.out b/tests/runtime/data/tail/out/generic_enc_win1251.out new file mode 100644 index 00000000000..46ec1327571 --- /dev/null +++ b/tests/runtime/data/tail/out/generic_enc_win1251.out @@ -0,0 +1,9 @@ +{"log":"\u041f\u0440\u0438\u0432\u0435\u0442"} +{"log":"\u0421\u043f\u0430\u0441\u0438\u0431\u043e"} +{"log":"\u0414\u043e \u0441\u0432\u0438\u0434\u0430\u043d\u0438\u044f"} +{"log":"\u041a\u043e\u0448\u043a\u0430"} +{"log":"\u0421\u043e\u0431\u0430\u043a\u0430"} +{"log":"\u041d\u0435\u0431\u043e"} +{"log":"\u041c\u043e\u0440\u0435"} +{"log":"\u0411\u044a\u043b\u0433\u0430\u0440\u0441\u043a\u0438 \u0435\u0437\u0438\u043a"} +{"log":"\u0423\u043a\u0440\u0430\u0457\u043d\u0441\u044c\u043a\u0430 \u043c\u043e\u0432\u0430"} diff --git a/tests/runtime/data/tail/out/generic_enc_win1252.out b/tests/runtime/data/tail/out/generic_enc_win1252.out new file mode 100644 index 00000000000..b6c52e56e76 --- /dev/null +++ b/tests/runtime/data/tail/out/generic_enc_win1252.out @@ -0,0 +1,14 @@ +{"log":"Hello"} +{"log":"Thank you"} +{"log":"Goodbye"} +{"log":"Bonjour"} +{"log":"Merci"} +{"log":"Au revoir"} +{"log":"Hallo"} +{"log":"Danke"} +{"log":"Auf Wiedersehen"} +{"log":"Hola"} +{"log":"Gracias"} +{"log":"Adi\u00f3s"} +{"log":"Cr\u00e8me br\u00fbl\u00e9e"} +{"log":"Pi\u00f1ata"} \ No newline at end of file diff --git a/tests/runtime/data/tail/out/generic_enc_win1253.out b/tests/runtime/data/tail/out/generic_enc_win1253.out new file mode 100644 index 00000000000..6f77ebe6f3f --- /dev/null +++ b/tests/runtime/data/tail/out/generic_enc_win1253.out @@ -0,0 +1,8 @@ +{"log":"\u0393\u03b5\u03b9\u03ac \u03c3\u03bf\u03c5"} +{"log":"\u0395\u03c5\u03c7\u03b1\u03c1\u03b9\u03c3\u03c4\u03ce"} +{"log":"\u0391\u03bd\u03c4\u03af\u03bf"} +{"log":"\u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac"} +{"log":"\u0393\u03ac\u03c4\u03b1"} +{"log":"\u03a3\u03ba\u03cd\u03bb\u03bf\u03c2"} +{"log":"\u039f\u03c5\u03c1\u03b1\u03bd\u03cc\u03c2"} +{"log":"\u0398\u03ac\u03bb\u03b1\u03c3\u03c3\u03b1"} diff --git a/tests/runtime/data/tail/out/generic_enc_win1254.out b/tests/runtime/data/tail/out/generic_enc_win1254.out new file mode 100644 index 00000000000..d9ed2a1e1ec --- /dev/null +++ b/tests/runtime/data/tail/out/generic_enc_win1254.out @@ -0,0 +1,13 @@ +{"log":"Merhaba"} +{"log":"Te\u015fekk\u00fcr ederim"} +{"log":"Ho\u015f\u00e7a kal"} +{"log":"T\u00fcrkiye"} +{"log":"Kedi"} +{"log":"K\u00f6pek"} +{"log":"Yemek"} +{"log":"\u0130\u00e7mek"} +{"log":"G\u00f6k"} +{"log":"Deniz"} +{"log":"\u00d6\u011frenci"} +{"log":"I\u015f\u0131k"} +{"log":"A\u011fa\u00e7"} diff --git a/tests/runtime/data/tail/out/generic_enc_win1255.out b/tests/runtime/data/tail/out/generic_enc_win1255.out new file mode 100644 index 00000000000..9b89410af76 --- /dev/null +++ b/tests/runtime/data/tail/out/generic_enc_win1255.out @@ -0,0 +1,8 @@ +{"log":"\u05e9\u05dc\u05d5\u05dd"} +{"log":"\u05ea\u05d5\u05d3\u05d4"} +{"log":"\u05dc\u05d4\u05ea\u05e8\u05d0\u05d5\u05ea"} +{"log":"\u05e2\u05d1\u05e8\u05d9\u05ea"} +{"log":"\u05d7\u05ea\u05d5\u05dc"} +{"log":"\u05db\u05dc\u05d1"} +{"log":"\u05e9\u05de\u05d9\u05d9\u05dd"} +{"log":"\u05d9\u05dd"} \ No newline at end of file diff --git a/tests/runtime/data/tail/out/generic_enc_win1256.out b/tests/runtime/data/tail/out/generic_enc_win1256.out new file mode 100644 index 00000000000..fcd0a00e613 --- /dev/null +++ b/tests/runtime/data/tail/out/generic_enc_win1256.out @@ -0,0 +1,8 @@ +{"log":"\u0645\u0631\u062d\u0628\u0627"} +{"log":"\u0634\u0643\u0631\u0627"} +{"log":"\u0645\u0639 \u0627\u0644\u0633\u0644\u0627\u0645\u0629"} +{"log":"\u0627\u0644\u0639\u0631\u0628\u064a\u0629"} +{"log":"\u0642\u0637"} +{"log":"\u0643\u0644\u0628"} +{"log":"\u0633\u0645\u0627\u0621"} +{"log":"\u0628\u062d\u0631"} diff --git a/tests/runtime/in_tail.c b/tests/runtime/in_tail.c index 6708437ece4..e3af2c3ba88 100644 --- a/tests/runtime/in_tail.c +++ b/tests/runtime/in_tail.c @@ -589,6 +589,159 @@ void flb_test_in_tail_dockermode_firstline_detection() NULL); } +void do_test_generic_enctype(char *system, const char *target, const char *enc, int tExpected, int nExpected, ...) +{ + int64_t ret; + flb_ctx_t *ctx = NULL; + int in_ffd; + int out_ffd; + va_list va; + char *key; + char *value; + char path[PATH_MAX]; + struct tail_test_result result = {0}; + + result.nMatched = 0; + result.target = target; + + struct flb_lib_out_cb cb; + cb.cb = cb_check_result; + cb.data = &result; + + /* initialize */ + set_result(0); + + ctx = flb_create(); + + ret = flb_service_set(ctx, + "Log_Level", "error", + "Parsers_File", DPATH "/parsers.conf", + NULL); + TEST_CHECK_(ret == 0, "setting service options"); + + in_ffd = flb_input(ctx, (char *) system, NULL); + TEST_CHECK(in_ffd >= 0); + TEST_CHECK(flb_input_set(ctx, in_ffd, "tag", "test", NULL) == 0); + + /* Compose path based on target */ + snprintf(path, sizeof(path) - 1, DPATH "/log/%s.log", target); + TEST_CHECK_(access(path, R_OK) == 0, "accessing log file: %s", path); + + TEST_CHECK(flb_input_set(ctx, in_ffd, + "path" , path, + "generic.encoding", enc, + "read_from_head", "true", + NULL) == 0); + + va_start(va, nExpected); + while ((key = va_arg(va, char *))) { + value = va_arg(va, char *); + TEST_CHECK(value != NULL); + TEST_CHECK(flb_input_set(ctx, in_ffd, key, value, NULL) == 0); + } + va_end(va); + + out_ffd = flb_output(ctx, (char *) "lib", &cb); + TEST_CHECK(out_ffd >= 0); + TEST_CHECK(flb_output_set(ctx, out_ffd, + "match", "test", + "format", "json", + NULL) == 0); + + TEST_CHECK(flb_service_set(ctx, "Flush", "0.5", + "Grace", "1", + NULL) == 0); + + /* Start test */ + /* Start the engine */ + ret = flb_start(ctx); + TEST_CHECK_(ret == 0, "starting engine"); + + /* Poll for up to 5 seconds or until we got a match */ + for (ret = 0; ret < tExpected && result.nMatched < nExpected; ret++) { + usleep(1000); + } + + /* Wait until matching nExpected results */ + wait_with_timeout(5000, &result, nExpected); + + TEST_CHECK(result.nMatched == nExpected); + TEST_MSG("result.nMatched: %i\nnExpected: %i", result.nMatched, nExpected); + + ret = flb_stop(ctx); + TEST_CHECK_(ret == 0, "stopping engine"); + + if (ctx) { + flb_destroy(ctx); + } +} + +void flb_test_in_tail_generic_enc_big5() +{ + do_test_generic_enctype("tail", "generic_enc_big5", "BIG5", + 20000, 10, NULL); +} + +void flb_test_in_tail_generic_enc_gb18030() +{ + do_test_generic_enctype("tail", "generic_enc_gb18030", "GB18030", + 20000, 12, NULL); +} + +void flb_test_in_tail_generic_enc_gbk() +{ + do_test_generic_enctype("tail", "generic_enc_gbk", "GBK", + 20000, 11, NULL); +} + +void flb_test_in_tail_generic_enc_sjis() +{ + do_test_generic_enctype("tail", "generic_enc_sjis", "ShiftJIS", + 20000, 11, NULL); +} + +void flb_test_in_tail_generic_enc_win1250() +{ + do_test_generic_enctype("tail", "generic_enc_win1250", "WIN1250", + 20000, 6, NULL); +} + +void flb_test_in_tail_generic_enc_win1251() +{ + do_test_generic_enctype("tail", "generic_enc_win1251", "WIN1251", + 20000, 9, NULL); +} + +void flb_test_in_tail_generic_enc_win1252() +{ + do_test_generic_enctype("tail", "generic_enc_win1252", "WIN1252", + 20000, 14, NULL); +} + +void flb_test_in_tail_generic_enc_win1253() +{ + do_test_generic_enctype("tail", "generic_enc_win1253", "WIN1253", + 20000, 8, NULL); +} + +void flb_test_in_tail_generic_enc_win1254() +{ + do_test_generic_enctype("tail", "generic_enc_win1254", "WIN1254", + 20000, 13, NULL); +} + +void flb_test_in_tail_generic_enc_win1255() +{ + do_test_generic_enctype("tail", "generic_enc_win1255", "WIN1255", + 20000, 8, NULL); +} + +void flb_test_in_tail_generic_enc_win1256() +{ + do_test_generic_enctype("tail", "generic_enc_win1256", "WIN1256", + 20000, 8, NULL); +} + #ifdef FLB_HAVE_UNICODE_ENCODER void do_test_unicode(char *system, const char *target, int nExpected, ...) { @@ -2227,6 +2380,17 @@ TEST_LIST = { {"in_tail_dockermode_splitted_multiple_lines", flb_test_in_tail_dockermode_splitted_multiple_lines}, {"in_tail_dockermode_firstline_detection", flb_test_in_tail_dockermode_firstline_detection}, {"in_tail_multiline_json_and_regex", flb_test_in_tail_multiline_json_and_regex}, + {"in_tail_generic_enc_big5", flb_test_in_tail_generic_enc_big5}, + {"in_tail_generic_enc_gb18030", flb_test_in_tail_generic_enc_gb18030}, + {"in_tail_generic_enc_gbk", flb_test_in_tail_generic_enc_gbk}, + {"in_tail_generic_enc_sjis", flb_test_in_tail_generic_enc_sjis}, + {"in_tail_generic_enc_win1250", flb_test_in_tail_generic_enc_win1250}, + {"in_tail_generic_enc_win1251", flb_test_in_tail_generic_enc_win1251}, + {"in_tail_generic_enc_win1252", flb_test_in_tail_generic_enc_win1252}, + {"in_tail_generic_enc_win1253", flb_test_in_tail_generic_enc_win1253}, + {"in_tail_generic_enc_win1254", flb_test_in_tail_generic_enc_win1254}, + {"in_tail_generic_enc_win1255", flb_test_in_tail_generic_enc_win1255}, + {"in_tail_generic_enc_win1256", flb_test_in_tail_generic_enc_win1256}, #endif {NULL, NULL} };