Skip to content

Commit b92c6a2

Browse files
committed
unicode: conv: Implement conversion rules for encoding conversion
Note: The rules which are related to CJK is mainly included for this converter implementation on Fluent Bit. Signed-off-by: Hiroshi Hatake <[email protected]>
1 parent df9d52b commit b92c6a2

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+64910
-1
lines changed

include/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,13 @@ if(FLB_UNICODE_ENCODER)
4343
PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ)
4444
endif()
4545

46+
# flb_conv
47+
file(GLOB headers "fluent-bit/unicode/*.h")
48+
install(FILES ${headers}
49+
DESTINATION ${FLB_INSTALL_INCLUDEDIR}/fluent-bit/unicode/
50+
COMPONENT headers
51+
PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ)
52+
4653
install(FILES "../lib/monkey/include/monkey/mk_core.h"
4754
DESTINATION ${FLB_INSTALL_INCLUDEDIR}/monkey/
4855
COMPONENT headers-extra

include/fluent-bit/flb_unicode.h

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,28 @@
2121
#define FLB_UNICODE
2222

2323
#include <stddef.h>
24+
#include <fluent-bit/unicode/flb_wchar.h>
25+
#include <fluent-bit/unicode/flb_conv.h>
26+
27+
/* Supported encoding converters */
28+
enum flb_unicode_generic_enc_type {
29+
FLB_GENERIC_ASCII = FLB_STR_ASCII, /* ASCII */
30+
FLB_GENERIC_WIN1256 = FLB_WIN1256, /* windows-1256 */
31+
FLB_GENERIC_WIN866 = FLB_WIN866, /* (MS-DOS CP866) */
32+
FLB_GENERIC_WIN874 = FLB_WIN874, /* windows-874 */
33+
FLB_GENERIC_WIN1251 = FLB_WIN1251, /* windows-1251 */
34+
FLB_GENERIC_WIN1252 = FLB_WIN1252, /* windows-1252 */
35+
FLB_GENERIC_WIN1250 = FLB_WIN1250, /* windows-1250 */
36+
FLB_GENERIC_WIN1253 = FLB_WIN1253, /* windows-1253 */
37+
FLB_GENERIC_WIN1254 = FLB_WIN1254, /* windows-1254 */
38+
FLB_GENERIC_WIN1255 = FLB_WIN1255, /* windows-1255 */
39+
FLB_GENERIC_SJIS = FLB_SJIS, /* Shift JIS (Windows-932) */
40+
FLB_GENERIC_BIG5 = FLB_BIG5, /* Big5 (Windows-950) */
41+
FLB_GENERIC_GBK = FLB_GBK, /* GBK (Windows-936) */
42+
FLB_GENERIC_UHC = FLB_UHC, /* UHC (Windows-949) */
43+
FLB_GENERIC_GB18030 = FLB_GB18030, /* GB18030 */
44+
_FLB_GENERIC_LAST_ENCODING_ /* mark only */
45+
};
2446

2547
#ifdef FLB_HAVE_UNICODE_ENCODER
2648
#include <fluent-bit/simdutf/flb_simdutf_connector.h>
@@ -30,7 +52,7 @@
3052
#define FLB_UNICODE_CONVERT_UNSUPPORTED FLB_SIMDUTF_CONNECTOR_CONVERT_UNSUPPORTED
3153
#define FLB_UNICODE_CONVERT_ERROR FLB_SIMDUTF_CONNECTOR_CONVERT_ERROR
3254

33-
enum flb_unicode_endocing_type {
55+
enum flb_unicode_encoding_type {
3456
FLB_UNICODE_ENCODING_UTF8 = FLB_SIMDUTF_ENCODING_TYPE_UTF8, /* BOM 0xef 0xbb 0xbf */
3557
FLB_UNICODE_ENCODING_UTF16_LE = FLB_SIMDUTF_ENCODING_TYPE_UTF16_LE, /* BOM 0xff 0xfe */
3658
FLB_UNICODE_ENCODING_UTF16_BE = FLB_SIMDUTF_ENCODING_TYPE_UTF16_BE, /* BOM 0xfe 0xff */
@@ -54,4 +76,10 @@ int flb_unicode_convert(int preferred_encoding, const char *input, size_t length
5476
char **output, size_t *out_size);
5577
int flb_unicode_validate(const char *record, size_t size);
5678

79+
int flb_unicode_generic_supported_encoding(const char *encoding_name);
80+
int flb_unicode_generic_convert_to_utf8(const char *encoding_name,
81+
const unsigned char *input, unsigned char **output, size_t length);
82+
int flb_unicode_generic_convert_from_utf8(const char *encoding_name,
83+
const unsigned char *input, unsigned char **output, size_t length);
84+
5785
#endif
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2+
3+
/* Fluent Bit
4+
* ==========
5+
* Copyright (C) 2025 The Fluent Bit Authors
6+
*
7+
* Licensed under the Apache License, Version 2.0 (the "License");
8+
* you may not use this file except in compliance with the License.
9+
* You may obtain a copy of the License at
10+
*
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
*
13+
* Unless required by applicable law or agreed to in writing, software
14+
* distributed under the License is distributed on an "AS IS" BASIS,
15+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
* See the License for the specific language governing permissions and
17+
* limitations under the License.
18+
*/
19+
20+
#ifndef FLB_CONV_H
21+
#define FLB_CONV_H
22+
23+
#include <stddef.h>
24+
#include <stdbool.h>
25+
26+
#include <monkey/mk_core.h>
27+
28+
#define FLB_CONV_MAX_ALIAS_LENGTH 4
29+
30+
#define FLB_CONV_CONVERT_OK 0
31+
#define FLB_CONV_CONVERTER_NOT_FOUND -1
32+
#define FLB_CONV_ALLOCATION_FAILED -2
33+
#define FLB_CONV_CONVERSION_FAILED -3
34+
35+
struct flb_unicode_converter {
36+
const char *name;
37+
const char *aliases[FLB_CONV_MAX_ALIAS_LENGTH];
38+
const char *desc;
39+
int encoding;
40+
int max_width; /* Maximum width of character from local to UTF-8 */
41+
42+
/* callbacks */
43+
int (*cb_to_utf8) (const unsigned char *src, unsigned char **dest,
44+
size_t len, bool no_error, int encoding);
45+
int (*cb_from_utf8) (const unsigned char *src, unsigned char **dest,
46+
size_t len, bool no_error, int encoding);
47+
48+
struct mk_list _head;
49+
};
50+
51+
struct flb_unicode_converter *flb_conv_select_converter(const char *encoding_name);
52+
int flb_conv_supported_encoding(const char *encoding_name);
53+
int flb_conv_convert_to_utf8(const char *encoding_name,
54+
const unsigned char *src, unsigned char **dest,
55+
size_t len, bool no_error);
56+
int flb_conv_convert_from_utf8(const char *encoding_name,
57+
const unsigned char *src, unsigned char **dest,
58+
size_t len, bool no_error);
59+
60+
extern struct flb_unicode_converter sjis_converter;
61+
extern struct flb_unicode_converter gb18030_converter;
62+
extern struct flb_unicode_converter uhc_converter;
63+
extern struct flb_unicode_converter big5_converter;
64+
extern struct flb_unicode_converter win866_converter;
65+
extern struct flb_unicode_converter win874_converter;
66+
extern struct flb_unicode_converter win1250_converter;
67+
extern struct flb_unicode_converter win1251_converter;
68+
extern struct flb_unicode_converter win1252_converter;
69+
extern struct flb_unicode_converter win1253_converter;
70+
extern struct flb_unicode_converter win1254_converter;
71+
extern struct flb_unicode_converter win1255_converter;
72+
extern struct flb_unicode_converter win1256_converter;
73+
extern struct flb_unicode_converter gbk_converter;
74+
75+
#endif
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2+
3+
/* Fluent Bit
4+
* ==========
5+
* Copyright (C) 2025 The Fluent Bit Authors
6+
*
7+
* Licensed under the Apache License, Version 2.0 (the "License");
8+
* you may not use this file except in compliance with the License.
9+
* You may obtain a copy of the License at
10+
*
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
*
13+
* Unless required by applicable law or agreed to in writing, software
14+
* distributed under the License is distributed on an "AS IS" BASIS,
15+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
* See the License for the specific language governing permissions and
17+
* limitations under the License.
18+
*/
19+
20+
#ifndef FLB_WCHAR_H
21+
#define FLB_WCHAR_H
22+
23+
#include <stddef.h>
24+
#include <stdbool.h>
25+
26+
#include <fluent-bit/flb_log.h>
27+
28+
/* msb for char */
29+
#define HIGHBIT (0x80)
30+
#define IS_HIGHBIT_SET(ch) ((unsigned char)(ch) & HIGHBIT)
31+
32+
/*
33+
* The FLB_wchar type
34+
*/
35+
typedef unsigned int flb_wchar;
36+
37+
/*
38+
* Maximum byte length of multibyte characters in any backend encoding
39+
*/
40+
#define MAX_MULTIBYTE_CHAR_LEN 4
41+
42+
/*
43+
* SJIS validation macros
44+
*/
45+
#define ISSJISHEAD(c) (((c) >= 0x81 && (c) <= 0x9f) || ((c) >= 0xe0 && (c) <= 0xfc))
46+
#define ISSJISTAIL(c) (((c) >= 0x40 && (c) <= 0x7e) || ((c) >= 0x80 && (c) <= 0xfc))
47+
48+
#include <fluent-bit/flb_macros.h>
49+
50+
/*
51+
* Encoding identifiers
52+
*/
53+
typedef enum flb_enc
54+
{
55+
FLB_STR_ASCII = 0, /* STR/ASCII */
56+
FLB_UTF8, /* Unicode UTF8 */
57+
FLB_WIN1256, /* windows-1256 */
58+
FLB_WIN866, /* (MS-DOS CP866) */
59+
FLB_WIN874, /* windows-874 */
60+
FLB_WIN1251, /* windows-1251 */
61+
FLB_WIN1252, /* windows-1252 */
62+
FLB_WIN1250, /* windows-1250 */
63+
FLB_WIN1253, /* windows-1253 */
64+
FLB_WIN1254, /* windows-1254 */
65+
FLB_WIN1255, /* windows-1255 */
66+
FLB_SJIS, /* Shift JIS (Windows-932) */
67+
FLB_BIG5, /* Big5 (Windows-950) */
68+
FLB_GBK, /* GBK (Windows-936) */
69+
FLB_UHC, /* UHC (Windows-949) */
70+
FLB_GB18030, /* GB18030 */
71+
_FLB_LAST_ENCODING_ /* mark only */
72+
73+
} flb_enc;
74+
75+
#define FLB_VALID_ENCODING(_enc) \
76+
((_enc) >= 0 && (_enc) < _FLB_LAST_ENCODING_)
77+
78+
/* On FE are possible all encodings */
79+
#define FLB_VALID_FE_ENCODING(_enc) FLB_VALID_ENCODING(_enc)
80+
81+
/*
82+
* flb_wchar stuff
83+
*/
84+
typedef int (*mb2wchar_with_len_converter) (const unsigned char *from,
85+
flb_wchar *to,
86+
int len);
87+
88+
typedef int (*wchar2mb_with_len_converter) (const flb_wchar *from,
89+
unsigned char *to,
90+
int len);
91+
92+
typedef int (*mblen_converter) (const unsigned char *mbstr);
93+
typedef int (*mbdisplaylen_converter) (const unsigned char *mbstr);
94+
typedef bool (*mbcharacter_incrementer) (unsigned char *mbstr, int len);
95+
typedef int (*mbchar_verifier) (const unsigned char *mbstr, int len);
96+
typedef int (*mbstr_verifier) (const unsigned char *mbstr, int len);
97+
98+
typedef struct
99+
{
100+
mb2wchar_with_len_converter mb2wchar_with_len; /* convert a multibyte
101+
* string to a wchar */
102+
wchar2mb_with_len_converter wchar2mb_with_len; /* convert a wchar string
103+
* to a multibyte */
104+
mblen_converter mblen; /* get byte length of a char */
105+
mbdisplaylen_converter dsplen; /* get display width of a char */
106+
mbchar_verifier mbverifychar; /* verify multibyte character */
107+
mbstr_verifier mbverifystr; /* verify multibyte string */
108+
int maxmblen; /* max bytes for a char in this encoding */
109+
} flb_wchar_tbl;
110+
111+
extern const flb_wchar_tbl flb_wchar_table[];
112+
113+
/*
114+
* Radix tree for character conversion.
115+
*
116+
*/
117+
typedef struct {
118+
const uint16_t *chars16;
119+
const uint32_t *chars32;
120+
121+
/* Radix tree for 1-byte inputs */
122+
uint32_t b1root;
123+
uint8_t b1_lower;
124+
uint8_t b1_upper;
125+
126+
/* Radix tree for 2-byte inputs */
127+
uint32_t b2root;
128+
uint8_t b2_1_lower;
129+
uint8_t b2_1_upper;
130+
uint8_t b2_2_lower;
131+
uint8_t b2_2_upper;
132+
133+
/* Radix tree for 3-byte inputs */
134+
uint32_t b3root;
135+
uint8_t b3_1_lower;
136+
uint8_t b3_1_upper;
137+
uint8_t b3_2_lower;
138+
uint8_t b3_2_upper;
139+
uint8_t b3_3_lower;
140+
uint8_t b3_3_upper;
141+
142+
/* Radix tree for 4-byte inputs */
143+
uint32_t b4root;
144+
uint8_t b4_1_lower;
145+
uint8_t b4_1_upper;
146+
uint8_t b4_2_lower;
147+
uint8_t b4_2_upper;
148+
uint8_t b4_3_lower;
149+
uint8_t b4_3_upper;
150+
uint8_t b4_4_lower;
151+
uint8_t b4_4_upper;
152+
153+
} flb_mb_radix_tree;
154+
155+
/*
156+
* UTF-8 to local code conversion map (for combined characters)
157+
*/
158+
typedef struct {
159+
uint32_t utf1;
160+
uint32_t utf2;
161+
uint32_t code;
162+
} flb_utf_to_local_combined;
163+
164+
/*
165+
* local code to UTF-8 conversion map (for combined characters)
166+
*/
167+
typedef struct {
168+
uint32_t code;
169+
uint32_t utf1;
170+
uint32_t utf2;
171+
} flb_local_to_utf_combined;
172+
173+
/*
174+
* @brief callback function for algorithmic encoding conversions (in either direction)
175+
*
176+
* if function returns zero, it does not know how to convert the code
177+
*/
178+
typedef uint32_t (*utf_local_conversion_func) (uint32_t code);
179+
180+
extern void flb_encoding_set_invalid(int encoding, char *dst);
181+
extern int flb_encoding_mblen(int encoding, const char *mbstr);
182+
extern int flb_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
183+
size_t remaining);
184+
extern int flb_encoding_mblen_bounded(int encoding, const char *mbstr);
185+
extern int flb_encoding_dsplen(int encoding, const char *mbstr);
186+
extern int flb_encoding_verifymbchar(int encoding, const char *mbstr, int len);
187+
extern int flb_encoding_verifymbstr(int encoding, const char *mbstr, int len);
188+
extern int flb_encoding_max_length(int encoding);
189+
190+
extern bool flb_utf8_islegal(const unsigned char *source, int length);
191+
extern int flb_utf_mblen(const unsigned char *s);
192+
193+
/* Those of converting functions is not public APIs in flb_conv.h. */
194+
extern int flb_convert_to_local_internal(const unsigned char *utf, int len,
195+
unsigned char *iso,
196+
const flb_mb_radix_tree *map,
197+
const flb_utf_to_local_combined *cmap, int cmapsize,
198+
utf_local_conversion_func conv_func,
199+
int encoding, bool noError);
200+
extern int flb_convert_to_utf_internal(const unsigned char *iso, int len,
201+
unsigned char *utf,
202+
const flb_mb_radix_tree *map,
203+
const flb_local_to_utf_combined *cmap, int cmapsize,
204+
utf_local_conversion_func conv_func,
205+
int encoding, bool noError);
206+
207+
extern bool flb_verifymbstr(const char *mbstr, int len, bool noError);
208+
extern bool flb_verify_mbstr(int encoding, const char *mbstr, int len,
209+
bool noError);
210+
extern int flb_verify_mbstr_len(int encoding, const char *mbstr, int len,
211+
bool noError);
212+
213+
extern void flb_report_invalid_encoding(int encoding, const char *mbstr, int len);
214+
extern void flb_report_untranslatable_char(int src_encoding, int dest_encoding,
215+
const char *mbstr, int len);
216+
217+
#endif /* FLB_WCHAR_H */

src/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,13 @@ if(FLB_STREAM_PROCESSOR)
464464
)
465465
endif()
466466

467+
# Unicode Conversions
468+
add_subdirectory(unicode)
469+
set(FLB_DEPS
470+
${FLB_DEPS}
471+
flb-conv
472+
)
473+
467474
if (MSVC)
468475
set(flb_rc_files
469476
${CMAKE_CURRENT_BINARY_DIR}/version.rc

src/flb_unicode.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020

2121
#include <fluent-bit/flb_unicode.h>
22+
#include <fluent-bit/unicode/flb_conv.h>
2223
#include <stddef.h>
2324

2425
int flb_unicode_convert(int preferred_encoding, const char *input, size_t length,
@@ -40,3 +41,20 @@ int flb_unicode_validate(const char *record, size_t size)
4041
return -1;
4142
#endif
4243
}
44+
45+
int flb_unicode_generic_supported_encoding(const char *encoding_name)
46+
{
47+
return flb_conv_supported_encoding(encoding_name);
48+
}
49+
50+
int flb_unicode_generic_convert_to_utf8(const char *encoding_name,
51+
const unsigned char *input, unsigned char **output, size_t length)
52+
{
53+
return flb_conv_convert_to_utf8(encoding_name, input, output, length, FLB_FALSE);
54+
}
55+
56+
int flb_unicode_generic_convert_from_utf8(const char *encoding_name,
57+
const unsigned char *input, unsigned char **output, size_t length)
58+
{
59+
return flb_conv_convert_from_utf8(encoding_name, input, output, length, FLB_FALSE);
60+
}

0 commit comments

Comments
 (0)