Skip to content

Commit 5905ace

Browse files
committed
simdutf_connector: Add C connector for simdutf library
Signed-off-by: Hiroshi Hatake <[email protected]>
1 parent f1544f0 commit 5905ace

File tree

6 files changed

+329
-1
lines changed

6 files changed

+329
-1
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,7 @@ endif()
479479
if(FLB_UNICODE_ENCODER)
480480
enable_language(CXX)
481481
add_subdirectory(${FLB_PATH_LIB_SIMDUTF} EXCLUDE_FROM_ALL)
482+
FLB_DEFINITION(FLB_HAVE_UNICODE_ENCODER)
482483
endif()
483484

484485
# snappy

include/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@ install(FILES ${headers}
2828
COMPONENT headers
2929
PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ)
3030

31+
file(GLOB headers "fluent-bit/simdutf/*.h")
32+
install(FILES ${headers}
33+
DESTINATION ${FLB_INSTALL_INCLUDEDIR}/fluent-bit/simdutf/
34+
COMPONENT headers
35+
PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ)
36+
3137
install(FILES "../lib/monkey/include/monkey/mk_core.h"
3238
DESTINATION ${FLB_INSTALL_INCLUDEDIR}/monkey/
3339
COMPONENT headers-extra
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2+
3+
/* Fluent Bit
4+
* ==========
5+
* Copyright (C) 2015-2024 The Fluent Bit Authors
6+
*
7+
* Licensed under the Apache License, Version 2.0 (the "License");
8+
* you may not use this file except in compliance with the License.
9+
* You may obtain a copy of the License at
10+
*
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
*
13+
* Unless required by applicable law or agreed to in writing, software
14+
* distributed under the License is distributed on an "AS IS" BASIS,
15+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
* See the License for the specific language governing permissions and
17+
* limitations under the License.
18+
*/
19+
20+
#ifndef FLB_SIMDUTF_CONNECTOR_H
21+
#define FLB_SIMDUTF_CONNECTOR_H
22+
23+
#include <uchar.h>
24+
25+
#ifdef __cplusplus
26+
extern "C" {
27+
#endif /* __cplusplus */
28+
29+
#define FLB_SIMDUTF_CONNECTOR_CONVERT_OK 0
30+
#define FLB_SIMDUTF_CONNECTOR_CONVERT_NOP -1
31+
#define FLB_SIMDUTF_CONNECTOR_CONVERT_UNSUPPORTED -2
32+
#define FLB_SIMDUTF_CONNECTOR_CONVERT_ERROR -3
33+
34+
/* Just copy and pasted from amalugamated simdutf.h to remove C++ namespace */
35+
enum flb_simdutf_encoding_type {
36+
FLB_SIMDUTF_ENCODING_TYPE_UTF8 = 1, /* BOM 0xef 0xbb 0xbf */
37+
FLB_SIMDUTF_ENCODING_TYPE_UTF16_LE = 2, /* BOM 0xff 0xfe */
38+
FLB_SIMDUTF_ENCODING_TYPE_UTF16_BE = 4, /* BOM 0xfe 0xff */
39+
FLB_SIMDUTF_ENCODING_TYPE_UTF32_LE = 8, /* BOM 0xff 0xfe 0x00 0x00 */
40+
FLB_SIMDUTF_ENCODING_TYPE_UTF32_BE = 16, /* BOM 0x00 0x00 0xfe 0xff */
41+
FLB_SIMDUTF_ENCODING_TYPE_Latin1 = 32,
42+
43+
FLB_SIMDUTF_ENCODING_TYPE_UNSPECIFIED = 0,
44+
FLB_SIMDUTF_ENCODING_TYPE_UNICODE_AUTO = 1 << 10, /* Automatically detecting flag*/
45+
};
46+
47+
enum flb_simdutf_error_code {
48+
FLB_SIMDUTF_ERROR_CODE_SUCCESS = FLB_SIMDUTF_CONNECTOR_CONVERT_OK,
49+
FLB_SIMDUTF_ERROR_CODE_HEADER_BITS,
50+
FLB_SIMDUTF_ERROR_CODE_TOO_SHORT,
51+
FLB_SIMDUTF_ERROR_CODE_TOO_LONG,
52+
FLB_SIMDUTF_ERROR_CODE_OVERLONG,
53+
FLB_SIMDUTF_ERROR_CODE_TOO_LARGE,
54+
FLB_SIMDUTF_ERROR_CODE_SURROGATE,
55+
FLB_SIMDUTF_ERROR_CODE_INVALID_BASE64_CHARACTER,
56+
FLB_SIMDUTF_ERROR_CODE_BASE64_INPUT_REMAINDER,
57+
FLB_SIMDUTF_ERROR_CODE_OUTPUT_BUFFER_TOO_SMALL,
58+
FLB_SIMDUTF_ERROR_CODE_OTHER,
59+
};
60+
61+
int flb_simdutf_connector_utf8_length_from_utf16le(const char16_t *buf, size_t len);
62+
int flb_simdutf_connector_utf8_length_from_utf16be(const char16_t *buf, size_t len);
63+
int flb_simdutf_connector_utf8_length_from_utf16(const char16_t *buf, size_t len);
64+
int flb_simdutf_connector_validate_utf8(const char *buf, size_t len);
65+
int flb_simdutf_connector_validate_utf16le(const char16_t *buf, size_t len);
66+
int flb_simdutf_connector_validate_utf16be(const char16_t *buf, size_t len);
67+
int flb_simdutf_connector_validate_utf16(const char16_t *buf, size_t len);
68+
int flb_simdutf_connector_convert_utf16le_to_utf8(const char16_t *buf, size_t len,
69+
char **utf8_output, size_t *out_size);
70+
int flb_simdutf_connector_convert_utf16be_to_utf8(const char16_t *buf, size_t len,
71+
char **utf8_output, size_t *out_size);
72+
int flb_simdutf_connector_convert_utf16_to_utf8(const char16_t *buf, size_t len,
73+
char **utf8_output, size_t *out_size);
74+
void flb_simdutf_connector_change_endianness_utf16(const char16_t *input, size_t length, char16_t *output);
75+
int flb_simdutf_connector_detect_encodings(const char *input, size_t length);
76+
int flb_simdutf_connector_convert_from_unicode(int preferred_encoding,
77+
const char *input, size_t length,
78+
char **output, size_t *out_size);
79+
80+
#ifdef __cplusplus
81+
}
82+
#endif /* __cplusplus */
83+
84+
#endif

src/CMakeLists.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,10 @@ if(FLB_WASM)
334334
add_subdirectory(wasm)
335335
endif()
336336

337+
if(FLB_UNICODE_ENCODER)
338+
add_subdirectory(simdutf)
339+
endif()
340+
337341
# WAMRC compiler
338342
if(FLB_WAMRC)
339343
add_subdirectory(wamrc)
@@ -409,11 +413,12 @@ set(FLB_DEPS
409413
)
410414
endif()
411415

412-
# Unicode Encoding
416+
# Unicode Encoding (UTF-16LE, UTF-16BE)
413417
if(FLB_UNICODE_ENCODER)
414418
set(FLB_DEPS
415419
${FLB_DEPS}
416420
simdutf-static
421+
flb-simdutf-connector-static
417422
)
418423
endif()
419424

src/simdutf/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
include_directories (../../${FLB_PATH_LIB_SIMDUTF}/src/simdutf)
2+
3+
message(STATUS "include: ${FLB_PATH_LIB_SIMDUTF}/src/simdutf")
4+
5+
set(src
6+
flb_simdutf_connector.cpp)
7+
8+
add_library(flb-simdutf-connector-static STATIC ${src})
9+
target_link_libraries(flb-simdutf-connector-static simdutf-static)
Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2+
3+
/* Fluent Bit
4+
* ==========
5+
* Copyright (C) 2015-2024 The Fluent Bit Authors
6+
*
7+
* Licensed under the Apache License, Version 2.0 (the "License");
8+
* you may not use this file except in compliance with the License.
9+
* You may obtain a copy of the License at
10+
*
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
*
13+
* Unless required by applicable law or agreed to in writing, software
14+
* distributed under the License is distributed on an "AS IS" BASIS,
15+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
* See the License for the specific language governing permissions and
17+
* limitations under the License.
18+
*/
19+
20+
#include <simdutf.h>
21+
#include <uchar.h>
22+
#include <memory.h>
23+
#include <fluent-bit/simdutf/flb_simdutf_connector.h>
24+
#include <memory>
25+
26+
int flb_simdutf_connector_utf8_length_from_utf16le(const char16_t *buf, size_t len)
27+
{
28+
return simdutf::utf8_length_from_utf16le(buf, len);
29+
}
30+
31+
int flb_simdutf_connector_utf8_length_from_utf16be(const char16_t *buf, size_t len)
32+
{
33+
return simdutf::utf8_length_from_utf16be(buf, len);
34+
}
35+
36+
int flb_simdutf_connector_utf8_length_from_utf16(const char16_t *buf, size_t len)
37+
{
38+
return simdutf::utf8_length_from_utf16(buf, len);
39+
}
40+
41+
int flb_simdutf_connector_validate_utf8(const char *buf, size_t len)
42+
{
43+
return simdutf::validate_utf8(buf, len);
44+
}
45+
46+
int flb_simdutf_connector_validate_utf16le(const char16_t *buf, size_t len)
47+
{
48+
return simdutf::validate_utf16le(buf, len);
49+
}
50+
51+
int flb_simdutf_connector_validate_utf16be(const char16_t *buf, size_t len)
52+
{
53+
return simdutf::validate_utf16be(buf, len);
54+
}
55+
56+
int flb_simdutf_connector_validate_utf16(const char16_t *buf, size_t len)
57+
{
58+
return simdutf::validate_utf16(buf, len);
59+
}
60+
61+
int flb_simdutf_connector_convert_utf16le_to_utf8(const char16_t *buf, size_t len,
62+
char **utf8_output, size_t *out_size)
63+
{
64+
size_t clen = 0;
65+
size_t converted = 0;
66+
simdutf::result result;
67+
68+
clen = simdutf::utf8_length_from_utf16le(buf, len);
69+
/* convert_utfXXXX_to_utf8 function needs to pass allocated memory region with C++ style */
70+
std::unique_ptr<char[]> output{new char[clen]};
71+
converted = simdutf::convert_utf16le_to_utf8(buf, len, output.get());
72+
result = simdutf::validate_utf8_with_errors(output.get(), clen);
73+
if (result.error == simdutf::error_code::SUCCESS && converted > 0) {
74+
std::string result_string(output.get(), clen);
75+
76+
*utf8_output = strdup(result_string.c_str());
77+
*out_size = converted;
78+
79+
return FLB_SIMDUTF_ERROR_CODE_SUCCESS;
80+
}
81+
else {
82+
*utf8_output = NULL;
83+
*out_size = 0;
84+
85+
return result.error;
86+
}
87+
}
88+
89+
int flb_simdutf_connector_convert_utf16be_to_utf8(const char16_t *buf, size_t len,
90+
char **utf8_output, size_t *out_size)
91+
{
92+
size_t clen = 0;
93+
size_t converted = 0;
94+
simdutf::result result;
95+
96+
clen = simdutf::utf8_length_from_utf16be(buf, len);
97+
/* convert_utfXXXX_to_utf8 function needs to pass allocated memory region with C++ style */
98+
std::unique_ptr<char[]> output{new char[clen]};
99+
converted = simdutf::convert_utf16be_to_utf8(buf, len, output.get());
100+
result = simdutf::validate_utf8_with_errors(output.get(), clen);
101+
if (result.error == simdutf::error_code::SUCCESS && converted > 0) {
102+
std::string result_string(output.get(), clen);
103+
104+
*utf8_output = strdup(result_string.c_str());
105+
*out_size = converted;
106+
107+
return FLB_SIMDUTF_ERROR_CODE_SUCCESS;
108+
}
109+
else {
110+
*utf8_output = NULL;
111+
*out_size = 0;
112+
113+
return result.error;
114+
}
115+
}
116+
117+
int flb_simdutf_connector_convert_utf16_to_utf8(const char16_t *buf, size_t len,
118+
char **utf8_output, size_t *out_size)
119+
{
120+
size_t clen = 0;
121+
size_t converted = 0;
122+
simdutf::result result;
123+
124+
clen = simdutf::utf8_length_from_utf16(buf, len);
125+
/* convert_utfXXXX_to_utf8 function needs to pass allocated memory region with C++ style */
126+
std::unique_ptr<char[]> output{new char[clen]};
127+
converted = simdutf::convert_utf16_to_utf8(buf, len, output.get());
128+
result = simdutf::validate_utf8_with_errors(output.get(), clen);
129+
if (result.error == simdutf::error_code::SUCCESS && converted > 0) {
130+
std::string result_string(output.get(), clen);
131+
132+
*utf8_output = strdup(result_string.c_str());
133+
*out_size = converted;
134+
135+
return FLB_SIMDUTF_ERROR_CODE_SUCCESS;
136+
}
137+
else {
138+
*utf8_output = NULL;
139+
*out_size = 0;
140+
141+
return result.error;
142+
}
143+
}
144+
145+
void flb_simdutf_connector_change_endianness_utf16(const char16_t *input, size_t length, char16_t *output)
146+
{
147+
simdutf::change_endianness_utf16(input, length, output);
148+
}
149+
150+
int flb_simdutf_connector_detect_encodings(const char *input, size_t length)
151+
{
152+
return simdutf::detect_encodings(input, length);
153+
}
154+
155+
int flb_simdutf_connector_convert_from_unicode(int preferred_encoding,
156+
const char *input, size_t length,
157+
char **output, size_t *out_size)
158+
{
159+
int len = 0;
160+
size_t clen = 0;
161+
size_t i = 0;
162+
int encoding = 0;
163+
std::u16string str16;
164+
165+
if (preferred_encoding == FLB_SIMDUTF_ENCODING_TYPE_UNICODE_AUTO) {
166+
encoding = simdutf::detect_encodings(input, length);
167+
}
168+
else if (preferred_encoding != FLB_SIMDUTF_ENCODING_TYPE_UNSPECIFIED) {
169+
encoding = preferred_encoding;
170+
}
171+
else {
172+
/* forcibly nop on this condition */
173+
encoding = FLB_SIMDUTF_ENCODING_TYPE_UTF8;
174+
}
175+
if ((encoding & simdutf::encoding_type::UTF8) == simdutf::encoding_type::UTF8) {
176+
/* Nothing to do! */
177+
return FLB_SIMDUTF_CONNECTOR_CONVERT_NOP;
178+
}
179+
else if ((encoding & simdutf::encoding_type::UTF16_LE) == simdutf::encoding_type::UTF16_LE) {
180+
len = length;
181+
if (len % 2) {
182+
len--;
183+
}
184+
for (i = 0; i < len;) {
185+
if (i + 2 > len) {
186+
break;
187+
}
188+
/* little-endian */
189+
int lo = input[i++] & 0xFF;
190+
int hi = input[i++] & 0xFF;
191+
str16.push_back(hi << 8 | lo);
192+
}
193+
194+
return flb_simdutf_connector_convert_utf16le_to_utf8(str16.c_str(), str16.size(),
195+
output, out_size);
196+
}
197+
else if ((encoding & simdutf::encoding_type::UTF16_BE) == simdutf::encoding_type::UTF16_BE) {
198+
len = length;
199+
if (len % 2) {
200+
len--;
201+
}
202+
for (i = 0; i < len;) {
203+
if (i + 2 > len) {
204+
break;
205+
}
206+
/* big-endian */
207+
int lo = input[i++] & 0xFF;
208+
int hi = input[i++] & 0xFF;
209+
str16.push_back(lo | hi << 8);
210+
}
211+
212+
return flb_simdutf_connector_convert_utf16be_to_utf8(str16.c_str(), str16.size(),
213+
output, out_size);
214+
}
215+
else {
216+
/* Note: UTF-32LE and UTF-32BE are used for internal usages
217+
* nowadays. So, not to be provided for these encodings is reasonable. */
218+
/* When detected unsupported encodings, it will be reached here. */
219+
return FLB_SIMDUTF_CONNECTOR_CONVERT_UNSUPPORTED;
220+
}
221+
222+
return FLB_SIMDUTF_CONNECTOR_CONVERT_OK;
223+
}

0 commit comments

Comments
 (0)