Skip to content

Commit 5497e2a

Browse files
erikcededsiper
authored andcommitted
simdutf_connector: reduce copying
- Do not copy input if data is already aligned. - Only allocate output once. Signed-off-by: Erik Cederberg <[email protected]>
1 parent d4496e6 commit 5497e2a

File tree

1 file changed

+79
-73
lines changed

1 file changed

+79
-73
lines changed

src/simdutf/flb_simdutf_connector.cpp

Lines changed: 79 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,49 @@
1919

2020
#include <simdutf.h>
2121
#include <fluent-bit/simdutf/flb_simdutf_connector.h>
22-
#include <memory.h>
2322
#include <memory>
23+
extern "C"
24+
{
25+
#include <fluent-bit/flb_log.h>
26+
#include <fluent-bit/flb_mem.h>
27+
}
28+
29+
typedef int (*conversion_function)(const char16_t *buf, size_t len,
30+
char **utf8_output, size_t *out_size);
31+
32+
static int convert_from_unicode(conversion_function convert,
33+
const char *input, size_t length,
34+
char **output, size_t *out_size)
35+
{
36+
size_t len;
37+
std::unique_ptr<char16_t, decltype(&flb_free)> temp_buffer(NULL, flb_free);
38+
const char16_t *aligned_input = NULL;
39+
int status;
40+
41+
len = length;
42+
if (len % 2) {
43+
len--;
44+
}
45+
if (len < 2) {
46+
return FLB_SIMDUTF_CONNECTOR_CONVERT_NOP;
47+
}
48+
49+
/* Check alignment to determine whether to copy or not */
50+
if ((uintptr_t) input % 2 == 0) {
51+
aligned_input = (const char16_t *) input;
52+
}
53+
else {
54+
temp_buffer.reset((char16_t *) flb_malloc(len));
55+
if (temp_buffer.get() == NULL) {
56+
flb_errno();
57+
return FLB_SIMDUTF_CONNECTOR_CONVERT_ERROR;
58+
}
59+
memcpy(temp_buffer.get(), input, len);
60+
aligned_input = temp_buffer.get();
61+
}
62+
63+
return convert(aligned_input, len / 2, output, out_size);
64+
}
2465

2566
int flb_simdutf_connector_utf8_length_from_utf16le(const char16_t *buf, size_t len)
2667
{
@@ -61,23 +102,24 @@ int flb_simdutf_connector_convert_utf16le_to_utf8(const char16_t *buf, size_t le
61102
char **utf8_output, size_t *out_size)
62103
{
63104
size_t clen = 0;
64-
size_t converted = 0;
65-
simdutf::result result;
105+
simdutf::result result = {};
66106

67107
clen = simdutf::utf8_length_from_utf16le(buf, len);
68-
/* convert_utfXXXX_to_utf8 function needs to pass allocated memory region with C++ style */
69-
std::unique_ptr<char[]> output{new char[clen]};
70-
converted = simdutf::convert_utf16le_to_utf8(buf, len, output.get());
71-
result = simdutf::validate_utf8_with_errors(output.get(), clen);
72-
if (result.error == simdutf::error_code::SUCCESS && converted > 0) {
73-
std::string result_string(output.get(), clen);
108+
*utf8_output = (char *) flb_malloc(clen + 1);
109+
if (*utf8_output == NULL) {
110+
flb_errno();
111+
return FLB_SIMDUTF_CONNECTOR_CONVERT_ERROR;
112+
}
74113

75-
*utf8_output = strdup(result_string.c_str());
76-
*out_size = converted;
114+
result = simdutf::convert_utf16le_to_utf8_with_errors(buf, len, *utf8_output);
115+
if (result.error == simdutf::error_code::SUCCESS && result.count > 0) {
116+
(*utf8_output)[result.count] = '\0';
117+
*out_size = result.count;
77118

78119
return FLB_SIMDUTF_ERROR_CODE_SUCCESS;
79120
}
80121
else {
122+
flb_free(*utf8_output);
81123
*utf8_output = NULL;
82124
*out_size = 0;
83125

@@ -89,23 +131,24 @@ int flb_simdutf_connector_convert_utf16be_to_utf8(const char16_t *buf, size_t le
89131
char **utf8_output, size_t *out_size)
90132
{
91133
size_t clen = 0;
92-
size_t converted = 0;
93-
simdutf::result result;
134+
simdutf::result result = {};
94135

95136
clen = simdutf::utf8_length_from_utf16be(buf, len);
96-
/* convert_utfXXXX_to_utf8 function needs to pass allocated memory region with C++ style */
97-
std::unique_ptr<char[]> output{new char[clen]};
98-
converted = simdutf::convert_utf16be_to_utf8(buf, len, output.get());
99-
result = simdutf::validate_utf8_with_errors(output.get(), clen);
100-
if (result.error == simdutf::error_code::SUCCESS && converted > 0) {
101-
std::string result_string(output.get(), clen);
137+
*utf8_output = (char *) flb_malloc(clen + 1);
138+
if (*utf8_output == NULL) {
139+
flb_errno();
140+
return FLB_SIMDUTF_CONNECTOR_CONVERT_ERROR;
141+
}
102142

103-
*utf8_output = strdup(result_string.c_str());
104-
*out_size = converted;
143+
result = simdutf::convert_utf16be_to_utf8_with_errors(buf, len, *utf8_output);
144+
if (result.error == simdutf::error_code::SUCCESS && result.count > 0) {
145+
(*utf8_output)[result.count] = '\0';
146+
*out_size = result.count;
105147

106148
return FLB_SIMDUTF_ERROR_CODE_SUCCESS;
107149
}
108150
else {
151+
flb_free(*utf8_output);
109152
*utf8_output = NULL;
110153
*out_size = 0;
111154

@@ -117,23 +160,24 @@ int flb_simdutf_connector_convert_utf16_to_utf8(const char16_t *buf, size_t len,
117160
char **utf8_output, size_t *out_size)
118161
{
119162
size_t clen = 0;
120-
size_t converted = 0;
121-
simdutf::result result;
163+
simdutf::result result = {};
122164

123165
clen = simdutf::utf8_length_from_utf16(buf, len);
124-
/* convert_utfXXXX_to_utf8 function needs to pass allocated memory region with C++ style */
125-
std::unique_ptr<char[]> output{new char[clen]};
126-
converted = simdutf::convert_utf16_to_utf8(buf, len, output.get());
127-
result = simdutf::validate_utf8_with_errors(output.get(), clen);
128-
if (result.error == simdutf::error_code::SUCCESS && converted > 0) {
129-
std::string result_string(output.get(), clen);
166+
*utf8_output = (char *) flb_malloc(clen + 1);
167+
if (*utf8_output == NULL) {
168+
flb_errno();
169+
return FLB_SIMDUTF_CONNECTOR_CONVERT_ERROR;
170+
}
130171

131-
*utf8_output = strdup(result_string.c_str());
132-
*out_size = converted;
172+
result = simdutf::convert_utf16_to_utf8_with_errors(buf, len, *utf8_output);
173+
if (result.error == simdutf::error_code::SUCCESS && result.count > 0) {
174+
(*utf8_output)[result.count] = '\0';
175+
*out_size = result.count;
133176

134177
return FLB_SIMDUTF_ERROR_CODE_SUCCESS;
135178
}
136179
else {
180+
flb_free(*utf8_output);
137181
*utf8_output = NULL;
138182
*out_size = 0;
139183

@@ -155,11 +199,7 @@ int flb_simdutf_connector_convert_from_unicode(int preferred_encoding,
155199
const char *input, size_t length,
156200
char **output, size_t *out_size)
157201
{
158-
size_t len = 0;
159-
size_t i = 0;
160202
int encoding = 0;
161-
std::u16string str16;
162-
163203
if (preferred_encoding == FLB_SIMDUTF_ENCODING_TYPE_UNICODE_AUTO) {
164204
encoding = simdutf::detect_encodings(input, length);
165205
}
@@ -175,46 +215,12 @@ int flb_simdutf_connector_convert_from_unicode(int preferred_encoding,
175215
return FLB_SIMDUTF_CONNECTOR_CONVERT_NOP;
176216
}
177217
else if ((encoding & simdutf::encoding_type::UTF16_LE) == simdutf::encoding_type::UTF16_LE) {
178-
len = length;
179-
if (len % 2) {
180-
len--;
181-
}
182-
if (len < 2) {
183-
return FLB_SIMDUTF_CONNECTOR_CONVERT_NOP;
184-
}
185-
for (i = 0 ; i < len;) {
186-
if (i + 2 > len) {
187-
break;
188-
}
189-
/* little-endian */
190-
int lo = input[i++] & 0xFF;
191-
int hi = input[i++] & 0xFF;
192-
str16.push_back(hi << 8 | lo);
193-
}
194-
195-
return flb_simdutf_connector_convert_utf16le_to_utf8(str16.c_str(), str16.size(),
196-
output, out_size);
218+
return convert_from_unicode(flb_simdutf_connector_convert_utf16le_to_utf8,
219+
input, length, output, out_size);
197220
}
198221
else if ((encoding & simdutf::encoding_type::UTF16_BE) == simdutf::encoding_type::UTF16_BE) {
199-
len = length;
200-
if (len % 2) {
201-
len--;
202-
}
203-
if (len < 2) {
204-
return FLB_SIMDUTF_CONNECTOR_CONVERT_NOP;
205-
}
206-
for (i = 0; i < len;) {
207-
if (i + 2 > len) {
208-
break;
209-
}
210-
/* big-endian */
211-
int lo = input[i++] & 0xFF;
212-
int hi = input[i++] & 0xFF;
213-
str16.push_back(lo | hi << 8);
214-
}
215-
216-
return flb_simdutf_connector_convert_utf16be_to_utf8(str16.c_str(), str16.size(),
217-
output, out_size);
222+
return convert_from_unicode(flb_simdutf_connector_convert_utf16be_to_utf8,
223+
input, length, output, out_size);
218224
}
219225
else {
220226
/* Note: UTF-32LE and UTF-32BE are used for internal usages

0 commit comments

Comments
 (0)