Skip to content

Commit 84176d1

Browse files
committed
encoding: add flb_encoding
Add flb_encoding functions for charset encodings to utf8. * Uses lib/tutf8e-library. * Only 8-bit source charsets are supported. * Encoding options (//OPTION) : //IGNORE, //QUESTION, //REPLACEMENT ///<text> * This commit doesn't add support to any input plugin. Signed-off-by: Jukka Pihl <[email protected]>
1 parent 71993e0 commit 84176d1

File tree

4 files changed

+210
-1
lines changed

4 files changed

+210
-1
lines changed

CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ if(FLB_ALL)
198198
# Global
199199
set(FLB_DEBUG 1)
200200
set(FLB_TLS 1)
201+
set(FLB_UTF8_ENCODER 1)
201202

202203
# Input plugins
203204
set(FLB_IN_CPU 1)
@@ -381,7 +382,8 @@ endif()
381382

382383
# tutf8e
383384
if(FLB_UTF8_ENCODER)
384-
add_subdirectory(${FLB_PATH_LIB_TUTF8E} EXCLUDE_FROM_ALL)
385+
add_subdirectory(${FLB_PATH_LIB_TUTF8E})
386+
FLB_DEFINITION(FLB_HAVE_UTF8_ENCODER)
385387
endif()
386388

387389
# xxHash

include/fluent-bit/flb_encoding.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2+
3+
/* Fluent Bit
4+
* ==========
5+
* Copyright (C) 2019 The Fluent Bit Authors
6+
* Copyright (C) 2015-2018 Treasure Data Inc.
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this file except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
#ifndef FLB_ENCODING_H
22+
#define FLB_ENCODING_H
23+
24+
#include <tutf8e.h>
25+
26+
27+
#define FLB_ENCODING_SUCCESS 0
28+
#define FLB_ENCODING_FAILURE -1
29+
30+
struct flb_encoding {
31+
TUTF8encoder encoder;
32+
const char *invalid;
33+
};
34+
35+
struct flb_encoding *flb_encoding_open(const char *encoding);
36+
37+
int flb_encoding_decode(struct flb_encoding *ec,
38+
char *str, size_t slen,
39+
char **result, size_t *result_len);
40+
41+
void flb_encoding_close(struct flb_encoding *ic);
42+
43+
#endif

src/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,10 @@ set(FLB_DEPS
312312
${FLB_DEPS}
313313
tutf8e
314314
)
315+
set(src
316+
${src}
317+
"flb_encoding.c"
318+
)
315319
endif()
316320

317321
# Record Accessor

src/flb_encoding.c

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2+
3+
/* Fluent Bit
4+
* ==========
5+
* Copyright (C) 2019 The Fluent Bit Authors
6+
* Copyright (C) 2015-2018 Treasure Data Inc.
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this file except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
#include <stdio.h>
22+
23+
#include <string.h>
24+
#include <time.h>
25+
#include <ctype.h>
26+
27+
#include <fluent-bit/flb_macros.h>
28+
#include <fluent-bit/flb_config.h>
29+
#include <fluent-bit/flb_error.h>
30+
#include <fluent-bit/flb_mem.h>
31+
#include <fluent-bit/flb_str.h>
32+
#include <fluent-bit/flb_encoding.h>
33+
34+
35+
#include <tutf8e.h>
36+
37+
/*
38+
*
39+
* flb_encoding_open(encoding):
40+
* iso-8859-1,...
41+
* windows-1251 windows-1252, ..
42+
*
43+
* <charset> - fail if bad chars
44+
* <charset>//IGNORE - ignore bad chars
45+
* <charset>//REPLACEMENT //R - use unicode replacement chars for bad chars
46+
* <charset>//QUESTION //Q - use '?' for bad chars
47+
* <charset>///<str> - use <str> for bad chars
48+
*/
49+
50+
51+
static unsigned char replacement_utf8[] = { 0xEF, 0xBF, 0xBD , 0 };
52+
53+
struct flb_encoding *flb_encoding_open(const char *encoding) {
54+
struct flb_encoding *ec;
55+
TUTF8encoder encoder;
56+
const char *invalid;
57+
char *opt;
58+
59+
if ((opt = strstr(encoding,"//")) != NULL) {
60+
*opt = 0;
61+
opt += 2;
62+
if (*opt == '/') {
63+
invalid = opt + 1;
64+
}
65+
else if (!strcmp(opt,"I") || !strcmp(opt,"IGNORE")) {
66+
invalid = "";
67+
}
68+
else if (!strcmp(opt,"R") || !strcmp(opt,"REPLACEMENT")) {
69+
invalid = (const char *) replacement_utf8;
70+
}
71+
else if (!strcmp(opt,"Q") || !strcmp(opt,"QUESTION")) {
72+
invalid = "?";
73+
}
74+
else {
75+
flb_error("[flb_encoding] unknown encodig option: %s", opt);
76+
return NULL;
77+
}
78+
}
79+
else {
80+
invalid = NULL;
81+
}
82+
83+
if ((encoder = tutf8e_encoder(encoding)) == NULL) {
84+
flb_error("[flb_encoding] unknown encoding: %s", encoding);
85+
return NULL;
86+
}
87+
88+
ec = flb_calloc(sizeof(struct flb_encoding),1);
89+
90+
if (!ec) {
91+
flb_errno();
92+
return NULL;
93+
}
94+
95+
if (invalid) {
96+
invalid = flb_strdup(invalid);
97+
if (!invalid) {
98+
flb_errno();
99+
flb_free(ec);
100+
return NULL;
101+
}
102+
}
103+
104+
ec->encoder = encoder;
105+
ec->invalid = invalid;
106+
return ec;
107+
}
108+
109+
110+
int flb_encoding_decode(struct flb_encoding *ec,
111+
char *str, size_t slen,
112+
char **result, size_t *result_len)
113+
{
114+
size_t outlen = 0;
115+
char *outbuf;
116+
int ret;
117+
118+
*result = NULL;
119+
*result_len = 0;
120+
121+
if (slen == 0) {
122+
*result = flb_strdup("");
123+
*result_len = 0;
124+
return FLB_ENCODING_SUCCESS;
125+
}
126+
127+
ret = tutf8e_encoder_buffer_length(ec->encoder, str, ec->invalid, slen, &outlen);
128+
129+
if (ret != TUTF8E_OK) {
130+
return FLB_ENCODING_FAILURE;
131+
}
132+
133+
outbuf = flb_malloc(outlen + 1);
134+
if(outbuf == NULL) {
135+
flb_error("[flb_encoding] out of memory (%zu)", (int) outlen + 1);
136+
return FLB_ENCODING_FAILURE;
137+
}
138+
139+
ret = tutf8e_encoder_buffer_encode(ec->encoder, str, slen, ec->invalid, outbuf, &outlen);
140+
141+
if (ret != TUTF8E_OK) {
142+
flb_free(outbuf);
143+
return FLB_ENCODING_FAILURE;
144+
}
145+
146+
outbuf[outlen] = 0;
147+
*result = outbuf;
148+
*result_len = outlen;
149+
150+
return FLB_ENCODING_SUCCESS;
151+
}
152+
153+
void flb_encoding_close(struct flb_encoding *ec) {
154+
if (ec) {
155+
if (ec->invalid) {
156+
flb_free((char*)ec->invalid);
157+
}
158+
}
159+
}
160+

0 commit comments

Comments
 (0)