Skip to content

Commit d4986e8

Browse files
committed
encoding: add flb_encoding
Add flb_encoding functions for charset encodings to utf8. * Uses lib/tutf8e-library. * Only 8-bit source charsets are supported. * Encoding options (//OPTION) : //IGNORE, //QUESTION, //REPLACEMENT ///<text> * This commit doesn't add support to any input plugin. * Depends on fluent#2326 Signed-off-by: Jukka Pihl <[email protected]>
1 parent a3c018d commit d4986e8

File tree

4 files changed

+188
-1
lines changed

4 files changed

+188
-1
lines changed

CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ if(FLB_ALL)
196196
# Global
197197
set(FLB_DEBUG 1)
198198
set(FLB_TLS 1)
199+
set(FLB_UTF8_ENCODER 1)
199200

200201
# Input plugins
201202
set(FLB_IN_CPU 1)
@@ -353,7 +354,8 @@ add_subdirectory(${FLB_PATH_LIB_MINIZ} EXCLUDE_FROM_ALL)
353354

354355
# tutf8e
355356
if(FLB_UTF8_ENCODER)
356-
add_subdirectory(${FLB_PATH_LIB_TUTF8E} EXCLUDE_FROM_ALL)
357+
add_subdirectory(${FLB_PATH_LIB_TUTF8E})
358+
FLB_DEFINITION(FLB_HAVE_UTF8_ENCODER)
357359
endif()
358360

359361
# Chunk I/O

include/fluent-bit/flb_encoding.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2+
3+
/* Fluent Bit
4+
* ==========
5+
* Copyright (C) 2019 The Fluent Bit Authors
6+
* Copyright (C) 2015-2018 Treasure Data Inc.
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this file except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
#ifndef FLB_ENCODING_H
22+
#define FLB_ENCODING_H
23+
24+
#include <tutf8e.h>
25+
26+
27+
#define FLB_ENCODING_SUCCESS 0
28+
#define FLB_ENCODING_FAILURE -1
29+
30+
struct flb_encoding {
31+
TUTF8encoder encoder;
32+
const char *invalid;
33+
};
34+
35+
struct flb_encoding *flb_encoding_open(const char *encoding);
36+
37+
int flb_encoding_decode(struct flb_encoding *ec,
38+
char *str, size_t slen,
39+
char **result, size_t *result_len);
40+
41+
void flb_encoding_close(struct flb_encoding *ic);
42+
43+
#endif

src/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,10 @@ set(FLB_DEPS
278278
${FLB_DEPS}
279279
tutf8e
280280
)
281+
set(src
282+
${src}
283+
"flb_encoding.c"
284+
)
281285
endif()
282286

283287
# Record Accessor

src/flb_encoding.c

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2+
3+
/* Fluent Bit
4+
* ==========
5+
* Copyright (C) 2019 The Fluent Bit Authors
6+
* Copyright (C) 2015-2018 Treasure Data Inc.
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this file except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
#include <stdio.h>
22+
23+
#include <string.h>
24+
#include <time.h>
25+
#include <ctype.h>
26+
27+
#include <fluent-bit/flb_macros.h>
28+
#include <fluent-bit/flb_config.h>
29+
#include <fluent-bit/flb_error.h>
30+
#include <fluent-bit/flb_mem.h>
31+
#include <fluent-bit/flb_str.h>
32+
#include <fluent-bit/flb_encoding.h>
33+
34+
35+
#include <tutf8e.h>
36+
37+
/*
38+
*
39+
* flb_encoding_open(encoding):
40+
* iso-8859-1,...
41+
* windows-1251 windows-1252, ..
42+
*
43+
* <charset> - fail if bad chars
44+
* <charset>//IGNORE - ignore bad chars
45+
* <charset>//REPLACEMENT //R - use unicode replacement chars for bad chars
46+
* <charset>//QUESTION //Q - use '?' for bad chars
47+
* <charset>///<str> - use <str> for bad chars
48+
*/
49+
50+
51+
static unsigned char replacement_utf8[] = { 0xEF, 0xBF, 0xBD , 0 };
52+
53+
struct flb_encoding *flb_encoding_open(const char *encoding) {
54+
struct flb_encoding *ec;
55+
TUTF8encoder encoder;
56+
const char *invalid;
57+
char *opt;
58+
59+
if ((opt = strstr(encoding,"//")) != NULL) {
60+
*opt = 0;
61+
opt += 2;
62+
if (*opt == '/') {
63+
invalid = opt + 1;
64+
} else if (!strcmp(opt,"I") || !strcmp(opt,"IGNORE")) {
65+
invalid = "";
66+
} else if (!strcmp(opt,"R") || !strcmp(opt,"REPLACEMENT")) {
67+
invalid = (const char *) replacement_utf8;
68+
} else if (!strcmp(opt,"Q") || !strcmp(opt,"QUESTION")) {
69+
invalid = "?";
70+
} else {
71+
flb_error("[flb_encoding] unknown encodig option: %s", opt);
72+
}
73+
} else {
74+
invalid = NULL;
75+
}
76+
77+
if ((encoder = tutf8e_encoder(encoding)) == NULL) {
78+
flb_error("[flb_encoding] unknown encoding: %s", encoding);
79+
return NULL;
80+
}
81+
ec = flb_calloc(sizeof(struct flb_encoding),1);
82+
ec->encoder = encoder;
83+
ec->invalid = invalid ? flb_strdup(invalid) : NULL;
84+
return ec;
85+
}
86+
87+
88+
int flb_encoding_decode(struct flb_encoding *ec,
89+
char *str, size_t slen,
90+
char **result, size_t *result_len)
91+
{
92+
size_t outlen = 0;
93+
char *outbuf;
94+
int ret;
95+
96+
*result = NULL;
97+
*result_len = 0;
98+
99+
if (slen == 0) {
100+
*result = flb_strdup("");
101+
*result_len = 0;
102+
return FLB_ENCODING_SUCCESS;
103+
}
104+
105+
ret = tutf8e_encoder_buffer_length(ec->encoder, str, ec->invalid, slen, &outlen);
106+
107+
if (ret != TUTF8E_OK) {
108+
return FLB_ENCODING_FAILURE;
109+
}
110+
111+
112+
outbuf = flb_malloc(outlen + 1);
113+
if(outbuf == NULL) {
114+
flb_error("[flb_encoding] out of memory (%zu)", (int) outlen + 1);
115+
return FLB_ENCODING_FAILURE;
116+
}
117+
118+
ret = tutf8e_encoder_buffer_encode(ec->encoder, str, slen, ec->invalid, outbuf, &outlen);
119+
120+
if (ret != TUTF8E_OK) {
121+
flb_free(outbuf);
122+
return FLB_ENCODING_FAILURE;
123+
}
124+
outbuf[outlen] = 0;
125+
*result = outbuf;
126+
*result_len = outlen;
127+
128+
return FLB_ENCODING_SUCCESS;
129+
}
130+
131+
void flb_encoding_close(struct flb_encoding *ec) {
132+
if (ec) {
133+
if (ec->invalid) {
134+
flb_free((char*)ec->invalid);
135+
}
136+
}
137+
}
138+

0 commit comments

Comments
 (0)