Skip to content

Commit 3c9fefc

Browse files
nigels-comedsiper
authored andcommitted
lib: tutf8e: A tiny UTF-8 encoder for C
Signed-off-by: Nigel Stewart <[email protected]>
1 parent 9767919 commit 3c9fefc

File tree

11 files changed

+1644
-0
lines changed

11 files changed

+1644
-0
lines changed

CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ option(FLB_SMALL "Optimise for small size" No)
5050
option(FLB_COVERAGE "Build with code-coverage" No)
5151
option(FLB_JEMALLOC "Build with Jemalloc support" No)
5252
option(FLB_REGEX "Build with Regex support" Yes)
53+
option(FLB_ENCODE "Build with UTF8 encoding support" Yes)
5354
option(FLB_PARSER "Build with Parser support" Yes)
5455
option(FLB_TLS "Build with SSL/TLS support" No)
5556
option(FLB_BINARY "Build executable binary" Yes)
@@ -310,6 +311,11 @@ add_subdirectory(${FLB_PATH_LIB_MPACK} EXCLUDE_FROM_ALL)
310311
# Miniz (zip)
311312
add_subdirectory(${FLB_PATH_LIB_MINIZ} EXCLUDE_FROM_ALL)
312313

314+
# tutf8e
315+
if(FLB_ENCODE)
316+
add_subdirectory(${FLB_PATH_LIB_TUTF8E} EXCLUDE_FROM_ALL)
317+
endif()
318+
313319
# Chunk I/O
314320
FLB_OPTION(CIO_LIB_STATIC ON)
315321
FLB_OPTION(CIO_LIB_SHARED OFF)
@@ -558,6 +564,12 @@ if(FLB_REGEX)
558564
FLB_DEFINITION(FLB_HAVE_REGEX)
559565
endif()
560566

567+
# tutf8e (UTF8 Encoding)
568+
# =====================
569+
if(FLB_ENCODE)
570+
FLB_DEFINITION(FLB_HAVE_ENCODE)
571+
endif()
572+
561573
# LuaJIT (Scripting Support)
562574
# ==========================
563575
if(FLB_LUAJIT)

cmake/headers.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ include_directories(
1919
${CMAKE_CURRENT_BINARY_DIR}/include
2020
)
2121

22+
if(FLB_ENCODE)
23+
include_directories(${FLB_PATH_ROOT_SOURCE}/${FLB_PATH_LIB_TUTF8E}/include)
24+
endif()
25+
2226
# On Windows, the core uses libevent
2327
if(CMAKE_SYSTEM_NAME MATCHES "Windows")
2428
include_directories(

cmake/libraries.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ set(FLB_PATH_LIB_SQLITE "lib/sqlite-amalgamation-3240000")
1111
set(FLB_PATH_LIB_ONIGMO "lib/onigmo")
1212
set(FLB_PATH_LIB_MPACK "lib/mpack-amalgamation-1.0")
1313
set(FLB_PATH_LIB_MINIZ "lib/miniz")
14+
set(FLB_PATH_LIB_TUTF8E "lib/tutf8e")

lib/tutf8e/CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
cmake_minimum_required(VERSION 2.8)
2+
project(tutf8e)
3+
4+
set(CMAKE_C_FLAGS "-Os -Wall")
5+
6+
include_directories(include)
7+
add_library(tutf8e STATIC src/tutf8e.c)
8+
9+
add_executable(tutf8e-test test/test.c)
10+
target_link_libraries(tutf8e-test tutf8e)

lib/tutf8e/LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2019 Nigel Stewart
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

lib/tutf8e/README.md

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# tutf8e
2+
3+
*Tute Feighty*
4+
5+
A tiny UTF-8 encoder for C.
6+
7+
## Goals
8+
9+
* As small and fast as possible
10+
* Narrowly scoped to one-step UTF-8 encoding in C
11+
* Link only what you need and use
12+
* MIT licence
13+
14+
## Supported Encodings
15+
16+
* [iso-8859-1](https://en.wikipedia.org/wiki/ISO/IEC_8859-1) Latin-1 Western European
17+
* [iso-8859-2](https://en.wikipedia.org/wiki/ISO/IEC_8859-2) Latin-2 East European
18+
* [iso-8859-3](https://en.wikipedia.org/wiki/ISO/IEC_8859-3) Latin-3 South European
19+
* [iso-8859-4](https://en.wikipedia.org/wiki/ISO/IEC_8859-4) Latin-4 North European
20+
* [iso-8859-5](https://en.wikipedia.org/wiki/ISO/IEC_8859-5) Part 5: Latin/Cyrillic
21+
* [iso-8859-6](https://en.wikipedia.org/wiki/ISO/IEC_8859-6) Part 6: Latin/Arabic
22+
* [iso-8859-7](https://en.wikipedia.org/wiki/ISO/IEC_8859-7) Part 7: Latin/Greek
23+
* [iso-8859-8](https://en.wikipedia.org/wiki/ISO/IEC_8859-8) Part 8: Latin/Hebrew
24+
* [iso-8859-9](https://en.wikipedia.org/wiki/ISO/IEC_8859-9) Latin-5 Turkish
25+
* [iso-8859-10](https://en.wikipedia.org/wiki/ISO/IEC_8859-10) Latin-6 Nordic
26+
* [iso-8859-11](https://en.wikipedia.org/wiki/ISO/IEC_8859-11) Part 11: Latin/Thai
27+
* [iso-8859-13](https://en.wikipedia.org/wiki/ISO/IEC_8859-13) Latin-7 Baltic Rim
28+
* [iso-8859-14](https://en.wikipedia.org/wiki/ISO/IEC_8859-14) Latin-8 Celtic
29+
* [iso-8859-15](https://en.wikipedia.org/wiki/ISO/IEC_8859-15) Latin-9 Western European
30+
* [iso-8859-16](https://en.wikipedia.org/wiki/ISO/IEC_8859-16) Latin-10 South-Eastern European
31+
* [windows-1250](https://en.wikipedia.org/wiki/Windows-1250) Central European and Eastern European
32+
* [windows-1251](https://en.wikipedia.org/wiki/Windows-1251) Cyrillic
33+
* [windows-1252](https://en.wikipedia.org/wiki/Windows-1252) English
34+
* [windows-1253](https://en.wikipedia.org/wiki/Windows-1253) Greek
35+
* [windows-1254](https://en.wikipedia.org/wiki/Windows-1254) Turkish
36+
* [windows-1255](https://en.wikipedia.org/wiki/Windows-1255) Hebrew
37+
* [windows-1256](https://en.wikipedia.org/wiki/Windows-1256) Arabic
38+
* [windows-1257](https://en.wikipedia.org/wiki/Windows-1257) Baltic
39+
* [windows-1258](https://en.wikipedia.org/wiki/Windows-1258) Vietnamese
40+
41+
## Test Procedure
42+
43+
```
44+
$ ./codegen.py
45+
46+
$ gcc src/* test/test.c -Iinclude
47+
48+
$ ./a.out
49+
A quick brown fox jumps over the lazy dog
50+
Nechť již hříšné saxofony ďáblů rozezvučí síň úděsnými tóny waltzu, tanga a quickstepu.
51+
Pijamalı hasta yağız şoföre çabucak güvendi.
52+
Põdur Zagrebi tšellomängija-följetonist Ciqo külmetas kehvas garaažis
53+
В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
54+
διαφυλάξτε γενικά τη ζωή σας από βαθειά ψυχικά τραύματα
55+
עטלף אבק נס דרך מזגן שהתפוצץ כי חם
56+
Pijamalı hasta yağız şoföre çabucak güvendi.
57+
Flygande bäckasiner söka hwila på mjuka tuvor.
58+
เป็นมนุษย์สุดประเสริฐเลิศคุณค่า กว่าบรรดาฝูงสัตว์เดรัจฉาน จงฝ่าฟันพัฒนาวิชาการ อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า หัดอภัยเหมือนกีฬาอัชฌาสัย ปฏิบัติประพฤติกฎกำหนดใจ พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอยฯ
59+
Jeżu klątw, spłódź Finom część gry hańb!
60+
11 passed, 0 failed tests
61+
```
62+
63+
## How small is it?
64+
65+
512 bytes + overhead per encoding.
66+
67+
```
68+
$ for i in src/*; do gcc -c $i -O1; done
69+
$ du -bhc *.o | grep total
70+
32K total
71+
72+
$ for i in src/*; do gcc -c $i -O3; done
73+
$ du -bhc *.o | grep total
74+
32K total
75+
76+
$ for i in src/*; do gcc -c $i -Os; done
77+
$ du -bhc *.o | grep total
78+
28K total
79+
```
80+
81+
## Related
82+
83+
* [iconv](https://www.gnu.org/software/libiconv/)
84+
* [icu](http://site.icu-project.org/)

0 commit comments

Comments
 (0)