Skip to content

Commit dd5ae2d

Browse files
committed
Add support for GBK/CP936 encoding and conversion
1 parent 6bedcf4 commit dd5ae2d

File tree

7 files changed

+360
-2
lines changed

7 files changed

+360
-2
lines changed

include/openvic-dataloader/detail/Encoding.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ namespace ovdl::detail {
88
Ascii,
99
Utf8,
1010
Windows1251,
11-
Windows1252
11+
Windows1252,
12+
Gbk,
1213
};
1314
}

src/openvic-dataloader/csv/Parser.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ struct Parser::ParseHandler final : detail::BasicFileParseHandler<CsvParseState>
4343
case Utf8:
4444
case Windows1251:
4545
case Windows1252:
46+
case Gbk:
4647
return lexy::parse<Node>(buffer<lexy::utf8_char_encoding>(), parse_state(), parse_state().logger().error_callback());
4748
OVDL_DEFAULT_CASE_UNREACHABLE(Unknown);
4849
}
Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
#pragma once
2+
3+
#include <cerrno>
4+
#include <cstddef>
5+
#include <cstring>
6+
7+
#include <openvic-dataloader/detail/Encoding.hpp>
8+
9+
#include <lexy/_detail/memory_resource.hpp>
10+
#include <lexy/encoding.hpp>
11+
#include <lexy/input/buffer.hpp>
12+
#include <lexy/input/file.hpp>
13+
14+
#ifdef _WIN32
15+
#define WIN32_LEAN_AND_MEAN
16+
#include <windows.h>
17+
#undef WIN32_LEAN_AND_MEAN
18+
#elif defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
19+
#include <iconv.h>
20+
#endif
21+
22+
namespace ovdl::convert::gbk {
23+
template<typename Encoding, lexy::encoding_endianness Endian>
24+
struct _make_buffer {
25+
static constexpr size_t small_buffer_size = size_t(4) * 1024;
26+
27+
template<typename MemoryResource = void>
28+
auto operator()(detail::Encoding encoding, const void* _memory, std::size_t size,
29+
MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>()) const {
30+
constexpr auto native_endianness = LEXY_IS_LITTLE_ENDIAN ? lexy::encoding_endianness::little : lexy::encoding_endianness::big;
31+
32+
using char_type = typename Encoding::char_type;
33+
LEXY_PRECONDITION(size % sizeof(char_type) == 0);
34+
auto memory = static_cast<const unsigned char*>(_memory);
35+
36+
if constexpr (sizeof(char_type) == 1 || Endian == native_endianness) {
37+
switch (encoding) {
38+
using enum detail::Encoding;
39+
case Ascii:
40+
case Utf8:
41+
return lexy::make_buffer_from_raw<Encoding, Endian>(_memory, size, resource);
42+
default: break;
43+
}
44+
45+
#if defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
46+
iconv_t cd = ::iconv_open("UTF-8", "WINDOWS-936");
47+
if (cd == (iconv_t)-1) {
48+
return lexy::buffer<Encoding, MemoryResource> { resource };
49+
}
50+
#endif
51+
52+
size_t in_size = size;
53+
// While technically illegal, it seems the contract for iconv is wrong, it doesn't modify the content of inbuff
54+
// It only ever does such for convenience
55+
char* in_buffer = const_cast<char*>(static_cast<const char*>(_memory));
56+
57+
if (in_buffer == nullptr) {
58+
return lexy::buffer<Encoding, MemoryResource> { resource };
59+
}
60+
61+
typename lexy::buffer<Encoding, MemoryResource>::builder out_builder(size * 3);
62+
char* out_buffer = out_builder.data();
63+
size_t out_size = out_builder.size();
64+
65+
auto iconv_err_handler = [&]() {
66+
if (errno == EILSEQ && in_buffer && in_size >= 1) {
67+
auto full_width_exclaim = [&] {
68+
// Insert UTF-8 ! (full width exclaimation mark)
69+
*out_buffer++ = '\xEF';
70+
*out_buffer++ = '\xBC';
71+
*out_buffer++ = '\x81';
72+
out_size -= 3;
73+
in_buffer += sizeof(char_type);
74+
--in_size;
75+
};
76+
switch (*in_buffer) {
77+
// Expect non-standard § from Windows-1252, required for color behavior
78+
case '\xA7':
79+
// Insert UTF-8 §
80+
*out_buffer++ = '\xC2';
81+
*out_buffer++ = '\xA7';
82+
out_size -= 2;
83+
in_buffer += sizeof(char_type);
84+
--in_size;
85+
return true;
86+
// Expect non-standard ! (full width exclaimation mark), found in some localizations
87+
case '\xA1':
88+
full_width_exclaim();
89+
return true;
90+
// Expect nothing then non-standard ! (full width exclaimation mark), found in some localizations
91+
case '\xAD':
92+
if (in_size >= 2 && in_buffer + 1 && in_buffer[1] == '\xA1') {
93+
--out_size;
94+
in_buffer += sizeof(char_type);
95+
--in_size;
96+
full_width_exclaim();
97+
}
98+
return true;
99+
// Unexpected error
100+
default: break;
101+
}
102+
}
103+
return false;
104+
};
105+
#if defined(_WIN32)
106+
auto iconv_mimic = [&]() -> int64_t {
107+
static constexpr size_t CP_GBK = 936;
108+
static constexpr size_t MB_CHAR_MAX = 16;
109+
110+
static auto mblen = [](const char* buf, int bufsize) {
111+
int len = 0;
112+
113+
unsigned char c = *buf;
114+
if (c < 0x80) len = 1;
115+
else if ((c & 0xE0) == 0xC0) len = 2;
116+
else if ((c & 0xF0) == 0xE0) len = 3;
117+
else if ((c & 0xF8) == 0xF0) len = 4;
118+
else if ((c & 0xFC) == 0xF8) len = 5;
119+
else if ((c & 0xFE) == 0xFC) len = 6;
120+
121+
if (len == 0) {
122+
errno = EILSEQ;
123+
return -1;
124+
} else if (bufsize < len) {
125+
errno = EINVAL;
126+
return -1;
127+
}
128+
return len;
129+
};
130+
131+
while (in_size != 0) {
132+
unsigned short wbuf[MB_CHAR_MAX]; /* enough room for one character */
133+
size_t wsize = MB_CHAR_MAX;
134+
135+
int insize = IsDBCSLeadByteEx(CP_GBK, *in_buffer) ? 2 : 1;
136+
if (insize == 2 && in_buffer && in_size >= 2) {
137+
// iconv errors on user-defined double byte characters
138+
// MultiByteToWideChar/WideCharToMultiByte does not
139+
unsigned char byte1 = static_cast<unsigned char>(*in_buffer);
140+
unsigned char byte2 = static_cast<unsigned char>(in_buffer[1]);
141+
if (byte1 >= 0xAA && byte1 <= 0xAF && byte2 >= 0xA1 && byte2 <= 0xFE) {
142+
errno = EILSEQ;
143+
return -1;
144+
}
145+
if (byte1 >= 0xF8 && byte1 <= 0xFE && byte2 >= 0xA1 && byte2 <= 0xFE) {
146+
errno = EILSEQ;
147+
return -1;
148+
}
149+
if (byte1 >= 0xA1 && byte1 <= 0xA7 && byte2 >= 0x40 && byte2 <= 0xA0 && byte2 != 0x7F) {
150+
errno = EILSEQ;
151+
return -1;
152+
}
153+
}
154+
wsize = MultiByteToWideChar(CP_GBK, MB_ERR_INVALID_CHARS, in_buffer, insize, (wchar_t*)wbuf, wsize);
155+
if (wsize == 0) {
156+
in_buffer += insize;
157+
in_size -= insize;
158+
continue;
159+
}
160+
161+
if (out_size == 0) {
162+
errno = E2BIG;
163+
return -1;
164+
}
165+
166+
int outsize = WideCharToMultiByte(CP_UTF8, 0, (const wchar_t*)wbuf, wsize, out_buffer, out_size, NULL, NULL);
167+
if (outsize == 0) {
168+
switch (GetLastError()) {
169+
case ERROR_INVALID_FLAGS:
170+
case ERROR_INVALID_PARAMETER:
171+
case ERROR_INSUFFICIENT_BUFFER:
172+
errno = E2BIG;
173+
return -1;
174+
default: break;
175+
}
176+
errno = EILSEQ;
177+
return -1;
178+
} else if (mblen(out_buffer, outsize) != outsize) {
179+
/* validate result */
180+
errno = EILSEQ;
181+
return -1;
182+
}
183+
184+
in_buffer += insize;
185+
out_buffer += outsize;
186+
in_size -= insize;
187+
out_size -= outsize;
188+
}
189+
190+
return 0;
191+
};
192+
193+
const auto end = in_buffer + size;
194+
while (in_size > 0 && out_size > 0 && in_buffer != end) {
195+
if (iconv_mimic() == -1) {
196+
if (!iconv_err_handler()) {
197+
break;
198+
}
199+
}
200+
}
201+
#elif defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
202+
const auto end = in_buffer + size;
203+
while (in_size > 0 && out_size > 0 && in_buffer != end) {
204+
if (::iconv(cd, &in_buffer, &in_size, &out_buffer, &out_size) == -1) {
205+
if (!iconv_err_handler()) {
206+
break;
207+
}
208+
}
209+
}
210+
::iconv_close(cd);
211+
#else
212+
#error "GBK conversion not supported on this platform"
213+
#endif
214+
return lexy::buffer<Encoding, MemoryResource> { out_builder.data(), static_cast<size_t>(out_buffer - out_builder.data()), resource };
215+
} else {
216+
return lexy::make_buffer_from_raw<Encoding, Endian>(_memory, size, resource);
217+
}
218+
}
219+
};
220+
221+
template<typename Encoding, lexy::encoding_endianness Endianness = lexy::encoding_endianness::bom>
222+
constexpr auto make_buffer_from_raw = _make_buffer<Encoding, Endianness> {};
223+
224+
template<typename Encoding, lexy::encoding_endianness Endian, typename MemoryResource>
225+
struct _read_file_user_data : lexy::_read_file_user_data<Encoding, Endian, MemoryResource> {
226+
using base_type = lexy::_read_file_user_data<Encoding, Endian, MemoryResource>;
227+
228+
detail::Encoding encoding;
229+
230+
_read_file_user_data(detail::Encoding encoding, MemoryResource* resource) : base_type(resource), encoding(encoding) {}
231+
static auto callback() {
232+
return [](void* _user_data, const char* memory, std::size_t size) {
233+
auto user_data = static_cast<_read_file_user_data*>(_user_data);
234+
235+
user_data->buffer = make_buffer_from_raw<Encoding, Endian>(user_data->encoding, memory, size, user_data->resource);
236+
};
237+
}
238+
};
239+
240+
template<typename Encoding = lexy::default_encoding,
241+
lexy::encoding_endianness Endian = lexy::encoding_endianness::bom,
242+
typename MemoryResource = void>
243+
auto read_file(
244+
const char* path,
245+
detail::Encoding encoding,
246+
MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>())
247+
-> lexy::read_file_result<Encoding, MemoryResource> {
248+
_read_file_user_data<Encoding, Endian, MemoryResource> user_data(encoding, resource);
249+
auto error = lexy::_detail::read_file(path, user_data.callback(), &user_data);
250+
return lexy::read_file_result(error, LEXY_MOV(user_data.buffer));
251+
}
252+
253+
/// Reads stdin into a buffer.
254+
template<typename Encoding = lexy::default_encoding,
255+
lexy::encoding_endianness Endian = lexy::encoding_endianness::bom,
256+
typename MemoryResource = void>
257+
auto read_stdin(
258+
detail::Encoding encoding,
259+
MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>())
260+
-> lexy::read_file_result<Encoding, MemoryResource> {
261+
_read_file_user_data<Encoding, Endian, MemoryResource> user_data(encoding, resource);
262+
auto error = lexy::_detail::read_stdin(user_data.callback(), &user_data);
263+
return lexy::read_file_result(error, LEXY_MOV(user_data.buffer));
264+
}
265+
}

src/openvic-dataloader/detail/Detect.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "detail/Detect.hpp"
22

3+
#include <optional>
4+
35
using namespace ovdl;
46
using namespace ovdl::encoding_detect;
57

@@ -23,6 +25,15 @@ std::optional<int64_t> AsciiCandidate::read(const std::span<const cbyte>& buffer
2325
return std::nullopt;
2426
}
2527

28+
std::optional<int64_t> GbkCandidate::read(const std::span<const cbyte>& buffer) {
29+
auto lexy_buffer = lexy::make_buffer_from_raw<lexy::default_encoding, lexy::encoding_endianness::little>(buffer.data(), buffer.size());
30+
if (is_gbk(lexy_buffer)) {
31+
return 2;
32+
}
33+
34+
return std::nullopt;
35+
}
36+
2637
std::optional<int64_t> NonLatinCasedCandidate::read(const std::span<const cbyte>& buffer) {
2738
static constexpr cbyte LATIN_LETTER = 1;
2839
static constexpr int64_t NON_LATIN_MIXED_CASE_PENALTY = -20;
@@ -351,3 +362,5 @@ std::optional<int64_t> LatinCandidate::read(const std::span<const cbyte>& buffer
351362

352363
template struct ovdl::encoding_detect::DetectUtf8<true>;
353364
template struct ovdl::encoding_detect::DetectUtf8<false>;
365+
template struct ovdl::encoding_detect::DetectGbk<true>;
366+
template struct ovdl::encoding_detect::DetectGbk<false>;

0 commit comments

Comments
 (0)