diff --git a/include/openvic-dataloader/detail/Encoding.hpp b/include/openvic-dataloader/detail/Encoding.hpp index b1a9aed..21e9b26 100644 --- a/include/openvic-dataloader/detail/Encoding.hpp +++ b/include/openvic-dataloader/detail/Encoding.hpp @@ -8,6 +8,7 @@ namespace ovdl::detail { Ascii, Utf8, Windows1251, - Windows1252 + Windows1252, + Gbk, }; } \ No newline at end of file diff --git a/src/openvic-dataloader/csv/Parser.cpp b/src/openvic-dataloader/csv/Parser.cpp index 5caa57d..ef8e1ce 100644 --- a/src/openvic-dataloader/csv/Parser.cpp +++ b/src/openvic-dataloader/csv/Parser.cpp @@ -43,6 +43,7 @@ struct Parser::ParseHandler final : detail::BasicFileParseHandler case Utf8: case Windows1251: case Windows1252: + case Gbk: return lexy::parse(buffer(), parse_state(), parse_state().logger().error_callback()); OVDL_DEFAULT_CASE_UNREACHABLE(Unknown); } diff --git a/src/openvic-dataloader/detail/ConvertGbk.hpp b/src/openvic-dataloader/detail/ConvertGbk.hpp new file mode 100644 index 0000000..4020792 --- /dev/null +++ b/src/openvic-dataloader/detail/ConvertGbk.hpp @@ -0,0 +1,272 @@ +#pragma once + +#include +#include +#include + +#include + +#include +#include +#include +#include + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#include +#undef WIN32_LEAN_AND_MEAN +#elif defined(__unix__) || defined(__APPLE__) || __has_include() +#include +#endif + +namespace ovdl::convert::gbk { + template + struct _make_buffer { + static constexpr size_t small_buffer_size = size_t(4) * 1024; + + template + auto operator()(detail::Encoding encoding, const void* _memory, std::size_t size, + MemoryResource* resource = lexy::_detail::get_memory_resource()) const { + constexpr auto native_endianness = LEXY_IS_LITTLE_ENDIAN ? lexy::encoding_endianness::little : lexy::encoding_endianness::big; + + using char_type = typename Encoding::char_type; + LEXY_PRECONDITION(size % sizeof(char_type) == 0); + auto memory = static_cast(_memory); + + if constexpr (sizeof(char_type) == 1 || Endian == native_endianness) { + switch (encoding) { + using enum detail::Encoding; + case Ascii: + case Utf8: + return lexy::make_buffer_from_raw(_memory, size, resource); + default: break; + } + +#if defined(__unix__) || defined(__APPLE__) || __has_include() + iconv_t cd = ::iconv_open("UTF-8", "WINDOWS-936"); + if (cd == (iconv_t)-1) { + return lexy::buffer { resource }; + } +#endif + + size_t in_size = size; + // While technically illegal, it seems the contract for iconv is wrong, it doesn't modify the content of inbuff + // It only ever does such for convenience + char* in_buffer = const_cast(static_cast(_memory)); + + if (in_buffer == nullptr) { + return lexy::buffer { resource }; + } + + typename lexy::buffer::builder out_builder(size * 3); + char* out_buffer = out_builder.data(); + size_t out_size = out_builder.size(); + + auto iconv_err_handler = [&]() { + if (errno == EILSEQ && in_buffer && in_size >= 1) { + auto full_width_exclaim = [&] { + // Insert UTF-8 ! (full width exclaimation mark) + *out_buffer++ = '\xEF'; + *out_buffer++ = '\xBC'; + *out_buffer++ = '\x81'; + out_size -= 3; + in_buffer += sizeof(char_type); + --in_size; + }; + switch (*in_buffer) { + // Expect non-standard § from Windows-1252, required for color behavior + case '\xA7': + // Insert UTF-8 § + *out_buffer++ = '\xC2'; + *out_buffer++ = '\xA7'; + out_size -= 2; + in_buffer += sizeof(char_type); + --in_size; + return true; + // Expect non-standard ! (full width exclaimation mark), found in some localizations + case '\xA1': + full_width_exclaim(); + return true; + // Expect nothing then non-standard ! (full width exclaimation mark), found in some localizations + case '\xAD': + if (in_size >= 2 && in_buffer + 1 && in_buffer[1] == '\xA1') { + --out_size; + in_buffer += sizeof(char_type); + --in_size; + full_width_exclaim(); + } + return true; + // Unexpected error + default: break; + } + } + return false; + }; +#if defined(_WIN32) + auto iconv_mimic = [&]() -> int64_t { + static constexpr size_t CP_GBK = 936; + static constexpr size_t MB_CHAR_MAX = 16; + + static auto mblen = [](const char* buf, int bufsize) { + int len = 0; + + unsigned char c = *buf; + if (c < 0x80) { + len = 1; + } else if ((c & 0xE0) == 0xC0) { + len = 2; + } else if ((c & 0xF0) == 0xE0) { + len = 3; + } else if ((c & 0xF8) == 0xF0) { + len = 4; + } else if ((c & 0xFC) == 0xF8) { + len = 5; + } else if ((c & 0xFE) == 0xFC) { + len = 6; + } + + if (len == 0) { + errno = EILSEQ; + return -1; + } else if (bufsize < len) { + errno = EINVAL; + return -1; + } + return len; + }; + + while (in_size != 0) { + unsigned short wbuf[MB_CHAR_MAX]; /* enough room for one character */ + size_t wsize = MB_CHAR_MAX; + + int insize = IsDBCSLeadByteEx(CP_GBK, *in_buffer) ? 2 : 1; + if (insize == 2 && in_buffer && in_size >= 2) { + // iconv errors on user-defined double byte characters + // MultiByteToWideChar/WideCharToMultiByte does not + unsigned char byte1 = static_cast(*in_buffer); + unsigned char byte2 = static_cast(in_buffer[1]); + if (byte1 >= 0xAA && byte1 <= 0xAF && byte2 >= 0xA1 && byte2 <= 0xFE) { + errno = EILSEQ; + return -1; + } + if (byte1 >= 0xF8 && byte1 <= 0xFE && byte2 >= 0xA1 && byte2 <= 0xFE) { + errno = EILSEQ; + return -1; + } + if (byte1 >= 0xA1 && byte1 <= 0xA7 && byte2 >= 0x40 && byte2 <= 0xA0 && byte2 != 0x7F) { + errno = EILSEQ; + return -1; + } + } + wsize = MultiByteToWideChar(CP_GBK, MB_ERR_INVALID_CHARS, in_buffer, insize, (wchar_t*)wbuf, wsize); + if (wsize == 0) { + in_buffer += insize; + in_size -= insize; + continue; + } + + if (out_size == 0) { + errno = E2BIG; + return -1; + } + + int outsize = WideCharToMultiByte(CP_UTF8, 0, (const wchar_t*)wbuf, wsize, out_buffer, out_size, NULL, NULL); + if (outsize == 0) { + switch (GetLastError()) { + case ERROR_INVALID_FLAGS: + case ERROR_INVALID_PARAMETER: + case ERROR_INSUFFICIENT_BUFFER: + errno = E2BIG; + return -1; + default: break; + } + errno = EILSEQ; + return -1; + } else if (mblen(out_buffer, outsize) != outsize) { + /* validate result */ + errno = EILSEQ; + return -1; + } + + in_buffer += insize; + out_buffer += outsize; + in_size -= insize; + out_size -= outsize; + } + + return 0; + }; + + const auto end = in_buffer + size; + while (in_size > 0 && out_size > 0 && in_buffer != end) { + if (iconv_mimic() == -1) { + if (!iconv_err_handler()) { + break; + } + } + } +#elif defined(__unix__) || defined(__APPLE__) || __has_include() + const auto end = in_buffer + size; + while (in_size > 0 && out_size > 0 && in_buffer != end) { + if (::iconv(cd, &in_buffer, &in_size, &out_buffer, &out_size) == -1) { + if (!iconv_err_handler()) { + break; + } + } + } + ::iconv_close(cd); +#else +#error "GBK conversion not supported on this platform" +#endif + return lexy::buffer { out_builder.data(), static_cast(out_buffer - out_builder.data()), resource }; + } else { + return lexy::make_buffer_from_raw(_memory, size, resource); + } + } + }; + + template + constexpr auto make_buffer_from_raw = _make_buffer {}; + + template + struct _read_file_user_data : lexy::_read_file_user_data { + using base_type = lexy::_read_file_user_data; + + detail::Encoding encoding; + + _read_file_user_data(detail::Encoding encoding, MemoryResource* resource) : base_type(resource), encoding(encoding) {} + static auto callback() { + return [](void* _user_data, const char* memory, std::size_t size) { + auto user_data = static_cast<_read_file_user_data*>(_user_data); + + user_data->buffer = make_buffer_from_raw(user_data->encoding, memory, size, user_data->resource); + }; + } + }; + + template + auto read_file( + const char* path, + detail::Encoding encoding, + MemoryResource* resource = lexy::_detail::get_memory_resource()) + -> lexy::read_file_result { + _read_file_user_data user_data(encoding, resource); + auto error = lexy::_detail::read_file(path, user_data.callback(), &user_data); + return lexy::read_file_result(error, LEXY_MOV(user_data.buffer)); + } + + /// Reads stdin into a buffer. + template + auto read_stdin( + detail::Encoding encoding, + MemoryResource* resource = lexy::_detail::get_memory_resource()) + -> lexy::read_file_result { + _read_file_user_data user_data(encoding, resource); + auto error = lexy::_detail::read_stdin(user_data.callback(), &user_data); + return lexy::read_file_result(error, LEXY_MOV(user_data.buffer)); + } +} \ No newline at end of file diff --git a/src/openvic-dataloader/detail/Detect.cpp b/src/openvic-dataloader/detail/Detect.cpp index a12a6b2..12b3163 100644 --- a/src/openvic-dataloader/detail/Detect.cpp +++ b/src/openvic-dataloader/detail/Detect.cpp @@ -1,5 +1,7 @@ #include "detail/Detect.hpp" +#include + using namespace ovdl; using namespace ovdl::encoding_detect; @@ -23,6 +25,15 @@ std::optional AsciiCandidate::read(const std::span& buffer return std::nullopt; } +std::optional GbkCandidate::read(const std::span& buffer) { + auto lexy_buffer = lexy::make_buffer_from_raw(buffer.data(), buffer.size()); + if (is_gbk(lexy_buffer)) { + return 2; + } + + return std::nullopt; +} + std::optional NonLatinCasedCandidate::read(const std::span& buffer) { static constexpr cbyte LATIN_LETTER = 1; static constexpr int64_t NON_LATIN_MIXED_CASE_PENALTY = -20; @@ -351,3 +362,5 @@ std::optional LatinCandidate::read(const std::span& buffer template struct ovdl::encoding_detect::DetectUtf8; template struct ovdl::encoding_detect::DetectUtf8; +template struct ovdl::encoding_detect::DetectGbk; +template struct ovdl::encoding_detect::DetectGbk; diff --git a/src/openvic-dataloader/detail/Detect.hpp b/src/openvic-dataloader/detail/Detect.hpp index 7a9ea64..dee4016 100644 --- a/src/openvic-dataloader/detail/Detect.hpp +++ b/src/openvic-dataloader/detail/Detect.hpp @@ -17,7 +17,9 @@ #include #include #include +#include #include +#include #include #include @@ -87,6 +89,50 @@ namespace ovdl::encoding_detect { return lexy::match>(input); } + template + struct DetectGbk { + struct not_gbk { + static constexpr auto name = "not gbk"; + }; + + static constexpr auto rule = [] { + constexpr auto is_not_ascii_flag = lexy::dsl::context_flag; + + constexpr auto ascii_values = lexy::dsl::ascii::character; + auto euro_value = lexy::dsl::lit_c<'\x80'>; + constexpr auto gbk1 = lexy::dsl::token(dsl::lit_b_range<0xA1, 0xA9> >> dsl::lit_b_range<0xA1, 0xFE>); + constexpr auto gbk2 = lexy::dsl::token(dsl::lit_b_range<0xB0, 0xF7> >> dsl::lit_b_range<0xA1, 0xFE>); + constexpr auto gbk3 = dsl::lit_b_range<0x81, 0xA0> >> (lexy::dsl::must(lexy::dsl::lit_b<0x7F>).template error | dsl::lit_b_range<0x40, 0xFE>); + constexpr auto gbk4 = dsl::lit_b_range<0xAA, 0xFE> >> (lexy::dsl::must(lexy::dsl::lit_b<0x7F>).template error | dsl::lit_b_range<0x40, 0xA0>); + constexpr auto gbk5 = dsl::lit_b_range<0xA8, 0xA9> >> (lexy::dsl::must(lexy::dsl::lit_b<0x7F>).template error | dsl::lit_b_range<0x40, 0xA0>); + constexpr auto udef1 = lexy::dsl::token(dsl::lit_b_range<0xAA, 0xAF> >> dsl::lit_b_range<0xA1, 0xFE>); + constexpr auto udef2 = lexy::dsl::token(dsl::lit_b_range<0xF8, 0xFE> >> dsl::lit_b_range<0xA1, 0xFE>); + constexpr auto udef3 = dsl::lit_b_range<0xA1, 0xA7> >> (lexy::dsl::must(lexy::dsl::lit_b<0x7F>).template error | dsl::lit_b_range<0x40, 0xA0>); + + auto gbk_check = (euro_value | gbk1 | gbk2 | udef1 | udef2 | gbk3 | gbk4 | gbk5 | udef3) >> + is_not_ascii_flag.set(); + + return is_not_ascii_flag.template create() + + lexy::dsl::while_(gbk_check | ascii_values) + + lexy::dsl::must(is_not_ascii_flag.is_set()).template error + lexy::dsl::eof; + }(); + + static constexpr auto value = lexy::constant(true); + }; + + extern template struct DetectGbk; + extern template struct DetectGbk; + + template + constexpr bool is_gbk_no_ascii(const Input& input) { + return lexy::match>(input); + } + + template + constexpr bool is_gbk(const Input& input) { + return lexy::match>(input); + } + struct DetectorData { static constexpr std::array latin_ascii = std::to_array({ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @@ -424,6 +470,10 @@ namespace ovdl::encoding_detect { std::optional read(const std::span& buffer); }; + struct GbkCandidate { + std::optional read(const std::span& buffer); + }; + struct NonLatinCasedCandidate { enum class CaseState { Space, @@ -484,7 +534,7 @@ namespace ovdl::encoding_detect { std::optional read(const std::span& buffer); }; - using InnerCandidate = std::variant; + using InnerCandidate = std::variant; template struct overloaded : Ts... { @@ -530,6 +580,10 @@ namespace ovdl::encoding_detect { return create_candidate(get_byte_score(index)); } + static constexpr Candidate new_gbk() { + return create_candidate(); + } + constexpr std::optional score(const std::span& buffer, std::size_t encoding, bool expectation_is_valid) { if (auto old_score = score_value) { auto new_score = std::visit([&](auto& inner) { @@ -565,6 +619,9 @@ namespace ovdl::encoding_detect { }, [](const NonLatinCasedCandidate& candidate) { return candidate.score_data.encoding; + }, + [](const GbkCandidate& candidate) { + return Encoding::Gbk; } }, inner); } @@ -576,6 +633,7 @@ namespace ovdl::encoding_detect { Candidate::new_utf8(), Candidate::new_latin(ScoreIndex::Windows1252), Candidate::new_non_latin_cased(ScoreIndex::Windows1251), + Candidate::new_gbk(), }; Encoding default_fallback = Encoding::Unknown; diff --git a/src/openvic-dataloader/detail/ParseHandler.hpp b/src/openvic-dataloader/detail/ParseHandler.hpp index e30a452..c8fa412 100644 --- a/src/openvic-dataloader/detail/ParseHandler.hpp +++ b/src/openvic-dataloader/detail/ParseHandler.hpp @@ -16,6 +16,7 @@ #include "detail/BufferError.hpp" #include "detail/Convert.hpp" +#include "detail/ConvertGbk.hpp" #include "detail/Detect.hpp" #include "detail/InternalConcepts.hpp" @@ -109,6 +110,20 @@ namespace ovdl::detail { *state = { convert::make_buffer_from_raw(encoding, std::move(buffer).release(), size), encoding }; }; + template + static constexpr auto generate_gbk_state(State* state, const char* path, auto&& buffer, Encoding encoding) { + size_t size = buffer.size(); + if (path[0] != '\0') { + *state = { + path, + convert::gbk::make_buffer_from_raw(encoding, std::move(buffer).release(), size), + encoding + }; + return; + } + *state = { convert::gbk::make_buffer_from_raw(encoding, std::move(buffer).release(), size), encoding }; + }; + template static void create_state(State* state, const char* path, lexy::buffer&& buffer, std::optional fallback) { if (!_system_fallback_encoding.has_value()) { @@ -142,6 +157,10 @@ namespace ovdl::detail { generate_conversion_state(state, path, std::move(buffer), encoding); break; } + case Gbk: { + generate_gbk_state(state, path, std::move(buffer), encoding); + break; + } OVDL_DEFAULT_CASE_UNREACHABLE(); } diff --git a/src/openvic-dataloader/v2script/Parser.cpp b/src/openvic-dataloader/v2script/Parser.cpp index d647ec8..6b569d3 100644 --- a/src/openvic-dataloader/v2script/Parser.cpp +++ b/src/openvic-dataloader/v2script/Parser.cpp @@ -66,6 +66,7 @@ struct Parser::ParseHandler final : detail::BasicStateParseHandler(buffer(), parse_state(), parse_state().logger().error_callback()); OVDL_DEFAULT_CASE_UNREACHABLE(Unknown); }