1+ #pragma once
2+
3+ #include < cerrno>
4+ #include < cstddef>
5+ #include < cstring>
6+
7+ #include < openvic-dataloader/detail/Encoding.hpp>
8+
9+ #include < lexy/_detail/memory_resource.hpp>
10+ #include < lexy/encoding.hpp>
11+ #include < lexy/input/buffer.hpp>
12+ #include < lexy/input/file.hpp>
13+
14+ #ifdef _WIN32
15+ #define WIN32_LEAN_AND_MEAN
16+ #include < windows.h>
17+ #undef WIN32_LEAN_AND_MEAN
18+ #elif defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
19+ #include < iconv.h>
20+ #endif
21+
22+ namespace ovdl ::convert::gbk {
23+ template <typename Encoding, lexy::encoding_endianness Endian>
24+ struct _make_buffer {
25+ static constexpr size_t small_buffer_size = size_t (4 ) * 1024 ;
26+
27+ template <typename MemoryResource = void >
28+ auto operator ()(detail::Encoding encoding, const void * _memory, std::size_t size,
29+ MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>()) const {
30+ constexpr auto native_endianness = LEXY_IS_LITTLE_ENDIAN ? lexy::encoding_endianness::little : lexy::encoding_endianness::big;
31+
32+ using char_type = typename Encoding::char_type;
33+ LEXY_PRECONDITION (size % sizeof (char_type) == 0 );
34+ auto memory = static_cast <const unsigned char *>(_memory);
35+
36+ if constexpr (sizeof (char_type) == 1 || Endian == native_endianness) {
37+ switch (encoding) {
38+ using enum detail::Encoding;
39+ case Ascii:
40+ case Utf8:
41+ return lexy::make_buffer_from_raw<Encoding, Endian>(_memory, size, resource);
42+ default : break ;
43+ }
44+
45+ #if defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
46+ iconv_t cd = ::iconv_open (" UTF-8" , " WINDOWS-936" );
47+ if (cd == (iconv_t )-1 ) {
48+ return lexy::buffer<Encoding, MemoryResource> { resource };
49+ }
50+ #endif
51+
52+ size_t in_size = size;
53+ // While technically illegal, it seems the contract for iconv is wrong, it doesn't modify the content of inbuff
54+ // It only ever does such for convenience
55+ char * in_buffer = const_cast <char *>(static_cast <const char *>(_memory));
56+
57+ if (in_buffer == nullptr ) {
58+ return lexy::buffer<Encoding, MemoryResource> { resource };
59+ }
60+
61+ typename lexy::buffer<Encoding, MemoryResource>::builder out_builder (size * 3 );
62+ char * out_buffer = out_builder.data ();
63+ size_t out_size = out_builder.size ();
64+
65+ auto iconv_err_handler = [&]() {
66+ if (errno == EILSEQ && in_buffer && in_size >= 1 ) {
67+ auto full_width_exclaim = [&] {
68+ // Insert UTF-8 ! (full width exclaimation mark)
69+ *out_buffer++ = ' \xEF ' ;
70+ *out_buffer++ = ' \xBC ' ;
71+ *out_buffer++ = ' \x81 ' ;
72+ out_size -= 3 ;
73+ in_buffer += sizeof (char_type);
74+ --in_size;
75+ };
76+ switch (*in_buffer) {
77+ // Expect non-standard § from Windows-1252, required for color behavior
78+ case ' \xA7 ' :
79+ // Insert UTF-8 §
80+ *out_buffer++ = ' \xC2 ' ;
81+ *out_buffer++ = ' \xA7 ' ;
82+ out_size -= 2 ;
83+ in_buffer += sizeof (char_type);
84+ --in_size;
85+ return true ;
86+ // Expect non-standard ! (full width exclaimation mark), found in some localizations
87+ case ' \xA1 ' :
88+ full_width_exclaim ();
89+ return true ;
90+ // Expect nothing then non-standard ! (full width exclaimation mark), found in some localizations
91+ case ' \xAD ' :
92+ if (in_size >= 2 && in_buffer + 1 && in_buffer[1 ] == ' \xA1 ' ) {
93+ --out_size;
94+ in_buffer += sizeof (char_type);
95+ --in_size;
96+ full_width_exclaim ();
97+ }
98+ return true ;
99+ // Unexpected error
100+ default : break ;
101+ }
102+ }
103+ return false ;
104+ };
105+ #if defined(_WIN32)
106+ auto iconv_mimic = [&]() -> int64_t {
107+ static constexpr size_t CP_GBK = 936 ;
108+ static constexpr size_t MB_CHAR_MAX = 16 ;
109+
110+ static auto mblen = [](const char * buf, int bufsize) {
111+ int len = 0 ;
112+
113+ unsigned char c = *buf;
114+ if (c < 0x80 ) len = 1 ;
115+ else if ((c & 0xE0 ) == 0xC0 ) len = 2 ;
116+ else if ((c & 0xF0 ) == 0xE0 ) len = 3 ;
117+ else if ((c & 0xF8 ) == 0xF0 ) len = 4 ;
118+ else if ((c & 0xFC ) == 0xF8 ) len = 5 ;
119+ else if ((c & 0xFE ) == 0xFC ) len = 6 ;
120+
121+ if (len == 0 ) {
122+ errno = EILSEQ;
123+ return -1 ;
124+ } else if (bufsize < len) {
125+ errno = EINVAL;
126+ return -1 ;
127+ }
128+ return len;
129+ };
130+
131+ while (in_size != 0 ) {
132+ unsigned short wbuf[MB_CHAR_MAX]; /* enough room for one character */
133+ size_t wsize = MB_CHAR_MAX;
134+
135+ int insize = IsDBCSLeadByteEx (CP_GBK, *in_buffer) ? 2 : 1 ;
136+ if (insize == 2 && in_buffer && in_size >= 2 ) {
137+ // iconv errors on user-defined double byte characters
138+ // MultiByteToWideChar/WideCharToMultiByte does not
139+ unsigned char byte1 = static_cast <unsigned char >(*in_buffer);
140+ unsigned char byte2 = static_cast <unsigned char >(in_buffer[1 ]);
141+ if (byte1 >= 0xAA && byte1 <= 0xAF && byte2 >= 0xA1 && byte2 <= 0xFE ) {
142+ errno = EILSEQ;
143+ return -1 ;
144+ }
145+ if (byte1 >= 0xF8 && byte1 <= 0xFE && byte2 >= 0xA1 && byte2 <= 0xFE ) {
146+ errno = EILSEQ;
147+ return -1 ;
148+ }
149+ if (byte1 >= 0xA1 && byte1 <= 0xA7 && byte2 >= 0x40 && byte2 <= 0xA0 && byte2 != 0x7F ) {
150+ errno = EILSEQ;
151+ return -1 ;
152+ }
153+ }
154+ wsize = MultiByteToWideChar (CP_GBK, MB_ERR_INVALID_CHARS, in_buffer, insize, (wchar_t *)wbuf, wsize);
155+ if (wsize == 0 ) {
156+ in_buffer += insize;
157+ in_size -= insize;
158+ continue ;
159+ }
160+
161+ if (out_size == 0 ) {
162+ errno = E2BIG;
163+ return -1 ;
164+ }
165+
166+ int outsize = WideCharToMultiByte (CP_UTF8, 0 , (const wchar_t *)wbuf, wsize, out_buffer, out_size, NULL , NULL );
167+ if (outsize == 0 ) {
168+ switch (GetLastError ()) {
169+ case ERROR_INVALID_FLAGS:
170+ case ERROR_INVALID_PARAMETER:
171+ case ERROR_INSUFFICIENT_BUFFER:
172+ errno = E2BIG;
173+ return -1 ;
174+ default : break ;
175+ }
176+ errno = EILSEQ;
177+ return -1 ;
178+ } else if (mblen (out_buffer, outsize) != outsize) {
179+ /* validate result */
180+ errno = EILSEQ;
181+ return -1 ;
182+ }
183+
184+ in_buffer += insize;
185+ out_buffer += outsize;
186+ in_size -= insize;
187+ out_size -= outsize;
188+ }
189+
190+ return 0 ;
191+ };
192+
193+ const auto end = in_buffer + size;
194+ while (in_size > 0 && out_size > 0 && in_buffer != end) {
195+ if (iconv_mimic () == -1 ) {
196+ if (!iconv_err_handler ()) {
197+ break ;
198+ }
199+ }
200+ }
201+ #elif defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
202+ const auto end = in_buffer + size;
203+ while (in_size > 0 && out_size > 0 && in_buffer != end) {
204+ if (::iconv (cd, &in_buffer, &in_size, &out_buffer, &out_size) == -1 ) {
205+ if (!iconv_err_handler ()) {
206+ break ;
207+ }
208+ }
209+ }
210+ ::iconv_close (cd);
211+ #else
212+ #error "GBK conversion not supported on this platform"
213+ #endif
214+ return lexy::buffer<Encoding, MemoryResource> { out_builder.data (), static_cast <size_t >(out_buffer - out_builder.data ()), resource };
215+ } else {
216+ return lexy::make_buffer_from_raw<Encoding, Endian>(_memory, size, resource);
217+ }
218+ }
219+ };
220+
221+ template <typename Encoding, lexy::encoding_endianness Endianness = lexy::encoding_endianness::bom>
222+ constexpr auto make_buffer_from_raw = _make_buffer<Encoding, Endianness> {};
223+
224+ template <typename Encoding, lexy::encoding_endianness Endian, typename MemoryResource>
225+ struct _read_file_user_data : lexy::_read_file_user_data<Encoding, Endian, MemoryResource> {
226+ using base_type = lexy::_read_file_user_data<Encoding, Endian, MemoryResource>;
227+
228+ detail::Encoding encoding;
229+
230+ _read_file_user_data (detail::Encoding encoding, MemoryResource* resource) : base_type(resource), encoding(encoding) {}
231+ static auto callback () {
232+ return [](void * _user_data, const char * memory, std::size_t size) {
233+ auto user_data = static_cast <_read_file_user_data*>(_user_data);
234+
235+ user_data->buffer = make_buffer_from_raw<Encoding, Endian>(user_data->encoding , memory, size, user_data->resource );
236+ };
237+ }
238+ };
239+
240+ template <typename Encoding = lexy::default_encoding,
241+ lexy::encoding_endianness Endian = lexy::encoding_endianness::bom,
242+ typename MemoryResource = void >
243+ auto read_file (
244+ const char * path,
245+ detail::Encoding encoding,
246+ MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>())
247+ -> lexy::read_file_result<Encoding, MemoryResource> {
248+ _read_file_user_data<Encoding, Endian, MemoryResource> user_data (encoding, resource);
249+ auto error = lexy::_detail::read_file (path, user_data.callback (), &user_data);
250+ return lexy::read_file_result (error, LEXY_MOV (user_data.buffer ));
251+ }
252+
253+ // / Reads stdin into a buffer.
254+ template <typename Encoding = lexy::default_encoding,
255+ lexy::encoding_endianness Endian = lexy::encoding_endianness::bom,
256+ typename MemoryResource = void >
257+ auto read_stdin (
258+ detail::Encoding encoding,
259+ MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>())
260+ -> lexy::read_file_result<Encoding, MemoryResource> {
261+ _read_file_user_data<Encoding, Endian, MemoryResource> user_data (encoding, resource);
262+ auto error = lexy::_detail::read_stdin (user_data.callback (), &user_data);
263+ return lexy::read_file_result (error, LEXY_MOV (user_data.buffer ));
264+ }
265+ }
0 commit comments