1+ #pragma once
2+
3+ #include < cerrno>
4+ #include < cstddef>
5+ #include < cstring>
6+
7+ #include < openvic-dataloader/detail/Encoding.hpp>
8+
9+ #include < lexy/_detail/memory_resource.hpp>
10+ #include < lexy/encoding.hpp>
11+ #include < lexy/input/buffer.hpp>
12+ #include < lexy/input/file.hpp>
13+
14+ #ifdef _WIN32
15+ #define WIN32_LEAN_AND_MEAN
16+ #include < windows.h>
17+ #undef WIN32_LEAN_AND_MEAN
18+ #elif defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
19+ #include < iconv.h>
20+ #endif
21+
22+ namespace ovdl ::convert::gbk {
23+ template <typename Encoding, lexy::encoding_endianness Endian>
24+ struct _make_buffer {
25+ static constexpr size_t small_buffer_size = size_t (4 ) * 1024 ;
26+
27+ template <typename MemoryResource = void >
28+ auto operator ()(detail::Encoding encoding, const void * _memory, std::size_t size,
29+ MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>()) const {
30+ constexpr auto native_endianness = LEXY_IS_LITTLE_ENDIAN ? lexy::encoding_endianness::little : lexy::encoding_endianness::big;
31+
32+ using char_type = typename Encoding::char_type;
33+ LEXY_PRECONDITION (size % sizeof (char_type) == 0 );
34+ auto memory = static_cast <const unsigned char *>(_memory);
35+
36+ if constexpr (sizeof (char_type) == 1 || Endian == native_endianness) {
37+ switch (encoding) {
38+ using enum detail::Encoding;
39+ case Ascii:
40+ case Utf8:
41+ return lexy::make_buffer_from_raw<Encoding, Endian>(_memory, size, resource);
42+ default : break ;
43+ }
44+
45+ #if defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
46+ iconv_t cd = ::iconv_open (" UTF-8" , " WINDOWS-936" );
47+ if (cd == (iconv_t )-1 ) {
48+ return lexy::buffer<Encoding, MemoryResource> { resource };
49+ }
50+ #endif
51+
52+ size_t in_size = size;
53+ // While technically illegal, it seems the contract for iconv is wrong, it doesn't modify the content of inbuff
54+ // It only ever does such for convenience
55+ char * in_buffer = const_cast <char *>(static_cast <const char *>(_memory));
56+
57+ if (in_buffer == nullptr ) {
58+ return lexy::buffer<Encoding, MemoryResource> { resource };
59+ }
60+
61+ typename lexy::buffer<Encoding, MemoryResource>::builder out_builder (size * 3 );
62+ char * out_buffer = out_builder.data ();
63+ size_t out_size = out_builder.size ();
64+
65+ auto iconv_err_handler = [&]() {
66+ if (errno == EILSEQ && in_buffer && in_size >= 1 ) {
67+ auto full_width_exclaim = [&] {
68+ // Insert UTF-8 ! (full width exclaimation mark)
69+ *out_buffer++ = ' \xEF ' ;
70+ *out_buffer++ = ' \xBC ' ;
71+ *out_buffer++ = ' \x81 ' ;
72+ out_size -= 3 ;
73+ in_buffer += sizeof (char_type);
74+ --in_size;
75+ };
76+ switch (*in_buffer) {
77+ // Expect non-standard § from Windows-1252, required for color behavior
78+ case ' \xA7 ' :
79+ // Insert UTF-8 §
80+ *out_buffer++ = ' \xC2 ' ;
81+ *out_buffer++ = ' \xA7 ' ;
82+ out_size -= 2 ;
83+ in_buffer += sizeof (char_type);
84+ --in_size;
85+ return true ;
86+ // Expect non-standard ! (full width exclaimation mark), found in some localizations
87+ case ' \xA1 ' :
88+ full_width_exclaim ();
89+ return true ;
90+ // Expect nothing then non-standard ! (full width exclaimation mark), found in some localizations
91+ case ' \xAD ' :
92+ if (in_size >= 2 && in_buffer + 1 && in_buffer[1 ] == ' \xA1 ' ) {
93+ --out_size;
94+ in_buffer += sizeof (char_type);
95+ --in_size;
96+ full_width_exclaim ();
97+ }
98+ return true ;
99+ // Unexpected error
100+ default : break ;
101+ }
102+ }
103+ return false ;
104+ };
105+ #if defined(_WIN32)
106+ auto iconv_mimic = [&]() -> int64_t {
107+ static constexpr size_t CP_GBK = 936 ;
108+ static constexpr size_t MB_CHAR_MAX = 16 ;
109+
110+ static auto mblen = [](const char * buf, int bufsize) {
111+ int len = 0 ;
112+
113+ unsigned char c = *buf;
114+ if (c < 0x80 ) {
115+ len = 1 ;
116+ } else if ((c & 0xE0 ) == 0xC0 ) {
117+ len = 2 ;
118+ } else if ((c & 0xF0 ) == 0xE0 ) {
119+ len = 3 ;
120+ } else if ((c & 0xF8 ) == 0xF0 ) {
121+ len = 4 ;
122+ } else if ((c & 0xFC ) == 0xF8 ) {
123+ len = 5 ;
124+ } else if ((c & 0xFE ) == 0xFC ) {
125+ len = 6 ;
126+ }
127+
128+ if (len == 0 ) {
129+ errno = EILSEQ;
130+ return -1 ;
131+ } else if (bufsize < len) {
132+ errno = EINVAL;
133+ return -1 ;
134+ }
135+ return len;
136+ };
137+
138+ while (in_size != 0 ) {
139+ unsigned short wbuf[MB_CHAR_MAX]; /* enough room for one character */
140+ size_t wsize = MB_CHAR_MAX;
141+
142+ int insize = IsDBCSLeadByteEx (CP_GBK, *in_buffer) ? 2 : 1 ;
143+ if (insize == 2 && in_buffer && in_size >= 2 ) {
144+ // iconv errors on user-defined double byte characters
145+ // MultiByteToWideChar/WideCharToMultiByte does not
146+ unsigned char byte1 = static_cast <unsigned char >(*in_buffer);
147+ unsigned char byte2 = static_cast <unsigned char >(in_buffer[1 ]);
148+ if (byte1 >= 0xAA && byte1 <= 0xAF && byte2 >= 0xA1 && byte2 <= 0xFE ) {
149+ errno = EILSEQ;
150+ return -1 ;
151+ }
152+ if (byte1 >= 0xF8 && byte1 <= 0xFE && byte2 >= 0xA1 && byte2 <= 0xFE ) {
153+ errno = EILSEQ;
154+ return -1 ;
155+ }
156+ if (byte1 >= 0xA1 && byte1 <= 0xA7 && byte2 >= 0x40 && byte2 <= 0xA0 && byte2 != 0x7F ) {
157+ errno = EILSEQ;
158+ return -1 ;
159+ }
160+ }
161+ wsize = MultiByteToWideChar (CP_GBK, MB_ERR_INVALID_CHARS, in_buffer, insize, (wchar_t *)wbuf, wsize);
162+ if (wsize == 0 ) {
163+ in_buffer += insize;
164+ in_size -= insize;
165+ continue ;
166+ }
167+
168+ if (out_size == 0 ) {
169+ errno = E2BIG;
170+ return -1 ;
171+ }
172+
173+ int outsize = WideCharToMultiByte (CP_UTF8, 0 , (const wchar_t *)wbuf, wsize, out_buffer, out_size, NULL , NULL );
174+ if (outsize == 0 ) {
175+ switch (GetLastError ()) {
176+ case ERROR_INVALID_FLAGS:
177+ case ERROR_INVALID_PARAMETER:
178+ case ERROR_INSUFFICIENT_BUFFER:
179+ errno = E2BIG;
180+ return -1 ;
181+ default : break ;
182+ }
183+ errno = EILSEQ;
184+ return -1 ;
185+ } else if (mblen (out_buffer, outsize) != outsize) {
186+ /* validate result */
187+ errno = EILSEQ;
188+ return -1 ;
189+ }
190+
191+ in_buffer += insize;
192+ out_buffer += outsize;
193+ in_size -= insize;
194+ out_size -= outsize;
195+ }
196+
197+ return 0 ;
198+ };
199+
200+ const auto end = in_buffer + size;
201+ while (in_size > 0 && out_size > 0 && in_buffer != end) {
202+ if (iconv_mimic () == -1 ) {
203+ if (!iconv_err_handler ()) {
204+ break ;
205+ }
206+ }
207+ }
208+ #elif defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
209+ const auto end = in_buffer + size;
210+ while (in_size > 0 && out_size > 0 && in_buffer != end) {
211+ if (::iconv (cd, &in_buffer, &in_size, &out_buffer, &out_size) == -1 ) {
212+ if (!iconv_err_handler ()) {
213+ break ;
214+ }
215+ }
216+ }
217+ ::iconv_close (cd);
218+ #else
219+ #error "GBK conversion not supported on this platform"
220+ #endif
221+ return lexy::buffer<Encoding, MemoryResource> { out_builder.data (), static_cast <size_t >(out_buffer - out_builder.data ()), resource };
222+ } else {
223+ return lexy::make_buffer_from_raw<Encoding, Endian>(_memory, size, resource);
224+ }
225+ }
226+ };
227+
228+ template <typename Encoding, lexy::encoding_endianness Endianness = lexy::encoding_endianness::bom>
229+ constexpr auto make_buffer_from_raw = _make_buffer<Encoding, Endianness> {};
230+
231+ template <typename Encoding, lexy::encoding_endianness Endian, typename MemoryResource>
232+ struct _read_file_user_data : lexy::_read_file_user_data<Encoding, Endian, MemoryResource> {
233+ using base_type = lexy::_read_file_user_data<Encoding, Endian, MemoryResource>;
234+
235+ detail::Encoding encoding;
236+
237+ _read_file_user_data (detail::Encoding encoding, MemoryResource* resource) : base_type(resource), encoding(encoding) {}
238+ static auto callback () {
239+ return [](void * _user_data, const char * memory, std::size_t size) {
240+ auto user_data = static_cast <_read_file_user_data*>(_user_data);
241+
242+ user_data->buffer = make_buffer_from_raw<Encoding, Endian>(user_data->encoding , memory, size, user_data->resource );
243+ };
244+ }
245+ };
246+
247+ template <typename Encoding = lexy::default_encoding,
248+ lexy::encoding_endianness Endian = lexy::encoding_endianness::bom,
249+ typename MemoryResource = void >
250+ auto read_file (
251+ const char * path,
252+ detail::Encoding encoding,
253+ MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>())
254+ -> lexy::read_file_result<Encoding, MemoryResource> {
255+ _read_file_user_data<Encoding, Endian, MemoryResource> user_data (encoding, resource);
256+ auto error = lexy::_detail::read_file (path, user_data.callback (), &user_data);
257+ return lexy::read_file_result (error, LEXY_MOV (user_data.buffer ));
258+ }
259+
260+ // / Reads stdin into a buffer.
261+ template <typename Encoding = lexy::default_encoding,
262+ lexy::encoding_endianness Endian = lexy::encoding_endianness::bom,
263+ typename MemoryResource = void >
264+ auto read_stdin (
265+ detail::Encoding encoding,
266+ MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>())
267+ -> lexy::read_file_result<Encoding, MemoryResource> {
268+ _read_file_user_data<Encoding, Endian, MemoryResource> user_data (encoding, resource);
269+ auto error = lexy::_detail::read_stdin (user_data.callback (), &user_data);
270+ return lexy::read_file_result (error, LEXY_MOV (user_data.buffer ));
271+ }
272+ }
0 commit comments