diff --git a/.clang-format b/.clang-format index 93baaea..aa8ad51 100644 --- a/.clang-format +++ b/.clang-format @@ -46,6 +46,8 @@ AlignTrailingComments: true AlignEscapedNewlines: Left AlignAfterOpenBracket: DontAlign AccessModifierOffset: -4 +Macros: + - "OVDL_DEFAULT_CASE_UNREACHABLE(OPTION)=default: ovdl::detail::unreachable()" IncludeCategories: - Regex: <[[:alnum:]_]+> Priority: 1 diff --git a/include/openvic-dataloader/detail/Encoding.hpp b/include/openvic-dataloader/detail/Encoding.hpp index 12a0524..b1a9aed 100644 --- a/include/openvic-dataloader/detail/Encoding.hpp +++ b/include/openvic-dataloader/detail/Encoding.hpp @@ -3,7 +3,7 @@ #include namespace ovdl::detail { - enum class Encoding : std::int8_t { + enum class Encoding : std::uint8_t { Unknown, Ascii, Utf8, diff --git a/include/openvic-dataloader/detail/Utility.hpp b/include/openvic-dataloader/detail/Utility.hpp index 06c74f3..eb6dc84 100644 --- a/include/openvic-dataloader/detail/Utility.hpp +++ b/include/openvic-dataloader/detail/Utility.hpp @@ -7,6 +7,25 @@ #include +#ifdef DEBUG_ENABLED +#define OVDL_DEFAULT_CASE_UNREACHABLE(...) \ + __VA_OPT__(case __VA_ARGS__ : ovdl::detail::unreachable()) +#else +#define OVDL_DEFAULT_CASE_UNREACHABLE(...) \ + default: ovdl::detail::unreachable() +#endif + +#ifdef __GNUC__ +#define OVDL_BEGIN_IGNORE_WARNING_RETURN_TYPE \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wreturn-type\"") +#define OVDL_END_IGNORE_WARNING_RETURN_TYPE \ + _Pragma("GCC diagnostic pop") +#else +#define OVDL_BEGIN_IGNORE_WARNING_RETURN_TYPE +#define OVDL_END_IGNORE_WARNING_RETURN_TYPE +#endif + #if __has_cpp_attribute(msvc::no_unique_address) #define OVDL_NO_UNIQUE_ADDRESS \ _Pragma("warning(push)") _Pragma("warning(disable : 4848)") \ diff --git a/src/openvic-dataloader/File.hpp b/src/openvic-dataloader/File.hpp index cdab377..fa01d33 100644 --- a/src/openvic-dataloader/File.hpp +++ b/src/openvic-dataloader/File.hpp @@ -17,6 +17,7 @@ namespace ovdl { struct File { using buffer_ids = detail::TypeRegister< lexy::buffer, + lexy::buffer, lexy::buffer, lexy::buffer, lexy::buffer, diff --git a/src/openvic-dataloader/csv/CsvGrammar.hpp b/src/openvic-dataloader/csv/CsvGrammar.hpp index acbeafb..6811586 100644 --- a/src/openvic-dataloader/csv/CsvGrammar.hpp +++ b/src/openvic-dataloader/csv/CsvGrammar.hpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include @@ -14,9 +13,7 @@ #include #include -#include "detail/Convert.hpp" #include "detail/InternalConcepts.hpp" -#include "detail/dsl.hpp" // Grammar Definitions // namespace ovdl::csv::grammar { @@ -38,21 +35,6 @@ namespace ovdl::csv::grammar { } }; - constexpr bool IsUtf8(auto encoding) { - return std::same_as, lexy::utf8_char_encoding>; - } - - template - constexpr auto convert_as_string = convert::convert_as_string< - String, - ConvertErrorHandler>; - - constexpr auto ansi_character = lexy::dsl::ascii::character / dsl::lit_b_range<0x80, 0xFF>; - constexpr auto ansi_control = - lexy::dsl::ascii::control / - lexy::dsl::lit_b<0x81> / lexy::dsl::lit_b<0x8D> / lexy::dsl::lit_b<0x8F> / - lexy::dsl::lit_b<0x90> / lexy::dsl::lit_b<0x9D>; - constexpr auto utf_character = lexy::dsl::unicode::character; constexpr auto utf_control = lexy::dsl::unicode::control; @@ -75,47 +57,20 @@ namespace ovdl::csv::grammar { template struct CsvGrammar { - struct StringValue : lexy::scan_production, - lexy::token_production { - - template - static constexpr scan_result scan(lexy::rule_scanner& scanner, detail::IsFileParseState auto& state) { - using encoding = typename Reader::encoding; - - constexpr auto rule = [] { - // Arbitrary code points - auto c = [] { - if constexpr (std::same_as || std::same_as) { - return ansi_character - ansi_control; - } else { - return utf_character - utf_control; - } - }(); - - auto back_escape = lexy::dsl::backslash_escape // - .symbol(); - - auto quote_escape = lexy::dsl::escape(lexy::dsl::lit_c<'"'>) // - .template symbol(); - - return lexy::dsl::delimited(lexy::dsl::lit_c<'"'>, lexy::dsl::not_followed_by(lexy::dsl::lit_c<'"'>, lexy::dsl::lit_c<'"'>))(c, back_escape, quote_escape); - }(); - - lexy::scan_result str_result = scanner.template parse(rule); - if (!scanner || !str_result) { - return lexy::scan_failed; - } - return str_result.value(); - } + struct StringValue : lexy::token_production { + static constexpr auto rule = [] { + auto quote = lexy::dsl::lit_c<'"'>; + auto c = utf_character - utf_control; + auto back_escape = lexy::dsl::backslash_escape.symbol(); + auto quote_escape = lexy::dsl::escape(lexy::dsl::lit_c<'"'>).template symbol(); - static constexpr auto rule = lexy::dsl::peek(lexy::dsl::lit_c<'"'>) >> lexy::dsl::scan; + return lexy::dsl::delimited(quote, lexy::dsl::not_followed_by(quote, quote))(c, back_escape, quote_escape); + }(); - static constexpr auto value = convert_as_string >> lexy::forward; + static constexpr auto value = lexy::as_string; }; - struct PlainValue : lexy::scan_production, - lexy::token_production { - + struct PlainValue : lexy::token_production { template static constexpr auto _escape_check = character - (lexy::dsl::lit_b / lexy::dsl::ascii::newline); @@ -124,57 +79,24 @@ namespace ovdl::csv::grammar { static constexpr auto value = lexy::constant('\n'); }; - template - static constexpr scan_result scan(lexy::rule_scanner& scanner, detail::IsFileParseState auto& state) { - using encoding = typename Reader::encoding; - - constexpr auto rule = [] { - constexpr auto character = [] { - if constexpr (std::same_as || std::same_as) { - return ansi_character; - } else { - return utf_character; - } - }(); - - if constexpr (Options.SupportStrings) { - return lexy::dsl::identifier(character - (lexy::dsl::lit_b / lexy::dsl::ascii::newline)); - } else { - constexpr auto backslash = lexy::dsl::lit_b<'\\'>; - - constexpr auto escape_check_char = _escape_check; - constexpr auto escape_rule = lexy::dsl::p; - - return lexy::dsl::list( - lexy::dsl::identifier(escape_check_char - backslash) | - escape_rule | - lexy::dsl::capture(escape_check_char) // - ); - } - }(); - + static constexpr auto rule = [] { if constexpr (Options.SupportStrings) { - auto lexeme_result = scanner.template parse>(rule); - if (!scanner || !lexeme_result) { - return lexy::scan_failed; - } - return std::string { lexeme_result.value().begin(), lexeme_result.value().end() }; + return lexy::dsl::identifier(utf_character - (lexy::dsl::lit_b / lexy::dsl::ascii::newline)); } else { - lexy::scan_result str_result = scanner.template parse(rule); - if (!scanner || !str_result) { - return lexy::scan_failed; - } - return str_result.value(); - } - } + constexpr auto backslash = lexy::dsl::lit_b<'\\'>; - static constexpr auto rule = - dsl::peek( - _escape_check, - _escape_check) >> - lexy::dsl::scan; + constexpr auto escape_check_char = _escape_check; + constexpr auto escape_rule = lexy::dsl::p; + + return lexy::dsl::list( + lexy::dsl::identifier(escape_check_char - backslash) | + escape_rule | + lexy::dsl::capture(escape_check_char) // + ); + } + }(); - static constexpr auto value = convert_as_string >> lexy::forward; + static constexpr auto value = lexy::as_string; }; struct Value { diff --git a/src/openvic-dataloader/csv/Parser.cpp b/src/openvic-dataloader/csv/Parser.cpp index 84bc06a..5caa57d 100644 --- a/src/openvic-dataloader/csv/Parser.cpp +++ b/src/openvic-dataloader/csv/Parser.cpp @@ -29,20 +29,26 @@ using namespace ovdl::csv; struct Parser::ParseHandler final : detail::BasicFileParseHandler { template std::optional parse() { + if (parse_state().encoding() == detail::Encoding::Unknown) { + parse_state().logger().error("tried to parse unknown encoding"); + return parse_state().logger().get_errors(); + } + + OVDL_BEGIN_IGNORE_WARNING_RETURN_TYPE auto result = [&] { switch (parse_state().encoding()) { using enum detail::Encoding; case Ascii: + return lexy::parse(buffer(), parse_state(), parse_state().logger().error_callback()); case Utf8: - return lexy::parse(buffer(), parse_state(), parse_state().logger().error_callback()); - case Unknown: case Windows1251: case Windows1252: - return lexy::parse(buffer(), parse_state(), parse_state().logger().error_callback()); - default: - ovdl::detail::unreachable(); + return lexy::parse(buffer(), parse_state(), parse_state().logger().error_callback()); + OVDL_DEFAULT_CASE_UNREACHABLE(Unknown); } }(); + OVDL_END_IGNORE_WARNING_RETURN_TYPE + if (!result) { return this->parse_state().logger().get_errors(); } diff --git a/src/openvic-dataloader/detail/Convert.hpp b/src/openvic-dataloader/detail/Convert.hpp index e346185..09255b0 100644 --- a/src/openvic-dataloader/detail/Convert.hpp +++ b/src/openvic-dataloader/detail/Convert.hpp @@ -1,596 +1,421 @@ #pragma once +#include #include #include #include #include -#include -#include -#include #include #include #include +#include #include -#include -#include #include "openvic-dataloader/detail/Encoding.hpp" -#include "ParseState.hpp" // IWYU pragma: keep -#include "detail/InternalConcepts.hpp" -#include "detail/dsl.hpp" -#include "v2script/ParseState.hpp" - namespace ovdl::convert { - struct map_value { - std::string_view _value; - - constexpr map_value() noexcept : _value("") {} - constexpr map_value(std::nullptr_t) noexcept : _value("\0", 1) {} - constexpr explicit map_value(std::string_view val) noexcept : _value(val) {} - - static constexpr map_value invalid_value() noexcept { - return map_value(nullptr); - } - - constexpr bool is_invalid() const noexcept { - return !_value.empty() && _value[0] == '\0'; - } - - constexpr bool is_pass() const noexcept { - return _value.empty(); - } - - constexpr bool is_valid() const noexcept { - return !_value.empty() && _value[0] != '\0'; - } - - constexpr explicit operator bool() const noexcept { - return is_valid(); - } - }; - template - concept IsConverter = requires(unsigned char c, lexy::_pr>& reader) { - { T::try_parse(reader) } -> std::same_as; + concept MapperConcept = requires(char* memory, detail::Encoding encoding) { + { T::get_from(memory, encoding) } -> std::same_as; + { T::map(memory, encoding) } -> std::convertible_to; }; - struct Utf8 { - static constexpr auto map = lexy::symbol_table; - - template - static constexpr map_value try_parse(Reader& reader) { - return {}; + struct AnsiToUtf8Mapper { + static constexpr auto win1252_map = lexy::symbol_table // + .map<'\x80'>("€") + .map<'\x82'>("‚") + .map<'\x83'>("ƒ") + .map<'\x84'>("„") + .map<'\x85'>("…") + .map<'\x86'>("†") + .map<'\x87'>("‡") + .map<'\x88'>("ˆ") + .map<'\x89'>("‰") + .map<'\x8A'>("Š") + .map<'\x8B'>("‹") + .map<'\x8C'>("Œ") + .map<'\x8E'>("Ž") + + .map<'\x91'>("‘") + .map<'\x92'>("’") + .map<'\x93'>("“") + .map<'\x94'>("”") + .map<'\x95'>("•") + .map<'\x96'>("–") + .map<'\x97'>("—") + .map<'\x98'>("˜") + .map<'\x99'>("™") + .map<'\x9A'>("š") + .map<'\x9B'>("›") + .map<'\x9C'>("œ") + .map<'\x9E'>("ž") + .map<'\x9F'>("Ÿ") + + .map<'\xA0'>(" ") + .map<'\xA1'>("¡") + .map<'\xA2'>("¢") + .map<'\xA3'>("£") + .map<'\xA4'>("¤") + .map<'\xA5'>("¥") + .map<'\xA6'>("¦") + .map<'\xA7'>("§") + .map<'\xA8'>("¨") + .map<'\xA9'>("©") + .map<'\xAA'>("ª") + .map<'\xAB'>("«") + .map<'\xAC'>("¬") + .map<'\xAD'>("­") // Soft Hyphen + .map<'\xAE'>("®") + .map<'\xAF'>("¯") + + .map<'\xB0'>("°") + .map<'\xB1'>("±") + .map<'\xB2'>("²") + .map<'\xB3'>("³") + .map<'\xB4'>("´") + .map<'\xB5'>("µ") + .map<'\xB6'>("¶") + .map<'\xB7'>("·") + .map<'\xB8'>("¸") + .map<'\xB9'>("¹") + .map<'\xBA'>("º") + .map<'\xBB'>("»") + .map<'\xBC'>("¼") + .map<'\xBD'>("½") + .map<'\xBE'>("¾") + .map<'\xBF'>("¿") + + .map<'\xC0'>("À") + .map<'\xC1'>("Á") + .map<'\xC2'>("Â") + .map<'\xC3'>("Ã") + .map<'\xC4'>("Ä") + .map<'\xC5'>("Å") + .map<'\xC6'>("Æ") + .map<'\xC7'>("Ç") + .map<'\xC8'>("È") + .map<'\xC9'>("É") + .map<'\xCA'>("Ê") + .map<'\xCB'>("Ë") + .map<'\xCC'>("Ì") + .map<'\xCD'>("Í") + .map<'\xCE'>("Î") + .map<'\xCF'>("Ï") + + .map<'\xD0'>("Ð") + .map<'\xD1'>("Ñ") + .map<'\xD2'>("Ò") + .map<'\xD3'>("Ó") + .map<'\xD4'>("Ô") + .map<'\xD5'>("Õ") + .map<'\xD6'>("Ö") + .map<'\xD7'>("×") + .map<'\xD8'>("Ø") + .map<'\xD9'>("Ù") + .map<'\xDA'>("Ú") + .map<'\xDB'>("Û") + .map<'\xDC'>("Ü") + .map<'\xDD'>("Ý") + .map<'\xDE'>("Þ") + .map<'\xDF'>("ß") + + .map<'\xE0'>("à") + .map<'\xE1'>("á") + .map<'\xE2'>("â") + .map<'\xE3'>("ã") + .map<'\xE4'>("ä") + .map<'\xE5'>("å") + .map<'\xE6'>("æ") + .map<'\xE7'>("ç") + .map<'\xE8'>("è") + .map<'\xE9'>("é") + .map<'\xEA'>("ê") + .map<'\xEB'>("ë") + .map<'\xEC'>("ì") + .map<'\xED'>("í") + .map<'\xEE'>("î") + .map<'\xEF'>("ï") + + .map<'\xF0'>("ð") + .map<'\xF1'>("ñ") + .map<'\xF2'>("ò") + .map<'\xF3'>("ó") + .map<'\xF4'>("ô") + .map<'\xF5'>("õ") + .map<'\xF6'>("ö") + .map<'\xF7'>("÷") + .map<'\xF8'>("ø") + .map<'\xF9'>("ù") + .map<'\xFA'>("ú") + .map<'\xFB'>("û") + .map<'\xFC'>("ü") + .map<'\xFD'>("ý") + .map<'\xFE'>("þ") + .map<'\xFF'>("ÿ") + + // Paradox being special, invalid Windows-1252 + // Used for (semantically incorrect) Polish localization TODOs + .map<'\x8F'>("Ę"); + + static constexpr auto win1251_map = lexy::symbol_table // + .map<'\x80'>("Ђ") + .map<'\x81'>("Ѓ") + .map<'\x82'>("‚") + .map<'\x83'>("ѓ") + .map<'\x84'>("„") + .map<'\x85'>("…") + .map<'\x86'>("†") + .map<'\x87'>("‡") + .map<'\x88'>("€") + .map<'\x89'>("‰") + .map<'\x8A'>("Љ") + .map<'\x8B'>("‹") + .map<'\x8C'>("Њ") + .map<'\x8D'>("Ќ") + .map<'\x8E'>("Ћ") + .map<'\x8F'>("Џ") + + .map<'\x90'>("ђ") + .map<'\x91'>("‘") + .map<'\x92'>("’") + .map<'\x93'>("“") + .map<'\x94'>("”") + .map<'\x95'>("•") + .map<'\x96'>("–") + .map<'\x97'>("—") + .map<'\x99'>("™") + .map<'\x9A'>("љ") + .map<'\x9B'>("›") + .map<'\x9C'>("њ") + .map<'\x9D'>("ќ") + .map<'\x9E'>("ћ") + .map<'\x9F'>("џ") + + .map<'\xA0'>(" ") + .map<'\xA1'>("Ў") + .map<'\xA2'>("ў") + .map<'\xA3'>("Ј") + .map<'\xA4'>("¤") + .map<'\xA5'>("Ґ") + .map<'\xA6'>("¦") + .map<'\xA7'>("§") + .map<'\xA8'>("Ё") + .map<'\xA9'>("©") + .map<'\xAA'>("Є") + .map<'\xAB'>("«") + .map<'\xAC'>("¬") + .map<'\xAD'>("­") // Soft Hyphen + .map<'\xAE'>("®") + .map<'\xAF'>("Ї") + + .map<'\xB0'>("°") + .map<'\xB1'>("±") + .map<'\xB2'>("І") + .map<'\xB3'>("і") + .map<'\xB4'>("ґ") + .map<'\xB5'>("µ") + .map<'\xB6'>("¶") + .map<'\xB7'>("·") + .map<'\xB8'>("ё") + .map<'\xB9'>("№") + .map<'\xBA'>("є") + .map<'\xBB'>("»") + .map<'\xBC'>("ј") + .map<'\xBD'>("Ѕ") + .map<'\xBE'>("ѕ") + .map<'\xBF'>("ї") + + .map<'\xC0'>("А") + .map<'\xC1'>("Б") + .map<'\xC2'>("В") + .map<'\xC3'>("Г") + .map<'\xC4'>("Д") + .map<'\xC5'>("Е") + .map<'\xC6'>("Ж") + .map<'\xC7'>("З") + .map<'\xC8'>("И") + .map<'\xC9'>("Й") + .map<'\xCA'>("К") + .map<'\xCB'>("Л") + .map<'\xCC'>("М") + .map<'\xCD'>("Н") + .map<'\xCE'>("О") + .map<'\xCF'>("П") + + .map<'\xD0'>("Р") + .map<'\xD1'>("С") + .map<'\xD2'>("Т") + .map<'\xD3'>("У") + .map<'\xD4'>("Ф") + .map<'\xD5'>("Х") + .map<'\xD6'>("Ц") + .map<'\xD7'>("Ч") + .map<'\xD8'>("Ш") + .map<'\xD9'>("Щ") + .map<'\xDA'>("Ъ") + .map<'\xDB'>("Ы") + .map<'\xDC'>("Ь") + .map<'\xDD'>("Э") + .map<'\xDE'>("Ю") + .map<'\xDF'>("Я") + + .map<'\xE0'>("а") + .map<'\xE1'>("б") + .map<'\xE2'>("в") + .map<'\xE3'>("г") + .map<'\xE4'>("д") + .map<'\xE5'>("е") + .map<'\xE6'>("ж") + .map<'\xE7'>("з") + .map<'\xE8'>("и") + .map<'\xE9'>("й") + .map<'\xEA'>("к") + .map<'\xEB'>("л") + .map<'\xEC'>("м") + .map<'\xED'>("н") + .map<'\xEE'>("о") + .map<'\xEF'>("п") + + .map<'\xF0'>("р") + .map<'\xF1'>("с") + .map<'\xF2'>("т") + .map<'\xF3'>("у") + .map<'\xF4'>("ф") + .map<'\xF5'>("х") + .map<'\xF6'>("ц") + .map<'\xF7'>("ч") + .map<'\xF8'>("ш") + .map<'\xF9'>("щ") + .map<'\xFA'>("ъ") + .map<'\xFB'>("ы") + .map<'\xFC'>("ь") + .map<'\xFD'>("э") + .map<'\xFE'>("ю") + .map<'\xFF'>("я"); + + static std::string_view get_from(const char* memory, detail::Encoding encoding) { + auto reader = lexy::_range_reader(memory, memory + 1); + + switch (encoding) { + using enum detail::Encoding; + case Windows1251: { + auto index = win1251_map.try_parse(reader); + if (index) { + return win1251_map[index]; + } + } + case Windows1252: { + auto index = win1252_map.try_parse(reader); + if (index) { + return win1252_map[index]; + } + break; + } + default: break; + } + return { memory, 1 }; } - }; - static_assert(IsConverter); - - struct Windows1252 { - static constexpr auto map = lexy::symbol_table // - .map<'\x80'>("€") - .map<'\x82'>("‚") - .map<'\x83'>("ƒ") - .map<'\x84'>("„") - .map<'\x85'>("…") - .map<'\x86'>("†") - .map<'\x87'>("‡") - .map<'\x88'>("ˆ") - .map<'\x89'>("‰") - .map<'\x8A'>("Š") - .map<'\x8B'>("‹") - .map<'\x8C'>("Œ") - .map<'\x8E'>("Ž") - - .map<'\x91'>("‘") - .map<'\x92'>("’") - .map<'\x93'>("“") - .map<'\x94'>("”") - .map<'\x95'>("•") - .map<'\x96'>("–") - .map<'\x97'>("—") - .map<'\x98'>("˜") - .map<'\x99'>("™") - .map<'\x9A'>("š") - .map<'\x9B'>("›") - .map<'\x9C'>("œ") - .map<'\x9E'>("ž") - .map<'\x9F'>("Ÿ") - - .map<'\xA0'>(" ") - .map<'\xA1'>("¡") - .map<'\xA2'>("¢") - .map<'\xA3'>("£") - .map<'\xA4'>("¤") - .map<'\xA5'>("¥") - .map<'\xA6'>("¦") - .map<'\xA7'>("§") - .map<'\xA8'>("¨") - .map<'\xA9'>("©") - .map<'\xAA'>("ª") - .map<'\xAB'>("«") - .map<'\xAC'>("¬") - .map<'\xAD'>("­") // Soft Hyphen - .map<'\xAE'>("®") - .map<'\xAF'>("¯") - - .map<'\xB0'>("°") - .map<'\xB1'>("±") - .map<'\xB2'>("²") - .map<'\xB3'>("³") - .map<'\xB4'>("´") - .map<'\xB5'>("µ") - .map<'\xB6'>("¶") - .map<'\xB7'>("·") - .map<'\xB8'>("¸") - .map<'\xB9'>("¹") - .map<'\xBA'>("º") - .map<'\xBB'>("»") - .map<'\xBC'>("¼") - .map<'\xBD'>("½") - .map<'\xBE'>("¾") - .map<'\xBF'>("¿") - - .map<'\xC0'>("À") - .map<'\xC1'>("Á") - .map<'\xC2'>("Â") - .map<'\xC3'>("Ã") - .map<'\xC4'>("Ä") - .map<'\xC5'>("Å") - .map<'\xC6'>("Æ") - .map<'\xC7'>("Ç") - .map<'\xC8'>("È") - .map<'\xC9'>("É") - .map<'\xCA'>("Ê") - .map<'\xCB'>("Ë") - .map<'\xCC'>("Ì") - .map<'\xCD'>("Í") - .map<'\xCE'>("Î") - .map<'\xCF'>("Ï") - - .map<'\xD0'>("Ð") - .map<'\xD1'>("Ñ") - .map<'\xD2'>("Ò") - .map<'\xD3'>("Ó") - .map<'\xD4'>("Ô") - .map<'\xD5'>("Õ") - .map<'\xD6'>("Ö") - .map<'\xD7'>("×") - .map<'\xD8'>("Ø") - .map<'\xD9'>("Ù") - .map<'\xDA'>("Ú") - .map<'\xDB'>("Û") - .map<'\xDC'>("Ü") - .map<'\xDD'>("Ý") - .map<'\xDE'>("Þ") - .map<'\xDF'>("ß") - - .map<'\xE0'>("à") - .map<'\xE1'>("á") - .map<'\xE2'>("â") - .map<'\xE3'>("ã") - .map<'\xE4'>("ä") - .map<'\xE5'>("å") - .map<'\xE6'>("æ") - .map<'\xE7'>("ç") - .map<'\xE8'>("è") - .map<'\xE9'>("é") - .map<'\xEA'>("ê") - .map<'\xEB'>("ë") - .map<'\xEC'>("ì") - .map<'\xED'>("í") - .map<'\xEE'>("î") - .map<'\xEF'>("ï") - - .map<'\xF0'>("ð") - .map<'\xF1'>("ñ") - .map<'\xF2'>("ò") - .map<'\xF3'>("ó") - .map<'\xF4'>("ô") - .map<'\xF5'>("õ") - .map<'\xF6'>("ö") - .map<'\xF7'>("÷") - .map<'\xF8'>("ø") - .map<'\xF9'>("ù") - .map<'\xFA'>("ú") - .map<'\xFB'>("û") - .map<'\xFC'>("ü") - .map<'\xFD'>("ý") - .map<'\xFE'>("þ") - .map<'\xFF'>("ÿ") - - // Paradox being special, invalid Windows-1252 - // Used for (semantically incorrect) Polish localization TODOs - .map<'\x8F'>("Ę"); - - template - static constexpr map_value try_parse(Reader& reader) { - auto index = map.try_parse(reader); - if (index) { - return map_value(map[index]); - } else if (*reader.position() < 0) { - return map_value::invalid_value(); + + static size_t map(char* memory, detail::Encoding encoding) { + if (std::string_view map = get_from(memory, encoding); !map.empty()) { + for (char const& c : map) { + *memory++ = c; + } + return map.size(); } - return {}; + return 1; } }; - static_assert(IsConverter); - - struct Windows1251 { - static constexpr auto map = lexy::symbol_table // - .map<'\x80'>("Ђ") - .map<'\x81'>("Ѓ") - .map<'\x82'>("‚") - .map<'\x83'>("ѓ") - .map<'\x84'>("„") - .map<'\x85'>("…") - .map<'\x86'>("†") - .map<'\x87'>("‡") - .map<'\x88'>("€") - .map<'\x89'>("‰") - .map<'\x8A'>("Љ") - .map<'\x8B'>("‹") - .map<'\x8C'>("Њ") - .map<'\x8D'>("Ќ") - .map<'\x8E'>("Ћ") - .map<'\x8F'>("Џ") - - .map<'\x90'>("ђ") - .map<'\x91'>("‘") - .map<'\x92'>("’") - .map<'\x93'>("“") - .map<'\x94'>("”") - .map<'\x95'>("•") - .map<'\x96'>("–") - .map<'\x97'>("—") - .map<'\x99'>("™") - .map<'\x9A'>("љ") - .map<'\x9B'>("›") - .map<'\x9C'>("њ") - .map<'\x9D'>("ќ") - .map<'\x9E'>("ћ") - .map<'\x9F'>("џ") - - .map<'\xA0'>(" ") - .map<'\xA1'>("Ў") - .map<'\xA2'>("ў") - .map<'\xA3'>("Ј") - .map<'\xA4'>("¤") - .map<'\xA5'>("Ґ") - .map<'\xA6'>("¦") - .map<'\xA7'>("§") - .map<'\xA8'>("Ё") - .map<'\xA9'>("©") - .map<'\xAA'>("Є") - .map<'\xAB'>("«") - .map<'\xAC'>("¬") - .map<'\xAD'>("­") // Soft Hyphen - .map<'\xAE'>("®") - .map<'\xAF'>("Ї") - - .map<'\xB0'>("°") - .map<'\xB1'>("±") - .map<'\xB2'>("І") - .map<'\xB3'>("і") - .map<'\xB4'>("ґ") - .map<'\xB5'>("µ") - .map<'\xB6'>("¶") - .map<'\xB7'>("·") - .map<'\xB8'>("ё") - .map<'\xB9'>("№") - .map<'\xBA'>("є") - .map<'\xBB'>("»") - .map<'\xBC'>("ј") - .map<'\xBD'>("Ѕ") - .map<'\xBE'>("ѕ") - .map<'\xBF'>("ї") - - .map<'\xC0'>("А") - .map<'\xC1'>("Б") - .map<'\xC2'>("В") - .map<'\xC3'>("Г") - .map<'\xC4'>("Д") - .map<'\xC5'>("Е") - .map<'\xC6'>("Ж") - .map<'\xC7'>("З") - .map<'\xC8'>("И") - .map<'\xC9'>("Й") - .map<'\xCA'>("К") - .map<'\xCB'>("Л") - .map<'\xCC'>("М") - .map<'\xCD'>("Н") - .map<'\xCE'>("О") - .map<'\xCF'>("П") - - .map<'\xD0'>("Р") - .map<'\xD1'>("С") - .map<'\xD2'>("Т") - .map<'\xD3'>("У") - .map<'\xD4'>("Ф") - .map<'\xD5'>("Х") - .map<'\xD6'>("Ц") - .map<'\xD7'>("Ч") - .map<'\xD8'>("Ш") - .map<'\xD9'>("Щ") - .map<'\xDA'>("Ъ") - .map<'\xDB'>("Ы") - .map<'\xDC'>("Ь") - .map<'\xDD'>("Э") - .map<'\xDE'>("Ю") - .map<'\xDF'>("Я") - - .map<'\xE0'>("а") - .map<'\xE1'>("б") - .map<'\xE2'>("в") - .map<'\xE3'>("г") - .map<'\xE4'>("д") - .map<'\xE5'>("е") - .map<'\xE6'>("ж") - .map<'\xE7'>("з") - .map<'\xE8'>("и") - .map<'\xE9'>("й") - .map<'\xEA'>("к") - .map<'\xEB'>("л") - .map<'\xEC'>("м") - .map<'\xED'>("н") - .map<'\xEE'>("о") - .map<'\xEF'>("п") - - .map<'\xF0'>("р") - .map<'\xF1'>("с") - .map<'\xF2'>("т") - .map<'\xF3'>("у") - .map<'\xF4'>("ф") - .map<'\xF5'>("х") - .map<'\xF6'>("ц") - .map<'\xF7'>("ч") - .map<'\xF8'>("ш") - .map<'\xF9'>("щ") - .map<'\xFA'>("ъ") - .map<'\xFB'>("ы") - .map<'\xFC'>("ь") - .map<'\xFD'>("э") - .map<'\xFE'>("ю") - .map<'\xFF'>("я"); - - template - static constexpr map_value try_parse(Reader& reader) { - auto index = map.try_parse(reader); - if (index) { - return map_value(map[index]); - } else if (*reader.position() < 0) { - return map_value::invalid_value(); + static_assert(MapperConcept); + + constexpr auto ansi_to_utf8 = AnsiToUtf8Mapper {}; + + template + struct _make_buffer { + template + auto operator()(detail::Encoding encoding, const void* _memory, std::size_t size, + MemoryResource* resource = lexy::_detail::get_memory_resource()) const { + constexpr auto native_endianness = LEXY_IS_LITTLE_ENDIAN ? lexy::encoding_endianness::little : lexy::encoding_endianness::big; + + using char_type = typename Encoding::char_type; + LEXY_PRECONDITION(size % sizeof(char_type) == 0); + auto memory = static_cast(_memory); + + if constexpr (sizeof(char_type) == 1 || Endian == native_endianness) { + switch (encoding) { + using enum detail::Encoding; + case Ascii: + case Utf8: + return lexy::make_buffer_from_raw(_memory, size, resource); + default: break; + } + + size_t utf8_size = 0; + const auto end = memory + size; + + for (auto dest = memory; dest != end; dest += sizeof(char_type)) { + utf8_size += Mapper::get_from(reinterpret_cast(dest), encoding).size(); + } + + typename lexy::buffer::builder builder(utf8_size, resource); + for (auto dest = builder.data(); memory != end; memory += sizeof(char_type)) { + *dest = static_cast(*memory); + dest += Mapper::map(dest, encoding); + } + return LEXY_MOV(builder).finish(); + } else { + return lexy::make_buffer_from_raw(_memory, size, resource); } - return {}; } }; - static_assert(IsConverter); - - template - constexpr map_value try_parse_map(detail::Encoding&& encoding, Reader& reader) { - switch (encoding) { - case detail::Encoding::Unknown: - case detail::Encoding::Ascii: - case detail::Encoding::Utf8: return Utf8::try_parse(reader); - case detail::Encoding::Windows1251: return Windows1251::try_parse(reader); - case detail::Encoding::Windows1252: return Windows1252::try_parse(reader); - } - ovdl::detail::unreachable(); - } - template - using _string_char_type = LEXY_DECAY_DECLTYPE(LEXY_DECLVAL(String)[0]); + template + constexpr auto make_buffer_from_raw = _make_buffer {}; - template - concept IsErrorHandler = - std::is_convertible_v // - && requires(T t, ovdl::v2script::ast::ParseState& state, lexy::_pr> reader) { - { T::on_invalid_character(state, reader) }; - }; - - struct EmptyHandler { - static constexpr void on_invalid_character(detail::IsStateType auto& state, auto reader) {} - }; + template + struct _read_file_user_data : lexy::_read_file_user_data { + using base_type = lexy::_read_file_user_data; - template> Error = EmptyHandler> - constexpr auto convert_as_string = - dsl::sink( - lexy::fold_inplace( - std::initializer_list<_string_char_type> {}, // - [](String& result, detail::IsStateType auto& state, CharT c) { - if constexpr (std::is_convertible_v) { - switch (state.encoding()) { - using enum ovdl::detail::Encoding; - case Ascii: - case Utf8: - break; - // Skip Ascii and Utf8 encoding - default: { - // If within ASCII range - if (c >= CharT {}) { - break; - } - - map_value val = {}; - CharT char_array[] { c, CharT() }; - auto input = lexy::range_input(&char_array[0], &char_array[1]); - auto reader = input.reader(); - - // prefer preserving unknown conversion maps, least things will work, they'll just probably display wrong - // map = make_map_from(state.encoding(), c); - val = try_parse_map(state.encoding(), reader); - - // Invalid characters are dropped - if (val.is_invalid()) { - Error::on_invalid_character(state, reader); - return; - } - - // non-pass characters are not valid ascii and are mapped to utf8 values - if (!val.is_pass()) { - result.append(val._value); - return; - } - - break; - } - } - } + detail::Encoding encoding; - result.push_back(c); // - }, // - [](String& result, detail::IsStateType auto& state, String&& str) { - if constexpr (std::is_convertible_v) { - switch (state.encoding()) { - using enum ovdl::detail::Encoding; - case Ascii: - case Utf8: - break; - // Skip Ascii and Utf8 encoding - default: { - auto input = lexy::string_input(str); - auto reader = input.reader(); - using encoding = decltype(reader)::encoding; - constexpr auto eof = encoding::eof(); - - if constexpr (requires { result.reserve(str.size()); }) { - result.reserve(str.size()); - } - - auto begin = reader.position(); - auto last_it = begin; - while (reader.peek() != eof) { - // If not within ASCII range - if (*reader.position() < 0) { - map_value val = try_parse_map(state.encoding(), reader); - - if (val.is_invalid()) { - Error::on_invalid_character(state, reader); - reader.bump(); - continue; - } else if (!val.is_pass()) { - result.append(val._value); - last_it = reader.position(); - continue; - } - } - - while (reader.peek() != eof && *reader.position() > 0) { - reader.bump(); - } - result.append(last_it, reader.position()); - last_it = reader.position(); - } - if (last_it != begin) { - return; - } - break; - } - } - } + _read_file_user_data(detail::Encoding encoding, MemoryResource* resource) : base_type(resource), encoding(encoding) {} + static auto callback() { + return [](void* _user_data, const char* memory, std::size_t size) { + auto user_data = static_cast<_read_file_user_data*>(_user_data); - result.append(LEXY_MOV(str)); // - }, // - [](String& result, detail::IsStateType auto& state, Iterator begin, Iterator end) // - -> decltype(void(LEXY_DECLVAL(Str).append(begin, end))) { - if constexpr (std::is_convertible_v) { - switch (state.encoding()) { - using enum ovdl::detail::Encoding; - case Ascii: - case Utf8: - break; - // Skip Ascii and Utf8 encoding - default: { - auto input = lexy::range_input(begin, end); - auto reader = input.reader(); - using encoding = decltype(reader)::encoding; - constexpr auto eof = encoding::eof(); - - if constexpr (requires { result.reserve(end - begin); }) { - result.reserve(end - begin); - } - - auto begin = reader.position(); - auto last_it = begin; - while (reader.peek() != eof) { - // If not within ASCII range - if (*reader.position() < 0) { - map_value val = try_parse_map(state.encoding(), reader); - - if (val.is_invalid()) { - Error::on_invalid_character(state, reader); - reader.bump(); - continue; - } else if (!val.is_pass()) { - result.append(val._value); - last_it = reader.position(); - continue; - } - } - - while (reader.peek() != eof && *reader.position() > 0) { - reader.bump(); - } - result.append(last_it, reader.position()); - last_it = reader.position(); - } - if (last_it != begin) { - return; - } - break; - } - } - } + user_data->buffer = make_buffer_from_raw(user_data->encoding, memory, size, user_data->resource); + }; + } + }; - result.append(begin, end); // - }, // - [](String& result, detail::IsStateType auto& state, lexy::lexeme lex) { - using encoding = typename Reader::encoding; - using _char_type = _string_char_type; - static_assert(lexy::char_type_compatible_with_reader, - "cannot convert lexeme to this string type"); - - if constexpr ((std::same_as || std::same_as) && - std::convertible_to) { - auto input = lexy::range_input(lex.begin(), lex.end()); - auto reader = input.reader(); - using encoding = decltype(reader)::encoding; - constexpr auto eof = encoding::eof(); - - if constexpr (requires { result.reserve(lex.end() - lex.begin()); }) { - result.reserve(lex.end() - lex.begin()); - } - - auto begin = reader.position(); - auto last_it = begin; - while (reader.peek() != eof) { - // If not within ASCII range - if (*reader.position() < 0) { - map_value val = try_parse_map(state.encoding(), reader); - - if (val.is_invalid()) { - Error::on_invalid_character(state, reader); - reader.bump(); - continue; - } else if (!val.is_pass()) { - result.append(val._value); - last_it = reader.position(); - continue; - } - } - - while (reader.peek() != eof && *reader.position() > 0) { - reader.bump(); - } - result.append(last_it, reader.position()); - last_it = reader.position(); - } - if (last_it != begin) { - return; - } - } + template + auto read_file( + const char* path, + detail::Encoding encoding, + Mapper = {}, + MemoryResource* resource = lexy::_detail::get_memory_resource()) + -> lexy::read_file_result { + _read_file_user_data user_data(encoding, resource); + auto error = lexy::_detail::read_file(path, user_data.callback(), &user_data); + return lexy::read_file_result(error, LEXY_MOV(user_data.buffer)); + } - result.append(lex.begin(), lex.end()); // - })); + /// Reads stdin into a buffer. + template + auto read_stdin( + detail::Encoding encoding, + Mapper = {}, + MemoryResource* resource = lexy::_detail::get_memory_resource()) + -> lexy::read_file_result { + _read_file_user_data user_data(encoding, resource); + auto error = lexy::_detail::read_stdin(user_data.callback(), &user_data); + return lexy::read_file_result(error, LEXY_MOV(user_data.buffer)); + } } \ No newline at end of file diff --git a/src/openvic-dataloader/detail/ParseHandler.hpp b/src/openvic-dataloader/detail/ParseHandler.hpp index 813af42..e30a452 100644 --- a/src/openvic-dataloader/detail/ParseHandler.hpp +++ b/src/openvic-dataloader/detail/ParseHandler.hpp @@ -15,6 +15,7 @@ #include "openvic-dataloader/detail/Utility.hpp" #include "detail/BufferError.hpp" +#include "detail/Convert.hpp" #include "detail/Detect.hpp" #include "detail/InternalConcepts.hpp" @@ -94,6 +95,20 @@ namespace ovdl::detail { *state = { lexy::buffer(std::move(buffer)), encoding }; }; + template + static constexpr auto generate_conversion_state(State* state, const char* path, auto&& buffer, Encoding encoding) { + size_t size = buffer.size(); + if (path[0] != '\0') { + *state = { + path, + convert::make_buffer_from_raw(encoding, std::move(buffer).release(), size), + encoding + }; + return; + } + *state = { convert::make_buffer_from_raw(encoding, std::move(buffer).release(), size), encoding }; + }; + template static void create_state(State* state, const char* path, lexy::buffer&& buffer, std::optional fallback) { if (!_system_fallback_encoding.has_value()) { @@ -111,19 +126,23 @@ namespace ovdl::detail { auto [encoding, is_alone] = encoding_detect::Detector { .default_fallback = fallback.value() }.detect_assess(buffer); switch (encoding) { using enum Encoding; - case Ascii: + case Ascii: { + generate_state(state, path, std::move(buffer), encoding); + break; + } case Utf8: { generate_state(state, path, std::move(buffer), encoding); break; } - case Unknown: + case Unknown: { + break; + } case Windows1251: case Windows1252: { - generate_state(state, path, std::move(buffer), encoding); + generate_conversion_state(state, path, std::move(buffer), encoding); break; } - default: - ovdl::detail::unreachable(); + OVDL_DEFAULT_CASE_UNREACHABLE(); } if (!is_alone) { @@ -135,7 +154,7 @@ namespace ovdl::detail { } if (encoding == ovdl::detail::Encoding::Unknown) { - state->logger().warning("could not detect encoding"); + state->logger().error("could not detect encoding"); } } diff --git a/src/openvic-dataloader/detail/dsl.hpp b/src/openvic-dataloader/detail/dsl.hpp index a25d711..4d3b984 100644 --- a/src/openvic-dataloader/detail/dsl.hpp +++ b/src/openvic-dataloader/detail/dsl.hpp @@ -283,4 +283,41 @@ namespace ovdl::dsl { constexpr auto peek(Rule, RuleUtf) { return _peek {}; } + + template + struct _as_string_view { + struct _sink { + StringView _result; + + using return_type = StringView; + + constexpr void operator()(StringView&& str) { + _result = LEXY_MOV(str); + } + + template + constexpr auto operator()(Iterator begin, Iterator end) + -> decltype(void(Str { begin, end })) { + _result = { begin, end }; + } + + template + constexpr void operator()(lexy::lexeme lex) { + static_assert(lexy::char_type_compatible_with_reader, + "cannot convert lexeme to this string type"); + _result = { lex.begin(), lex.end() }; + } + + constexpr StringView&& finish() && { + return LEXY_MOV(_result); + } + }; + + constexpr auto sink() const { + return _sink { StringView() }; + } + }; + + template + constexpr auto as_string_view = _as_string_view {}; } \ No newline at end of file diff --git a/src/openvic-dataloader/v2script/DecisionGrammar.hpp b/src/openvic-dataloader/v2script/DecisionGrammar.hpp index b5010ff..c559eec 100644 --- a/src/openvic-dataloader/v2script/DecisionGrammar.hpp +++ b/src/openvic-dataloader/v2script/DecisionGrammar.hpp @@ -26,13 +26,13 @@ namespace ovdl::v2script::grammar { using helper = dsl::rule_helper; struct List { - static constexpr auto rule = dsl::curly_bracketed.opt_list(helper::p | lexy::dsl::p>); + static constexpr auto rule = dsl::curly_bracketed.opt_list(helper::p | lexy::dsl::p); static constexpr auto value = lexy::as_list >> construct_list; }; static constexpr auto rule = - dsl::p> >> + dsl::p >> (helper::flags + lexy::dsl::equal_sign + lexy::dsl::p); static constexpr auto value = construct; diff --git a/src/openvic-dataloader/v2script/EffectGrammar.hpp b/src/openvic-dataloader/v2script/EffectGrammar.hpp index 10f8348..db664e4 100644 --- a/src/openvic-dataloader/v2script/EffectGrammar.hpp +++ b/src/openvic-dataloader/v2script/EffectGrammar.hpp @@ -10,7 +10,7 @@ namespace ovdl::v2script::grammar { struct EffectStatement { - static constexpr auto rule = lexy::dsl::p>; + static constexpr auto rule = lexy::dsl::p; static constexpr auto value = lexy::forward; }; diff --git a/src/openvic-dataloader/v2script/EventGrammar.hpp b/src/openvic-dataloader/v2script/EventGrammar.hpp index af20ccf..e80282c 100644 --- a/src/openvic-dataloader/v2script/EventGrammar.hpp +++ b/src/openvic-dataloader/v2script/EventGrammar.hpp @@ -26,7 +26,7 @@ namespace ovdl::v2script::grammar { struct EventMtthStatement { struct MonthValue { - static constexpr auto rule = lexy::dsl::p>; + static constexpr auto rule = lexy::dsl::p; static constexpr auto value = dsl::callback( [](detail::IsParseState auto& state, ast::IdentifierValue* value) { bool is_number = true; @@ -52,7 +52,7 @@ namespace ovdl::v2script::grammar { static constexpr auto value = lexy::as_list >> construct_list; }; - static constexpr auto str_or_id = lexy::dsl::p::ValueExpression>; + static constexpr auto str_or_id = lexy::dsl::p; struct EventOptionList { using name = fkeyword_rule<"name", str_or_id>; @@ -86,13 +86,13 @@ namespace ovdl::v2script::grammar { return helper::flags + dsl::curly_bracketed.opt_list( helper::p | lexy::dsl::p | lexy::dsl::p