diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 4ee57895..81ea92a6 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -11,7 +11,9 @@ FetchContent_MakeAvailable(counters) add_executable(realbenchmark benchmark.cpp) target_link_libraries(realbenchmark PRIVATE counters::counters) add_executable(bench_ip bench_ip.cpp) +add_executable(bench_uint16 bench_uint16.cpp) target_link_libraries(bench_ip PRIVATE counters::counters) +target_link_libraries(bench_uint16 PRIVATE counters::counters) set_property( TARGET realbenchmark @@ -19,8 +21,12 @@ set_property( set_property( TARGET bench_ip PROPERTY CXX_STANDARD 17) +set_property( + TARGET bench_uint16 + PROPERTY CXX_STANDARD 17) target_link_libraries(realbenchmark PUBLIC fast_float) target_link_libraries(bench_ip PUBLIC fast_float) +target_link_libraries(bench_uint16 PUBLIC fast_float) include(ExternalProject) diff --git a/benchmarks/bench_uint16.cpp b/benchmarks/bench_uint16.cpp new file mode 100644 index 00000000..c4cef81b --- /dev/null +++ b/benchmarks/bench_uint16.cpp @@ -0,0 +1,139 @@ +#include "counters/bench.h" +#include "fast_float/fast_float.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void pretty_print(size_t volume, size_t bytes, std::string name, + counters::event_aggregate agg) { + if (agg.inner_count > 1) { + printf("# (inner count: %d)\n", agg.inner_count); + } + printf("%-40s : ", name.c_str()); + printf(" %5.2f GB/s ", bytes / agg.fastest_elapsed_ns()); + printf(" %5.1f Mip/s ", volume * 1000.0 / agg.fastest_elapsed_ns()); + printf(" %5.2f ns/ip ", agg.fastest_elapsed_ns() / volume); + if (counters::event_collector().has_events()) { + printf(" %5.2f GHz ", agg.fastest_cycles() / agg.fastest_elapsed_ns()); + printf(" %5.2f c/ip ", agg.fastest_cycles() / volume); + printf(" %5.2f i/ip ", agg.fastest_instructions() / volume); + printf(" %5.2f c/b ", agg.fastest_cycles() / bytes); + printf(" %5.2f i/b ", agg.fastest_instructions() / bytes); + printf(" %5.2f i/c ", agg.fastest_instructions() / agg.fastest_cycles()); + } + printf("\n"); +} + +enum class parse_method { standard, fast_float }; + +void validate(const std::string &buffer, const std::vector &expected, + char delimiter) { + const char *p = buffer.data(); + const char *pend = p + buffer.size(); + + for (size_t i = 0; i < expected.size(); i++) { + uint16_t val; + auto r = fast_float::from_chars(p, pend, val); + if (r.ec != std::errc() || val != expected[i]) { + printf("Validation failed at index %zu: expected %u, got %u\n", i, + expected[i], val); + std::abort(); + } + p = r.ptr; + if (i + 1 < expected.size()) { + if (p >= pend || *p != delimiter) { + printf("Validation failed at index %zu: delimiter mismatch\n", i); + std::abort(); + } + ++p; + } + } + + if (p != pend) { + printf("Validation failed: trailing bytes remain\n"); + std::abort(); + } + printf("Validation passed!\n"); +} + +int main() { + constexpr size_t N = 500000; + constexpr char delimiter = ','; + std::mt19937 rng(1234); + std::uniform_int_distribution dist(0, 65535); + + std::vector expected; + expected.reserve(N); + + std::string buffer; + buffer.reserve(N * 6); // up to 5 digits + delimiter + + for (size_t i = 0; i < N; ++i) { + uint16_t val = (uint16_t)dist(rng); + expected.push_back(val); + std::string s = std::to_string(val); + buffer.append(s); + if (i + 1 < N) { + buffer.push_back(delimiter); + } + } + + size_t total_bytes = buffer.size(); + + validate(buffer, expected, delimiter); + + volatile uint64_t sink = 0; + + pretty_print(N, total_bytes, "parse_uint16_std_fromchars", + counters::bench([&]() { + uint64_t sum = 0; + const char *p = buffer.data(); + const char *pend = p + buffer.size(); + for (size_t i = 0; i < N; ++i) { + uint16_t value = 0; + auto r = std::from_chars(p, pend, value); + if (r.ec != std::errc()) + std::abort(); + sum += value; + p = r.ptr; + if (i + 1 < N) { + if (p >= pend || *p != delimiter) + std::abort(); + ++p; + } + } + if (p != pend) + std::abort(); + sink += sum; + })); + + pretty_print(N, total_bytes, "parse_uint16_fastfloat", counters::bench([&]() { + uint64_t sum = 0; + const char *p = buffer.data(); + const char *pend = p + buffer.size(); + for (size_t i = 0; i < N; ++i) { + uint16_t value = 0; + auto r = fast_float::from_chars(p, pend, value); + if (r.ec != std::errc()) + std::abort(); + sum += value; + p = r.ptr; + if (i + 1 < N) { + if (p >= pend || *p != delimiter) + std::abort(); + ++p; + } + } + if (p != pend) + std::abort(); + sink += sum; + })); + + return EXIT_SUCCESS; +} diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index 5609ba1a..85435373 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -32,7 +32,7 @@ template fastfloat_really_inline constexpr bool has_simd_opt() { // able to optimize it well. template fastfloat_really_inline constexpr bool is_integer(UC c) noexcept { - return !(c > UC('9') || c < UC('0')); + return (unsigned)(c - UC('0')) <= 9u; } fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) { @@ -68,6 +68,25 @@ read8_to_u64(UC const *chars) { return val; } +// Read 4 UC into a u32. Truncates UC if not char. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint32_t +read4_to_u32(UC const *chars) { + if (cpp20_and_in_constexpr() || !std::is_same::value) { + uint32_t val = 0; + for (int i = 0; i < 4; ++i) { + val |= uint32_t(uint8_t(*chars)) << (i * 8); + ++chars; + } + return val; + } + uint32_t val; + ::memcpy(&val, chars, sizeof(uint32_t)); +#if FASTFLOAT_IS_BIG_ENDIAN == 1 + val = byteswap_32(val); +#endif + return val; +} #ifdef FASTFLOAT_SSE2 fastfloat_really_inline uint64_t simd_read8_to_u64(__m128i const data) { @@ -149,6 +168,18 @@ is_made_of_eight_digits_fast(uint64_t val) noexcept { 0x8080808080808080)); } +fastfloat_really_inline constexpr bool +is_made_of_four_digits_fast(uint32_t val) noexcept { + return !((((val + 0x46464646) | (val - 0x30303030)) & 0x80808080)); +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint32_t +parse_four_digits_unrolled(uint32_t val) noexcept { + val -= 0x30303030; + val = (val * 10) + (val >> 8); + return (((val & 0x00FF00FF) * 0x00640001) >> 16) & 0xFFFF; +} + #ifdef FASTFLOAT_HAS_SIMD // Call this if chars might not be 8 digits. @@ -606,6 +637,56 @@ parse_int_string(UC const *p, UC const *pend, T &value, } } + FASTFLOAT_IF_CONSTEXPR17((std::is_same::value)) { + if (base == 10) { + const size_t len = size_t(pend - p); + if (len == 0) { + if (has_leading_zeros) { + value = 0; + answer.ec = std::errc(); + answer.ptr = p; + } else { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + } + return answer; + } + + if (len >= 4) { + uint32_t digits = read4_to_u32(p); + if (is_made_of_four_digits_fast(digits)) { + uint32_t v = parse_four_digits_unrolled(digits); + if (len >= 5 && is_integer(p[4])) { + v = v * 10 + uint32_t(p[4] - '0'); + if (len >= 6 && is_integer(p[5])) { + answer.ec = std::errc::result_out_of_range; + const UC *q = p + 5; + while (q != pend && is_integer(*q)) { + q++; + } + answer.ptr = q; + return answer; + } + if (v > 65535) { + answer.ec = std::errc::result_out_of_range; + answer.ptr = p + 5; + return answer; + } + value = uint16_t(v); + answer.ec = std::errc(); + answer.ptr = p + 5; + return answer; + } + // 4 digits + value = uint16_t(v); + answer.ec = std::errc(); + answer.ptr = p + 4; + return answer; + } + } + } + } + uint64_t i = 0; if (base == 10) { loop_parse_if_eight_digits(p, pend, i); // use SIMD if possible