From 8b49927099c456019bb396ae2e9f81413672910e Mon Sep 17 00:00:00 2001 From: Ben Wibking Date: Wed, 24 Sep 2025 13:30:39 -0400 Subject: [PATCH 01/10] add IntSuperAccumulator --- Src/Base/AMReX_IntSuperAccumulator.H | 258 ++++++++++++++++++ Src/Base/CMakeLists.txt | 1 + Src/Base/Make.package | 1 + Tests/CMakeLists.txt | 2 +- .../IntSuperAccumulator/CMakeLists.txt | 9 + .../Numerics/IntSuperAccumulator/GNUmakefile | 26 ++ .../Numerics/IntSuperAccumulator/Make.package | 1 + Tests/Numerics/IntSuperAccumulator/main.cpp | 142 ++++++++++ 8 files changed, 439 insertions(+), 1 deletion(-) create mode 100644 Src/Base/AMReX_IntSuperAccumulator.H create mode 100644 Tests/Numerics/IntSuperAccumulator/CMakeLists.txt create mode 100644 Tests/Numerics/IntSuperAccumulator/GNUmakefile create mode 100644 Tests/Numerics/IntSuperAccumulator/Make.package create mode 100644 Tests/Numerics/IntSuperAccumulator/main.cpp diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H new file mode 100644 index 00000000000..c1fcc8d32c3 --- /dev/null +++ b/Src/Base/AMReX_IntSuperAccumulator.H @@ -0,0 +1,258 @@ +#ifndef AMREX_INT_SUPERACCUMULATOR_H_ +#define AMREX_INT_SUPERACCUMULATOR_H_ + +#include + +#include +#include +#include +#include +#include + +#include +#include + +namespace amrex { + +/** + * \brief Integer superaccumulator backed by IArrayBox storage. + * + * The accumulator stores the exact sum of single-precision floating-point + * numbers by representing the scaled integer \f$\sum_i x_i 2^{-e_\text{min}}\f$ + * in a radix-\f$2^{30}\f$ positional system. Each digit is stored as a 32-bit int in + * an `IArrayBox` component. A client that needs M independent sums must + * allocate an accumulator with `M * digits_per_entry` components and index the + * storage using `component = entry * digits_per_entry + digit`. Both + * `accumulate` and `finalize` operate in single precision; callers may cast to + * wider types if needed. All summands are assumed non-negative; subnormal + * inputs are treated as zero to reduce the number of required digits per entry. + */ +class IntSuperAccumulatorFab +{ +public: + static constexpr int digit_bits = 30; //!< Width of each digit in bits. + static constexpr int digit_base = 1 << digit_bits; //!< Base of each digit. + static constexpr int min_exponent = -149; //!< Smallest exponent of scaled mantissa (binary32 minimum normal). + static constexpr int max_exponent = 104; //!< Largest exponent of scaled mantissa (binary32 maximum normal). + static constexpr int significand_bits = 24; //!< Binary32 significand bits including hidden bit. + static constexpr int extra_digits = 1; //!< Safety margin for carry propagation. + static constexpr int digits_for_shift = ((max_exponent - min_exponent) / digit_bits) + 1; // covers exponent window + static constexpr int digits_for_value = 2; //!< Worst-case mantissa occupies two digits at radix 2^30. + static constexpr int digits_per_entry = digits_for_shift + digits_for_value + extra_digits; //!< 12 digits per entry. + + IntSuperAccumulatorFab () = default; + + explicit IntSuperAccumulatorFab (Arena* arena) noexcept + : m_storage(arena) {} + + IntSuperAccumulatorFab (const Box& box, int nentries, Arena* arena=nullptr) + : m_storage(box, nentries * digits_per_entry, arena), m_entries(nentries) {} + + [[nodiscard]] int entries () const noexcept { return m_entries; } + + void define (const Box& box, int nentries, Arena* arena=nullptr) + { + m_entries = nentries; + m_storage.resize(box, nentries * digits_per_entry, arena); + } + + [[nodiscard]] Array4 array () noexcept { return m_storage.array(); } + [[nodiscard]] Array4 const_array () const noexcept { return m_storage.const_array(); } + + void reset () + { + if (amrex::Gpu::inLaunchRegion()) { + m_storage.template setVal(0); + } else { + m_storage.template setVal(0); + } + } + + void reset (Box const& bx, int nentries = -1) + { + const int entries = (nentries < 0) ? m_entries : nentries; + const int ncomp = entries * digits_per_entry; + if (amrex::Gpu::inLaunchRegion()) { + m_storage.template setVal(0, bx, 0, ncomp); + } else { + m_storage.template setVal(0, bx, 0, ncomp); + } + } + + /** + * \brief Accumulate a floating-point value into the superaccumulator. + * + * The caller must provide the digit array and logical comp index for the + * entry being updated. All operations are carried out using integers; the + * routine does not allocate additional storage. + */ + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static void + accumulate (Array const& digits, int i, int j, int k, int entry, float value, + int entry_digits = digits_per_entry) noexcept + { + if (value <= 0.0f) { + if (value < 0.0f) { +#ifndef AMREX_USE_GPU + amrex::Abort("IntSuperAccumulatorFab::accumulate: negative input not supported"); +#endif + } + return; + } + + if (!amrex::Math::isfinite(value)) { +#ifndef AMREX_USE_GPU + amrex::Abort("IntSuperAccumulatorFab::accumulate: non-finite input"); +#else + return; +#endif + } + + const auto bits = detail::decompose(value); + if (bits.is_zero) { return; } + + const int component_base = entry * entry_digits; + + const int shift = bits.exponent - min_exponent; + int digit_index = shift / digit_bits; + const int bit_offset = shift % digit_bits; + + const unsigned long long magnitude = bits.magnitude; + unsigned long long chunk_value = magnitude << bit_offset; + + const int component_limit = component_base + entry_digits; + + while (chunk_value != 0ULL) { + const unsigned long long chunk = chunk_value & static_cast(digit_base - 1); + chunk_value >>= digit_bits; + if (component_base + digit_index >= component_limit) { +#ifndef AMREX_USE_GPU + amrex::Abort("IntSuperAccumulatorFab: digit overflow"); +#else + break; +#endif + } + detail::add_to_digit(digits, i, j, k, + component_base + digit_index, + component_limit, + chunk); + ++digit_index; + } + } + + /** + * \brief Finalize the accumulator entry and return a binary32 result. + */ + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static float + finalize (Array const& digits, int i, int j, int k, int entry, + int entry_digits = digits_per_entry) noexcept + { + const int component_base = entry * entry_digits; + + double result = 0.0; + for (int d = entry_digits - 1; d >= 0; --d) { + const int digit = digits(i,j,k, component_base + d); + if (digit == 0) { continue; } + const int exponent = min_exponent + d * digit_bits; + result += std::ldexp(static_cast(digit), exponent); + } + return static_cast(result); + } + + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static void + clear_entry (Array const& digits, int i, int j, int k, int entry, + int entry_digits = digits_per_entry) noexcept + { + const int component_base = entry * entry_digits; + for (int d = 0; d < entry_digits; ++d) { + digits(i,j,k, component_base + d) = 0; + } + } + +private: + struct Decomposed + { + unsigned long long magnitude; + int exponent; + bool is_zero; + }; + + struct detail + { + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static void + add_to_digit (Array const& digits, int i, int j, int k, + int component, int component_limit, unsigned long long value) noexcept + { + unsigned long long carry = value; + int current_component = component; + constexpr unsigned long long base = static_cast(digit_base); + + while (carry != 0ULL) { + if (current_component >= component_limit) { +#ifndef AMREX_USE_GPU + amrex::Abort("IntSuperAccumulatorFab: carry overflow"); +#else + break; +#endif + } + + const unsigned long long sum = static_cast(digits(i,j,k,current_component)) + carry; + const unsigned long long remainder = sum & (base - 1ULL); + const unsigned long long quotient = sum >> digit_bits; + + digits(i,j,k,current_component) = static_cast(remainder); + carry = quotient; + ++current_component; + } + } + + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static Decomposed + decompose (float value) noexcept + { + Decomposed bits{}; + + if (value <= 0.0f) { + bits.is_zero = true; + return bits; + } + + union { + float f; + std::uint32_t u; + } pun{}; + pun.f = value; + const std::uint32_t u = pun.u; + + const std::uint32_t exponent_field = (u >> 23) & 0xffu; + const std::uint32_t fraction = u & ((1u << 23) - 1u); + + if (exponent_field == 0xffu) { +#ifndef AMREX_USE_GPU + amrex::Abort("IntSuperAccumulatorFab::decompose: NaN or Inf not supported (float)"); +#else + Decomposed tmp{}; tmp.is_zero = true; return tmp; +#endif + } else if (exponent_field == 0u) { + // Treat subnormals as zero; caller is expected to pre-round. + bits.is_zero = true; + return bits; + } else { + bits.magnitude = (1u << 23) | fraction; + bits.exponent = static_cast(exponent_field) - 127 - significand_bits + 1; + } + + bits.is_zero = false; + return bits; + } + }; + + IArrayBox m_storage; + int m_entries = 0; +}; + +} // namespace amrex + +#endif diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt index 89f406013d0..07802d1d6b5 100644 --- a/Src/Base/CMakeLists.txt +++ b/Src/Base/CMakeLists.txt @@ -137,6 +137,7 @@ foreach(D IN LISTS AMReX_SPACEDIM) AMReX_BaseFab.H AMReX_BaseFab.cpp AMReX_Array4.H + AMReX_IntSuperAccumulator.H AMReX_MakeType.H AMReX_TypeTraits.H AMReX_FabDataType.H diff --git a/Src/Base/Make.package b/Src/Base/Make.package index 675d37b2ccf..7b7851076e6 100644 --- a/Src/Base/Make.package +++ b/Src/Base/Make.package @@ -163,6 +163,7 @@ C$(AMREX_BASE)_headers += AMReX_TypeTraits.H C$(AMREX_BASE)_headers += AMReX_FabDataType.H C$(AMREX_BASE)_headers += AMReX_Array4.H +C$(AMREX_BASE)_headers += AMReX_IntSuperAccumulator.H C$(AMREX_BASE)_sources += AMReX_BaseFab.cpp C$(AMREX_BASE)_headers += AMReX_BaseFab.H AMReX_BaseFabUtility.H C$(AMREX_BASE)_headers += AMReX_FabFactory.H diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt index 3db6cccbf8d..c0a4b82e258 100644 --- a/Tests/CMakeLists.txt +++ b/Tests/CMakeLists.txt @@ -126,7 +126,7 @@ else() # List of subdirectories to search for CMakeLists. # set( AMREX_TESTS_SUBDIRS Amr AsyncOut CallNoinline CLZ CommType CTOParFor DeviceGlobal - Enum HeatEquation MultiBlock MultiPeriod ParmParse Parser + Enum HeatEquation MultiBlock MultiPeriod Numerics ParmParse Parser Parser2 ParserUserFn Reinit RoundoffDomain SmallMatrix) if (AMReX_PARTICLES) diff --git a/Tests/Numerics/IntSuperAccumulator/CMakeLists.txt b/Tests/Numerics/IntSuperAccumulator/CMakeLists.txt new file mode 100644 index 00000000000..7e8d9a8080d --- /dev/null +++ b/Tests/Numerics/IntSuperAccumulator/CMakeLists.txt @@ -0,0 +1,9 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + set(_sources main.cpp) + set(_input_files ) + + setup_test(${D} _sources _input_files) + + unset(_sources) + unset(_input_files) +endforeach() diff --git a/Tests/Numerics/IntSuperAccumulator/GNUmakefile b/Tests/Numerics/IntSuperAccumulator/GNUmakefile new file mode 100644 index 00000000000..a42d27db031 --- /dev/null +++ b/Tests/Numerics/IntSuperAccumulator/GNUmakefile @@ -0,0 +1,26 @@ +AMREX_HOME ?= ../../../amrex + +PRECISION = FLOAT + +DEBUG = FALSE + +DIM = 3 + +COMP = gcc + +USE_MPI = FALSE +USE_OMP = FALSE +USE_CUDA = FALSE +USE_HIP = FALSE +USE_SYCL = FALSE + +BL_NO_FORT = TRUE + +TINY_PROFILE = FALSE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/Numerics/IntSuperAccumulator/Make.package b/Tests/Numerics/IntSuperAccumulator/Make.package new file mode 100644 index 00000000000..6b4b865e8fc --- /dev/null +++ b/Tests/Numerics/IntSuperAccumulator/Make.package @@ -0,0 +1 @@ +CEXE_sources += main.cpp diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp new file mode 100644 index 00000000000..f04bfdb0a89 --- /dev/null +++ b/Tests/Numerics/IntSuperAccumulator/main.cpp @@ -0,0 +1,142 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace +{ + +struct Range +{ + float min_value; + float max_value; + int count; + int seed; +}; + +} // namespace + +int +main (int argc, char* argv[]) +{ + amrex::Initialize(argc, argv); + int status = 0; + + { + using amrex::IntSuperAccumulatorFab; + + const amrex::Box bx(amrex::IntVect::TheZeroVector(), amrex::IntVect::TheZeroVector()); + IntSuperAccumulatorFab accumulator(bx, 1); + + const std::array ranges = {{ + {1.0e-12f, 1.0e-3f, 32768, 11}, + {1.0e-4f, 1.0f, 65536, 23}, + {1.0f, 1.0e6f, 65536, 37}, + {1.0e-2f, 1.0e8f, 131072, 41} + }}; + + auto compute_reference = [] (amrex::Gpu::HostVector const& values) -> float { + volatile double sum = 0.0; + volatile double corr = 0.0; + for (float v : values) { + const volatile double y = static_cast(v) - corr; + const volatile double t = sum + y; + corr = (t - sum) - y; + sum = t; + } + return static_cast(sum); + }; + + auto accumulate_values = [&] (amrex::Gpu::HostVector const& values) -> float { + accumulator.reset(bx, 1); + auto digits = accumulator.array(); + constexpr int i = 0; + constexpr int j = 0; + constexpr int k = 0; + constexpr int entry = 0; + + const int count = static_cast(values.size()); + + if (amrex::Gpu::inLaunchRegion()) { + amrex::Gpu::DeviceVector d_values(count); + amrex::Gpu::copy(amrex::Gpu::hostToDevice, values.begin(), values.end(), d_values.begin()); + + auto ptr = d_values.data(); + auto arr = digits; + + amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int idx) { + for (int n = 0; n < count; ++n) { + IntSuperAccumulatorFab::accumulate(arr, i, j, k, entry, ptr[n]); + } + }); + + amrex::Gpu::streamSynchronize(); + + amrex::Gpu::DeviceScalar d_result; + auto res_ptr = d_result.dataPtr(); + amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int) { + res_ptr[0] = IntSuperAccumulatorFab::finalize(arr, i, j, k, entry); + }); + amrex::Gpu::streamSynchronize(); + return d_result.dataValue(); + } else { + for (float v : values) { + IntSuperAccumulatorFab::accumulate(digits, i, j, k, entry, v); + } + return IntSuperAccumulatorFab::finalize(digits, i, j, k, entry); + } + }; + + for (const auto& cfg : ranges) { + std::mt19937 gen(cfg.seed); + std::uniform_real_distribution dist(cfg.min_value, cfg.max_value); + + amrex::Gpu::HostVector values(cfg.count); + for (int n = 0; n < cfg.count; ++n) { + values[n] = dist(gen); + } + + const float reference = compute_reference(values); + const float result = accumulate_values(values); + + if (result != reference) { + amrex::Print() << "Mismatch for range [" << cfg.min_value << ", " + << cfg.max_value << "] with count " << cfg.count + << ": reference=" << reference + << " accumulator=" << result << '\n'; + status = 1; + break; + } + } + + if (status == 0) { + constexpr float value = 0.125f; + constexpr int repeats = 200000; + + amrex::Gpu::HostVector values(repeats); + for (int n = 0; n < repeats; ++n) { + values[n] = value; + } + + const float reference = compute_reference(values); + const float result = accumulate_values(values); + + if (result != reference) { + amrex::Print() << "Mismatch for constant accumulation: reference=" + << reference << " accumulator=" << result << '\n'; + status = 1; + } + } + } + + amrex::Finalize(); + return status; +} From 1acf9dd946ec1905bfe8189db0450166a8533f95 Mon Sep 17 00:00:00 2001 From: Ben Wibking Date: Wed, 24 Sep 2025 13:38:50 -0400 Subject: [PATCH 02/10] fix warning --- Tests/Numerics/IntSuperAccumulator/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp index f04bfdb0a89..e763944022f 100644 --- a/Tests/Numerics/IntSuperAccumulator/main.cpp +++ b/Tests/Numerics/IntSuperAccumulator/main.cpp @@ -72,7 +72,7 @@ main (int argc, char* argv[]) auto ptr = d_values.data(); auto arr = digits; - amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int idx) { + amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int) { for (int n = 0; n < count; ++n) { IntSuperAccumulatorFab::accumulate(arr, i, j, k, entry, ptr[n]); } From b2a6b1d4352e474431f06436e2a968bcd55dac28 Mon Sep 17 00:00:00 2001 From: Ben Wibking Date: Wed, 24 Sep 2025 14:15:00 -0400 Subject: [PATCH 03/10] propagate carries using atomics --- Src/Base/AMReX_IntSuperAccumulator.H | 45 +++++++++++++++++++++ Tests/Numerics/IntSuperAccumulator/main.cpp | 6 +-- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H index c1fcc8d32c3..e64396567bb 100644 --- a/Src/Base/AMReX_IntSuperAccumulator.H +++ b/Src/Base/AMReX_IntSuperAccumulator.H @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -199,12 +200,56 @@ private: #endif } +#if defined(AMREX_USE_GPU) && AMREX_DEVICE_COMPILE + // When we compile for the device we cannot rely on a carry-save digit layout, so + // each thread performs an immediate normalization using GPU atomics. + int* const digit_ptr = &(digits(i,j,k,current_component)); + + // Break the carry into the current radix chunk and the carry to forward. + const unsigned long long chunk = carry & (base - 1ULL); + carry >>= digit_bits; + + // Atomically accumulate this chunk. Multiple threads may land on the same digit, + // which is safe because atomicAdd enforces a total order. + if (chunk != 0ULL) { + amrex::Gpu::Atomic::Add(digit_ptr, static_cast(chunk)); + } + + // Track how many bases we successfully removed so they can be forwarded upward. + unsigned long long propagated = 0ULL; + while (true) { + // Snapshot the digit atomically without changing it. This avoids racing + // against other threads that may be normalizing the same slot. + const int current_value = amrex::Gpu::Atomic::Add(digit_ptr, 0); + if (current_value < digit_base) { + break; + } + + // Attempt to drain one full base from the digit. The return value is the + // pre-subtraction contents, so we know whether we actually claimed the carry. + const int before = amrex::Gpu::Atomic::Add(digit_ptr, -digit_base); + if (before >= digit_base) { + // We consumed one unit of overflow and should forward it to the next digit. + ++propagated; + continue; + } else { + // Another thread already normalized the digit between the peek and our + // subtract; restore the value and exit. + amrex::Gpu::Atomic::Add(digit_ptr, digit_base); + break; + } + } + + // Fold any carries we successfully removed into the higher-order digits. + carry += propagated; +#else const unsigned long long sum = static_cast(digits(i,j,k,current_component)) + carry; const unsigned long long remainder = sum & (base - 1ULL); const unsigned long long quotient = sum >> digit_bits; digits(i,j,k,current_component) = static_cast(remainder); carry = quotient; +#endif ++current_component; } } diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp index e763944022f..c374a7b47f2 100644 --- a/Tests/Numerics/IntSuperAccumulator/main.cpp +++ b/Tests/Numerics/IntSuperAccumulator/main.cpp @@ -72,10 +72,8 @@ main (int argc, char* argv[]) auto ptr = d_values.data(); auto arr = digits; - amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int) { - for (int n = 0; n < count; ++n) { - IntSuperAccumulatorFab::accumulate(arr, i, j, k, entry, ptr[n]); - } + amrex::ParallelFor(count, [=] AMREX_GPU_DEVICE (int n) { + IntSuperAccumulatorFab::accumulate(arr, i, j, k, entry, ptr[n]); }); amrex::Gpu::streamSynchronize(); From 9e97b4478ff96f3d419eec4f5ca923f181b74da1 Mon Sep 17 00:00:00 2001 From: Ben Wibking Date: Wed, 24 Sep 2025 14:23:03 -0400 Subject: [PATCH 04/10] fix warnings --- Src/Base/AMReX_IntSuperAccumulator.H | 16 ++++++++-------- Tests/Numerics/IntSuperAccumulator/main.cpp | 14 +++++++------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H index e64396567bb..5b5be51d823 100644 --- a/Src/Base/AMReX_IntSuperAccumulator.H +++ b/Src/Base/AMReX_IntSuperAccumulator.H @@ -92,8 +92,8 @@ public: accumulate (Array const& digits, int i, int j, int k, int entry, float value, int entry_digits = digits_per_entry) noexcept { - if (value <= 0.0f) { - if (value < 0.0f) { + if (value <= 0.0F) { + if (value < 0.0F) { #ifndef AMREX_USE_GPU amrex::Abort("IntSuperAccumulatorFab::accumulate: negative input not supported"); #endif @@ -259,7 +259,7 @@ private: { Decomposed bits{}; - if (value <= 0.0f) { + if (value <= 0.0F) { bits.is_zero = true; return bits; } @@ -271,21 +271,21 @@ private: pun.f = value; const std::uint32_t u = pun.u; - const std::uint32_t exponent_field = (u >> 23) & 0xffu; - const std::uint32_t fraction = u & ((1u << 23) - 1u); + const std::uint32_t exponent_field = (u >> 23) & 0xFFU; + const std::uint32_t fraction = u & ((1U << 23) - 1U); - if (exponent_field == 0xffu) { + if (exponent_field == 0xFFU) { #ifndef AMREX_USE_GPU amrex::Abort("IntSuperAccumulatorFab::decompose: NaN or Inf not supported (float)"); #else Decomposed tmp{}; tmp.is_zero = true; return tmp; #endif - } else if (exponent_field == 0u) { + } else if (exponent_field == 0U) { // Treat subnormals as zero; caller is expected to pre-round. bits.is_zero = true; return bits; } else { - bits.magnitude = (1u << 23) | fraction; + bits.magnitude = (1U << 23) | fraction; bits.exponent = static_cast(exponent_field) - 127 - significand_bits + 1; } diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp index c374a7b47f2..5dba67731be 100644 --- a/Tests/Numerics/IntSuperAccumulator/main.cpp +++ b/Tests/Numerics/IntSuperAccumulator/main.cpp @@ -37,10 +37,10 @@ main (int argc, char* argv[]) IntSuperAccumulatorFab accumulator(bx, 1); const std::array ranges = {{ - {1.0e-12f, 1.0e-3f, 32768, 11}, - {1.0e-4f, 1.0f, 65536, 23}, - {1.0f, 1.0e6f, 65536, 37}, - {1.0e-2f, 1.0e8f, 131072, 41} + {1.0e-12F, 1.0e-3F, 32768, 11}, + {1.0e-4F, 1.0F, 65536, 23}, + {1.0F, 1.0e6F, 65536, 37}, + {1.0e-2F, 1.0e8F, 131072, 41} }}; auto compute_reference = [] (amrex::Gpu::HostVector const& values) -> float { @@ -69,7 +69,7 @@ main (int argc, char* argv[]) amrex::Gpu::DeviceVector d_values(count); amrex::Gpu::copy(amrex::Gpu::hostToDevice, values.begin(), values.end(), d_values.begin()); - auto ptr = d_values.data(); + auto *ptr = d_values.data(); auto arr = digits; amrex::ParallelFor(count, [=] AMREX_GPU_DEVICE (int n) { @@ -79,7 +79,7 @@ main (int argc, char* argv[]) amrex::Gpu::streamSynchronize(); amrex::Gpu::DeviceScalar d_result; - auto res_ptr = d_result.dataPtr(); + auto *res_ptr = d_result.dataPtr(); amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int) { res_ptr[0] = IntSuperAccumulatorFab::finalize(arr, i, j, k, entry); }); @@ -116,7 +116,7 @@ main (int argc, char* argv[]) } if (status == 0) { - constexpr float value = 0.125f; + constexpr float value = 0.125F; constexpr int repeats = 200000; amrex::Gpu::HostVector values(repeats); From b793a8343e18df010d0de87d0280e6cb3abcd969 Mon Sep 17 00:00:00 2001 From: Ben Wibking Date: Wed, 24 Sep 2025 14:27:02 -0400 Subject: [PATCH 05/10] fix warning --- Src/Base/AMReX_IntSuperAccumulator.H | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H index 5b5be51d823..06dffbc05ac 100644 --- a/Src/Base/AMReX_IntSuperAccumulator.H +++ b/Src/Base/AMReX_IntSuperAccumulator.H @@ -189,7 +189,7 @@ private: { unsigned long long carry = value; int current_component = component; - constexpr unsigned long long base = static_cast(digit_base); + constexpr auto base = static_cast(digit_base); while (carry != 0ULL) { if (current_component >= component_limit) { From 46623fe9a6323756756df601b700d640156f2e21 Mon Sep 17 00:00:00 2001 From: Ben Wibking Date: Wed, 24 Sep 2025 16:25:20 -0400 Subject: [PATCH 06/10] simplify --- Src/Base/AMReX_IntSuperAccumulator.H | 53 ++++++++------------- Tests/Numerics/IntSuperAccumulator/main.cpp | 21 +++++--- 2 files changed, 35 insertions(+), 39 deletions(-) diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H index 06dffbc05ac..ba1dd63f2c1 100644 --- a/Src/Base/AMReX_IntSuperAccumulator.H +++ b/Src/Base/AMReX_IntSuperAccumulator.H @@ -21,9 +21,8 @@ namespace amrex { * The accumulator stores the exact sum of single-precision floating-point * numbers by representing the scaled integer \f$\sum_i x_i 2^{-e_\text{min}}\f$ * in a radix-\f$2^{30}\f$ positional system. Each digit is stored as a 32-bit int in - * an `IArrayBox` component. A client that needs M independent sums must - * allocate an accumulator with `M * digits_per_entry` components and index the - * storage using `component = entry * digits_per_entry + digit`. Both + * an `IArrayBox` component. This simplified variant maintains a single logical + * entry whose digits occupy the `digits_per_entry` components. Both * `accumulate` and `finalize` operate in single precision; callers may cast to * wider types if needed. All summands are assumed non-negative; subnormal * inputs are treated as zero to reduce the number of required digits per entry. @@ -46,15 +45,14 @@ public: explicit IntSuperAccumulatorFab (Arena* arena) noexcept : m_storage(arena) {} - IntSuperAccumulatorFab (const Box& box, int nentries, Arena* arena=nullptr) - : m_storage(box, nentries * digits_per_entry, arena), m_entries(nentries) {} + IntSuperAccumulatorFab (const Box& box, Arena* arena=nullptr) + : m_storage(box, digits_per_entry, arena) {} - [[nodiscard]] int entries () const noexcept { return m_entries; } + [[nodiscard]] int entries () const noexcept { return 1; } - void define (const Box& box, int nentries, Arena* arena=nullptr) + void define (const Box& box, Arena* arena=nullptr) { - m_entries = nentries; - m_storage.resize(box, nentries * digits_per_entry, arena); + m_storage.resize(box, digits_per_entry, arena); } [[nodiscard]] Array4 array () noexcept { return m_storage.array(); } @@ -69,10 +67,9 @@ public: } } - void reset (Box const& bx, int nentries = -1) + void reset (Box const& bx) { - const int entries = (nentries < 0) ? m_entries : nentries; - const int ncomp = entries * digits_per_entry; + constexpr int ncomp = digits_per_entry; if (amrex::Gpu::inLaunchRegion()) { m_storage.template setVal(0, bx, 0, ncomp); } else { @@ -83,14 +80,13 @@ public: /** * \brief Accumulate a floating-point value into the superaccumulator. * - * The caller must provide the digit array and logical comp index for the - * entry being updated. All operations are carried out using integers; the - * routine does not allocate additional storage. + * The caller must provide the digit array for the single logical entry. + * All operations are carried out using integers; the routine does not + * allocate additional storage. */ template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static void - accumulate (Array const& digits, int i, int j, int k, int entry, float value, - int entry_digits = digits_per_entry) noexcept + accumulate (Array const& digits, int i, int j, int k, float value) noexcept { if (value <= 0.0F) { if (value < 0.0F) { @@ -112,8 +108,6 @@ public: const auto bits = detail::decompose(value); if (bits.is_zero) { return; } - const int component_base = entry * entry_digits; - const int shift = bits.exponent - min_exponent; int digit_index = shift / digit_bits; const int bit_offset = shift % digit_bits; @@ -121,7 +115,8 @@ public: const unsigned long long magnitude = bits.magnitude; unsigned long long chunk_value = magnitude << bit_offset; - const int component_limit = component_base + entry_digits; + constexpr int component_base = 0; + constexpr int component_limit = digits_per_entry; while (chunk_value != 0ULL) { const unsigned long long chunk = chunk_value & static_cast(digit_base - 1); @@ -146,14 +141,11 @@ public: */ template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static float - finalize (Array const& digits, int i, int j, int k, int entry, - int entry_digits = digits_per_entry) noexcept + finalize (Array const& digits, int i, int j, int k) noexcept { - const int component_base = entry * entry_digits; - double result = 0.0; - for (int d = entry_digits - 1; d >= 0; --d) { - const int digit = digits(i,j,k, component_base + d); + for (int d = digits_per_entry - 1; d >= 0; --d) { + const int digit = digits(i,j,k, d); if (digit == 0) { continue; } const int exponent = min_exponent + d * digit_bits; result += std::ldexp(static_cast(digit), exponent); @@ -163,12 +155,10 @@ public: template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static void - clear_entry (Array const& digits, int i, int j, int k, int entry, - int entry_digits = digits_per_entry) noexcept + clear_entry (Array const& digits, int i, int j, int k) noexcept { - const int component_base = entry * entry_digits; - for (int d = 0; d < entry_digits; ++d) { - digits(i,j,k, component_base + d) = 0; + for (int d = 0; d < digits_per_entry; ++d) { + digits(i,j,k, d) = 0; } } @@ -295,7 +285,6 @@ private: }; IArrayBox m_storage; - int m_entries = 0; }; } // namespace amrex diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp index 5dba67731be..11f4002a948 100644 --- a/Tests/Numerics/IntSuperAccumulator/main.cpp +++ b/Tests/Numerics/IntSuperAccumulator/main.cpp @@ -34,7 +34,7 @@ main (int argc, char* argv[]) using amrex::IntSuperAccumulatorFab; const amrex::Box bx(amrex::IntVect::TheZeroVector(), amrex::IntVect::TheZeroVector()); - IntSuperAccumulatorFab accumulator(bx, 1); + IntSuperAccumulatorFab accumulator(bx); const std::array ranges = {{ {1.0e-12F, 1.0e-3F, 32768, 11}, @@ -56,12 +56,11 @@ main (int argc, char* argv[]) }; auto accumulate_values = [&] (amrex::Gpu::HostVector const& values) -> float { - accumulator.reset(bx, 1); + accumulator.reset(bx); auto digits = accumulator.array(); constexpr int i = 0; constexpr int j = 0; constexpr int k = 0; - constexpr int entry = 0; const int count = static_cast(values.size()); @@ -73,7 +72,7 @@ main (int argc, char* argv[]) auto arr = digits; amrex::ParallelFor(count, [=] AMREX_GPU_DEVICE (int n) { - IntSuperAccumulatorFab::accumulate(arr, i, j, k, entry, ptr[n]); + IntSuperAccumulatorFab::accumulate(arr, i, j, k, ptr[n]); }); amrex::Gpu::streamSynchronize(); @@ -81,15 +80,15 @@ main (int argc, char* argv[]) amrex::Gpu::DeviceScalar d_result; auto *res_ptr = d_result.dataPtr(); amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int) { - res_ptr[0] = IntSuperAccumulatorFab::finalize(arr, i, j, k, entry); + res_ptr[0] = IntSuperAccumulatorFab::finalize(arr, i, j, k); }); amrex::Gpu::streamSynchronize(); return d_result.dataValue(); } else { for (float v : values) { - IntSuperAccumulatorFab::accumulate(digits, i, j, k, entry, v); + IntSuperAccumulatorFab::accumulate(digits, i, j, k, v); } - return IntSuperAccumulatorFab::finalize(digits, i, j, k, entry); + return IntSuperAccumulatorFab::finalize(digits, i, j, k); } }; @@ -105,6 +104,11 @@ main (int argc, char* argv[]) const float reference = compute_reference(values); const float result = accumulate_values(values); + amrex::Print() << "Range [" << cfg.min_value << ", " << cfg.max_value + << "] with count " << cfg.count + << ": reference=" << reference + << " accumulator=" << result << '\n'; + if (result != reference) { amrex::Print() << "Mismatch for range [" << cfg.min_value << ", " << cfg.max_value << "] with count " << cfg.count @@ -127,6 +131,9 @@ main (int argc, char* argv[]) const float reference = compute_reference(values); const float result = accumulate_values(values); + amrex::Print() << "Constant accumulation: reference=" << reference + << " accumulator=" << result << '\n'; + if (result != reference) { amrex::Print() << "Mismatch for constant accumulation: reference=" << reference << " accumulator=" << result << '\n'; From a54ce40a79e049cacef4f5ea7acb449417c584a1 Mon Sep 17 00:00:00 2001 From: Ben Wibking Date: Wed, 24 Sep 2025 16:40:03 -0400 Subject: [PATCH 07/10] add very small and very large summands --- Tests/Numerics/IntSuperAccumulator/main.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp index 11f4002a948..18c3026bbaa 100644 --- a/Tests/Numerics/IntSuperAccumulator/main.cpp +++ b/Tests/Numerics/IntSuperAccumulator/main.cpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -36,7 +37,14 @@ main (int argc, char* argv[]) const amrex::Box bx(amrex::IntVect::TheZeroVector(), amrex::IntVect::TheZeroVector()); IntSuperAccumulatorFab accumulator(bx); - const std::array ranges = {{ + const float lowest_exponent_min = std::numeric_limits::min(); + const float lowest_exponent_max = std::nextafter(std::ldexp(1.0f, -125), 0.0f); + const float highest_exponent_min = std::ldexp(1.0f, 125); + const float highest_exponent_max = std::ldexp(1.0f, 126); // OVERFLOWS TO INF: std::numeric_limits::max(); + + const std::array ranges = {{ + {lowest_exponent_min, lowest_exponent_max, 32768, 3}, + {highest_exponent_min, highest_exponent_max, 4, 5}, {1.0e-12F, 1.0e-3F, 32768, 11}, {1.0e-4F, 1.0F, 65536, 23}, {1.0F, 1.0e6F, 65536, 37}, From 21e101ecf37a77a0a092d2a464aac68229d08861 Mon Sep 17 00:00:00 2001 From: Ben Wibking Date: Wed, 24 Sep 2025 16:40:51 -0400 Subject: [PATCH 08/10] fix warning --- Src/Base/AMReX_IntSuperAccumulator.H | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H index ba1dd63f2c1..acb2429e13f 100644 --- a/Src/Base/AMReX_IntSuperAccumulator.H +++ b/Src/Base/AMReX_IntSuperAccumulator.H @@ -48,7 +48,7 @@ public: IntSuperAccumulatorFab (const Box& box, Arena* arena=nullptr) : m_storage(box, digits_per_entry, arena) {} - [[nodiscard]] int entries () const noexcept { return 1; } + [[nodiscard]] static constexpr int entries () noexcept { return 1; } void define (const Box& box, Arena* arena=nullptr) { From b2e47f430d815cc7a06de69c1c028271a53652f4 Mon Sep 17 00:00:00 2001 From: Ben Wibking Date: Wed, 24 Sep 2025 16:47:10 -0400 Subject: [PATCH 09/10] fix float literal --- Tests/Numerics/IntSuperAccumulator/main.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp index 18c3026bbaa..b9577a26ed4 100644 --- a/Tests/Numerics/IntSuperAccumulator/main.cpp +++ b/Tests/Numerics/IntSuperAccumulator/main.cpp @@ -38,9 +38,9 @@ main (int argc, char* argv[]) IntSuperAccumulatorFab accumulator(bx); const float lowest_exponent_min = std::numeric_limits::min(); - const float lowest_exponent_max = std::nextafter(std::ldexp(1.0f, -125), 0.0f); - const float highest_exponent_min = std::ldexp(1.0f, 125); - const float highest_exponent_max = std::ldexp(1.0f, 126); // OVERFLOWS TO INF: std::numeric_limits::max(); + const float lowest_exponent_max = std::nextafter(std::ldexp(1.0F, -125), 0.0F); + const float highest_exponent_min = std::ldexp(1.0F, 125); + const float highest_exponent_max = std::ldexp(1.0F, 126); // OVERFLOWS TO INF: std::numeric_limits::max(); const std::array ranges = {{ {lowest_exponent_min, lowest_exponent_max, 32768, 3}, From 9037ed5fb3842dea6cfac1a181e29b6042cf75dd Mon Sep 17 00:00:00 2001 From: Benjamin Wibking Date: Thu, 25 Sep 2025 08:00:48 +1000 Subject: [PATCH 10/10] use int64_t words --- Src/Base/AMReX_IntSuperAccumulator.H | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H index acb2429e13f..8186a40b24d 100644 --- a/Src/Base/AMReX_IntSuperAccumulator.H +++ b/Src/Base/AMReX_IntSuperAccumulator.H @@ -4,10 +4,10 @@ #include #include +#include #include #include #include -#include #include #include @@ -16,12 +16,12 @@ namespace amrex { /** - * \brief Integer superaccumulator backed by IArrayBox storage. + * \brief Integer superaccumulator backed by BaseFab storage. * * The accumulator stores the exact sum of single-precision floating-point * numbers by representing the scaled integer \f$\sum_i x_i 2^{-e_\text{min}}\f$ - * in a radix-\f$2^{30}\f$ positional system. Each digit is stored as a 32-bit int in - * an `IArrayBox` component. This simplified variant maintains a single logical + * in a radix-\f$2^{30}\f$ positional system. Each digit is stored as a 64-bit signed + * integer in a `BaseFab` component. This simplified variant maintains a single logical * entry whose digits occupy the `digits_per_entry` components. Both * `accumulate` and `finalize` operate in single precision; callers may cast to * wider types if needed. All summands are assumed non-negative; subnormal @@ -55,8 +55,8 @@ public: m_storage.resize(box, digits_per_entry, arena); } - [[nodiscard]] Array4 array () noexcept { return m_storage.array(); } - [[nodiscard]] Array4 const_array () const noexcept { return m_storage.const_array(); } + [[nodiscard]] Array4 array () noexcept { return m_storage.array(); } + [[nodiscard]] Array4 const_array () const noexcept { return m_storage.const_array(); } void reset () { @@ -145,7 +145,7 @@ public: { double result = 0.0; for (int d = digits_per_entry - 1; d >= 0; --d) { - const int digit = digits(i,j,k, d); + const Long digit = digits(i,j,k, d); if (digit == 0) { continue; } const int exponent = min_exponent + d * digit_bits; result += std::ldexp(static_cast(digit), exponent); @@ -193,7 +193,7 @@ private: #if defined(AMREX_USE_GPU) && AMREX_DEVICE_COMPILE // When we compile for the device we cannot rely on a carry-save digit layout, so // each thread performs an immediate normalization using GPU atomics. - int* const digit_ptr = &(digits(i,j,k,current_component)); + Long* const digit_ptr = &(digits(i,j,k,current_component)); // Break the carry into the current radix chunk and the carry to forward. const unsigned long long chunk = carry & (base - 1ULL); @@ -202,7 +202,7 @@ private: // Atomically accumulate this chunk. Multiple threads may land on the same digit, // which is safe because atomicAdd enforces a total order. if (chunk != 0ULL) { - amrex::Gpu::Atomic::Add(digit_ptr, static_cast(chunk)); + amrex::Gpu::Atomic::Add(digit_ptr, static_cast(chunk)); } // Track how many bases we successfully removed so they can be forwarded upward. @@ -210,14 +210,14 @@ private: while (true) { // Snapshot the digit atomically without changing it. This avoids racing // against other threads that may be normalizing the same slot. - const int current_value = amrex::Gpu::Atomic::Add(digit_ptr, 0); + const Long current_value = amrex::Gpu::Atomic::Add(digit_ptr, static_cast(0)); if (current_value < digit_base) { break; } // Attempt to drain one full base from the digit. The return value is the // pre-subtraction contents, so we know whether we actually claimed the carry. - const int before = amrex::Gpu::Atomic::Add(digit_ptr, -digit_base); + const Long before = amrex::Gpu::Atomic::Add(digit_ptr, static_cast(-digit_base)); if (before >= digit_base) { // We consumed one unit of overflow and should forward it to the next digit. ++propagated; @@ -225,7 +225,7 @@ private: } else { // Another thread already normalized the digit between the peek and our // subtract; restore the value and exit. - amrex::Gpu::Atomic::Add(digit_ptr, digit_base); + amrex::Gpu::Atomic::Add(digit_ptr, static_cast(digit_base)); break; } } @@ -237,7 +237,7 @@ private: const unsigned long long remainder = sum & (base - 1ULL); const unsigned long long quotient = sum >> digit_bits; - digits(i,j,k,current_component) = static_cast(remainder); + digits(i,j,k,current_component) = static_cast(remainder); carry = quotient; #endif ++current_component; @@ -284,7 +284,7 @@ private: } }; - IArrayBox m_storage; + BaseFab m_storage; }; } // namespace amrex