From 8b49927099c456019bb396ae2e9f81413672910e Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Wed, 24 Sep 2025 13:30:39 -0400
Subject: [PATCH 01/10] add IntSuperAccumulator

---
 Src/Base/AMReX_IntSuperAccumulator.H          | 258 ++++++++++++++++++
 Src/Base/CMakeLists.txt                       |   1 +
 Src/Base/Make.package                         |   1 +
 Tests/CMakeLists.txt                          |   2 +-
 .../IntSuperAccumulator/CMakeLists.txt        |   9 +
 .../Numerics/IntSuperAccumulator/GNUmakefile  |  26 ++
 .../Numerics/IntSuperAccumulator/Make.package |   1 +
 Tests/Numerics/IntSuperAccumulator/main.cpp   | 142 ++++++++++
 8 files changed, 439 insertions(+), 1 deletion(-)
 create mode 100644 Src/Base/AMReX_IntSuperAccumulator.H
 create mode 100644 Tests/Numerics/IntSuperAccumulator/CMakeLists.txt
 create mode 100644 Tests/Numerics/IntSuperAccumulator/GNUmakefile
 create mode 100644 Tests/Numerics/IntSuperAccumulator/Make.package
 create mode 100644 Tests/Numerics/IntSuperAccumulator/main.cpp
diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H
new file mode 100644
index 00000000000..c1fcc8d32c3
--- /dev/null
+++ b/Src/Base/AMReX_IntSuperAccumulator.H
@@ -0,0 +1,258 @@
+#ifndef AMREX_INT_SUPERACCUMULATOR_H_
+#define AMREX_INT_SUPERACCUMULATOR_H_
+
+#include <AMReX_Config.H>
+
+#include <AMReX_Array4.H>
+#include <AMReX_Gpu.H>
+#include <AMReX_GpuQualifiers.H>
+#include <AMReX_IArrayBox.H>
+#include <AMReX_Math.H>
+
+#include <cmath>
+#include <cstdint>
+
+namespace amrex {
+
+/**
+ * \brief Integer superaccumulator backed by IArrayBox storage.
+ *
+ * The accumulator stores the exact sum of single-precision floating-point
+ * numbers by representing the scaled integer \f$\sum_i x_i 2^{-e_\text{min}}\f$
+ * in a radix-\f$2^{30}\f$ positional system. Each digit is stored as a 32-bit int in
+ * an `IArrayBox` component. A client that needs M independent sums must
+ * allocate an accumulator with `M * digits_per_entry` components and index the
+ * storage using `component = entry * digits_per_entry + digit`. Both
+ * `accumulate` and `finalize` operate in single precision; callers may cast to
+ * wider types if needed. All summands are assumed non-negative; subnormal
+ * inputs are treated as zero to reduce the number of required digits per entry.
+ */
+class IntSuperAccumulatorFab
+{
+public:
+    static constexpr int digit_bits = 30;                 //!< Width of each digit in bits.
+    static constexpr int digit_base = 1 << digit_bits;    //!< Base of each digit.
+    static constexpr int min_exponent = -149;             //!< Smallest exponent of scaled mantissa (binary32 minimum normal).
+    static constexpr int max_exponent =  104;             //!< Largest exponent of scaled mantissa (binary32 maximum normal).
+    static constexpr int significand_bits = 24;           //!< Binary32 significand bits including hidden bit.
+    static constexpr int extra_digits = 1;                //!< Safety margin for carry propagation.
+    static constexpr int digits_for_shift = ((max_exponent - min_exponent) / digit_bits) + 1; // covers exponent window
+    static constexpr int digits_for_value = 2;            //!< Worst-case mantissa occupies two digits at radix 2^30.
+    static constexpr int digits_per_entry = digits_for_shift + digits_for_value + extra_digits; //!< 12 digits per entry.
+
+    IntSuperAccumulatorFab () = default;
+
+    explicit IntSuperAccumulatorFab (Arena* arena) noexcept
+        : m_storage(arena) {}
+
+    IntSuperAccumulatorFab (const Box& box, int nentries, Arena* arena=nullptr)
+        : m_storage(box, nentries * digits_per_entry, arena), m_entries(nentries) {}
+
+    [[nodiscard]] int entries () const noexcept { return m_entries; }
+
+    void define (const Box& box, int nentries, Arena* arena=nullptr)
+    {
+        m_entries = nentries;
+        m_storage.resize(box, nentries * digits_per_entry, arena);
+    }
+
+    [[nodiscard]] Array4<int> array () noexcept { return m_storage.array(); }
+    [[nodiscard]] Array4<int const> const_array () const noexcept { return m_storage.const_array(); }
+
+    void reset ()
+    {
+        if (amrex::Gpu::inLaunchRegion()) {
+            m_storage.template setVal<RunOn::Device>(0);
+        } else {
+            m_storage.template setVal<RunOn::Host>(0);
+        }
+    }
+
+    void reset (Box const& bx, int nentries = -1)
+    {
+        const int entries = (nentries < 0) ? m_entries : nentries;
+        const int ncomp = entries * digits_per_entry;
+        if (amrex::Gpu::inLaunchRegion()) {
+            m_storage.template setVal<RunOn::Device>(0, bx, 0, ncomp);
+        } else {
+            m_storage.template setVal<RunOn::Host>(0, bx, 0, ncomp);
+        }
+    }
+
+    /**
+     * \brief Accumulate a floating-point value into the superaccumulator.
+     *
+     * The caller must provide the digit array and logical comp index for the
+     * entry being updated. All operations are carried out using integers; the
+     * routine does not allocate additional storage.
+     */
+    template <typename Array>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static void
+    accumulate (Array const& digits, int i, int j, int k, int entry, float value,
+                int entry_digits = digits_per_entry) noexcept
+    {
+        if (value <= 0.0f) {
+            if (value < 0.0f) {
+#ifndef AMREX_USE_GPU
+                amrex::Abort("IntSuperAccumulatorFab::accumulate: negative input not supported");
+#endif
+            }
+            return;
+        }
+
+        if (!amrex::Math::isfinite(value)) {
+#ifndef AMREX_USE_GPU
+            amrex::Abort("IntSuperAccumulatorFab::accumulate: non-finite input");
+#else
+            return;
+#endif
+        }
+
+        const auto bits = detail::decompose(value);
+        if (bits.is_zero) { return; }
+
+        const int component_base = entry * entry_digits;
+
+        const int shift = bits.exponent - min_exponent;
+        int digit_index = shift / digit_bits;
+        const int bit_offset = shift % digit_bits;
+
+        const unsigned long long magnitude = bits.magnitude;
+        unsigned long long chunk_value = magnitude << bit_offset;
+
+        const int component_limit = component_base + entry_digits;
+
+        while (chunk_value != 0ULL) {
+            const unsigned long long chunk = chunk_value & static_cast<unsigned long long>(digit_base - 1);
+            chunk_value >>= digit_bits;
+            if (component_base + digit_index >= component_limit) {
+#ifndef AMREX_USE_GPU
+                amrex::Abort("IntSuperAccumulatorFab: digit overflow");
+#else
+                break;
+#endif
+            }
+            detail::add_to_digit(digits, i, j, k,
+                                 component_base + digit_index,
+                                 component_limit,
+                                 chunk);
+            ++digit_index;
+        }
+    }
+
+    /**
+     * \brief Finalize the accumulator entry and return a binary32 result.
+     */
+    template <typename Array>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static float
+    finalize (Array const& digits, int i, int j, int k, int entry,
+              int entry_digits = digits_per_entry) noexcept
+    {
+        const int component_base = entry * entry_digits;
+
+        double result = 0.0;
+        for (int d = entry_digits - 1; d >= 0; --d) {
+            const int digit = digits(i,j,k, component_base + d);
+            if (digit == 0) { continue; }
+            const int exponent = min_exponent + d * digit_bits;
+            result += std::ldexp(static_cast<double>(digit), exponent);
+        }
+        return static_cast<float>(result);
+    }
+
+    template <typename Array>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static void
+    clear_entry (Array const& digits, int i, int j, int k, int entry,
+                 int entry_digits = digits_per_entry) noexcept
+    {
+        const int component_base = entry * entry_digits;
+        for (int d = 0; d < entry_digits; ++d) {
+            digits(i,j,k, component_base + d) = 0;
+        }
+    }
+
+private:
+    struct Decomposed
+    {
+        unsigned long long magnitude;
+        int exponent;
+        bool is_zero;
+    };
+
+    struct detail
+    {
+        template <typename Array>
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static void
+        add_to_digit (Array const& digits, int i, int j, int k,
+                      int component, int component_limit, unsigned long long value) noexcept
+        {
+            unsigned long long carry = value;
+            int current_component = component;
+            constexpr unsigned long long base = static_cast<unsigned long long>(digit_base);
+
+            while (carry != 0ULL) {
+                if (current_component >= component_limit) {
+#ifndef AMREX_USE_GPU
+                    amrex::Abort("IntSuperAccumulatorFab: carry overflow");
+#else
+                    break;
+#endif
+                }
+
+                const unsigned long long sum = static_cast<unsigned long long>(digits(i,j,k,current_component)) + carry;
+                const unsigned long long remainder = sum & (base - 1ULL);
+                const unsigned long long quotient = sum >> digit_bits;
+
+                digits(i,j,k,current_component) = static_cast<int>(remainder);
+                carry = quotient;
+                ++current_component;
+            }
+        }
+
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static Decomposed
+        decompose (float value) noexcept
+        {
+            Decomposed bits{};
+
+            if (value <= 0.0f) {
+                bits.is_zero = true;
+                return bits;
+            }
+
+            union {
+                float f;
+                std::uint32_t u;
+            } pun{};
+            pun.f = value;
+            const std::uint32_t u = pun.u;
+
+            const std::uint32_t exponent_field = (u >> 23) & 0xffu;
+            const std::uint32_t fraction = u & ((1u << 23) - 1u);
+
+            if (exponent_field == 0xffu) {
+#ifndef AMREX_USE_GPU
+                amrex::Abort("IntSuperAccumulatorFab::decompose: NaN or Inf not supported (float)");
+#else
+                Decomposed tmp{}; tmp.is_zero = true; return tmp;
+#endif
+            } else if (exponent_field == 0u) {
+                // Treat subnormals as zero; caller is expected to pre-round.
+                bits.is_zero = true;
+                return bits;
+            } else {
+                bits.magnitude = (1u << 23) | fraction;
+                bits.exponent = static_cast<int>(exponent_field) - 127 - significand_bits + 1;
+            }
+
+            bits.is_zero = false;
+            return bits;
+        }
+    };
+
+    IArrayBox m_storage;
+    int m_entries = 0;
+};
+
+} // namespace amrex
+
+#endif
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
index 89f406013d0..07802d1d6b5 100644
--- a/Src/Base/CMakeLists.txt
+++ b/Src/Base/CMakeLists.txt
@@ -137,6 +137,7 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        AMReX_BaseFab.H
        AMReX_BaseFab.cpp
        AMReX_Array4.H
+       AMReX_IntSuperAccumulator.H
        AMReX_MakeType.H
        AMReX_TypeTraits.H
        AMReX_FabDataType.H
diff --git a/Src/Base/Make.package b/Src/Base/Make.package
index 675d37b2ccf..7b7851076e6 100644
--- a/Src/Base/Make.package
+++ b/Src/Base/Make.package
@@ -163,6 +163,7 @@ C$(AMREX_BASE)_headers += AMReX_TypeTraits.H
 C$(AMREX_BASE)_headers += AMReX_FabDataType.H
 
 C$(AMREX_BASE)_headers += AMReX_Array4.H
+C$(AMREX_BASE)_headers += AMReX_IntSuperAccumulator.H
 C$(AMREX_BASE)_sources += AMReX_BaseFab.cpp
 C$(AMREX_BASE)_headers += AMReX_BaseFab.H AMReX_BaseFabUtility.H
 C$(AMREX_BASE)_headers += AMReX_FabFactory.H
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index 3db6cccbf8d..c0a4b82e258 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -126,7 +126,7 @@ else()
    # List of subdirectories to search for CMakeLists.
    #
    set( AMREX_TESTS_SUBDIRS Amr AsyncOut CallNoinline CLZ CommType CTOParFor DeviceGlobal
-                            Enum HeatEquation MultiBlock MultiPeriod ParmParse Parser
+                            Enum HeatEquation MultiBlock MultiPeriod Numerics ParmParse Parser
                             Parser2 ParserUserFn Reinit RoundoffDomain SmallMatrix)
 
    if (AMReX_PARTICLES)
diff --git a/Tests/Numerics/IntSuperAccumulator/CMakeLists.txt b/Tests/Numerics/IntSuperAccumulator/CMakeLists.txt
new file mode 100644
index 00000000000..7e8d9a8080d
--- /dev/null
+++ b/Tests/Numerics/IntSuperAccumulator/CMakeLists.txt
@@ -0,0 +1,9 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources main.cpp)
+    set(_input_files )
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/Numerics/IntSuperAccumulator/GNUmakefile b/Tests/Numerics/IntSuperAccumulator/GNUmakefile
new file mode 100644
index 00000000000..a42d27db031
--- /dev/null
+++ b/Tests/Numerics/IntSuperAccumulator/GNUmakefile
@@ -0,0 +1,26 @@
+AMREX_HOME ?= ../../../amrex
+
+PRECISION = FLOAT
+
+DEBUG   = FALSE
+
+DIM     = 3
+
+COMP    = gcc
+
+USE_MPI   = FALSE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+BL_NO_FORT = TRUE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/Numerics/IntSuperAccumulator/Make.package b/Tests/Numerics/IntSuperAccumulator/Make.package
new file mode 100644
index 00000000000..6b4b865e8fc
--- /dev/null
+++ b/Tests/Numerics/IntSuperAccumulator/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp
new file mode 100644
index 00000000000..f04bfdb0a89
--- /dev/null
+++ b/Tests/Numerics/IntSuperAccumulator/main.cpp
@@ -0,0 +1,142 @@
+#include <AMReX.H>
+#include <AMReX_Print.H>
+#include <AMReX_Box.H>
+#include <AMReX_IntVect.H>
+#include <AMReX_IntSuperAccumulator.H>
+#include <AMReX_Gpu.H>
+#include <AMReX_GpuContainers.H>
+
+#include <array>
+#include <cstdint>
+#include <limits>
+#include <random>
+
+namespace
+{
+
+struct Range
+{
+    float min_value;
+    float max_value;
+    int   count;
+    int   seed;
+};
+
+} // namespace
+
+int
+main (int argc, char* argv[])
+{
+    amrex::Initialize(argc, argv);
+    int status = 0;
+
+    {
+        using amrex::IntSuperAccumulatorFab;
+
+        const amrex::Box bx(amrex::IntVect::TheZeroVector(), amrex::IntVect::TheZeroVector());
+        IntSuperAccumulatorFab accumulator(bx, 1);
+
+        const std::array<Range, 4> ranges = {{
+            {1.0e-12f, 1.0e-3f,   32768, 11},
+            {1.0e-4f,  1.0f,      65536, 23},
+            {1.0f,     1.0e6f,    65536, 37},
+            {1.0e-2f,  1.0e8f,   131072, 41}
+        }};
+
+        auto compute_reference = [] (amrex::Gpu::HostVector<float> const& values) -> float {
+            volatile double sum = 0.0;
+            volatile double corr = 0.0;
+            for (float v : values) {
+                const volatile double y = static_cast<double>(v) - corr;
+                const volatile double t = sum + y;
+                corr = (t - sum) - y;
+                sum = t;
+            }
+            return static_cast<float>(sum);
+        };
+
+        auto accumulate_values = [&] (amrex::Gpu::HostVector<float> const& values) -> float {
+            accumulator.reset(bx, 1);
+            auto digits = accumulator.array();
+            constexpr int i = 0;
+            constexpr int j = 0;
+            constexpr int k = 0;
+            constexpr int entry = 0;
+
+            const int count = static_cast<int>(values.size());
+
+            if (amrex::Gpu::inLaunchRegion()) {
+                amrex::Gpu::DeviceVector<float> d_values(count);
+                amrex::Gpu::copy(amrex::Gpu::hostToDevice, values.begin(), values.end(), d_values.begin());
+
+                auto ptr = d_values.data();
+                auto arr = digits;
+
+                amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int idx) {
+                    for (int n = 0; n < count; ++n) {
+                        IntSuperAccumulatorFab::accumulate(arr, i, j, k, entry, ptr[n]);
+                    }
+                });
+
+                amrex::Gpu::streamSynchronize();
+
+                amrex::Gpu::DeviceScalar<float> d_result;
+                auto res_ptr = d_result.dataPtr();
+                amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int) {
+                    res_ptr[0] = IntSuperAccumulatorFab::finalize(arr, i, j, k, entry);
+                });
+                amrex::Gpu::streamSynchronize();
+                return d_result.dataValue();
+            } else {
+                for (float v : values) {
+                    IntSuperAccumulatorFab::accumulate(digits, i, j, k, entry, v);
+                }
+                return IntSuperAccumulatorFab::finalize(digits, i, j, k, entry);
+            }
+        };
+
+        for (const auto& cfg : ranges) {
+            std::mt19937 gen(cfg.seed);
+            std::uniform_real_distribution<float> dist(cfg.min_value, cfg.max_value);
+
+            amrex::Gpu::HostVector<float> values(cfg.count);
+            for (int n = 0; n < cfg.count; ++n) {
+                values[n] = dist(gen);
+            }
+
+            const float reference = compute_reference(values);
+            const float result = accumulate_values(values);
+
+            if (result != reference) {
+                amrex::Print() << "Mismatch for range [" << cfg.min_value << ", "
+                               << cfg.max_value << "] with count " << cfg.count
+                               << ": reference=" << reference
+                               << " accumulator=" << result << '\n';
+                status = 1;
+                break;
+            }
+        }
+
+        if (status == 0) {
+            constexpr float value = 0.125f;
+            constexpr int repeats = 200000;
+
+            amrex::Gpu::HostVector<float> values(repeats);
+            for (int n = 0; n < repeats; ++n) {
+                values[n] = value;
+            }
+
+            const float reference = compute_reference(values);
+            const float result = accumulate_values(values);
+
+            if (result != reference) {
+                amrex::Print() << "Mismatch for constant accumulation: reference="
+                               << reference << " accumulator=" << result << '\n';
+                status = 1;
+            }
+        }
+    }
+
+    amrex::Finalize();
+    return status;
+}

From 1acf9dd946ec1905bfe8189db0450166a8533f95 Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Wed, 24 Sep 2025 13:38:50 -0400
Subject: [PATCH 02/10] fix warning

---
 Tests/Numerics/IntSuperAccumulator/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp
index f04bfdb0a89..e763944022f 100644
--- a/Tests/Numerics/IntSuperAccumulator/main.cpp
+++ b/Tests/Numerics/IntSuperAccumulator/main.cpp
@@ -72,7 +72,7 @@ main (int argc, char* argv[])
                 auto ptr = d_values.data();
                 auto arr = digits;
 
-                amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int idx) {
+                amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int) {
                     for (int n = 0; n < count; ++n) {
                         IntSuperAccumulatorFab::accumulate(arr, i, j, k, entry, ptr[n]);
                     }

From b2a6b1d4352e474431f06436e2a968bcd55dac28 Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Wed, 24 Sep 2025 14:15:00 -0400
Subject: [PATCH 03/10] propagate carries using atomics

---
 Src/Base/AMReX_IntSuperAccumulator.H        | 45 +++++++++++++++++++++
 Tests/Numerics/IntSuperAccumulator/main.cpp |  6 +--
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H
index c1fcc8d32c3..e64396567bb 100644
--- a/Src/Base/AMReX_IntSuperAccumulator.H
+++ b/Src/Base/AMReX_IntSuperAccumulator.H
@@ -5,6 +5,7 @@
 
 #include <AMReX_Array4.H>
 #include <AMReX_Gpu.H>
+#include <AMReX_GpuAtomic.H>
 #include <AMReX_GpuQualifiers.H>
 #include <AMReX_IArrayBox.H>
 #include <AMReX_Math.H>
@@ -199,12 +200,56 @@ private:
 #endif
                 }
 
+#if defined(AMREX_USE_GPU) && AMREX_DEVICE_COMPILE
+                // When we compile for the device we cannot rely on a carry-save digit layout, so
+                // each thread performs an immediate normalization using GPU atomics.
+                int* const digit_ptr = &(digits(i,j,k,current_component));
+
+                // Break the carry into the current radix chunk and the carry to forward.
+                const unsigned long long chunk = carry & (base - 1ULL);
+                carry >>= digit_bits;
+
+                // Atomically accumulate this chunk.  Multiple threads may land on the same digit,
+                // which is safe because atomicAdd enforces a total order.
+                if (chunk != 0ULL) {
+                    amrex::Gpu::Atomic::Add(digit_ptr, static_cast<int>(chunk));
+                }
+
+                // Track how many bases we successfully removed so they can be forwarded upward.
+                unsigned long long propagated = 0ULL;
+                while (true) {
+                    // Snapshot the digit atomically without changing it.  This avoids racing
+                    // against other threads that may be normalizing the same slot.
+                    const int current_value = amrex::Gpu::Atomic::Add(digit_ptr, 0);
+                    if (current_value < digit_base) {
+                        break;
+                    }
+
+                    // Attempt to drain one full base from the digit.  The return value is the
+                    // pre-subtraction contents, so we know whether we actually claimed the carry.
+                    const int before = amrex::Gpu::Atomic::Add(digit_ptr, -digit_base);
+                    if (before >= digit_base) {
+                        // We consumed one unit of overflow and should forward it to the next digit.
+                        ++propagated;
+                        continue;
+                    } else {
+                        // Another thread already normalized the digit between the peek and our
+                        // subtract; restore the value and exit.
+                        amrex::Gpu::Atomic::Add(digit_ptr, digit_base);
+                        break;
+                    }
+                }
+
+                // Fold any carries we successfully removed into the higher-order digits.
+                carry += propagated;
+#else
                 const unsigned long long sum = static_cast<unsigned long long>(digits(i,j,k,current_component)) + carry;
                 const unsigned long long remainder = sum & (base - 1ULL);
                 const unsigned long long quotient = sum >> digit_bits;
 
                 digits(i,j,k,current_component) = static_cast<int>(remainder);
                 carry = quotient;
+#endif
                 ++current_component;
             }
         }
diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp
index e763944022f..c374a7b47f2 100644
--- a/Tests/Numerics/IntSuperAccumulator/main.cpp
+++ b/Tests/Numerics/IntSuperAccumulator/main.cpp
@@ -72,10 +72,8 @@ main (int argc, char* argv[])
                 auto ptr = d_values.data();
                 auto arr = digits;
 
-                amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int) {
-                    for (int n = 0; n < count; ++n) {
-                        IntSuperAccumulatorFab::accumulate(arr, i, j, k, entry, ptr[n]);
-                    }
+                amrex::ParallelFor(count, [=] AMREX_GPU_DEVICE (int n) {
+                    IntSuperAccumulatorFab::accumulate(arr, i, j, k, entry, ptr[n]);
                 });
 
                 amrex::Gpu::streamSynchronize();

From 9e97b4478ff96f3d419eec4f5ca923f181b74da1 Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Wed, 24 Sep 2025 14:23:03 -0400
Subject: [PATCH 04/10] fix warnings

---
 Src/Base/AMReX_IntSuperAccumulator.H        | 16 ++++++++--------
 Tests/Numerics/IntSuperAccumulator/main.cpp | 14 +++++++-------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H
index e64396567bb..5b5be51d823 100644
--- a/Src/Base/AMReX_IntSuperAccumulator.H
+++ b/Src/Base/AMReX_IntSuperAccumulator.H
@@ -92,8 +92,8 @@ public:
     accumulate (Array const& digits, int i, int j, int k, int entry, float value,
                 int entry_digits = digits_per_entry) noexcept
     {
-        if (value <= 0.0f) {
-            if (value < 0.0f) {
+        if (value <= 0.0F) {
+            if (value < 0.0F) {
 #ifndef AMREX_USE_GPU
                 amrex::Abort("IntSuperAccumulatorFab::accumulate: negative input not supported");
 #endif
@@ -259,7 +259,7 @@ private:
         {
             Decomposed bits{};
 
-            if (value <= 0.0f) {
+            if (value <= 0.0F) {
                 bits.is_zero = true;
                 return bits;
             }
@@ -271,21 +271,21 @@ private:
             pun.f = value;
             const std::uint32_t u = pun.u;
 
-            const std::uint32_t exponent_field = (u >> 23) & 0xffu;
-            const std::uint32_t fraction = u & ((1u << 23) - 1u);
+            const std::uint32_t exponent_field = (u >> 23) & 0xFFU;
+            const std::uint32_t fraction = u & ((1U << 23) - 1U);
 
-            if (exponent_field == 0xffu) {
+            if (exponent_field == 0xFFU) {
 #ifndef AMREX_USE_GPU
                 amrex::Abort("IntSuperAccumulatorFab::decompose: NaN or Inf not supported (float)");
 #else
                 Decomposed tmp{}; tmp.is_zero = true; return tmp;
 #endif
-            } else if (exponent_field == 0u) {
+            } else if (exponent_field == 0U) {
                 // Treat subnormals as zero; caller is expected to pre-round.
                 bits.is_zero = true;
                 return bits;
             } else {
-                bits.magnitude = (1u << 23) | fraction;
+                bits.magnitude = (1U << 23) | fraction;
                 bits.exponent = static_cast<int>(exponent_field) - 127 - significand_bits + 1;
             }
 
diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp
index c374a7b47f2..5dba67731be 100644
--- a/Tests/Numerics/IntSuperAccumulator/main.cpp
+++ b/Tests/Numerics/IntSuperAccumulator/main.cpp
@@ -37,10 +37,10 @@ main (int argc, char* argv[])
         IntSuperAccumulatorFab accumulator(bx, 1);
 
         const std::array<Range, 4> ranges = {{
-            {1.0e-12f, 1.0e-3f,   32768, 11},
-            {1.0e-4f,  1.0f,      65536, 23},
-            {1.0f,     1.0e6f,    65536, 37},
-            {1.0e-2f,  1.0e8f,   131072, 41}
+            {1.0e-12F, 1.0e-3F,   32768, 11},
+            {1.0e-4F,  1.0F,      65536, 23},
+            {1.0F,     1.0e6F,    65536, 37},
+            {1.0e-2F,  1.0e8F,   131072, 41}
         }};
 
         auto compute_reference = [] (amrex::Gpu::HostVector<float> const& values) -> float {
@@ -69,7 +69,7 @@ main (int argc, char* argv[])
                 amrex::Gpu::DeviceVector<float> d_values(count);
                 amrex::Gpu::copy(amrex::Gpu::hostToDevice, values.begin(), values.end(), d_values.begin());
 
-                auto ptr = d_values.data();
+                auto *ptr = d_values.data();
                 auto arr = digits;
 
                 amrex::ParallelFor(count, [=] AMREX_GPU_DEVICE (int n) {
@@ -79,7 +79,7 @@ main (int argc, char* argv[])
                 amrex::Gpu::streamSynchronize();
 
                 amrex::Gpu::DeviceScalar<float> d_result;
-                auto res_ptr = d_result.dataPtr();
+                auto *res_ptr = d_result.dataPtr();
                 amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int) {
                     res_ptr[0] = IntSuperAccumulatorFab::finalize(arr, i, j, k, entry);
                 });
@@ -116,7 +116,7 @@ main (int argc, char* argv[])
         }
 
         if (status == 0) {
-            constexpr float value = 0.125f;
+            constexpr float value = 0.125F;
             constexpr int repeats = 200000;
 
             amrex::Gpu::HostVector<float> values(repeats);

From b793a8343e18df010d0de87d0280e6cb3abcd969 Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Wed, 24 Sep 2025 14:27:02 -0400
Subject: [PATCH 05/10] fix warning

---
 Src/Base/AMReX_IntSuperAccumulator.H | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H
index 5b5be51d823..06dffbc05ac 100644
--- a/Src/Base/AMReX_IntSuperAccumulator.H
+++ b/Src/Base/AMReX_IntSuperAccumulator.H
@@ -189,7 +189,7 @@ private:
         {
             unsigned long long carry = value;
             int current_component = component;
-            constexpr unsigned long long base = static_cast<unsigned long long>(digit_base);
+            constexpr auto base = static_cast<unsigned long long>(digit_base);
 
             while (carry != 0ULL) {
                 if (current_component >= component_limit) {

From 46623fe9a6323756756df601b700d640156f2e21 Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Wed, 24 Sep 2025 16:25:20 -0400
Subject: [PATCH 06/10] simplify

---
 Src/Base/AMReX_IntSuperAccumulator.H        | 53 ++++++++-------------
 Tests/Numerics/IntSuperAccumulator/main.cpp | 21 +++++---
 2 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H
index 06dffbc05ac..ba1dd63f2c1 100644
--- a/Src/Base/AMReX_IntSuperAccumulator.H
+++ b/Src/Base/AMReX_IntSuperAccumulator.H
@@ -21,9 +21,8 @@ namespace amrex {
  * The accumulator stores the exact sum of single-precision floating-point
  * numbers by representing the scaled integer \f$\sum_i x_i 2^{-e_\text{min}}\f$
  * in a radix-\f$2^{30}\f$ positional system. Each digit is stored as a 32-bit int in
- * an `IArrayBox` component. A client that needs M independent sums must
- * allocate an accumulator with `M * digits_per_entry` components and index the
- * storage using `component = entry * digits_per_entry + digit`. Both
+ * an `IArrayBox` component. This simplified variant maintains a single logical
+ * entry whose digits occupy the `digits_per_entry` components. Both
  * `accumulate` and `finalize` operate in single precision; callers may cast to
  * wider types if needed. All summands are assumed non-negative; subnormal
  * inputs are treated as zero to reduce the number of required digits per entry.
@@ -46,15 +45,14 @@ public:
     explicit IntSuperAccumulatorFab (Arena* arena) noexcept
         : m_storage(arena) {}
 
-    IntSuperAccumulatorFab (const Box& box, int nentries, Arena* arena=nullptr)
-        : m_storage(box, nentries * digits_per_entry, arena), m_entries(nentries) {}
+    IntSuperAccumulatorFab (const Box& box, Arena* arena=nullptr)
+        : m_storage(box, digits_per_entry, arena) {}
 
-    [[nodiscard]] int entries () const noexcept { return m_entries; }
+    [[nodiscard]] int entries () const noexcept { return 1; }
 
-    void define (const Box& box, int nentries, Arena* arena=nullptr)
+    void define (const Box& box, Arena* arena=nullptr)
     {
-        m_entries = nentries;
-        m_storage.resize(box, nentries * digits_per_entry, arena);
+        m_storage.resize(box, digits_per_entry, arena);
     }
 
     [[nodiscard]] Array4<int> array () noexcept { return m_storage.array(); }
@@ -69,10 +67,9 @@ public:
         }
     }
 
-    void reset (Box const& bx, int nentries = -1)
+    void reset (Box const& bx)
     {
-        const int entries = (nentries < 0) ? m_entries : nentries;
-        const int ncomp = entries * digits_per_entry;
+        constexpr int ncomp = digits_per_entry;
         if (amrex::Gpu::inLaunchRegion()) {
             m_storage.template setVal<RunOn::Device>(0, bx, 0, ncomp);
         } else {
@@ -83,14 +80,13 @@ public:
     /**
      * \brief Accumulate a floating-point value into the superaccumulator.
      *
-     * The caller must provide the digit array and logical comp index for the
-     * entry being updated. All operations are carried out using integers; the
-     * routine does not allocate additional storage.
+     * The caller must provide the digit array for the single logical entry.
+     * All operations are carried out using integers; the routine does not
+     * allocate additional storage.
      */
     template <typename Array>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static void
-    accumulate (Array const& digits, int i, int j, int k, int entry, float value,
-                int entry_digits = digits_per_entry) noexcept
+    accumulate (Array const& digits, int i, int j, int k, float value) noexcept
     {
         if (value <= 0.0F) {
             if (value < 0.0F) {
@@ -112,8 +108,6 @@ public:
         const auto bits = detail::decompose(value);
         if (bits.is_zero) { return; }
 
-        const int component_base = entry * entry_digits;
-
         const int shift = bits.exponent - min_exponent;
         int digit_index = shift / digit_bits;
         const int bit_offset = shift % digit_bits;
@@ -121,7 +115,8 @@ public:
         const unsigned long long magnitude = bits.magnitude;
         unsigned long long chunk_value = magnitude << bit_offset;
 
-        const int component_limit = component_base + entry_digits;
+        constexpr int component_base = 0;
+        constexpr int component_limit = digits_per_entry;
 
         while (chunk_value != 0ULL) {
             const unsigned long long chunk = chunk_value & static_cast<unsigned long long>(digit_base - 1);
@@ -146,14 +141,11 @@ public:
      */
     template <typename Array>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static float
-    finalize (Array const& digits, int i, int j, int k, int entry,
-              int entry_digits = digits_per_entry) noexcept
+    finalize (Array const& digits, int i, int j, int k) noexcept
     {
-        const int component_base = entry * entry_digits;
-
         double result = 0.0;
-        for (int d = entry_digits - 1; d >= 0; --d) {
-            const int digit = digits(i,j,k, component_base + d);
+        for (int d = digits_per_entry - 1; d >= 0; --d) {
+            const int digit = digits(i,j,k, d);
             if (digit == 0) { continue; }
             const int exponent = min_exponent + d * digit_bits;
             result += std::ldexp(static_cast<double>(digit), exponent);
@@ -163,12 +155,10 @@ public:
 
     template <typename Array>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static void
-    clear_entry (Array const& digits, int i, int j, int k, int entry,
-                 int entry_digits = digits_per_entry) noexcept
+    clear_entry (Array const& digits, int i, int j, int k) noexcept
     {
-        const int component_base = entry * entry_digits;
-        for (int d = 0; d < entry_digits; ++d) {
-            digits(i,j,k, component_base + d) = 0;
+        for (int d = 0; d < digits_per_entry; ++d) {
+            digits(i,j,k, d) = 0;
         }
     }
 
@@ -295,7 +285,6 @@ private:
     };
 
     IArrayBox m_storage;
-    int m_entries = 0;
 };
 
 } // namespace amrex
diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp
index 5dba67731be..11f4002a948 100644
--- a/Tests/Numerics/IntSuperAccumulator/main.cpp
+++ b/Tests/Numerics/IntSuperAccumulator/main.cpp
@@ -34,7 +34,7 @@ main (int argc, char* argv[])
         using amrex::IntSuperAccumulatorFab;
 
         const amrex::Box bx(amrex::IntVect::TheZeroVector(), amrex::IntVect::TheZeroVector());
-        IntSuperAccumulatorFab accumulator(bx, 1);
+        IntSuperAccumulatorFab accumulator(bx);
 
         const std::array<Range, 4> ranges = {{
             {1.0e-12F, 1.0e-3F,   32768, 11},
@@ -56,12 +56,11 @@ main (int argc, char* argv[])
         };
 
         auto accumulate_values = [&] (amrex::Gpu::HostVector<float> const& values) -> float {
-            accumulator.reset(bx, 1);
+            accumulator.reset(bx);
             auto digits = accumulator.array();
             constexpr int i = 0;
             constexpr int j = 0;
             constexpr int k = 0;
-            constexpr int entry = 0;
 
             const int count = static_cast<int>(values.size());
 
@@ -73,7 +72,7 @@ main (int argc, char* argv[])
                 auto arr = digits;
 
                 amrex::ParallelFor(count, [=] AMREX_GPU_DEVICE (int n) {
-                    IntSuperAccumulatorFab::accumulate(arr, i, j, k, entry, ptr[n]);
+                    IntSuperAccumulatorFab::accumulate(arr, i, j, k, ptr[n]);
                 });
 
                 amrex::Gpu::streamSynchronize();
@@ -81,15 +80,15 @@ main (int argc, char* argv[])
                 amrex::Gpu::DeviceScalar<float> d_result;
                 auto *res_ptr = d_result.dataPtr();
                 amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int) {
-                    res_ptr[0] = IntSuperAccumulatorFab::finalize(arr, i, j, k, entry);
+                    res_ptr[0] = IntSuperAccumulatorFab::finalize(arr, i, j, k);
                 });
                 amrex::Gpu::streamSynchronize();
                 return d_result.dataValue();
             } else {
                 for (float v : values) {
-                    IntSuperAccumulatorFab::accumulate(digits, i, j, k, entry, v);
+                    IntSuperAccumulatorFab::accumulate(digits, i, j, k, v);
                 }
-                return IntSuperAccumulatorFab::finalize(digits, i, j, k, entry);
+                return IntSuperAccumulatorFab::finalize(digits, i, j, k);
             }
         };
 
@@ -105,6 +104,11 @@ main (int argc, char* argv[])
             const float reference = compute_reference(values);
             const float result = accumulate_values(values);
 
+            amrex::Print() << "Range [" << cfg.min_value << ", " << cfg.max_value
+                           << "] with count " << cfg.count
+                           << ": reference=" << reference
+                           << " accumulator=" << result << '\n';
+
             if (result != reference) {
                 amrex::Print() << "Mismatch for range [" << cfg.min_value << ", "
                                << cfg.max_value << "] with count " << cfg.count
@@ -127,6 +131,9 @@ main (int argc, char* argv[])
             const float reference = compute_reference(values);
             const float result = accumulate_values(values);
 
+            amrex::Print() << "Constant accumulation: reference=" << reference
+                           << " accumulator=" << result << '\n';
+
             if (result != reference) {
                 amrex::Print() << "Mismatch for constant accumulation: reference="
                                << reference << " accumulator=" << result << '\n';

From a54ce40a79e049cacef4f5ea7acb449417c584a1 Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Wed, 24 Sep 2025 16:40:03 -0400
Subject: [PATCH 07/10] add very small and very large summands

---
 Tests/Numerics/IntSuperAccumulator/main.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp
index 11f4002a948..18c3026bbaa 100644
--- a/Tests/Numerics/IntSuperAccumulator/main.cpp
+++ b/Tests/Numerics/IntSuperAccumulator/main.cpp
@@ -7,6 +7,7 @@
 #include <AMReX_GpuContainers.H>
 
 #include <array>
+#include <cmath>
 #include <cstdint>
 #include <limits>
 #include <random>
@@ -36,7 +37,14 @@ main (int argc, char* argv[])
         const amrex::Box bx(amrex::IntVect::TheZeroVector(), amrex::IntVect::TheZeroVector());
         IntSuperAccumulatorFab accumulator(bx);
 
-        const std::array<Range, 4> ranges = {{
+        const float lowest_exponent_min = std::numeric_limits<float>::min();
+        const float lowest_exponent_max = std::nextafter(std::ldexp(1.0f, -125), 0.0f);
+        const float highest_exponent_min = std::ldexp(1.0f, 125);
+        const float highest_exponent_max = std::ldexp(1.0f, 126); // OVERFLOWS TO INF: std::numeric_limits<float>::max();
+
+        const std::array<Range, 6> ranges = {{
+            {lowest_exponent_min, lowest_exponent_max, 32768, 3},
+            {highest_exponent_min, highest_exponent_max, 4, 5},
             {1.0e-12F, 1.0e-3F,   32768, 11},
             {1.0e-4F,  1.0F,      65536, 23},
             {1.0F,     1.0e6F,    65536, 37},

From 21e101ecf37a77a0a092d2a464aac68229d08861 Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Wed, 24 Sep 2025 16:40:51 -0400
Subject: [PATCH 08/10] fix warning

---
 Src/Base/AMReX_IntSuperAccumulator.H | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H
index ba1dd63f2c1..acb2429e13f 100644
--- a/Src/Base/AMReX_IntSuperAccumulator.H
+++ b/Src/Base/AMReX_IntSuperAccumulator.H
@@ -48,7 +48,7 @@ public:
     IntSuperAccumulatorFab (const Box& box, Arena* arena=nullptr)
         : m_storage(box, digits_per_entry, arena) {}
 
-    [[nodiscard]] int entries () const noexcept { return 1; }
+    [[nodiscard]] static constexpr int entries () noexcept { return 1; }
 
     void define (const Box& box, Arena* arena=nullptr)
     {

From b2e47f430d815cc7a06de69c1c028271a53652f4 Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Wed, 24 Sep 2025 16:47:10 -0400
Subject: [PATCH 09/10] fix float literal

---
 Tests/Numerics/IntSuperAccumulator/main.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp
index 18c3026bbaa..b9577a26ed4 100644
--- a/Tests/Numerics/IntSuperAccumulator/main.cpp
+++ b/Tests/Numerics/IntSuperAccumulator/main.cpp
@@ -38,9 +38,9 @@ main (int argc, char* argv[])
         IntSuperAccumulatorFab accumulator(bx);
 
         const float lowest_exponent_min = std::numeric_limits<float>::min();
-        const float lowest_exponent_max = std::nextafter(std::ldexp(1.0f, -125), 0.0f);
-        const float highest_exponent_min = std::ldexp(1.0f, 125);
-        const float highest_exponent_max = std::ldexp(1.0f, 126); // OVERFLOWS TO INF: std::numeric_limits<float>::max();
+        const float lowest_exponent_max = std::nextafter(std::ldexp(1.0F, -125), 0.0F);
+        const float highest_exponent_min = std::ldexp(1.0F, 125);
+        const float highest_exponent_max = std::ldexp(1.0F, 126); // OVERFLOWS TO INF: std::numeric_limits<float>::max();
 
         const std::array<Range, 6> ranges = {{
             {lowest_exponent_min, lowest_exponent_max, 32768, 3},

From 9037ed5fb3842dea6cfac1a181e29b6042cf75dd Mon Sep 17 00:00:00 2001
From: Benjamin Wibking <benjamin.wibking@anu.edu.au>
Date: Thu, 25 Sep 2025 08:00:48 +1000
Subject: [PATCH 10/10] use int64_t words

---
 Src/Base/AMReX_IntSuperAccumulator.H | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H
index acb2429e13f..8186a40b24d 100644
--- a/Src/Base/AMReX_IntSuperAccumulator.H
+++ b/Src/Base/AMReX_IntSuperAccumulator.H
@@ -4,10 +4,10 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_Array4.H>
+#include <AMReX_BaseFab.H>
 #include <AMReX_Gpu.H>
 #include <AMReX_GpuAtomic.H>
 #include <AMReX_GpuQualifiers.H>
-#include <AMReX_IArrayBox.H>
 #include <AMReX_Math.H>
 
 #include <cmath>
@@ -16,12 +16,12 @@
 namespace amrex {
 
 /**
- * \brief Integer superaccumulator backed by IArrayBox storage.
+ * \brief Integer superaccumulator backed by BaseFab<Long> storage.
  *
  * The accumulator stores the exact sum of single-precision floating-point
  * numbers by representing the scaled integer \f$\sum_i x_i 2^{-e_\text{min}}\f$
- * in a radix-\f$2^{30}\f$ positional system. Each digit is stored as a 32-bit int in
- * an `IArrayBox` component. This simplified variant maintains a single logical
+ * in a radix-\f$2^{30}\f$ positional system. Each digit is stored as a 64-bit signed
+ * integer in a `BaseFab<Long>` component. This simplified variant maintains a single logical
  * entry whose digits occupy the `digits_per_entry` components. Both
  * `accumulate` and `finalize` operate in single precision; callers may cast to
  * wider types if needed. All summands are assumed non-negative; subnormal
@@ -55,8 +55,8 @@ public:
         m_storage.resize(box, digits_per_entry, arena);
     }
 
-    [[nodiscard]] Array4<int> array () noexcept { return m_storage.array(); }
-    [[nodiscard]] Array4<int const> const_array () const noexcept { return m_storage.const_array(); }
+    [[nodiscard]] Array4<Long> array () noexcept { return m_storage.array(); }
+    [[nodiscard]] Array4<Long const> const_array () const noexcept { return m_storage.const_array(); }
 
     void reset ()
     {
@@ -145,7 +145,7 @@ public:
     {
         double result = 0.0;
         for (int d = digits_per_entry - 1; d >= 0; --d) {
-            const int digit = digits(i,j,k, d);
+            const Long digit = digits(i,j,k, d);
             if (digit == 0) { continue; }
             const int exponent = min_exponent + d * digit_bits;
             result += std::ldexp(static_cast<double>(digit), exponent);
@@ -193,7 +193,7 @@ private:
 #if defined(AMREX_USE_GPU) && AMREX_DEVICE_COMPILE
                 // When we compile for the device we cannot rely on a carry-save digit layout, so
                 // each thread performs an immediate normalization using GPU atomics.
-                int* const digit_ptr = &(digits(i,j,k,current_component));
+                Long* const digit_ptr = &(digits(i,j,k,current_component));
 
                 // Break the carry into the current radix chunk and the carry to forward.
                 const unsigned long long chunk = carry & (base - 1ULL);
@@ -202,7 +202,7 @@ private:
                 // Atomically accumulate this chunk.  Multiple threads may land on the same digit,
                 // which is safe because atomicAdd enforces a total order.
                 if (chunk != 0ULL) {
-                    amrex::Gpu::Atomic::Add(digit_ptr, static_cast<int>(chunk));
+                    amrex::Gpu::Atomic::Add(digit_ptr, static_cast<Long>(chunk));
                 }
 
                 // Track how many bases we successfully removed so they can be forwarded upward.
@@ -210,14 +210,14 @@ private:
                 while (true) {
                     // Snapshot the digit atomically without changing it.  This avoids racing
                     // against other threads that may be normalizing the same slot.
-                    const int current_value = amrex::Gpu::Atomic::Add(digit_ptr, 0);
+                    const Long current_value = amrex::Gpu::Atomic::Add(digit_ptr, static_cast<Long>(0));
                     if (current_value < digit_base) {
                         break;
                     }
 
                     // Attempt to drain one full base from the digit.  The return value is the
                     // pre-subtraction contents, so we know whether we actually claimed the carry.
-                    const int before = amrex::Gpu::Atomic::Add(digit_ptr, -digit_base);
+                    const Long before = amrex::Gpu::Atomic::Add(digit_ptr, static_cast<Long>(-digit_base));
                     if (before >= digit_base) {
                         // We consumed one unit of overflow and should forward it to the next digit.
                         ++propagated;
@@ -225,7 +225,7 @@ private:
                     } else {
                         // Another thread already normalized the digit between the peek and our
                         // subtract; restore the value and exit.
-                        amrex::Gpu::Atomic::Add(digit_ptr, digit_base);
+                        amrex::Gpu::Atomic::Add(digit_ptr, static_cast<Long>(digit_base));
                         break;
                     }
                 }
@@ -237,7 +237,7 @@ private:
                 const unsigned long long remainder = sum & (base - 1ULL);
                 const unsigned long long quotient = sum >> digit_bits;
 
-                digits(i,j,k,current_component) = static_cast<int>(remainder);
+                digits(i,j,k,current_component) = static_cast<Long>(remainder);
                 carry = quotient;
 #endif
                 ++current_component;
@@ -284,7 +284,7 @@ private:
         }
     };
 
-    IArrayBox m_storage;
+    BaseFab<Long> m_storage;
 };
 
 } // namespace amrex