alpaka-group · psychocoderHPC · Feb 2, 2026 · Feb 2, 2026
diff --git a/examples/native-cuda/source/main.cu b/examples/native-cuda/source/main.cu
@@ -26,12 +26,13 @@
   THE SOFTWARE.
 */
 
+#include "mallocMC/span.hpp"
+
 #include <mallocMC/mallocMC.cuh>
 
 #include <cstdint>
 #include <cstdlib>
 #include <functional>
-#include <span>
 
 /**
  * @brief Computes the sum of squares of the first `n` natural numbers.
@@ -66,15 +67,14 @@ __device__ auto sumOfSquares(auto const n)
  */
 __global__ void oneDotProductPerThread(mallocMC::CudaMemoryManager<> memoryManager, uint64_t numValues)
 {
+    using mallocMC::span;
     uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
 
     // Not very realistic, all threads are doing this on their own:
-    auto a = std::span<uint64_t>(
-        reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))),
-        numValues);
-    auto b = std::span<uint64_t>(
-        reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))),
-        numValues);
+    auto a
+        = span<uint64_t>(reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))), numValues);
+    auto b
+        = span<uint64_t>(reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))), numValues);
 
     std::iota(std::begin(a), std::end(a), tid);
     std::iota(std::begin(b), std::end(b), tid);

diff --git a/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp b/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp
@@ -28,6 +28,7 @@
 
 #include "mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp"
 #include "mallocMC/mallocMC_utils.hpp"
+#include "mallocMC/span.hpp"
 
 #include <alpaka/core/Common.hpp>
 #include <alpaka/intrinsic/Traits.hpp>
@@ -36,7 +37,6 @@
 
 #include <cstdint>
 #include <limits>
-#include <span>
 #include <type_traits>
 
 namespace mallocMC::CreationPolicies::FlatterScatterAlloc
@@ -344,7 +344,7 @@ namespace mallocMC::CreationPolicies::FlatterScatterAlloc
     template<uint32_t MyBitMaskSize = BitMaskSize>
     struct BitFieldFlatImpl
     {
-        std::span<BitMaskImpl<MyBitMaskSize>> data;
+        mallocMC::span<BitMaskImpl<MyBitMaskSize>> data;
 
         /**
          * @brief Check if the index-th bit in the bit field is set (=1).

diff --git a/include/mallocMC/span.hpp b/include/mallocMC/span.hpp
@@ -0,0 +1,86 @@
+/*
+  mallocMC: Memory Allocator for Many Core Architectures.
+  http://www.icg.tugraz.at/project/mvp
+  https://www.hzdr.de/crp
+
+  Copyright (C) 2012 Institute for Computer Graphics and Vision,
+                     Graz University of Technology
+  Copyright (C) 2014-2026 Institute of Radiation Physics,
+                     Helmholtz-Zentrum Dresden - Rossendorf
+
+  Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
+              Bernhard Kainz - kainz ( at ) icg.tugraz.at
+              Michael Kenzel - kenzel ( at ) icg.tugraz.at
+              Rene Widera - r.widera ( at ) hzdr.de
+              Axel Huebl - a.huebl ( at ) hzdr.de
+              Carlchristian Eckert - c.eckert ( at ) hzdr.de
+              Julian Lenz - j.lenz ( at ) hzdr.de
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+// This is a workaround for the following issue:
+// https://github.com/llvm/llvm-project/pull/136133
+// If clang or clang-based compilers like hipcc try to compile device code
+// with a too recent version of libstdc++ from GCC,
+// they run into issues like
+// error: reference to __host__ function '__glibcxx_assert_fail' in __host__ __device__ function
+
+#pragma once
+#include <cstddef>
+
+namespace mallocMC
+{
+    template<typename TData>
+    struct span
+    {
+        TData* ptr_;
+        size_t size_;
+
+        constexpr span(TData* ptr, size_t size) : ptr_(ptr), size_(size) {};
+
+        // This is explicitly NOT `explcit` because we want to be able to
+        // silently wrap an array into a span within other constructor calls.
+        template<size_t N>
+        constexpr span(TData (&arr)[N]) : ptr_(arr)
+                                        , size_(N)
+        {
+        }
+
+        [[nodiscard]] constexpr auto size() const -> size_t
+        {
+            return size_;
+        }
+
+        [[nodiscard]] constexpr auto operator[](size_t index) const -> decltype(auto)
+        {
+            return ptr_[index];
+        }
+
+        [[nodiscard]] constexpr auto begin() const -> decltype(auto)
+        {
+            return ptr_;
+        }
+
+        [[nodiscard]] constexpr auto end() const -> decltype(auto)
+        {
+            return &(ptr_[size_]);
+        }
+    };
+} // namespace mallocMC
diff --git a/test/multithreaded/source/AccessBlock.cpp b/test/multithreaded/source/AccessBlock.cpp
@@ -28,6 +28,7 @@
 #include "mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp"
 
 #include "mallocMC/mallocMC_utils.hpp"
+#include "mallocMC/span.hpp"
 #include "mocks.hpp"
 
 #include <alpaka/acc/AccCpuSerial.hpp>
@@ -61,11 +62,11 @@
 #include <cstdio>
 #include <functional>
 #include <iterator>
-#include <span>
 #include <tuple>
 #include <type_traits>
 
 using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock;
+using mallocMC::span;
 
 using Dim = alpaka::DimInt<1>;
 using Idx = std::uint32_t;
@@ -78,7 +79,6 @@ constexpr uint32_t pteSize = 4 + 4;
 constexpr uint32_t blockSize = numPages * (pageSize + pteSize);
 
 using MyAccessBlock = AccessBlock<HeapConfig<blockSize, pageSize>, AlignmentPolicy>;
-using std::span;
 
 // Fill all pages of the given access block with occupied chunks of the given size. This is useful to test the
 // behaviour near full filling but also to have a deterministic page and chunk where an allocation must happen
@@ -185,8 +185,8 @@ struct IsValid
         bool* results,
         uint32_t const size) const
     {
-        std::span<void*> tmpPointers(pointers, size);
-        std::span<bool> tmpResults(results, size);
+        span<void*> tmpPointers(pointers, size);
+        span<bool> tmpResults(results, size);
         std::transform(
             std::begin(tmpPointers),
             std::end(tmpPointers),
@@ -236,7 +236,7 @@ auto createChunkSizes(auto const& devHost, auto const& devAcc, auto& queue)
 auto createPointers(auto const& devHost, auto const& devAcc, auto& queue, uint32_t const size)
 {
     auto pointers = makeBuffer<void*>(devHost, devAcc, size);
-    std::span<void*> tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]);
+    span<void*> tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]);
     std::fill(std::begin(tmp), std::end(tmp), reinterpret_cast<void*>(1U));
     alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost);
     return pointers;
@@ -310,7 +310,7 @@ auto freeAllButOneOnFirstPage(
     AccessBlock<HeapConfig<T_blockSize, T_pageSize>, AlignmentPolicy>* accessBlock,
     auto& pointers)
 {
-    std::span<void*> tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]);
+    span<void*> tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]);
     std::sort(std::begin(tmp), std::end(tmp));
     // This points to the first chunk of page 0.
     auto* pointer1 = tmp[0];
@@ -375,7 +375,7 @@ auto checkContent(
     alpaka::wait(queue);
 
 
-    std::span<bool> tmpResults(alpaka::getPtrNative(results.m_onHost), results.m_extents[0]);
+    span<bool> tmpResults(alpaka::getPtrNative(results.m_onHost), results.m_extents[0]);
     auto writtenCorrectly = std::reduce(std::cbegin(tmpResults), std::cend(tmpResults), true, std::multiplies<bool>{});
 
     return writtenCorrectly;
@@ -758,7 +758,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags)
             devHost,
             devAcc,
             getAvailableSlots<Acc>(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]));
-        std::span<uint32_t> tmp(alpaka::getPtrNative(content.m_onHost), content.m_extents[0]);
+        span<uint32_t> tmp(alpaka::getPtrNative(content.m_onHost), content.m_extents[0]);
         std::generate(std::begin(tmp), std::end(tmp), ContentGenerator{});
         alpaka::memcpy(queue, content.m_onDevice, content.m_onHost);
         alpaka::wait(queue);
@@ -814,7 +814,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags)
         alpaka::memcpy(queue, result.m_onHost, result.m_onDevice);
         alpaka::wait(queue);
 
-        std::span<bool> tmpResults(alpaka::getPtrNative(result.m_onHost), result.m_extents[0]);
+        span<bool> tmpResults(alpaka::getPtrNative(result.m_onHost), result.m_extents[0]);
         CHECK(std::none_of(std::cbegin(tmpResults), std::cend(tmpResults), [](auto const val) { return val; }));
 
         CHECK(getAvailableSlots<Acc>(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]) == allSlots);
@@ -837,7 +837,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags)
         alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice);
         alpaka::wait(queue);
 
-        std::span<void*> tmpPointers(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]);
+        span<void*> tmpPointers(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]);
         std::sort(std::begin(tmpPointers), std::end(tmpPointers));
         CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers));
     }
@@ -865,7 +865,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags)
 
         // We only let the last (availableSlots-1) keep their memory. So, the rest at the beginning should have a
         // nullptr.
-        std::span<void*> tmpManyPointers(alpaka::getPtrNative(manyPointers.m_onHost), manyPointers.m_extents[0]);
+        span<void*> tmpManyPointers(alpaka::getPtrNative(manyPointers.m_onHost), manyPointers.m_extents[0]);
         auto beginNonNull = std::begin(tmpManyPointers) + (oversubscriptionFactor - 1) * availableSlots + 1;
 
         CHECK(std::all_of(
@@ -880,7 +880,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags)
     SECTION("can handle many different chunk sizes.")
     {
         auto chunkSizes = makeBuffer<uint32_t>(devHost, devAcc, pageSize);
-        std::span<uint32_t> chunkSizesSpan(alpaka::getPtrNative(chunkSizes.m_onHost), chunkSizes.m_extents[0]);
+        span<uint32_t> chunkSizesSpan(alpaka::getPtrNative(chunkSizes.m_onHost), chunkSizes.m_extents[0]);
         std::iota(std::begin(chunkSizesSpan), std::end(chunkSizesSpan), 1U);
         alpaka::memcpy(queue, chunkSizes.m_onDevice, chunkSizes.m_onHost);
         alpaka::wait(queue);
@@ -892,14 +892,14 @@ TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags)
             CreateAllChunkSizes{},
             accessBlock,
             span<void*>(alpaka::getPtrNative(pointers.m_onDevice), MyAccessBlock::numPages()),
-            std::span<uint32_t>(alpaka::getPtrNative(chunkSizes.m_onDevice), chunkSizes.m_extents[0]));
+            span<uint32_t>(alpaka::getPtrNative(chunkSizes.m_onDevice), chunkSizes.m_extents[0]));
 
         alpaka::wait(queue);
 
         alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice);
         alpaka::wait(queue);
 
-        std::span<void*> tmpPointers(alpaka::getPtrNative(pointers.m_onHost), MyAccessBlock::numPages());
+        span<void*> tmpPointers(alpaka::getPtrNative(pointers.m_onHost), MyAccessBlock::numPages());
         std::sort(std::begin(tmpPointers), std::end(tmpPointers));
         CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers));
     }

diff --git a/test/multithreaded/source/Scatter.cpp b/test/multithreaded/source/Scatter.cpp
@@ -33,6 +33,7 @@
 #include "mallocMC/distributionPolicies/Noop.hpp"
 #include "mallocMC/mallocMC_utils.hpp"
 #include "mallocMC/oOMPolicies/ReturnNull.hpp"
+#include "mallocMC/span.hpp"
 #include "mocks.hpp"
 
 #include <alpaka/acc/AccCpuSerial.hpp>
@@ -66,10 +67,10 @@
 #include <cstdio>
 #include <functional>
 #include <iterator>
-#include <span>
 #include <tuple>
 #include <type_traits>
 
+using mallocMC::span;
 using Dim = alpaka::DimInt<1>;
 using Idx = std::uint32_t;
 
@@ -98,8 +99,6 @@ using MyDeviceAllocator = mallocMC::DeviceAllocator<
     mallocMC::OOMPolicies::ReturnNull,
     mallocMC::AlignmentPolicies::Shrink<>>;
 
-using std::span;
-
 // Fill all pages of the given access block with occupied chunks of the given size. This is useful to test the
 // behaviour near full filling but also to have a deterministic page and chunk where an allocation must happen
 // regardless of the underlying access optimisations etc.
@@ -236,7 +235,7 @@ auto createChunkSizes(auto const& devHost, auto const& devAcc, auto& queue)
 auto createPointers(auto const& devHost, auto const& devAcc, auto& queue, uint32_t const size)
 {
     auto pointers = makeBuffer<void*>(devHost, devAcc, size);
-    std::span<void*> tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]);
+    span<void*> tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]);
     std::fill(std::begin(tmp), std::end(tmp), reinterpret_cast<void*>(1U));
     alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost);
     return pointers;
@@ -307,7 +306,7 @@ auto fillAllButOne(auto& queue, auto* accessBlock, auto const& chunkSize, auto&
 template<typename TAcc>
 auto freeAllButOneOnFirstPage(auto& queue, auto* accessBlock, auto& pointers)
 {
-    std::span<void*> tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]);
+    span<void*> tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]);
     std::sort(std::begin(tmp), std::end(tmp));
     // This points to the first chunk of page 0.
     auto* pointer1 = tmp[0];
@@ -371,7 +370,7 @@ auto checkContent(
     alpaka::wait(queue);
 
 
-    std::span<bool> tmpResults(alpaka::getPtrNative(results.m_onHost), results.m_extents[0]);
+    span<bool> tmpResults(alpaka::getPtrNative(results.m_onHost), results.m_extents[0]);
     auto writtenCorrectly = std::reduce(std::cbegin(tmpResults), std::cend(tmpResults), true, std::multiplies<bool>{});
 
     return writtenCorrectly;
@@ -735,7 +734,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded Scatter", "", alpaka::EnabledAccTags)
             devHost,
             devAcc,
             getAvailableSlots<Acc>(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]));
-        std::span<uint32_t> tmp(alpaka::getPtrNative(content.m_onHost), content.m_extents[0]);
+        span<uint32_t> tmp(alpaka::getPtrNative(content.m_onHost), content.m_extents[0]);
         std::generate(std::begin(tmp), std::end(tmp), ContentGenerator{});
         alpaka::memcpy(queue, content.m_onDevice, content.m_onHost);
         alpaka::wait(queue);
@@ -796,7 +795,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded Scatter", "", alpaka::EnabledAccTags)
         alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice);
         alpaka::wait(queue);
 
-        std::span<void*> tmpPointers(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]);
+        span<void*> tmpPointers(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]);
         std::sort(std::begin(tmpPointers), std::end(tmpPointers));
         CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers));
     }
@@ -824,7 +823,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded Scatter", "", alpaka::EnabledAccTags)
 
         // We only let the last (availableSlots-1) keep their memory. So, the rest at the beginning should have a
         // nullptr.
-        std::span<void*> tmpManyPointers(alpaka::getPtrNative(manyPointers.m_onHost), manyPointers.m_extents[0]);
+        span<void*> tmpManyPointers(alpaka::getPtrNative(manyPointers.m_onHost), manyPointers.m_extents[0]);
         auto beginNonNull = std::begin(tmpManyPointers) + (oversubscriptionFactor - 1) * availableSlots + 1;
 
         CHECK(std::all_of(