diff --git a/examples/native-cuda/source/main.cu b/examples/native-cuda/source/main.cu index 00c429a9..c91baa2d 100644 --- a/examples/native-cuda/source/main.cu +++ b/examples/native-cuda/source/main.cu @@ -26,12 +26,13 @@ THE SOFTWARE. */ +#include "mallocMC/span.hpp" + #include #include #include #include -#include /** * @brief Computes the sum of squares of the first `n` natural numbers. @@ -66,15 +67,14 @@ __device__ auto sumOfSquares(auto const n) */ __global__ void oneDotProductPerThread(mallocMC::CudaMemoryManager<> memoryManager, uint64_t numValues) { + using mallocMC::span; uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Not very realistic, all threads are doing this on their own: - auto a = std::span( - reinterpret_cast(memoryManager.malloc(numValues * sizeof(uint64_t))), - numValues); - auto b = std::span( - reinterpret_cast(memoryManager.malloc(numValues * sizeof(uint64_t))), - numValues); + auto a + = span(reinterpret_cast(memoryManager.malloc(numValues * sizeof(uint64_t))), numValues); + auto b + = span(reinterpret_cast(memoryManager.malloc(numValues * sizeof(uint64_t))), numValues); std::iota(std::begin(a), std::end(a), tid); std::iota(std::begin(b), std::end(b), tid); diff --git a/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp b/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp index c7596c07..eb76eda4 100644 --- a/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp +++ b/include/mallocMC/creationPolicies/FlatterScatter/BitField.hpp @@ -28,6 +28,7 @@ #include "mallocMC/creationPolicies/FlatterScatter/wrappingLoop.hpp" #include "mallocMC/mallocMC_utils.hpp" +#include "mallocMC/span.hpp" #include #include @@ -36,7 +37,6 @@ #include #include -#include #include namespace mallocMC::CreationPolicies::FlatterScatterAlloc @@ -344,7 +344,7 @@ namespace mallocMC::CreationPolicies::FlatterScatterAlloc template struct BitFieldFlatImpl { - std::span> data; + mallocMC::span> data; /** * @brief Check if the index-th bit in the bit field is set (=1). diff --git a/include/mallocMC/span.hpp b/include/mallocMC/span.hpp new file mode 100644 index 00000000..b0004d5f --- /dev/null +++ b/include/mallocMC/span.hpp @@ -0,0 +1,86 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + http://www.icg.tugraz.at/project/mvp + https://www.hzdr.de/crp + + Copyright (C) 2012 Institute for Computer Graphics and Vision, + Graz University of Technology + Copyright (C) 2014-2026 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at + Bernhard Kainz - kainz ( at ) icg.tugraz.at + Michael Kenzel - kenzel ( at ) icg.tugraz.at + Rene Widera - r.widera ( at ) hzdr.de + Axel Huebl - a.huebl ( at ) hzdr.de + Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +// This is a workaround for the following issue: +// https://github.com/llvm/llvm-project/pull/136133 +// If clang or clang-based compilers like hipcc try to compile device code +// with a too recent version of libstdc++ from GCC, +// they run into issues like +// error: reference to __host__ function '__glibcxx_assert_fail' in __host__ __device__ function + +#pragma once +#include + +namespace mallocMC +{ + template + struct span + { + TData* ptr_; + size_t size_; + + constexpr span(TData* ptr, size_t size) : ptr_(ptr), size_(size) {}; + + // This is explicitly NOT `explcit` because we want to be able to + // silently wrap an array into a span within other constructor calls. + template + constexpr span(TData (&arr)[N]) : ptr_(arr) + , size_(N) + { + } + + [[nodiscard]] constexpr auto size() const -> size_t + { + return size_; + } + + [[nodiscard]] constexpr auto operator[](size_t index) const -> decltype(auto) + { + return ptr_[index]; + } + + [[nodiscard]] constexpr auto begin() const -> decltype(auto) + { + return ptr_; + } + + [[nodiscard]] constexpr auto end() const -> decltype(auto) + { + return &(ptr_[size_]); + } + }; +} // namespace mallocMC diff --git a/test/multithreaded/source/AccessBlock.cpp b/test/multithreaded/source/AccessBlock.cpp index 8acfa6bc..5f127147 100644 --- a/test/multithreaded/source/AccessBlock.cpp +++ b/test/multithreaded/source/AccessBlock.cpp @@ -28,6 +28,7 @@ #include "mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp" #include "mallocMC/mallocMC_utils.hpp" +#include "mallocMC/span.hpp" #include "mocks.hpp" #include @@ -61,11 +62,11 @@ #include #include #include -#include #include #include using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock; +using mallocMC::span; using Dim = alpaka::DimInt<1>; using Idx = std::uint32_t; @@ -78,7 +79,6 @@ constexpr uint32_t pteSize = 4 + 4; constexpr uint32_t blockSize = numPages * (pageSize + pteSize); using MyAccessBlock = AccessBlock, AlignmentPolicy>; -using std::span; // Fill all pages of the given access block with occupied chunks of the given size. This is useful to test the // behaviour near full filling but also to have a deterministic page and chunk where an allocation must happen @@ -185,8 +185,8 @@ struct IsValid bool* results, uint32_t const size) const { - std::span tmpPointers(pointers, size); - std::span tmpResults(results, size); + span tmpPointers(pointers, size); + span tmpResults(results, size); std::transform( std::begin(tmpPointers), std::end(tmpPointers), @@ -236,7 +236,7 @@ auto createChunkSizes(auto const& devHost, auto const& devAcc, auto& queue) auto createPointers(auto const& devHost, auto const& devAcc, auto& queue, uint32_t const size) { auto pointers = makeBuffer(devHost, devAcc, size); - std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); std::fill(std::begin(tmp), std::end(tmp), reinterpret_cast(1U)); alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost); return pointers; @@ -310,7 +310,7 @@ auto freeAllButOneOnFirstPage( AccessBlock, AlignmentPolicy>* accessBlock, auto& pointers) { - std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); std::sort(std::begin(tmp), std::end(tmp)); // This points to the first chunk of page 0. auto* pointer1 = tmp[0]; @@ -375,7 +375,7 @@ auto checkContent( alpaka::wait(queue); - std::span tmpResults(alpaka::getPtrNative(results.m_onHost), results.m_extents[0]); + span tmpResults(alpaka::getPtrNative(results.m_onHost), results.m_extents[0]); auto writtenCorrectly = std::reduce(std::cbegin(tmpResults), std::cend(tmpResults), true, std::multiplies{}); return writtenCorrectly; @@ -758,7 +758,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags) devHost, devAcc, getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0])); - std::span tmp(alpaka::getPtrNative(content.m_onHost), content.m_extents[0]); + span tmp(alpaka::getPtrNative(content.m_onHost), content.m_extents[0]); std::generate(std::begin(tmp), std::end(tmp), ContentGenerator{}); alpaka::memcpy(queue, content.m_onDevice, content.m_onHost); alpaka::wait(queue); @@ -814,7 +814,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags) alpaka::memcpy(queue, result.m_onHost, result.m_onDevice); alpaka::wait(queue); - std::span tmpResults(alpaka::getPtrNative(result.m_onHost), result.m_extents[0]); + span tmpResults(alpaka::getPtrNative(result.m_onHost), result.m_extents[0]); CHECK(std::none_of(std::cbegin(tmpResults), std::cend(tmpResults), [](auto const val) { return val; })); CHECK(getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0]) == allSlots); @@ -837,7 +837,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags) alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); alpaka::wait(queue); - std::span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); std::sort(std::begin(tmpPointers), std::end(tmpPointers)); CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers)); } @@ -865,7 +865,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags) // We only let the last (availableSlots-1) keep their memory. So, the rest at the beginning should have a // nullptr. - std::span tmpManyPointers(alpaka::getPtrNative(manyPointers.m_onHost), manyPointers.m_extents[0]); + span tmpManyPointers(alpaka::getPtrNative(manyPointers.m_onHost), manyPointers.m_extents[0]); auto beginNonNull = std::begin(tmpManyPointers) + (oversubscriptionFactor - 1) * availableSlots + 1; CHECK(std::all_of( @@ -880,7 +880,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags) SECTION("can handle many different chunk sizes.") { auto chunkSizes = makeBuffer(devHost, devAcc, pageSize); - std::span chunkSizesSpan(alpaka::getPtrNative(chunkSizes.m_onHost), chunkSizes.m_extents[0]); + span chunkSizesSpan(alpaka::getPtrNative(chunkSizes.m_onHost), chunkSizes.m_extents[0]); std::iota(std::begin(chunkSizesSpan), std::end(chunkSizesSpan), 1U); alpaka::memcpy(queue, chunkSizes.m_onDevice, chunkSizes.m_onHost); alpaka::wait(queue); @@ -892,14 +892,14 @@ TEMPLATE_LIST_TEST_CASE("Threaded AccessBlock", "", alpaka::EnabledAccTags) CreateAllChunkSizes{}, accessBlock, span(alpaka::getPtrNative(pointers.m_onDevice), MyAccessBlock::numPages()), - std::span(alpaka::getPtrNative(chunkSizes.m_onDevice), chunkSizes.m_extents[0])); + span(alpaka::getPtrNative(chunkSizes.m_onDevice), chunkSizes.m_extents[0])); alpaka::wait(queue); alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); alpaka::wait(queue); - std::span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), MyAccessBlock::numPages()); + span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), MyAccessBlock::numPages()); std::sort(std::begin(tmpPointers), std::end(tmpPointers)); CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers)); } diff --git a/test/multithreaded/source/Scatter.cpp b/test/multithreaded/source/Scatter.cpp index 2328b1cf..15e4db11 100644 --- a/test/multithreaded/source/Scatter.cpp +++ b/test/multithreaded/source/Scatter.cpp @@ -33,6 +33,7 @@ #include "mallocMC/distributionPolicies/Noop.hpp" #include "mallocMC/mallocMC_utils.hpp" #include "mallocMC/oOMPolicies/ReturnNull.hpp" +#include "mallocMC/span.hpp" #include "mocks.hpp" #include @@ -66,10 +67,10 @@ #include #include #include -#include #include #include +using mallocMC::span; using Dim = alpaka::DimInt<1>; using Idx = std::uint32_t; @@ -98,8 +99,6 @@ using MyDeviceAllocator = mallocMC::DeviceAllocator< mallocMC::OOMPolicies::ReturnNull, mallocMC::AlignmentPolicies::Shrink<>>; -using std::span; - // Fill all pages of the given access block with occupied chunks of the given size. This is useful to test the // behaviour near full filling but also to have a deterministic page and chunk where an allocation must happen // regardless of the underlying access optimisations etc. @@ -236,7 +235,7 @@ auto createChunkSizes(auto const& devHost, auto const& devAcc, auto& queue) auto createPointers(auto const& devHost, auto const& devAcc, auto& queue, uint32_t const size) { auto pointers = makeBuffer(devHost, devAcc, size); - std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); std::fill(std::begin(tmp), std::end(tmp), reinterpret_cast(1U)); alpaka::memcpy(queue, pointers.m_onDevice, pointers.m_onHost); return pointers; @@ -307,7 +306,7 @@ auto fillAllButOne(auto& queue, auto* accessBlock, auto const& chunkSize, auto& template auto freeAllButOneOnFirstPage(auto& queue, auto* accessBlock, auto& pointers) { - std::span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + span tmp(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); std::sort(std::begin(tmp), std::end(tmp)); // This points to the first chunk of page 0. auto* pointer1 = tmp[0]; @@ -371,7 +370,7 @@ auto checkContent( alpaka::wait(queue); - std::span tmpResults(alpaka::getPtrNative(results.m_onHost), results.m_extents[0]); + span tmpResults(alpaka::getPtrNative(results.m_onHost), results.m_extents[0]); auto writtenCorrectly = std::reduce(std::cbegin(tmpResults), std::cend(tmpResults), true, std::multiplies{}); return writtenCorrectly; @@ -735,7 +734,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded Scatter", "", alpaka::EnabledAccTags) devHost, devAcc, getAvailableSlots(accessBlock, queue, devHost, devAcc, chunkSizes.m_onHost[0])); - std::span tmp(alpaka::getPtrNative(content.m_onHost), content.m_extents[0]); + span tmp(alpaka::getPtrNative(content.m_onHost), content.m_extents[0]); std::generate(std::begin(tmp), std::end(tmp), ContentGenerator{}); alpaka::memcpy(queue, content.m_onDevice, content.m_onHost); alpaka::wait(queue); @@ -796,7 +795,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded Scatter", "", alpaka::EnabledAccTags) alpaka::memcpy(queue, pointers.m_onHost, pointers.m_onDevice); alpaka::wait(queue); - std::span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); + span tmpPointers(alpaka::getPtrNative(pointers.m_onHost), pointers.m_extents[0]); std::sort(std::begin(tmpPointers), std::end(tmpPointers)); CHECK(std::unique(std::begin(tmpPointers), std::end(tmpPointers)) == std::end(tmpPointers)); } @@ -824,7 +823,7 @@ TEMPLATE_LIST_TEST_CASE("Threaded Scatter", "", alpaka::EnabledAccTags) // We only let the last (availableSlots-1) keep their memory. So, the rest at the beginning should have a // nullptr. - std::span tmpManyPointers(alpaka::getPtrNative(manyPointers.m_onHost), manyPointers.m_extents[0]); + span tmpManyPointers(alpaka::getPtrNative(manyPointers.m_onHost), manyPointers.m_extents[0]); auto beginNonNull = std::begin(tmpManyPointers) + (oversubscriptionFactor - 1) * availableSlots + 1; CHECK(std::all_of( diff --git a/test/unit/source/AccessBlock.cpp b/test/unit/source/AccessBlock.cpp index d3358298..80e780fe 100644 --- a/test/unit/source/AccessBlock.cpp +++ b/test/unit/source/AccessBlock.cpp @@ -29,6 +29,7 @@ #include "mallocMC/creationPolicies/FlatterScatter/BitField.hpp" #include "mallocMC/creationPolicies/FlatterScatter/PageInterpretation.hpp" #include "mallocMC/mallocMC_utils.hpp" +#include "mallocMC/span.hpp" #include "mocks.hpp" #include @@ -54,6 +55,8 @@ #include #include +using mallocMC::span; + template struct TestableAccessBlock : mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock @@ -608,7 +611,7 @@ TEST_CASE("AccessBlock (Regression)") // Fill all memory with ones. for(void* pointer : pointers) { - auto mem = std::span(static_cast(pointer), chunkSizeOneMask); + auto mem = span(static_cast(pointer), chunkSizeOneMask); for(auto& byte : mem) { byte = std::numeric_limits::max(); @@ -621,7 +624,7 @@ TEST_CASE("AccessBlock (Regression)") accessBlock.destroy(accSerial, freedPointer); void* pointerTwoMasks = accessBlock.create(accSerial, chunkSizeTwoMasks); - for(auto& c : std::span(static_cast(pointerTwoMasks), chunkSizeTwoMasks)) + for(auto& c : span(static_cast(pointerTwoMasks), chunkSizeTwoMasks)) { c = 0U; } @@ -631,7 +634,7 @@ TEST_CASE("AccessBlock (Regression)") { if(pointer != freedPointer) { - auto mem = std::span(static_cast(pointer), chunkSizeOneMask); + auto mem = span(static_cast(pointer), chunkSizeOneMask); CHECK(std::all_of( mem.begin(), mem.end(), @@ -640,7 +643,7 @@ TEST_CASE("AccessBlock (Regression)") } } - auto mem = std::span(static_cast(pointerTwoMasks), chunkSizeTwoMasks); + auto mem = span(static_cast(pointerTwoMasks), chunkSizeTwoMasks); CHECK(std::all_of(mem.begin(), mem.end(), [](auto const val) { return val == 0U; })); // Now, we want to be really explicit: diff --git a/test/unit/source/BitField.cpp b/test/unit/source/BitField.cpp index e7912891..5aace911 100644 --- a/test/unit/source/BitField.cpp +++ b/test/unit/source/BitField.cpp @@ -25,6 +25,7 @@ */ #include "mallocMC/mallocMC_utils.hpp" +#include "mallocMC/span.hpp" #include "mocks.hpp" #include @@ -40,6 +41,7 @@ using mallocMC::CreationPolicies::FlatterScatterAlloc::BitFieldFlatImpl; using mallocMC::CreationPolicies::FlatterScatterAlloc::BitMaskImpl; +using mallocMC::span; using BitMaskSizes = std::tuple< std::integral_constant, // NOLINT(*magic-number*) @@ -157,7 +159,7 @@ TEMPLATE_LIST_TEST_CASE("BitFieldFlat", "", BitMaskSizes) SECTION("knows a free bit if later ones are free, too.") { uint32_t const index = GENERATE(0, 1, numChunks / 2, numChunks - 1); - for(auto& mask : std::span{static_cast(data), index / BitMaskSize}) + for(auto& mask : span{static_cast(data), index / BitMaskSize}) { mask.set(accSerial); } @@ -174,7 +176,7 @@ TEMPLATE_LIST_TEST_CASE("BitFieldFlat", "", BitMaskSizes) SECTION("knows its first free bit for different numChunks.") { auto localNumChunks = numChunks / GENERATE(1, 2, 3); - std::span localData{static_cast(data), mallocMC::ceilingDivision(localNumChunks, BitMaskSize)}; + span localData{static_cast(data), mallocMC::ceilingDivision(localNumChunks, BitMaskSize)}; uint32_t const index = GENERATE(0, 1, 10, 12); for(auto& mask : localData) {