-
Notifications
You must be signed in to change notification settings - Fork 14
Updating Healpix CUDA primitive #290
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 38 commits
2fd3c8a
2b591ca
8fe86c2
933ac2a
e2cc68c
b5cbeac
9e0f121
92fe6a0
a70b262
0e03787
6f6c07e
f8a9a6d
866d1f2
a83dbd1
fd7860e
d29af9b
1ac3541
b75c0ce
00b169c
9775bba
fb8d0df
850cd43
25b2cc1
ba5a531
bfe89dc
64b1ceb
2e52da3
f6cd7f4
5152e2c
ac1609d
928ea12
bca4837
50e2840
757e022
6400681
77cbc96
2a2e6e7
3bcb69a
a6ad576
3060be6
de306c5
1583db5
a71a232
fbdcfbc
d322c77
7adef39
bf123e4
c4f1cb5
d6bb41b
56282fd
22d7636
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -9,8 +9,11 @@ set(CMAKE_CUDA_STANDARD 17) | |||||
|
|
||||||
| # Set default build type to Release | ||||||
| if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) | ||||||
| set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) | ||||||
| set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") | ||||||
| set(CMAKE_BUILD_TYPE | ||||||
| Release | ||||||
| CACHE STRING "Choose the type of build." FORCE) | ||||||
| set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" | ||||||
| "MinSizeRel" "RelWithDebInfo") | ||||||
| endif() | ||||||
|
|
||||||
| # Check for CUDA | ||||||
|
|
@@ -23,35 +26,48 @@ if(CMAKE_CUDA_COMPILER) | |||||
| message(STATUS "CUDA compiler found: ${CMAKE_CUDA_COMPILER}") | ||||||
|
|
||||||
| if(NOT SKBUILD) | ||||||
| message(FATAL_ERROR "Building standalone project directly without pip install is not supported" | ||||||
| "Please use pip install to build the project") | ||||||
| message( | ||||||
| FATAL_ERROR | ||||||
| "Building standalone project directly without pip install is not supported" | ||||||
| "Please use pip install to build the project") | ||||||
| else() | ||||||
| find_package(CUDAToolkit REQUIRED) | ||||||
|
|
||||||
| find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED) | ||||||
| # Add the executable | ||||||
| find_package( | ||||||
| Python 3.8 REQUIRED | ||||||
| COMPONENTS Interpreter Development.Module | ||||||
| OPTIONAL_COMPONENTS Development.SABIModule) | ||||||
| execute_process( | ||||||
| COMMAND "${Python_EXECUTABLE}" "-c" | ||||||
| "from jax import ffi; print(ffi.include_dir())" | ||||||
| OUTPUT_STRIP_TRAILING_WHITESPACE | ||||||
| OUTPUT_VARIABLE XLA_DIR) | ||||||
| message(STATUS "XLA include directory: ${XLA_DIR}") | ||||||
|
|
||||||
| # Detect the installed nanobind package and import it into CMake | ||||||
| execute_process( | ||||||
| COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir | ||||||
| OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE nanobind_ROOT) | ||||||
| find_package(nanobind CONFIG REQUIRED) | ||||||
|
|
||||||
| nanobind_add_module(_s2fft STABLE_ABI | ||||||
| ${CMAKE_CURRENT_LIST_DIR}/lib/src/extensions.cc | ||||||
| ${CMAKE_CURRENT_LIST_DIR}/lib/src/s2fft.cu | ||||||
| ${CMAKE_CURRENT_LIST_DIR}/lib/src/s2fft_callbacks.cu | ||||||
| ${CMAKE_CURRENT_LIST_DIR}/lib/src/plan_cache.cc | ||||||
| ${CMAKE_CURRENT_LIST_DIR}/lib/src/s2fft_kernels.cu | ||||||
| ) | ||||||
| find_package(nanobind CONFIG REQUIRED) | ||||||
|
|
||||||
| nanobind_add_module( | ||||||
| _s2fft | ||||||
| STABLE_ABI | ||||||
| ${CMAKE_CURRENT_LIST_DIR}/lib/src/extensions.cc | ||||||
| ${CMAKE_CURRENT_LIST_DIR}/lib/src/s2fft.cu | ||||||
| ${CMAKE_CURRENT_LIST_DIR}/lib/src/plan_cache.cc | ||||||
| ${CMAKE_CURRENT_LIST_DIR}/lib/src/s2fft_kernels.cu) | ||||||
|
|
||||||
| target_link_libraries(_s2fft PRIVATE CUDA::cudart_static CUDA::cufft_static CUDA::culibos) | ||||||
| target_include_directories(_s2fft PUBLIC ${CMAKE_CURRENT_LIST_DIR}/lib/include) | ||||||
| set_target_properties(_s2fft PROPERTIES | ||||||
| LINKER_LANGUAGE CUDA | ||||||
| CUDA_SEPARABLE_COMPILATION ON) | ||||||
| set(CMAKE_CUDA_ARCHITECTURES "70;80;89" CACHE STRING "List of CUDA compute capabilities to build cuDecomp for.") | ||||||
| target_include_directories( | ||||||
| _s2fft PUBLIC ${CMAKE_CURRENT_LIST_DIR}/lib/include ${XLA_DIR} ${CUDAToolkit_INCLUDE_DIRS}) | ||||||
| set_target_properties(_s2fft PROPERTIES LINKER_LANGUAGE CUDA | ||||||
| CUDA_SEPARABLE_COMPILATION ON) | ||||||
| set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -rdc=true") | ||||||
| set(CMAKE_CUDA_ARCHITECTURES | ||||||
| "70;80;89" | ||||||
| CACHE STRING "List of CUDA compute capabilities to build cuDecomp for.") | ||||||
| message(STATUS "CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") | ||||||
| set_target_properties(_s2fft PROPERTIES CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}") | ||||||
| set_target_properties(_s2fft PROPERTIES CUDA_ARCHITECTURES | ||||||
| "${CMAKE_CUDA_ARCHITECTURES}") | ||||||
|
|
||||||
| install(TARGETS _s2fft LIBRARY DESTINATION s2fft_lib) | ||||||
| endif() | ||||||
|
|
@@ -60,26 +76,35 @@ else() | |||||
| if(SKBUILD) | ||||||
| message(WARNING "CUDA compiler not found, building without CUDA support") | ||||||
|
|
||||||
| find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED) | ||||||
| find_package( | ||||||
| Python 3.8 | ||||||
|
||||||
| Python 3.8 | |
| Python 3.11 |
Same rationale as above comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Again think this comment still applies.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,165 @@ | ||
|
|
||
| /** | ||
| * @file cudastreamhandler.hpp | ||
| * @brief Singleton class for managing CUDA streams and events. | ||
| * | ||
| * This header provides a singleton implementation that encapsulates the creation, | ||
| * management, and cleanup of CUDA streams and events. It offers functions to fork | ||
| * streams, add new streams, and synchronize (join) streams with a given dependency. | ||
| * | ||
| * Usage example: | ||
| * @code | ||
| * #include "cudastreamhandler.hpp" | ||
| * | ||
| * int main() { | ||
| * // Create a handler instance | ||
| * CudaStreamHandler handler; | ||
| * | ||
| * // Fork 4 streams dependent on a given stream 'stream_main' | ||
| * handler.Fork(stream_main, 4); | ||
| * | ||
| * // Do work on the forked streams... | ||
| * | ||
| * // Join the streams back to 'stream_main' | ||
| * handler.join(stream_main); | ||
| * | ||
| * return 0; | ||
| * } | ||
| * @endcode | ||
| * | ||
| * Author: Wassim KABALAN | ||
| */ | ||
|
|
||
| #ifndef CUDASTREAMHANDLER_HPP | ||
| #define CUDASTREAMHANDLER_HPP | ||
|
|
||
| #include <algorithm> | ||
| #include <atomic> | ||
| #include <cuda_runtime.h> | ||
| #include <stdexcept> | ||
| #include <thread> | ||
| #include <vector> | ||
|
|
||
| // Singleton class managing CUDA streams and events | ||
| class CudaStreamHandlerImpl { | ||
| public: | ||
| static CudaStreamHandlerImpl &instance() { | ||
| static CudaStreamHandlerImpl instance; | ||
| return instance; | ||
| } | ||
|
|
||
| void AddStreams(int numStreams) { | ||
| if (numStreams > m_streams.size()) { | ||
| int streamsToAdd = numStreams - m_streams.size(); | ||
| m_streams.resize(numStreams); | ||
| std::generate(m_streams.end() - streamsToAdd, m_streams.end(), []() { | ||
| cudaStream_t stream; | ||
| cudaStreamCreate(&stream); | ||
| return stream; | ||
| }); | ||
| } | ||
| } | ||
|
|
||
| void join(cudaStream_t finalStream) { | ||
| std::for_each(m_streams.begin(), m_streams.end(), [this, finalStream](cudaStream_t stream) { | ||
| cudaEvent_t event; | ||
| cudaEventCreate(&event); | ||
| cudaEventRecord(event, stream); | ||
| cudaStreamWaitEvent(finalStream, event, 0); | ||
| m_events.push_back(event); | ||
| }); | ||
|
|
||
| if (!cleanup_thread.joinable()) { | ||
| stop_thread.store(false); | ||
| cleanup_thread = std::thread([this]() { this->AsyncEventCleanup(); }); | ||
| } | ||
| } | ||
|
|
||
| // Fork function to add streams and set dependency on a given stream | ||
| void Fork(cudaStream_t dependentStream, int N) { | ||
| AddStreams(N); // Add N streams | ||
|
|
||
| // Set dependency on the provided stream | ||
| std::for_each(m_streams.end() - N, m_streams.end(), [this, dependentStream](cudaStream_t stream) { | ||
| cudaEvent_t event; | ||
| cudaEventCreate(&event); | ||
| cudaEventRecord(event, dependentStream); | ||
| cudaStreamWaitEvent(stream, event, 0); // Set the stream to wait on the event | ||
| m_events.push_back(event); | ||
| }); | ||
| } | ||
|
|
||
| auto getIterator() { return StreamIterator(m_streams.begin(), m_streams.end()); } | ||
|
|
||
| ~CudaStreamHandlerImpl() { | ||
| stop_thread.store(true); | ||
| if (cleanup_thread.joinable()) { | ||
| cleanup_thread.join(); | ||
| } | ||
|
|
||
| std::for_each(m_streams.begin(), m_streams.end(), cudaStreamDestroy); | ||
| std::for_each(m_events.begin(), m_events.end(), cudaEventDestroy); | ||
| } | ||
|
|
||
| // Custom Iterator class to iterate over streams | ||
| class StreamIterator { | ||
| public: | ||
| StreamIterator(std::vector<cudaStream_t>::iterator begin, std::vector<cudaStream_t>::iterator end) | ||
| : current(begin), end(end) {} | ||
|
|
||
| cudaStream_t next() { | ||
| if (current == end) { | ||
| throw std::out_of_range("No more streams."); | ||
| } | ||
| return *current++; | ||
| } | ||
|
|
||
| bool hasNext() const { return current != end; } | ||
|
|
||
| private: | ||
| std::vector<cudaStream_t>::iterator current; | ||
| std::vector<cudaStream_t>::iterator end; | ||
| }; | ||
|
|
||
| private: | ||
| CudaStreamHandlerImpl() : stop_thread(false) {} | ||
| CudaStreamHandlerImpl(const CudaStreamHandlerImpl &) = delete; | ||
| CudaStreamHandlerImpl &operator=(const CudaStreamHandlerImpl &) = delete; | ||
|
|
||
| void AsyncEventCleanup() { | ||
| while (!stop_thread.load()) { | ||
| std::for_each(m_events.begin(), m_events.end(), [this](cudaEvent_t &event) { | ||
| if (cudaEventQuery(event) == cudaSuccess) { | ||
| cudaEventDestroy(event); | ||
| event = nullptr; | ||
| } | ||
| }); | ||
| std::this_thread::sleep_for(std::chrono::milliseconds(10)); | ||
| } | ||
| } | ||
|
|
||
| std::vector<cudaStream_t> m_streams; | ||
| std::vector<cudaEvent_t> m_events; | ||
| std::thread cleanup_thread; | ||
| std::atomic<bool> stop_thread; | ||
| }; | ||
|
|
||
| // Public class for encapsulating the singleton operations | ||
| class CudaStreamHandler { | ||
| public: | ||
| CudaStreamHandler() = default; | ||
| ~CudaStreamHandler() = default; | ||
|
|
||
| void AddStreams(int numStreams) { CudaStreamHandlerImpl::instance().AddStreams(numStreams); } | ||
|
|
||
| void join(cudaStream_t finalStream) { CudaStreamHandlerImpl::instance().join(finalStream); } | ||
|
|
||
| void Fork(cudaStream_t cudastream, int N) { CudaStreamHandlerImpl::instance().Fork(cudastream, N); } | ||
|
|
||
| // Get the custom iterator for CUDA streams | ||
| CudaStreamHandlerImpl::StreamIterator getIterator() { | ||
| return CudaStreamHandlerImpl::instance().getIterator(); | ||
| } | ||
| }; | ||
|
|
||
| #endif // CUDASTREAMHANDLER_HPP |
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With this file removed we can remove the comment in README Lines 350 to 352 in d77e9cb
|
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With #305 merged I think should now be able to tighten version requirement here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this comment still applies and looking at JAX FFI docs example
CMakeLists.txtit appears they specify Python 3.11 here so unless there is a strong reason to keep this at Python 3.8 I would say to keep in sync with ourproject.requires-pythonminimum inpyproject.tomlas that will make it easier to keep this from going stale when we do subsequent minimum Python version bumps.