Skip to content

Commit 214b5d4

Browse files
committed
fix some cuda issues...more to come
1 parent 312c16e commit 214b5d4

File tree

13 files changed

+192
-49
lines changed

13 files changed

+192
-49
lines changed

cmake/blt

Submodule blt updated 610 files
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#################################################################################
2+
# Generated host-config - Edit at own risk!
3+
#################################################################################
4+
#--------------------------------------------------------------------------------
5+
# SYS_TYPE: toss_4_x86_64_ib
6+
# Compiler Spec: gcc@=13.3.1
7+
# CMake executable path: /usr/tce/backend/installations/linux-rhel8-x86_64/gcc-10.3.1/cmake-3.26.3-nz532rvfpaf5lf74zxmplgiobuhol7lu/bin/cmake
8+
#--------------------------------------------------------------------------------
9+
10+
#--------------------------------------------------------------------------------
11+
# Compilers
12+
#--------------------------------------------------------------------------------
13+
14+
set(CMAKE_C_COMPILER "/usr/tce/packages/gcc/gcc-13.3.1-magic/bin/gcc" CACHE PATH "")
15+
16+
set(CMAKE_CXX_COMPILER "/usr/tce/packages/gcc/gcc-13.3.1-magic/bin/g++" CACHE PATH "")
17+
18+
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "")
19+
20+
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG" CACHE STRING "")
21+
22+
set(CMAKE_CXX_FLAGS_DEBUG "-g" CACHE STRING "")
23+
24+
#--------------------------------------------------------------------------------
25+
# CMake Standard
26+
#--------------------------------------------------------------------------------
27+
28+
set(BLT_CXX_STD "c++17" CACHE STRING "")
29+
30+
31+
#--------------------------------------------------------------------------------
32+
# Cuda
33+
#--------------------------------------------------------------------------------
34+
35+
set(ENABLE_CUDA ON CACHE BOOL "")
36+
37+
set(CMAKE_CUDA_STANDARD "17" CACHE PATH "")
38+
39+
set(CUDA_TOOLKIT_ROOT_DIR "/usr/tce/packages/cuda/cuda-12.9.1" CACHE PATH "")
40+
41+
set(CMAKE_CUDA_COMPILER "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc" CACHE PATH "")
42+
43+
set(CMAKE_CUDA_ARCHITECTURES "90" CACHE STRING "")
44+
45+
set(CMAKE_CUDA_FLAGS "-restrict --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations -arch sm_90" CACHE STRING "")
46+
47+
set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo ${CMAKE_CUDA_FLAGS_RELEASE}" CACHE STRING "")
48+
49+
set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "")
50+
51+
52+
#--------------------------------------------------------------------------------
53+
# System Math Libraries
54+
#--------------------------------------------------------------------------------
55+
56+
set(ENABLE_MKL ON CACHE BOOL "")
57+
58+
set(MKL_INCLUDE_DIRS "/usr/tce/packages/mkl/mkl-2022.1.0/include" CACHE PATH "")
59+
60+
set(MKL_LIBRARIES /usr/tce/packages/mkl/mkl-2022.1.0/mkl/2022.1.0/lib/intel64/libmkl_intel_lp64.so
61+
/usr/tce/packages/mkl/mkl-2022.1.0/mkl/2022.1.0/lib/intel64/libmkl_gnu_thread.so
62+
/usr/tce/packages/mkl/mkl-2022.1.0/mkl/2022.1.0/lib/intel64/libmkl_core.so
63+
/usr/tce/backend/installations/linux-rhel8-x86_64/gcc-13.3.1/llvm-19.1.3-gy2lu5xbi4csr2k47emlajzfs5mlsd4g/bin/../lib/x86_64-unknown-linux-gnu/libomp.so
64+
/lib64/libpthread.so
65+
/lib64/libm.so
66+
/lib64/libdl.so CACHE STRING "")
67+
68+
69+
#--------------------------------------------------------------------------------
70+
# Documentation
71+
#--------------------------------------------------------------------------------
72+
73+
set(ENABLE_DOCS OFF CACHE BOOL "")
74+
75+
set(ENABLE_DOXYGEN OFF CACHE BOOL "")
76+
77+
set(ENABLE_SPHINX OFF CACHE BOOL "")
78+
79+
#--------------------------------------------------------------------------------
80+
# Development tools
81+
#--------------------------------------------------------------------------------
82+
83+
set(ENABLE_UNCRUSTIFY OFF CACHE BOOL "")

src/CMakeLists.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,16 @@ set( hpcReact_sources)
2626
find_package(LAPACK REQUIRED)
2727
find_package(BLAS REQUIRED)
2828

29-
30-
31-
#target_link_libraries(MyExecutable PRIVATE LAPACK::LAPACK)
3229

33-
set( hpcReack_dependencies
30+
set( hpcReact_dependencies
3431
LAPACK::LAPACK
3532
BLAS::BLAS
3633
)
3734

35+
if( ENABLE_CUDA )
36+
list( APPEND hpcReact_dependencies cuda )
37+
endif()
38+
3839
blt_add_library( NAME hpcReact
3940
# SOURCES ${hpcReact_sources}
4041
HEADERS ${hpcReact_headers}

src/common/CArrayWrapper.hpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ template< typename T, int DIM0 >
3939
struct CArrayWrapper< T, DIM0 >
4040
{
4141
// default constructor
42+
HPCREACT_HOST_DEVICE
4243
constexpr CArrayWrapper() = default;
4344

4445
/**
@@ -53,6 +54,7 @@ struct CArrayWrapper< T, DIM0 >
5354
*
5455
* @note No runtime bounds checking is performed on the initializer size.
5556
*/
57+
HPCREACT_HOST_DEVICE
5658
constexpr CArrayWrapper( std::initializer_list< T > init )
5759
{
5860
// static_assert(init.size() == DIM0, "Size mismatch"); // needs c++20
@@ -66,6 +68,7 @@ struct CArrayWrapper< T, DIM0 >
6668
* @brief Copy constructor.
6769
* @param src The source CArrayWrapper to copy from.
6870
*/
71+
HPCREACT_HOST_DEVICE
6972
constexpr CArrayWrapper( CArrayWrapper const & src )
7073
{
7174
for( std::size_t i = 0; i < DIM0; i++ )
@@ -79,27 +82,31 @@ struct CArrayWrapper< T, DIM0 >
7982
* @param dim The index (must be in range [0, DIM0)).
8083
* @return Reference to the element at the specified index.
8184
*/
85+
HPCREACT_HOST_DEVICE
8286
constexpr HPCREACT_HOST_DEVICE inline T & operator()( int const dim ) { return data[dim]; }
8387

8488
/**
8589
* @brief Read-only access to an element by index (const overload).
8690
* @param dim The index (must be in range [0, DIM0)).
8791
* @return Const reference to the element at the specified index.
8892
*/
93+
HPCREACT_HOST_DEVICE
8994
constexpr HPCREACT_HOST_DEVICE inline T const & operator()( int const dim ) const { return data[dim]; }
9095

9196
/**
9297
* @brief Subscript operator for read/write access.
9398
* @param dim The index (must be in range [0, DIM0)).
9499
* @return Reference to the element at the specified index.
95100
*/
101+
HPCREACT_HOST_DEVICE
96102
constexpr HPCREACT_HOST_DEVICE inline T & operator[]( int const dim ) { return data[dim]; }
97103

98104
/**
99105
* @brief Subscript operator for read-only access (const overload).
100106
* @param dim The index (must be in range [0, DIM0)).
101107
* @return Const reference to the element at the specified index.
102108
*/
109+
HPCREACT_HOST_DEVICE
103110
constexpr HPCREACT_HOST_DEVICE inline T const & operator[]( int const dim ) const { return data[dim]; }
104111

105112
/// The underlying 1D C-style array.
@@ -120,12 +127,14 @@ template< typename T, int DIM0, int DIM1 >
120127
struct CArrayWrapper< T, DIM0, DIM1 >
121128
{
122129
// default constructor
130+
HPCREACT_HOST_DEVICE
123131
constexpr CArrayWrapper() = default;
124132

125133
/**
126134
* @brief Copy constructor.
127135
* @param src The source CArrayWrapper to copy from.
128136
*/
137+
HPCREACT_HOST_DEVICE
129138
constexpr CArrayWrapper( CArrayWrapper const & src )
130139
{
131140
for( std::size_t i = 0; i < DIM0; i++ )
@@ -150,6 +159,7 @@ struct CArrayWrapper< T, DIM0, DIM1 >
150159
*
151160
* @note No runtime bounds checking is performed on the initializer dimensions.
152161
*/
162+
HPCREACT_HOST_DEVICE
153163
constexpr CArrayWrapper( std::initializer_list< std::initializer_list< T > > init )
154164
{
155165
// static_assert(init.size() == DIM0, "Size mismatch"); // needs c++20
@@ -172,6 +182,7 @@ struct CArrayWrapper< T, DIM0, DIM1 >
172182
* @param dim1 Index in the second dimension (range [0, DIM1)).
173183
* @return Reference to the element at the specified 2D location.
174184
*/
185+
HPCREACT_HOST_DEVICE
175186
constexpr HPCREACT_HOST_DEVICE inline T & operator()( int const dim0, int const dim1 )
176187
{
177188
return data[dim0][dim1];
@@ -183,6 +194,7 @@ struct CArrayWrapper< T, DIM0, DIM1 >
183194
* @param dim1 Index in the second dimension (range [0, DIM1)).
184195
* @return Const reference to the element at the specified 2D location.
185196
*/
197+
HPCREACT_HOST_DEVICE
186198
constexpr HPCREACT_HOST_DEVICE inline T const & operator()( int const dim0, int const dim1 ) const
187199
{
188200
return data[dim0][dim1];
@@ -195,6 +207,7 @@ struct CArrayWrapper< T, DIM0, DIM1 >
195207
*
196208
* This allows usage like `obj[dim0][dim1]`.
197209
*/
210+
HPCREACT_HOST_DEVICE
198211
constexpr HPCREACT_HOST_DEVICE inline T ( & operator[]( int const dim0 ))[DIM1]
199212
{
200213
return data[dim0];
@@ -205,6 +218,7 @@ struct CArrayWrapper< T, DIM0, DIM1 >
205218
* @param dim0 The row index (range [0, DIM0)).
206219
* @return Const reference to an array of type T[DIM1].
207220
*/
221+
HPCREACT_HOST_DEVICE
208222
constexpr HPCREACT_HOST_DEVICE inline T const (&operator[]( int const dim0 ) const)[DIM1]
209223
{
210224
return data[dim0];
@@ -229,6 +243,7 @@ template< typename T, int DIM0, int DIM1, int DIM2 >
229243
struct CArrayWrapper< T, DIM0, DIM1, DIM2 >
230244
{
231245
// default constructor
246+
HPCREACT_HOST_DEVICE
232247
constexpr CArrayWrapper() = default;
233248

234249
/**
@@ -254,6 +269,7 @@ struct CArrayWrapper< T, DIM0, DIM1, DIM2 >
254269
* @note This constructor does not perform size validation. Incorrect initializer sizes
255270
* may lead to undefined behavior.
256271
*/
272+
HPCREACT_HOST_DEVICE
257273
constexpr CArrayWrapper( std::initializer_list< std::initializer_list< std::initializer_list< T > > > init )
258274
{
259275
// static_assert(init.size() == DIM0, "Size mismatch"); // needs c++20
@@ -286,6 +302,7 @@ struct CArrayWrapper< T, DIM0, DIM1, DIM2 >
286302
* @note Currently, this function incorrectly indexes data[dim0][dim1], missing dim2.
287303
* It should be `data[dim0][dim1][dim2]`. Please correct if intended.
288304
*/
305+
HPCREACT_HOST_DEVICE
289306
constexpr HPCREACT_HOST_DEVICE inline T & operator()( int const dim0, int const dim1, int const dim2 )
290307
{
291308
// NOTE: This looks like a bug in your original code. Should be data[dim0][dim1][dim2].
@@ -299,6 +316,7 @@ struct CArrayWrapper< T, DIM0, DIM1, DIM2 >
299316
* @param dim2 Index in the third dimension (range [0, DIM2)).
300317
* @return Const reference to the element at the specified 3D location.
301318
*/
319+
HPCREACT_HOST_DEVICE
302320
constexpr HPCREACT_HOST_DEVICE inline T const & operator()( int const dim0, int const dim1, int const dim2 ) const
303321
{
304322
// NOTE: Same potential bug as above. Should be data[dim0][dim1][dim2].
@@ -312,6 +330,7 @@ struct CArrayWrapper< T, DIM0, DIM1, DIM2 >
312330
*
313331
* This allows usage like `obj[dim0][dim1][dim2]`.
314332
*/
333+
HPCREACT_HOST_DEVICE
315334
constexpr HPCREACT_HOST_DEVICE inline T ( & operator[]( int const dim0 ))[DIM1][DIM2]
316335
{
317336
return data[dim0];
@@ -322,6 +341,7 @@ struct CArrayWrapper< T, DIM0, DIM1, DIM2 >
322341
* @param dim0 The slice index (range [0, DIM0)).
323342
* @return Const reference to an array of type T[DIM1][DIM2].
324343
*/
344+
HPCREACT_HOST_DEVICE
325345
constexpr HPCREACT_HOST_DEVICE inline T const (&operator[]( int const dim0 ) const)[DIM1][DIM2]
326346
{
327347
return data[dim0];

src/common/DirectSystemSolve.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ namespace hpcReact
1919
{
2020

2121
template< typename REAL_TYPE, int N >
22+
HPCREACT_HOST_DEVICE
2223
bool isPositiveDefinite( REAL_TYPE const (&A)[N][N] )
2324
{
2425
REAL_TYPE temp[N][N];
@@ -52,6 +53,7 @@ bool isPositiveDefinite( REAL_TYPE const (&A)[N][N] )
5253

5354

5455
template< typename REAL_TYPE, int N >
56+
HPCREACT_HOST_DEVICE
5557
void solveNxN_Cholesky( REAL_TYPE const (&A)[N][N], REAL_TYPE const (&b)[N], REAL_TYPE (& x)[N] )
5658
{
5759
REAL_TYPE L[N][N] = {{0}};
@@ -96,6 +98,7 @@ void solveNxN_Cholesky( REAL_TYPE const (&A)[N][N], REAL_TYPE const (&b)[N], REA
9698

9799

98100
template< typename REAL_TYPE, int N >
101+
HPCREACT_HOST_DEVICE
99102
void solveNxN_Cholesky( symmetricMatrix< REAL_TYPE, int, N > const & A,
100103
REAL_TYPE const (&b)[N],
101104
REAL_TYPE (& x)[N] )

src/common/macros.hpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,23 @@
1313

1414

1515

16-
#if defined( __CUDACC__ ) || defined( __HIPCC__ )
16+
#if defined( __CUDACC__ )
1717
#define HPCREACT_USE_DEVICE
18-
#define HPCREACT_HOST_DEVICE __host__ __device__
19-
#else
20-
#define HPCREACT_HOST_DEVICE
18+
#define HPCREACT_USE_CUDA
19+
#elif defined( __HIPCC__ )
20+
#define HPCREACT_USE_DEVICE
21+
#define HPCREACT_USE_HIP
2122
#endif
2223

2324

2425
#if defined( HPCREACT_USE_DEVICE )
2526
#define HPCREACT_GLOBAL __global__
27+
#define HPCREACT_DEVICE __device__
28+
#define HPCREACT_HOST_DEVICE __host__ __device__
2629
#else
2730
#define HPCREACT_GLOBAL
31+
#define HPCREACT_DEVICE
32+
#define HPCREACT_HOST_DEVICE
2833
#endif
2934

3035
/// This macro is used to ignore warnings that that a variable is

src/common/pmpl.hpp

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,24 +13,26 @@
1313

1414
#include "common/macros.hpp"
1515

16+
#include <cstdio>
1617
#include <utility>
18+
#include <iostream>
1719

1820

1921
namespace hpcReact
2022
{
2123
#if defined(HPCREACT_USE_DEVICE)
2224
#if defined(HPCREACT_USE_CUDA)
23-
#define deviceMalloc( PTR, BYTES ) cudaMalloc( PTR, BYTES );
24-
#define deviceMallocManaged( PTR, BYTES ) cudaMallocManaged( PTR, BYTES );
25-
#define deviceDeviceSynchronize() cudaDeviceSynchronize();
26-
#define deviceMemCpy( DST, SRC, BYTES, KIND ) cudaMemcpy( DST, SRC, BYTES, KIND );
27-
#define deviceFree( PTR ) cudaFree( PTR );
25+
#define deviceMalloc( PTR, BYTES ) cudaMalloc( PTR, BYTES );
26+
#define deviceMallocManaged( PTR, BYTES ) cudaMallocManaged( PTR, BYTES );
27+
#define deviceDeviceSynchronize() cudaDeviceSynchronize();
28+
#define deviceMemCpy( DST, SRC, BYTES, KIND ) cudaMemcpy( DST, SRC, BYTES, KIND );
29+
#define deviceFree( PTR ) cudaFree( PTR );
2830
#elif defined(HPCREACT_USE_HIP)
29-
#define deviceMalloc( PTR, BYTES ) hipMalloc( PTR, BYTES );
30-
#define deviceMallocManaged( PTR, BYTES ) hipMallocManaged( PTR, BYTES );
31-
#define deviceDeviceSynchronize() hipDeviceSynchronize();
32-
#define deviceMemCpy( DST, SRC, BYTES, KIND ) hipMemcpy( DST, SRC, BYTES, KIND );
33-
#define deviceFree( PTR ) hipFree( PTR );
31+
#define deviceMalloc( PTR, BYTES ) hipMalloc( PTR, BYTES );
32+
#define deviceMallocManaged( PTR, BYTES ) hipMallocManaged( PTR, BYTES );
33+
#define deviceDeviceSynchronize() hipDeviceSynchronize();
34+
#define deviceMemCpy( DST, SRC, BYTES, KIND ) hipMemcpy( DST, SRC, BYTES, KIND );
35+
#define deviceFree( PTR ) hipFree( PTR );
3436
#endif
3537
#endif
3638

@@ -121,13 +123,20 @@ HPCREACT_GLOBAL void genericKernel( LAMBDA func, DATA_TYPE * const data )
121123
template< typename DATA_TYPE, typename LAMBDA >
122124
void genericKernelWrapper( int const N, DATA_TYPE * const hostData, LAMBDA && func )
123125
{
124-
125126
#if defined(HPCREACT_USE_DEVICE)
126127
DATA_TYPE * deviceData;
127128
deviceMalloc( &deviceData, N * sizeof(DATA_TYPE) );
128129
deviceMemCpy( deviceData, hostData, N * sizeof(DATA_TYPE), cudaMemcpyHostToDevice );
129130
genericKernel <<< 1, 1 >>> ( std::forward< LAMBDA >( func ), deviceData );
131+
132+
cudaError_t e = cudaGetLastError();
133+
if (e != cudaSuccess) { fprintf(stderr, "launch error: %s\n", cudaGetErrorString(e)); abort(); }
134+
130135
deviceDeviceSynchronize();
136+
137+
e = cudaGetLastError();
138+
if (e != cudaSuccess) { fprintf(stderr, "post-sync error: %s\n", cudaGetErrorString(e)); abort(); }
139+
131140
deviceMemCpy( hostData, deviceData, N * sizeof(DATA_TYPE), cudaMemcpyDeviceToHost );
132141
deviceFree( deviceData );
133142
#else

src/common/unitTests/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,13 @@
22
set( testSourceFiles
33
testDirectSystemSolve.cpp )
44

5+
56
set( dependencyList hpcReact gtest )
67

8+
if( ENABLE_CUDA )
9+
list( APPEND dependencyList cuda )
10+
endif()
11+
712
# Add gtest C++ based tests
813
foreach(test ${testSourceFiles})
914
get_filename_component( test_name ${test} NAME_WE )

0 commit comments

Comments
 (0)