Skip to content

Commit 38f1afb

Browse files
authored
Merge #1954 Add ARM github action, ROCm 7, CUDA 13 gitlab job and corresponding update
This PR adds ARM Github action, and ROCm 7/CUDA 13 gitlab job with corresponding fix. Note. Ginkgo only supports wavefront size 64 even with ROCm 7 now Related PR: #1954
2 parents 3bfe1f8 + 2096aa2 commit 38f1afb

File tree

13 files changed

+265
-66
lines changed

13 files changed

+265
-66
lines changed

.github/workflows/arm.yml

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
name: ARM-build
2+
3+
on:
4+
push:
5+
branches:
6+
- 'main'
7+
- 'master'
8+
- 'develop'
9+
- 'release/**'
10+
tags:
11+
- '**'
12+
pull_request:
13+
types: [opened,synchronize]
14+
paths-ignore:
15+
- 'doc/**'
16+
workflow_dispatch:
17+
inputs:
18+
debug_enabled:
19+
description: 'Run the build with tmate debugging enabled by `debug_enabled` keyword (https://github.com/marketplace/actions/debugging-with-tmate)'
20+
required: false
21+
default: false
22+
23+
concurrency:
24+
group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
25+
cancel-in-progress: true
26+
27+
jobs:
28+
arm-omp:
29+
strategy:
30+
fail-fast: false
31+
matrix:
32+
config:
33+
- {shared: "ON", build_type: "Debug", name: "arm/omp/debug/shared", mixed: "OFF", half: "ON", bfloat16: "OFF"}
34+
- {shared: "OFF", build_type: "Release", name: "arm/omp/release/static", mixed: "ON", half: "ON", bfloat16: "OFF"}
35+
- {shared: "ON", build_type: "Release", name: "arm/omp/release/shared", mixed: "ON", half: "OFF", bfloat16: "ON"}
36+
- {shared: "ON", build_type: "Release", name: "arm/omp/release/shared-16bit", mixed: "ON", half: "ON", bfloat16: "ON"}
37+
name: ${{ matrix.config.name }}
38+
runs-on: [ubuntu-24.04-arm]
39+
40+
steps:
41+
- name: Checkout the latest code (shallow clone)
42+
uses: actions/checkout@v4
43+
44+
- name: info
45+
run: |
46+
g++ -v
47+
cmake --version
48+
49+
- name: Debug over SSH (tmate)
50+
uses: mxschmitt/action-tmate@v3.5
51+
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
52+
53+
- name: configure
54+
run: |
55+
mkdir build
56+
mkdir install
57+
export INSTALL_PREFIX=`pwd`/install
58+
cd build
59+
cmake .. -DCMAKE_CXX_FLAGS="-Wpedantic -ffp-contract=off" -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_ENABLE_HALF=${{ matrix.config.half }} -DGINKGO_ENABLE_BFLOAT16=${{ matrix.config.bfloat16 }}
60+
make -j4
61+
ctest -j4 --output-on-failure
62+
63+
- name: install
64+
run: |
65+
cd build
66+
make install
67+
make test_install

.gitlab-ci.yml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,20 @@ build/cuda126/nompi/gcc/cuda/release/shared:
218218
BUILD_TYPE: "Release"
219219
MODULE_LOAD: "cmake/3.30.8 cuda/12.6.3 gcc/13.3.0"
220220

221+
build/cuda130/nompi/gcc/cuda/release/shared:
222+
extends:
223+
- .build_and_test_tum_template
224+
- .default_variables
225+
- .full_test_condition
226+
- .use_tum-nvidia
227+
variables:
228+
BUILD_CUDA: "ON"
229+
BUILD_HWLOC: "OFF"
230+
ENABLE_HALF: "ON"
231+
ENABLE_BFLOAT16: "ON"
232+
BUILD_TYPE: "Release"
233+
MODULE_LOAD: "cmake/3.30.8 cuda/13.0.2 gcc/14.3.0"
234+
221235
# ROCm 4.5 and friends
222236
build/amd/nompi/gcc/rocm45/release/shared:
223237
extends:
@@ -341,6 +355,21 @@ build/amd/openmpi/gcc/rocm634_wo_omp/release/shared:
341355
BUILD_TYPE: "Release"
342356
MODULE_LOAD: "cmake/3.29.6 rocm/6.3.4 gcc/13.3.0 openmpi/5.0.7"
343357

358+
# mi50 is not officially supported by ROCm >= 7
359+
build/amd/nompi/gcc/rocm710/release/shared:
360+
extends:
361+
- .build_and_test_tum_template
362+
- .default_variables
363+
- .full_test_condition
364+
- .use_tum-amd-mi210
365+
variables:
366+
BUILD_HIP: "ON"
367+
BUILD_HWLOC: "OFF"
368+
BUILD_MPI: "OFF"
369+
BUILD_OMP: "OFF"
370+
BUILD_TYPE: "Release"
371+
MODULE_LOAD: "cmake/3.29.6 rocm/7.1.0 gcc/14.3.0"
372+
344373
# no cuda but latest gcc and clang
345374
build/nocuda/nompi/gcc/core/debug/static:
346375
extends:

.gitlab/image.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,12 @@
7979
- amd-gpus
8080
- tum
8181

82+
.use_tum-amd-mi210:
83+
image: rocky_tum
84+
tags:
85+
- amd-gpus-mi210
86+
- tum
87+
8288
.use_tum-intel:
8389
image: rocky_tum_intel
8490
tags:

.gitlab/scripts.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,13 @@
1212
- export CCACHE_DIR=${CCACHE_DIR}
1313
- export CCACHE_MAXSIZE=${CCACHE_MAXSIZE}
1414
- source /storage/apps/opt/spack/share/spack/setup-env.sh
15-
- export MODULEPATH=/storage/apps/opt/rocm-modules:/storage/apps/opt/spack/share/spack/lmod/linux-rocky9-x86_64/Core
15+
- mkdir -p lmod/cuda
16+
- echo 'prepend_path("PATH","/storage/apps/usr/local/cuda-13.0.2/bin")' > lmod/cuda/13.0.2.lua
17+
- echo 'prepend_path("CMAKE_PREFIX_PATH","/storage/apps/usr/local/cuda-13.0.2/.")' >> lmod/cuda/13.0.2.lua
18+
- echo 'setenv("CUDA_HOME","/storage/apps/usr/local/cuda-13.0.2")' >> lmod/cuda/13.0.2.lua
19+
- echo 'setenv("NVHPC_CUDA_HOME","/storage/apps/usr/local/cuda-13.0.2")' >> lmod/cuda/13.0.2.lua
20+
- export MODULEPATH="$(pwd)/lmod":/storage/apps/opt/rocm-modules:/storage/apps/opt/spack/share/spack/lmod/linux-rocky9-x86_64/Core
21+
- module av
1622

1723
.before_script_git_template:
1824
before_script:

CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,18 @@ if(GINKGO_BUILD_HIP)
234234
"Disable custom thrust namespace for hip before 5.7 because hip does not fully support it before 5.7"
235235
)
236236
set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE OFF)
237+
elseif(
238+
GINKGO_HIP_PLATFORM_AMD
239+
AND GINKGO_HIP_VERSION VERSION_GREATER_EQUAL 7.1
240+
AND GINKGO_HIP_VERSION VERSION_LESS 7.2
241+
)
242+
# https://github.com/ROCm/rocm-libraries/pull/1769 should fix this issue in ROCm 7.1.1.
243+
# HIP VERSION does not use the exact version number as ROCm. Need to wait for ROCm 7.1.1 to set proper range for ROCm 7.1.0
244+
message(
245+
STATUS
246+
"Disable custom thrust namespace for hip 7.1 because hip does not adapt the custom namespace fully."
247+
)
248+
set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE OFF)
237249
else()
238250
message(STATUS "Enable custom thrust namespace for hip")
239251
set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE ON)

benchmark/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ function(ginkgo_benchmark_cusparse_linops type def)
4040
cusparse_linops_${type}
4141
PRIVATE Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse
4242
)
43+
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13)
44+
target_link_libraries(cusparse_linops_${type} PRIVATE Thrust)
45+
endif()
4346
ginkgo_compile_features(cusparse_linops_${type})
4447
endfunction()
4548

cmake/cuda.cmake

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ endif()
1414

1515
find_package(NVTX REQUIRED)
1616

17+
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13)
18+
find_package(Thrust REQUIRED)
19+
thrust_create_target(Thrust)
20+
endif()
21+
1722
if(
1823
CMAKE_CUDA_HOST_COMPILER
1924
AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_COMPILER

common/cuda_hip/solver/cb_gmres_kernels.cpp

Lines changed: 67 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
1+
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
22
//
33
// SPDX-License-Identifier: BSD-3-Clause
44

@@ -623,14 +623,18 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
623623
const auto block_dim = default_block_size;
624624
constexpr auto block_size = default_block_size;
625625

626-
initialize_kernel<block_size>
627-
<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
628-
b->get_size()[0], b->get_size()[1], krylov_dim,
629-
as_device_type(b->get_const_values()), b->get_stride(),
630-
as_device_type(residual->get_values()), residual->get_stride(),
631-
as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
632-
as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
633-
as_device_type(stop_status->get_data()));
626+
if (grid_dim != 0) {
627+
initialize_kernel<block_size>
628+
<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
629+
b->get_size()[0], b->get_size()[1], krylov_dim,
630+
as_device_type(b->get_const_values()), b->get_stride(),
631+
as_device_type(residual->get_values()), residual->get_stride(),
632+
as_device_type(givens_sin->get_values()),
633+
givens_sin->get_stride(),
634+
as_device_type(givens_cos->get_values()),
635+
givens_cos->get_stride(),
636+
as_device_type(stop_status->get_data()));
637+
}
634638
}
635639

636640
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(
@@ -661,12 +665,14 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
661665
constexpr auto block_size = default_block_size;
662666
const auto stride_arnoldi = arnoldi_norm->get_stride();
663667

664-
restart_1_kernel<block_size>
665-
<<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
666-
residual->get_size()[0], residual->get_size()[1], krylov_dim,
667-
acc::as_device_range(krylov_bases),
668-
as_device_type(residual_norm_collection->get_values()),
669-
residual_norm_collection->get_stride());
668+
if (grid_dim_1 != 0) {
669+
restart_1_kernel<block_size>
670+
<<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
671+
residual->get_size()[0], residual->get_size()[1], krylov_dim,
672+
acc::as_device_range(krylov_bases),
673+
as_device_type(residual_norm_collection->get_values()),
674+
residual_norm_collection->get_stride());
675+
}
670676
kernels::GKO_DEVICE_NAMESPACE::dense::compute_norm2_dispatch(
671677
exec, residual, residual_norm, reduction_tmp);
672678

@@ -695,21 +701,23 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
695701
2 * stride_arnoldi),
696702
stride_arnoldi, acc::as_device_range(krylov_bases));
697703
}
698-
699704
const auto grid_dim_2 =
700705
ceildiv(std::max<size_type>(num_rows, 1) * krylov_stride[1],
701706
default_block_size);
702-
restart_2_kernel<block_size>
703-
<<<grid_dim_2, block_dim, 0, exec->get_stream()>>>(
704-
residual->get_size()[0], residual->get_size()[1],
705-
as_device_type(residual->get_const_values()),
706-
residual->get_stride(),
707-
as_device_type(residual_norm->get_const_values()),
708-
as_device_type(residual_norm_collection->get_values()),
709-
acc::as_device_range(krylov_bases),
710-
as_device_type(next_krylov_basis->get_values()),
711-
next_krylov_basis->get_stride(),
712-
as_device_type(final_iter_nums->get_data()));
707+
708+
if (grid_dim_2 != 0) {
709+
restart_2_kernel<block_size>
710+
<<<grid_dim_2, block_dim, 0, exec->get_stream()>>>(
711+
residual->get_size()[0], residual->get_size()[1],
712+
as_device_type(residual->get_const_values()),
713+
residual->get_stride(),
714+
as_device_type(residual_norm->get_const_values()),
715+
as_device_type(residual_norm_collection->get_values()),
716+
acc::as_device_range(krylov_bases),
717+
as_device_type(next_krylov_basis->get_values()),
718+
next_krylov_basis->get_stride(),
719+
as_device_type(final_iter_nums->get_data()));
720+
}
713721
}
714722

715723
GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL);
@@ -919,18 +927,21 @@ void givens_rotation(std::shared_ptr<const DefaultExecutor> exec,
919927
const auto block_dim = block_size;
920928
const auto grid_dim =
921929
static_cast<unsigned int>(ceildiv(num_cols, block_size));
922-
923-
givens_rotation_kernel<block_size>
924-
<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
925-
hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1],
926-
iter, as_device_type(hessenberg_iter->get_values()),
927-
hessenberg_iter->get_stride(),
928-
as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
929-
as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
930-
as_device_type(residual_norm->get_values()),
931-
as_device_type(residual_norm_collection->get_values()),
932-
residual_norm_collection->get_stride(),
933-
stop_status->get_const_data());
930+
if (grid_dim != 0) {
931+
givens_rotation_kernel<block_size>
932+
<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
933+
hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1],
934+
iter, as_device_type(hessenberg_iter->get_values()),
935+
hessenberg_iter->get_stride(),
936+
as_device_type(givens_sin->get_values()),
937+
givens_sin->get_stride(),
938+
as_device_type(givens_cos->get_values()),
939+
givens_cos->get_stride(),
940+
as_device_type(residual_norm->get_values()),
941+
as_device_type(residual_norm_collection->get_values()),
942+
residual_norm_collection->get_stride(),
943+
stop_status->get_const_data());
944+
}
934945
}
935946

936947

@@ -949,12 +960,14 @@ void arnoldi(std::shared_ptr<const DefaultExecutor> exec,
949960
array<stopping_status>* reorth_status,
950961
array<size_type>* num_reorth)
951962
{
952-
increase_final_iteration_numbers_kernel<<<
953-
static_cast<unsigned int>(
954-
ceildiv(final_iter_nums->get_size(), default_block_size)),
955-
default_block_size, 0, exec->get_stream()>>>(
956-
as_device_type(final_iter_nums->get_data()),
957-
stop_status->get_const_data(), final_iter_nums->get_size());
963+
if (final_iter_nums->get_size() != 0) {
964+
increase_final_iteration_numbers_kernel<<<
965+
static_cast<unsigned int>(
966+
ceildiv(final_iter_nums->get_size(), default_block_size)),
967+
default_block_size, 0, exec->get_stream()>>>(
968+
as_device_type(final_iter_nums->get_data()),
969+
stop_status->get_const_data(), final_iter_nums->get_size());
970+
}
958971
finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter,
959972
buffer_iter, arnoldi_norm, iter,
960973
stop_status->get_const_data(), reorth_status->get_data(),
@@ -1007,14 +1020,15 @@ void calculate_qy(std::shared_ptr<const DefaultExecutor> exec,
10071020
const auto grid_dim = static_cast<unsigned int>(
10081021
ceildiv(num_rows * stride_before_preconditioner, block_size));
10091022
const auto block_dim = block_size;
1010-
1011-
calculate_Qy_kernel<block_size>
1012-
<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
1013-
num_rows, num_cols, acc::as_device_range(krylov_bases),
1014-
as_device_type(y->get_const_values()), y->get_stride(),
1015-
as_device_type(before_preconditioner->get_values()),
1016-
stride_before_preconditioner,
1017-
as_device_type(final_iter_nums->get_const_data()));
1023+
if (grid_dim != 0) {
1024+
calculate_Qy_kernel<block_size>
1025+
<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
1026+
num_rows, num_cols, acc::as_device_range(krylov_bases),
1027+
as_device_type(y->get_const_values()), y->get_stride(),
1028+
as_device_type(before_preconditioner->get_values()),
1029+
stride_before_preconditioner,
1030+
as_device_type(final_iter_nums->get_const_data()));
1031+
}
10181032
// Calculate qy
10191033
// before_preconditioner = krylov_bases * y
10201034
}

cuda/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,9 @@ target_link_libraries(
156156
CUDA::cufft
157157
nvtx::nvtx
158158
)
159+
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13)
160+
target_link_libraries(ginkgo_cuda PRIVATE Thrust)
161+
endif()
159162
# NVTX3 is header-only and requires dlopen/dlclose in static builds
160163
target_link_libraries(ginkgo_cuda PUBLIC ginkgo_device ${CMAKE_DL_LIBS})
161164

hip/base/config.hip.hpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
1+
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
22
//
33
// SPDX-License-Identifier: BSD-3-Clause
44

@@ -32,7 +32,11 @@ struct config {
3232
* `device_functions.h`.
3333
*/
3434
#if GINKGO_HIP_PLATFORM_HCC
35-
static constexpr uint32 warp_size = warpSize;
35+
// workaround for ROCm >= 7, which does not give warpSize in compile time.
36+
// We can not define warpSize via compiler because amd_warp_functions.h
37+
// defines a struct variable called warpSize, too. No support for 32 on AMD
38+
// GPU yet.
39+
static constexpr uint32 warp_size = 64;
3640
#else // GINKGO_HIP_PLATFORM_NVCC
3741
static constexpr uint32 warp_size = 32;
3842
#endif

0 commit comments

Comments
 (0)