Skip to content

Commit 6b939eb

Browse files
committed
Merge remote-tracking branch 'origin/master' into davschneller/ze-compress
2 parents d123ae8 + 952c71a commit 6b939eb

File tree

11 files changed

+132
-77
lines changed

11 files changed

+132
-77
lines changed

.github/workflows/test.yml

Lines changed: 53 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -13,81 +13,95 @@ jobs:
1313
device-build-test:
1414
name: device-build-test
1515
runs-on: ${{ matrix.setup.runner }}
16-
container: ${{ matrix.setup.container }}
16+
container:
17+
image: ${{ matrix.setup.container }}
18+
options: ${{ matrix.setup.container-options || ' ' }}
1719
strategy:
1820
fail-fast: false
1921
matrix:
2022
arch:
21-
- snb # <-- needed for the self-hosted CI node for now :/
23+
- hsw
2224
build_type:
2325
- Release
2426
- Debug
2527
setup:
26-
- arch: sm_60
28+
- arch: sm_86
2729
backend: cuda
2830
cc: gcc-13
2931
cxx: g++-13
3032
fc: gfortran-13
31-
container: seissol/gha-gpu-nv:davschneller-gpu-image
32-
runner: ubuntu-24.04
33+
container: seissol/gha-gpu-nv:davschneller-ci-merge
34+
runner: self-hosted
35+
container-options: --runtime=nvidia --gpus=all
3336
pythonbreak: true
34-
- arch: sm_60
37+
test: true
38+
- arch: sm_86
3539
backend: acpp
3640
cc: gcc-13
3741
cxx: g++-13
3842
fc: gfortran-13
39-
container: seissol/gha-gpu-nv:davschneller-gpu-image
40-
runner: ubuntu-24.04
43+
container: seissol/gha-gpu-nv:davschneller-ci-merge
44+
runner: self-hosted
45+
container-options: --runtime=nvidia --gpus=all
4146
pythonbreak: true
42-
- arch: sm_60
47+
test: true
48+
- arch: sm_86
4349
backend: cuda
4450
cc: clang-18
4551
cxx: clang++-18
4652
fc: gfortran-13 # TODO?
47-
container: seissol/gha-gpu-nv:davschneller-gpu-image
48-
runner: ubuntu-24.04
53+
container: seissol/gha-gpu-nv:davschneller-ci-merge
54+
runner: self-hosted
55+
container-options: --runtime=nvidia --gpus=all
4956
pythonbreak: true
50-
# TODO: needs a working GPU runner
51-
#- arch: sm_60
52-
# backend: cuda
53-
# cc: nvc
54-
# cxx: nvc++
55-
# fc: nvfortran
56-
# container: seissol/gha-gpu-nvhpc:davschneller-gpu-image
57-
# runner: sccs-ci-nv-sm60
58-
# pythonbreak: true
57+
test: true
58+
- arch: sm_86
59+
backend: cuda
60+
cc: nvc
61+
cxx: nvc++
62+
fc: nvfortran
63+
container: seissol/gha-gpu-nvhpc:davschneller-ci-merge
64+
runner: self-hosted
65+
container-options: --runtime=nvidia --gpus=all
66+
pythonbreak: true
67+
test: true
5968
- arch: gfx906
6069
backend: hip
6170
cc: gcc-13
6271
cxx: g++-13
6372
fc: gfortran-13
64-
container: seissol/gha-gpu-amd:davschneller-gpu-image
73+
container: seissol/gha-gpu-amd:davschneller-ci-merge
6574
runner: ubuntu-24.04
6675
pythonbreak: true
76+
test: false
6777
- arch: gfx906
6878
backend: acpp
6979
cc: gcc-13
7080
cxx: g++-13
7181
fc: gfortran-13
72-
container: seissol/gha-gpu-amd:davschneller-gpu-image
82+
container: seissol/gha-gpu-amd:davschneller-ci-merge
7383
runner: ubuntu-24.04
7484
pythonbreak: true
85+
test: false
7586
- arch: gfx906
7687
backend: hip
7788
cc: clang-18
7889
cxx: clang++-18
7990
fc: gfortran-13 # TODO?
80-
container: seissol/gha-gpu-amd:davschneller-gpu-image
91+
container: seissol/gha-gpu-amd:davschneller-ci-merge
8192
runner: ubuntu-24.04
8293
pythonbreak: true
94+
test: false
8395
- arch: skl
8496
backend: oneapi
8597
cc: icx
8698
cxx: icpx
8799
fc: ifx
88100
container: seissol/gha-gpu-intel:davschneller-gpu-image
89-
runner: ubuntu-24.04
101+
container-options: --device /dev/dri
102+
runner: self-hosted
90103
pythonbreak: false
104+
test: true
91105
steps:
92106
- name: install-gtest
93107
run: |
@@ -98,7 +112,7 @@ jobs:
98112
cd ../..
99113
100114
- name: checkout-device
101-
uses: actions/checkout@v4
115+
uses: actions/checkout@v6
102116
with:
103117
submodules: recursive
104118

@@ -119,5 +133,18 @@ jobs:
119133
export CXX=${{matrix.setup.cxx}}
120134
export FC=${{matrix.setup.fc}}
121135
122-
cmake .. -GNinja -DDEVICE_BACKEND=${{matrix.setup.backend}} -DSM=${{matrix.setup.arch}}
136+
cmake .. -GNinja \
137+
-DDEVICE_BACKEND=${{matrix.setup.backend}} \
138+
-DSM=${{matrix.setup.arch}} \
139+
-DCMAKE_BUILD_TYPE=${{matrix.build_type}}
140+
123141
ninja
142+
143+
- id: test
144+
name: test-device
145+
if: ${{matrix.setup.test}}
146+
run: |
147+
cd tests
148+
cd build
149+
150+
./tests

algorithms/cudahip/ArrayManip.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ template void
8080

8181
//--------------------------------------------------------------------------------------------------
8282
__global__ void kernel_touchMemory(void* ptr, size_t size, bool clean) {
83-
int id = threadIdx.x + blockIdx.x * blockDim.x;
83+
const int id = threadIdx.x + blockIdx.x * blockDim.x;
8484
if (clean) {
8585
imemset(ptr, size, id, blockDim.x * gridDim.x);
8686
} else {

algorithms/cudahip/BatchManip.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,13 @@ template void Algorithms::setToValue(
131131
int** out, int value, size_t elementSize, size_t numElements, void* streamPtr);
132132
template void Algorithms::setToValue(
133133
unsigned** out, unsigned value, size_t elementSize, size_t numElements, void* streamPtr);
134+
template void Algorithms::setToValue(
135+
long** out, long value, size_t elementSize, size_t numElements, void* streamPtr);
136+
template void Algorithms::setToValue(unsigned long** out,
137+
unsigned long value,
138+
size_t elementSize,
139+
size_t numElements,
140+
void* streamPtr);
134141
template void Algorithms::setToValue(
135142
char** out, char value, size_t elementSize, size_t numElements, void* streamPtr);
136143

algorithms/cudahip/Reduction.cpp

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -56,20 +56,24 @@ __launch_bounds__(1024) void __global__ kernel_reduce(
5656
const auto threadInWarp = threadIdx.x % warpSize;
5757
const auto warpsNeeded = (size + warpSize - 1) / warpSize;
5858

59+
auto value = operation.defaultValue;
5960
auto acc = operation.defaultValue;
6061

6162
#pragma unroll 4
6263
for (std::size_t i = currentWarp; i < warpsNeeded; i += warpCount) {
6364
const auto id = threadInWarp + i * warpSize;
64-
auto value = (id < size) ? static_cast<AccT>(ntload(&vector[id])) : operation.defaultValue;
65+
const auto valueNew =
66+
(id < size) ? static_cast<AccT>(ntload(&vector[id])) : operation.defaultValue;
6567

66-
for (int offset = 1; offset < warpSize; offset *= 2) {
67-
value = operation(value, shuffledown(value, offset));
68-
}
68+
value = operation(value, valueNew);
69+
}
6970

70-
acc = operation(acc, value);
71+
for (int offset = 1; offset < warpSize; offset *= 2) {
72+
value = operation(value, shuffledown(value, offset));
7173
}
7274

75+
acc = operation(acc, value);
76+
7377
if (threadInWarp == 0) {
7478
shmem[currentWarp] = acc;
7579
}
@@ -78,19 +82,24 @@ __launch_bounds__(1024) void __global__ kernel_reduce(
7882

7983
if (currentWarp == 0) {
8084
const auto lastWarpsNeeded = (warpCount + warpSize - 1) / warpSize;
85+
86+
auto value = operation.defaultValue;
8187
auto lastAcc = operation.defaultValue;
88+
8289
#pragma unroll 2
8390
for (int i = 0; i < lastWarpsNeeded; ++i) {
8491
const auto id = threadInWarp + i * warpSize;
85-
auto value = (id < warpCount) ? shmem[id] : operation.defaultValue;
92+
const auto valueNew = (id < warpCount) ? shmem[id] : operation.defaultValue;
8693

87-
for (int offset = 1; offset < warpSize; offset *= 2) {
88-
value = operation(value, shuffledown(value, offset));
89-
}
94+
value = operation(value, valueNew);
95+
}
9096

91-
lastAcc = operation(lastAcc, value);
97+
for (int offset = 1; offset < warpSize; offset *= 2) {
98+
value = operation(value, shuffledown(value, offset));
9299
}
93100

101+
lastAcc = operation(lastAcc, value);
102+
94103
if (threadIdx.x == 0) {
95104
if (overrideResult) {
96105
ntstore(result, lastAcc);

algorithms/sycl/BatchManip.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,13 @@ template void Algorithms::setToValue(
118118
int** out, int value, size_t elementSize, size_t numElements, void* streamPtr);
119119
template void Algorithms::setToValue(
120120
unsigned** out, unsigned value, size_t elementSize, size_t numElements, void* streamPtr);
121+
template void Algorithms::setToValue(
122+
long** out, long value, size_t elementSize, size_t numElements, void* streamPtr);
123+
template void Algorithms::setToValue(unsigned long** out,
124+
unsigned long value,
125+
size_t elementSize,
126+
size_t numElements,
127+
void* streamPtr);
121128
template void Algorithms::setToValue(
122129
char** out, char value, size_t elementSize, size_t numElements, void* streamPtr);
123130

algorithms/sycl/Reduction.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,7 @@ void launchReduction(AccT* result,
9393
(size + (workGroupSize * itemsPerWorkItem) - 1) / (workGroupSize * itemsPerWorkItem);
9494

9595
cgh.parallel_for(
96-
sycl::nd_range<1>{numWorkGroups * itemsPerWorkItem, workGroupSize},
97-
[=](sycl::nd_item<1> idx) {
96+
sycl::nd_range<1>{numWorkGroups * workGroupSize, workGroupSize}, [=](sycl::nd_item<1> idx) {
9897
const auto localId = idx.get_local_id(0);
9998
const auto groupId = idx.get_group(0);
10099

interfaces/sycl/Internals.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#include <sycl/sycl.hpp>
1313

1414
namespace device::internals {
15-
constexpr static int DefaultBlockDim = 1024;
15+
constexpr static int DefaultBlockDim = 256;
1616

1717
template <typename T>
1818
void waitCheck(T&& result) {

tests/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ add_subdirectory(.. root)
2222

2323
find_package(GTest REQUIRED)
2424

25-
add_executable(tests main.cpp reductions.cpp array_manip.cpp memory.cpp batch_manip.cpp)
25+
add_executable(tests main.cpp reductions.cpp memory.cpp array_manip.cpp batch_manip.cpp)
2626
target_link_libraries(tests PRIVATE device ${GTEST_BOTH_LIBRARIES})
2727
target_include_directories(tests PRIVATE ${GTEST_INCLUDE_DIR})
2828

tests/array_manip.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -73,15 +73,17 @@ TEST_F(ArrayManip, touchNoClean32) {
7373
device->api->freeGlobMem(arr);
7474
}
7575

76+
// avoid double, due to some archs (also in the CI) don't seem to thoroughly support it
77+
7678
TEST_F(ArrayManip, touchClean64) {
7779

7880
const int N = 100;
79-
double* arr = (double*)device->api->allocGlobMem(N * sizeof(double));
81+
long* arr = (long*)device->api->allocGlobMem(N * sizeof(long));
8082
device->algorithms.touchMemory(arr, N, true, device->api->getDefaultStream());
81-
std::vector<double> hostVector(N, 1);
83+
std::vector<long> hostVector(N, 1);
8284

8385
device->api->copyFromAsync(
84-
&hostVector[0], arr, N * sizeof(double), device->api->getDefaultStream());
86+
&hostVector[0], arr, N * sizeof(long), device->api->getDefaultStream());
8587

8688
device->api->syncDefaultStreamWithHost();
8789

@@ -95,14 +97,13 @@ TEST_F(ArrayManip, touchClean64) {
9597
TEST_F(ArrayManip, touchNoClean64) {
9698

9799
const int N = 100;
98-
double* arr = (double*)device->api->allocGlobMem(N * sizeof(double));
99-
std::vector<double> hostVector(N, 0);
100+
long* arr = (long*)device->api->allocGlobMem(N * sizeof(long));
101+
std::vector<long> hostVector(N, 0);
100102

101-
device->api->copyToAsync(
102-
arr, &hostVector[0], N * sizeof(double), device->api->getDefaultStream());
103+
device->api->copyToAsync(arr, &hostVector[0], N * sizeof(long), device->api->getDefaultStream());
103104
device->algorithms.touchMemory(arr, N, false, device->api->getDefaultStream());
104105
device->api->copyFromAsync(
105-
&hostVector[0], arr, N * sizeof(double), device->api->getDefaultStream());
106+
&hostVector[0], arr, N * sizeof(long), device->api->getDefaultStream());
106107

107108
device->api->syncDefaultStreamWithHost();
108109

0 commit comments

Comments
 (0)