Skip to content

Commit 6ec59bb

Browse files
committed
Fix error in SYCL's basic data parallel kernel (only manifested in multi-GPU setups): the grid_x_offset and grid_y_offset was swapped.
1 parent a7dcd91 commit 6ec59bb

File tree

4 files changed

+16
-16
lines changed

4 files changed

+16
-16
lines changed

include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ class device_kernel_symm {
7373
constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
7474

7575
// calculate the indices used in the current work-item
76-
const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs
77-
const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows
76+
const auto i_idx = (idx.get_id(1) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs
77+
const auto j_idx = (idx.get_id(0) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows
7878

7979
// create a work-item private array used for internal caching
8080
std::array<std::array<real_type, INTERNAL_BLOCK_SIZE_uz>, INTERNAL_BLOCK_SIZE_uz> temp{};
@@ -212,8 +212,8 @@ class device_kernel_symm_mirror {
212212
constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
213213

214214
// calculate the indices used in the current work-item
215-
const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs
216-
const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows
215+
const auto i_idx = (idx.get_id(1) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs
216+
const auto j_idx = (idx.get_id(0) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows
217217

218218
// create a work-item private array used for internal caching
219219
std::array<std::array<real_type, INTERNAL_BLOCK_SIZE_uz>, INTERNAL_BLOCK_SIZE_uz> temp{};
@@ -321,8 +321,8 @@ class device_kernel_inplace_matrix_add {
321321
constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
322322

323323
// calculate the indices used in the current work-item
324-
const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows
325-
const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs
324+
const auto i_idx = (idx.get_id(1) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows
325+
const auto j_idx = (idx.get_id(0) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs
326326

327327
for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
328328
for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {

include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,8 @@ class device_kernel_assembly {
8181
constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
8282

8383
// calculate the indices used in the current work-item
84-
const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset
85-
const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows
84+
const auto i_idx = (idx.get_id(1) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset
85+
const auto j_idx = (idx.get_id(0) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows
8686

8787
// only calculate the upper triangular matrix
8888
if (i_idx >= j_idx) {

include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ class device_kernel_assembly_symm {
8787
constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
8888

8989
// calculate the indices used in the current work-item
90-
const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset
91-
const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows
90+
const auto i_idx = (idx.get_id(1) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset
91+
const auto j_idx = (idx.get_id(0) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows
9292

9393
// only calculate the upper triangular matrix
9494
if (i_idx >= j_idx) {

include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ class device_kernel_w_linear {
7373
constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
7474

7575
// calculate the indices used in the current work-item
76-
const auto feature_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_features
77-
const auto class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_classes
76+
const auto feature_idx = (idx.get_id(1) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_features
77+
const auto class_idx = (idx.get_id(0) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_classes
7878

7979
// create a work-item private array used for internal caching
8080
std::array<std::array<real_type, INTERNAL_BLOCK_SIZE_uz>, INTERNAL_BLOCK_SIZE_uz> temp{};
@@ -185,8 +185,8 @@ class device_kernel_predict_linear {
185185
constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
186186

187187
// calculate the indices used in the current work-item
188-
const auto pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points
189-
const auto class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_classes
188+
const auto pp_idx = (idx.get_id(1) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points
189+
const auto class_idx = (idx.get_id(0) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_classes
190190

191191
// create a work-item private array used for internal caching
192192
std::array<std::array<real_type, INTERNAL_BLOCK_SIZE_uz>, INTERNAL_BLOCK_SIZE_uz> temp{};
@@ -305,8 +305,8 @@ class device_kernel_predict {
305305
constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
306306

307307
// calculate the indices used in the current work-item
308-
const auto pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points
309-
const auto sv_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_support_vectors
308+
const auto pp_idx = (idx.get_id(1) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points
309+
const auto sv_idx = (idx.get_id(0) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_support_vectors
310310

311311
// create a work-item private array used for internal caching
312312
std::array<std::array<real_type, INTERNAL_BLOCK_SIZE_uz>, INTERNAL_BLOCK_SIZE_uz> temp{};

0 commit comments

Comments
 (0)