Skip to content

Commit 9150e1b

Browse files
committed
Try fixing a MSVC bug regarding unsigned OpenMP loop bounces.
1 parent 8047d7f commit 9150e1b

File tree

22 files changed

+174
-113
lines changed

22 files changed

+174
-113
lines changed

bindings/Python/sklearn_svc.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "plssvm/constants.hpp" // plssvm::real_type
1111
#include "plssvm/csvm_factory.hpp" // plssvm::make_csvc
1212
#include "plssvm/data_set/classification_data_set.hpp" // plssvm::classification_data_set
13+
#include "plssvm/detail/ssize.hpp" // plssvm::detail::ssize_t
1314
#include "plssvm/detail/type_traits.hpp" // plssvm::detail::remove_cvref_t
1415
#include "plssvm/gamma.hpp" // plssvm::gamma_coefficient_type, plssvm::gamma_type
1516
#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type
@@ -556,7 +557,7 @@ void init_sklearn_svc(py::module_ &m) {
556557
std::merge(index_sets[i].cbegin(), index_sets[i].cend(), index_sets[j].cbegin(), index_sets[j].cend(), sorted_indices.begin());
557558
// copy the support vectors to the binary support vectors
558559
#pragma omp parallel for collapse(2)
559-
for (std::size_t si = 0; si < num_data_points_in_sub_matrix; ++si) {
560+
for (ssize_t si = 0; si < static_cast<ssize_t>(num_data_points_in_sub_matrix); ++si) {
560561
for (std::size_t dim = 0; dim < num_features; ++dim) {
561562
temp(si, dim) = model.support_vectors()(sorted_indices[si], dim);
562563
}

bindings/Python/type_caster/matrix_type_caster.hpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#pragma once
1515

1616
#include "plssvm/constants.hpp" // plssvm::PADDING_SIZE
17+
#include "plssvm/detail/ssize.hpp" // plssvm::detail::{ssize_t, ssize}
1718
#include "plssvm/detail/string_conversion.hpp" // plssvm::detail::convert_to
1819
#include "plssvm/matrix.hpp" // plssvm::matrix, plssvm::layout_type
1920
#include "plssvm/shape.hpp" // plssvm::shape
@@ -115,13 +116,13 @@ struct type_caster<plssvm::matrix<T, layout>> {
115116
if constexpr (layout == plssvm::layout_type::aos) {
116117
// memory layout of Python Numpy array and PLSSVM matrix are the same -> can use memcpy to convert
117118
#pragma omp parallel for
118-
for (std::size_t row = 0; row < num_rows; ++row) {
119+
for (plssvm::detail::ssize_t row = 0; row < static_cast<plssvm::detail::ssize_t>(num_rows); ++row) {
119120
std::memcpy(value.data() + row * value.num_cols_padded(), ptr + row * num_cols, num_cols * sizeof(T));
120121
}
121122
} else if constexpr (layout == plssvm::layout_type::soa) {
122123
// the memory layouts don't match -> must use loops to convert layouts
123124
#pragma omp parallel for collapse(2)
124-
for (std::size_t row = 0; row < num_rows; ++row) {
125+
for (plssvm::detail::ssize_t row = 0; row < static_cast<plssvm::detail::ssize_t>(num_rows); ++row) {
125126
for (std::size_t col = 0; col < num_cols; ++col) {
126127
value(row, col) = ptr[row * num_cols + col];
127128
}
@@ -134,15 +135,15 @@ struct type_caster<plssvm::matrix<T, layout>> {
134135
if constexpr (layout == plssvm::layout_type::aos) {
135136
// the memory layouts don't match -> must use loops to convert layouts
136137
#pragma omp parallel for collapse(2)
137-
for (std::size_t row = 0; row < num_rows; ++row) {
138+
for (plssvm::detail::ssize_t row = 0; row < static_cast<plssvm::detail::ssize_t>(num_rows); ++row) {
138139
for (std::size_t col = 0; col < num_cols; ++col) {
139140
value(row, col) = ptr[col * num_rows + row];
140141
}
141142
}
142143
} else if constexpr (layout == plssvm::layout_type::soa) {
143144
// memory layout of Python Numpy array and PLSSVM matrix are the same -> can use memcpy to convert
144145
#pragma omp parallel for
145-
for (std::size_t row = 0; row < num_cols; ++row) {
146+
for (plssvm::detail::ssize_t row = 0; row < static_cast<plssvm::detail::ssize_t>(num_cols); ++row) {
146147
std::memcpy(value.data() + row * value.num_rows_padded(), ptr + row * num_rows, num_rows * sizeof(T));
147148
}
148149
} else {

include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include "plssvm/constants.hpp" // plssvm::real_type
1717
#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT
18+
#include "plssvm/detail/ssize.hpp" // plssvm::detail::ssize_t
1819
#include "plssvm/matrix.hpp" // plssvm::aos_matrix
1920
#include "plssvm/shape.hpp" // plssvm::shape
2021

@@ -51,8 +52,8 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
5152
const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
5253

5354
#pragma omp parallel for collapse(2)
54-
for (std::size_t rhs = 0; rhs < blocked_num_rhs; rhs += THREAD_BLOCK_SIZE_uz) {
55-
for (std::size_t row = 0; row < blocked_num_rows; row += THREAD_BLOCK_SIZE_uz) {
55+
for (plssvm::detail::ssize_t rhs = 0; rhs < blocked_num_rhs; rhs += THREAD_BLOCK_SIZE_uz) {
56+
for (plssvm::detail::ssize_t row = 0; row < blocked_num_rows; row += THREAD_BLOCK_SIZE_uz) {
5657
// perform operations on the current block
5758
for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) {
5859
for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {

include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "plssvm/backends/OpenMP/kernel/kernel_functions.hpp" // plssvm::openmp::detail::{feature_reduce, apply_kernel_function}
1717
#include "plssvm/constants.hpp" // plssvm::real_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::PADDING_SIZE
1818
#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT
19+
#include "plssvm/detail/ssize.hpp" // plssvm::detail::ssize_t
1920
#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type
2021
#include "plssvm/matrix.hpp" // plssvm::aos_matrix
2122

@@ -54,8 +55,8 @@ void device_kernel_assembly(const std::vector<real_type> &q, std::vector<real_ty
5455
const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
5556

5657
#pragma omp parallel for collapse(2) schedule(dynamic)
57-
for (std::size_t row = 0; row < blocked_dept; row += THREAD_BLOCK_SIZE_uz) {
58-
for (std::size_t col = 0; col < blocked_dept; col += THREAD_BLOCK_SIZE_uz) {
58+
for (plssvm::detail::ssize_t row = 0; row < blocked_dept; row += THREAD_BLOCK_SIZE_uz) {
59+
for (plssvm::detail::ssize_t col = 0; col < blocked_dept; col += THREAD_BLOCK_SIZE_uz) {
5960
// perform operations on the current block
6061
for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
6162
for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) {

include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "plssvm/constants.hpp" // plssvm::real_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
1717
#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT
1818
#include "plssvm/detail/operators.hpp" // overloaded arithmetic operations for a plssvm::matrix
19+
#include "plssvm/detail/ssize.hpp" // plssvm::detail::ssize_t
1920
#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type
2021
#include "plssvm/matrix.hpp" // aos_matrix
2122

@@ -63,8 +64,8 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
6364
const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
6465

6566
#pragma omp parallel for collapse(2) schedule(dynamic)
66-
for (std::size_t row = 0; row < blocked_dept; row += THREAD_BLOCK_SIZE_uz) {
67-
for (std::size_t col = 0; col < blocked_dept; col += THREAD_BLOCK_SIZE_uz) {
67+
for (plssvm::detail::ssize_t row = 0; row < blocked_dept; row += THREAD_BLOCK_SIZE_uz) {
68+
for (plssvm::detail::ssize_t col = 0; col < blocked_dept; col += THREAD_BLOCK_SIZE_uz) {
6869
// perform operations on the current block
6970
for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
7071
for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) {

include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include "plssvm/constants.hpp" // plssvm::real_type
1717
#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT
18+
#include "plssvm/detail/ssize.hpp" // plssvm::detail::ssize_t
1819
#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type
1920
#include "plssvm/matrix.hpp" // plssvm::aos_matrix, plssvm::matrix
2021
#include "plssvm/shape.hpp" // plssvm::shape
@@ -42,11 +43,11 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
4243
const std::size_t num_features = support_vectors.num_cols();
4344

4445
#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(num_classes, num_features, num_support_vectors)
45-
for (std::size_t a = 0; a < num_classes; ++a) {
46-
for (std::size_t dim = 0; dim < num_features; ++dim) {
46+
for (plssvm::detail::ssize_t a = 0; a < num_classes; ++a) {
47+
for (plssvm::detail::ssize_t dim = 0; dim < num_features; ++dim) {
4748
real_type temp{ 0.0 };
4849
#pragma omp simd reduction(+ : temp)
49-
for (std::size_t idx = 0; idx < num_support_vectors; ++idx) {
50+
for (plssvm::detail::ssize_t idx = 0; idx < num_support_vectors; ++idx) {
5051
temp = std::fma(alpha(a, idx), support_vectors(idx, dim), temp);
5152
}
5253
w(a, dim) = temp;
@@ -72,11 +73,11 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
7273
const std::size_t num_features = predict_points.num_cols();
7374

7475
#pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(num_classes, num_features, num_predict_points)
75-
for (std::size_t point_index = 0; point_index < num_predict_points; ++point_index) {
76-
for (std::size_t a = 0; a < num_classes; ++a) {
76+
for (plssvm::detail::ssize_t point_index = 0; point_index < num_predict_points; ++point_index) {
77+
for (plssvm::detail::ssize_t a = 0; a < num_classes; ++a) {
7778
real_type temp{ 0.0 };
7879
#pragma omp simd reduction(+ : temp)
79-
for (std::size_t dim = 0; dim < num_features; ++dim) {
80+
for (plssvm::detail::ssize_t dim = 0; dim < num_features; ++dim) {
8081
temp = std::fma(w(a, dim), predict_points(point_index, dim), temp);
8182
}
8283
prediction(point_index, a) = temp - rho[a];
@@ -115,15 +116,15 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
115116
const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
116117

117118
#pragma omp parallel for collapse(2)
118-
for (std::size_t point_index = 0; point_index < num_predict_points; ++point_index) {
119-
for (std::size_t a = 0; a < num_classes; ++a) {
119+
for (plssvm::detail::ssize_t point_index = 0; point_index < num_predict_points; ++point_index) {
120+
for (plssvm::detail::ssize_t a = 0; a < num_classes; ++a) {
120121
prediction(point_index, a) -= rho[a];
121122
}
122123
}
123124

124125
#pragma omp parallel for collapse(2)
125-
for (std::size_t pp = 0; pp < blocked_num_predict_points; pp += THREAD_BLOCK_SIZE_uz) {
126-
for (std::size_t sv = 0; sv < blocked_num_support_vectors; sv += THREAD_BLOCK_SIZE_uz) {
126+
for (plssvm::detail::ssize_t pp = 0; pp < blocked_num_predict_points; pp += THREAD_BLOCK_SIZE_uz) {
127+
for (plssvm::detail::ssize_t sv = 0; sv < blocked_num_support_vectors; sv += THREAD_BLOCK_SIZE_uz) {
127128
// perform operations on the current block
128129
for (std::size_t pp_block = 0; pp_block < THREAD_BLOCK_SIZE_uz; ++pp_block) {
129130
for (std::size_t sv_block = 0; sv_block < THREAD_BLOCK_SIZE_uz; ++sv_block) {

include/plssvm/backends/gpu_csvm.hpp

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT
1919
#include "plssvm/detail/data_distribution.hpp" // plssvm::detail::{data_distribution, triangular_data_distribution, rectangular_data_distribution}
2020
#include "plssvm/detail/move_only_any.hpp" // plssvm::detail::{move_only_any, move_only_any_cast}
21+
#include "plssvm/detail/ssize.hpp" // plssvm::detail::ssize_t
2122
#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type
2223
#include "plssvm/matrix.hpp" // plssvm::aos_matrix, plssvm::soa_matrix
2324
#include "plssvm/parameter.hpp" // plssvm::parameter
@@ -255,7 +256,7 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvm<device_ptr_t, queue_t, pin
255256

256257
// split memory allocation and memory copy! (necessary to remove locks on some systems and setups)
257258
#pragma omp parallel for if (num_devices > 1)
258-
for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
259+
for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
259260
// check whether the current device is responsible for at least one data point!
260261
if (data_distribution_->place_specific_num_rows(device_id) == 0) {
261262
continue;
@@ -271,7 +272,7 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvm<device_ptr_t, queue_t, pin
271272
const pinned_memory_type pm{ A };
272273

273274
#pragma omp parallel for if (num_devices > 1)
274-
for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
275+
for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
275276
// check whether the current device is responsible for at least one data point!
276277
if (data_distribution_->place_specific_num_rows(device_id) == 0) {
277278
continue;
@@ -346,7 +347,7 @@ void gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::blas_level_3(const solver
346347

347348
// split memory allocation and memory copy!
348349
#pragma omp parallel for if (num_devices > 1)
349-
for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
350+
for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
350351
// check whether the current device is responsible for at least one data point!
351352
if (data_distribution_->place_specific_num_rows(device_id) == 0) {
352353
continue;
@@ -359,7 +360,7 @@ void gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::blas_level_3(const solver
359360
}
360361

361362
#pragma omp parallel for ordered if (num_devices > 1)
362-
for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
363+
for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
363364
// check whether the current device is responsible for at least one data point!
364365
if (data_distribution_->place_specific_num_rows(device_id) == 0) {
365366
continue;
@@ -504,14 +505,14 @@ aos_matrix<real_type> gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::predict_
504505

505506
// split memory allocation and memory copy!
506507
#pragma omp parallel for if (num_devices > 1)
507-
for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
508+
for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
508509
const queue_type &device = devices_[device_id];
509510

510511
// allocate memory on the device
511512
alpha_d[device_id] = device_ptr_type{ alpha.shape(), alpha.padding(), device };
512513
}
513514
#pragma omp parallel for if (num_devices > 1)
514-
for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
515+
for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
515516
// copy data to the device
516517
alpha_d[device_id].copy_to_device(alpha);
517518
}
@@ -532,7 +533,7 @@ aos_matrix<real_type> gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::predict_
532533
std::vector<device_ptr_type> sv_d(num_devices);
533534
// split memory allocation and memory copy!
534535
#pragma omp parallel for if (num_devices > 1)
535-
for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
536+
for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
536537
// check whether the current device is responsible for at least one data point!
537538
if (data_distribution_->place_specific_num_rows(device_id) == 0) {
538539
continue;
@@ -544,7 +545,7 @@ aos_matrix<real_type> gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::predict_
544545
}
545546

546547
#pragma omp parallel for ordered if (num_devices > 1)
547-
for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
548+
for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
548549
// check whether the current device is responsible for at least one data point!
549550
if (data_distribution_->place_specific_num_rows(device_id) == 0) {
550551
continue;
@@ -599,29 +600,29 @@ aos_matrix<real_type> gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::predict_
599600
// upload the w vector to all devices
600601
// split memory allocation and memory copy!
601602
#pragma omp parallel for if (num_devices > 1)
602-
for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
603+
for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
603604
const queue_type &device = devices_[device_id];
604605

605606
// allocate memory on the device
606607
sv_or_w_d[device_id] = device_ptr_type{ shape{ num_classes, num_features }, shape{ PADDING_SIZE, PADDING_SIZE }, device };
607608
}
608609
#pragma omp parallel for if (num_devices > 1)
609-
for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
610+
for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
610611
// copy data to the device
611612
sv_or_w_d[device_id].copy_to_device(w);
612613
}
613614
} else {
614615
// use the support vectors for all other kernel functions
615616
// split memory allocation and memory copy!
616617
#pragma omp parallel for if (num_devices > 1)
617-
for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
618+
for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
618619
const queue_type &device = devices_[device_id];
619620

620621
// allocate memory on the device
621622
sv_or_w_d[device_id] = device_ptr_type{ support_vectors.shape(), support_vectors.padding(), device };
622623
}
623624
#pragma omp parallel for if (num_devices > 1)
624-
for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
625+
for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
625626
// copy data to the device
626627
sv_or_w_d[device_id].copy_to_device(support_vectors);
627628
}
@@ -637,7 +638,7 @@ aos_matrix<real_type> gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::predict_
637638

638639
// split memory allocation and memory copy!
639640
#pragma omp parallel for if (num_devices > 1)
640-
for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
641+
for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
641642
// check whether the current device is responsible for at least one data point!
642643
if (data_distribution_->place_specific_num_rows(device_id) == 0) {
643644
continue;
@@ -651,7 +652,7 @@ aos_matrix<real_type> gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::predict_
651652
}
652653

653654
#pragma omp parallel for if (num_devices > 1)
654-
for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
655+
for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
655656
// check whether the current device is responsible for at least one data point!
656657
if (data_distribution_->place_specific_num_rows(device_id) == 0) {
657658
continue;

0 commit comments

Comments
 (0)