SC-SGS
diff --git a/‎bindings/Python/sklearn_svc.cpp‎
Lines changed: 2 additions & 1 deletion b/‎bindings/Python/sklearn_svc.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎bindings/Python/type_caster/matrix_type_caster.hpp‎
Lines changed: 5 additions & 4 deletions b/‎bindings/Python/type_caster/matrix_type_caster.hpp‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp‎
Lines changed: 3 additions & 2 deletions b/‎include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp‎
Lines changed: 3 additions & 2 deletions b/‎include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp‎
Lines changed: 3 additions & 2 deletions b/‎include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp‎
Lines changed: 11 additions & 10 deletions b/‎include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎include/plssvm/backends/gpu_csvm.hpp‎
Lines changed: 15 additions & 14 deletions b/‎include/plssvm/backends/gpu_csvm.hpp‎
Lines changed: 15 additions & 14 deletions
@@ -10,6 +10,7 @@
 #include "plssvm/constants.hpp"                         // plssvm::real_type
 #include "plssvm/csvm_factory.hpp"                      // plssvm::make_csvc
 #include "plssvm/data_set/classification_data_set.hpp"  // plssvm::classification_data_set
+#include "plssvm/detail/ssize.hpp"                      // plssvm::detail::ssize_t
 #include "plssvm/detail/type_traits.hpp"                // plssvm::detail::remove_cvref_t
 #include "plssvm/gamma.hpp"                             // plssvm::gamma_coefficient_type, plssvm::gamma_type
 #include "plssvm/kernel_function_types.hpp"             // plssvm::kernel_function_type
@@ -556,7 +557,7 @@ void init_sklearn_svc(py::module_ &m) {
                                             std::merge(index_sets[i].cbegin(), index_sets[i].cend(), index_sets[j].cbegin(), index_sets[j].cend(), sorted_indices.begin());
 // copy the support vectors to the binary support vectors
 #pragma omp parallel for collapse(2)
-                                            for (std::size_t si = 0; si < num_data_points_in_sub_matrix; ++si) {
+                                            for (ssize_t si = 0; si < static_cast<ssize_t>(num_data_points_in_sub_matrix); ++si) {
                                                 for (std::size_t dim = 0; dim < num_features; ++dim) {
                                                     temp(si, dim) = model.support_vectors()(sorted_indices[si], dim);
                                                 }
 
@@ -14,6 +14,7 @@
 #pragma once
 
 #include "plssvm/constants.hpp"                 // plssvm::PADDING_SIZE
+#include "plssvm/detail/ssize.hpp"              // plssvm::detail::{ssize_t, ssize}
 #include "plssvm/detail/string_conversion.hpp"  // plssvm::detail::convert_to
 #include "plssvm/matrix.hpp"                    // plssvm::matrix, plssvm::layout_type
 #include "plssvm/shape.hpp"                     // plssvm::shape
@@ -115,13 +116,13 @@ struct type_caster<plssvm::matrix<T, layout>> {
             if constexpr (layout == plssvm::layout_type::aos) {
                 // memory layout of Python Numpy array and PLSSVM matrix are the same -> can use memcpy to convert
 #pragma omp parallel for
-                for (std::size_t row = 0; row < num_rows; ++row) {
+                for (plssvm::detail::ssize_t row = 0; row < static_cast<plssvm::detail::ssize_t>(num_rows); ++row) {
                     std::memcpy(value.data() + row * value.num_cols_padded(), ptr + row * num_cols, num_cols * sizeof(T));
                 }
             } else if constexpr (layout == plssvm::layout_type::soa) {
                 // the memory layouts don't match -> must use loops to convert layouts
 #pragma omp parallel for collapse(2)
-                for (std::size_t row = 0; row < num_rows; ++row) {
+                for (plssvm::detail::ssize_t row = 0; row < static_cast<plssvm::detail::ssize_t>(num_rows); ++row) {
                     for (std::size_t col = 0; col < num_cols; ++col) {
                         value(row, col) = ptr[row * num_cols + col];
                     }
@@ -134,15 +135,15 @@ struct type_caster<plssvm::matrix<T, layout>> {
             if constexpr (layout == plssvm::layout_type::aos) {
                 // the memory layouts don't match -> must use loops to convert layouts
 #pragma omp parallel for collapse(2)
-                for (std::size_t row = 0; row < num_rows; ++row) {
+                for (plssvm::detail::ssize_t row = 0; row < static_cast<plssvm::detail::ssize_t>(num_rows); ++row) {
                     for (std::size_t col = 0; col < num_cols; ++col) {
                         value(row, col) = ptr[col * num_rows + row];
                     }
                 }
             } else if constexpr (layout == plssvm::layout_type::soa) {
                 // memory layout of Python Numpy array and PLSSVM matrix are the same -> can use memcpy to convert
 #pragma omp parallel for
-                for (std::size_t row = 0; row < num_cols; ++row) {
+                for (plssvm::detail::ssize_t row = 0; row < static_cast<plssvm::detail::ssize_t>(num_cols); ++row) {
                     std::memcpy(value.data() + row * value.num_rows_padded(), ptr + row * num_rows, num_rows * sizeof(T));
                 }
             } else {
 
@@ -15,6 +15,7 @@
 
 #include "plssvm/constants.hpp"      // plssvm::real_type
 #include "plssvm/detail/assert.hpp"  // PLSSVM_ASSERT
+#include "plssvm/detail/ssize.hpp"   // plssvm::detail::ssize_t
 #include "plssvm/matrix.hpp"         // plssvm::aos_matrix
 #include "plssvm/shape.hpp"          // plssvm::shape
 
@@ -51,8 +52,8 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
     const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
 #pragma omp parallel for collapse(2)
-    for (std::size_t rhs = 0; rhs < blocked_num_rhs; rhs += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t row = 0; row < blocked_num_rows; row += THREAD_BLOCK_SIZE_uz) {
+    for (plssvm::detail::ssize_t rhs = 0; rhs < blocked_num_rhs; rhs += THREAD_BLOCK_SIZE_uz) {
+        for (plssvm::detail::ssize_t row = 0; row < blocked_num_rows; row += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
             for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) {
                 for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
 
@@ -16,6 +16,7 @@
 #include "plssvm/backends/OpenMP/kernel/kernel_functions.hpp"  // plssvm::openmp::detail::{feature_reduce, apply_kernel_function}
 #include "plssvm/constants.hpp"                                // plssvm::real_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::PADDING_SIZE
 #include "plssvm/detail/assert.hpp"                            // PLSSVM_ASSERT
+#include "plssvm/detail/ssize.hpp"                             // plssvm::detail::ssize_t
 #include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
 #include "plssvm/matrix.hpp"                                   // plssvm::aos_matrix
 
@@ -54,8 +55,8 @@ void device_kernel_assembly(const std::vector<real_type> &q, std::vector<real_ty
     const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
 #pragma omp parallel for collapse(2) schedule(dynamic)
-    for (std::size_t row = 0; row < blocked_dept; row += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t col = 0; col < blocked_dept; col += THREAD_BLOCK_SIZE_uz) {
+    for (plssvm::detail::ssize_t row = 0; row < blocked_dept; row += THREAD_BLOCK_SIZE_uz) {
+        for (plssvm::detail::ssize_t col = 0; col < blocked_dept; col += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
             for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
                 for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) {
 
@@ -16,6 +16,7 @@
 #include "plssvm/constants.hpp"              // plssvm::real_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
 #include "plssvm/detail/assert.hpp"          // PLSSVM_ASSERT
 #include "plssvm/detail/operators.hpp"       // overloaded arithmetic operations for a plssvm::matrix
+#include "plssvm/detail/ssize.hpp"           // plssvm::detail::ssize_t
 #include "plssvm/kernel_function_types.hpp"  // plssvm::kernel_function_type
 #include "plssvm/matrix.hpp"                 // aos_matrix
 
@@ -63,8 +64,8 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
     const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
 #pragma omp parallel for collapse(2) schedule(dynamic)
-    for (std::size_t row = 0; row < blocked_dept; row += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t col = 0; col < blocked_dept; col += THREAD_BLOCK_SIZE_uz) {
+    for (plssvm::detail::ssize_t row = 0; row < blocked_dept; row += THREAD_BLOCK_SIZE_uz) {
+        for (plssvm::detail::ssize_t col = 0; col < blocked_dept; col += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
             for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
                 for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) {
 
@@ -15,6 +15,7 @@
 
 #include "plssvm/constants.hpp"              // plssvm::real_type
 #include "plssvm/detail/assert.hpp"          // PLSSVM_ASSERT
+#include "plssvm/detail/ssize.hpp"           // plssvm::detail::ssize_t
 #include "plssvm/kernel_function_types.hpp"  // plssvm::kernel_function_type
 #include "plssvm/matrix.hpp"                 // plssvm::aos_matrix, plssvm::matrix
 #include "plssvm/shape.hpp"                  // plssvm::shape
@@ -42,11 +43,11 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
     const std::size_t num_features = support_vectors.num_cols();
 
 #pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(num_classes, num_features, num_support_vectors)
-    for (std::size_t a = 0; a < num_classes; ++a) {
-        for (std::size_t dim = 0; dim < num_features; ++dim) {
+    for (plssvm::detail::ssize_t a = 0; a < num_classes; ++a) {
+        for (plssvm::detail::ssize_t dim = 0; dim < num_features; ++dim) {
             real_type temp{ 0.0 };
 #pragma omp simd reduction(+ : temp)
-            for (std::size_t idx = 0; idx < num_support_vectors; ++idx) {
+            for (plssvm::detail::ssize_t idx = 0; idx < num_support_vectors; ++idx) {
                 temp = std::fma(alpha(a, idx), support_vectors(idx, dim), temp);
             }
             w(a, dim) = temp;
@@ -72,11 +73,11 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
     const std::size_t num_features = predict_points.num_cols();
 
 #pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(num_classes, num_features, num_predict_points)
-    for (std::size_t point_index = 0; point_index < num_predict_points; ++point_index) {
-        for (std::size_t a = 0; a < num_classes; ++a) {
+    for (plssvm::detail::ssize_t point_index = 0; point_index < num_predict_points; ++point_index) {
+        for (plssvm::detail::ssize_t a = 0; a < num_classes; ++a) {
             real_type temp{ 0.0 };
 #pragma omp simd reduction(+ : temp)
-            for (std::size_t dim = 0; dim < num_features; ++dim) {
+            for (plssvm::detail::ssize_t dim = 0; dim < num_features; ++dim) {
                 temp = std::fma(w(a, dim), predict_points(point_index, dim), temp);
             }
             prediction(point_index, a) = temp - rho[a];
@@ -115,15 +116,15 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
     const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
 #pragma omp parallel for collapse(2)
-    for (std::size_t point_index = 0; point_index < num_predict_points; ++point_index) {
-        for (std::size_t a = 0; a < num_classes; ++a) {
+    for (plssvm::detail::ssize_t point_index = 0; point_index < num_predict_points; ++point_index) {
+        for (plssvm::detail::ssize_t a = 0; a < num_classes; ++a) {
             prediction(point_index, a) -= rho[a];
         }
     }
 
 #pragma omp parallel for collapse(2)
-    for (std::size_t pp = 0; pp < blocked_num_predict_points; pp += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t sv = 0; sv < blocked_num_support_vectors; sv += THREAD_BLOCK_SIZE_uz) {
+    for (plssvm::detail::ssize_t pp = 0; pp < blocked_num_predict_points; pp += THREAD_BLOCK_SIZE_uz) {
+        for (plssvm::detail::ssize_t sv = 0; sv < blocked_num_support_vectors; sv += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
             for (std::size_t pp_block = 0; pp_block < THREAD_BLOCK_SIZE_uz; ++pp_block) {
                 for (std::size_t sv_block = 0; sv_block < THREAD_BLOCK_SIZE_uz; ++sv_block) {
 
@@ -18,6 +18,7 @@
 #include "plssvm/detail/assert.hpp"             // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"  // plssvm::detail::{data_distribution, triangular_data_distribution, rectangular_data_distribution}
 #include "plssvm/detail/move_only_any.hpp"      // plssvm::detail::{move_only_any, move_only_any_cast}
+#include "plssvm/detail/ssize.hpp"              // plssvm::detail::ssize_t
 #include "plssvm/kernel_function_types.hpp"     // plssvm::kernel_function_type
 #include "plssvm/matrix.hpp"                    // plssvm::aos_matrix, plssvm::soa_matrix
 #include "plssvm/parameter.hpp"                 // plssvm::parameter
@@ -255,7 +256,7 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvm<device_ptr_t, queue_t, pin
 
     // split memory allocation and memory copy! (necessary to remove locks on some systems and setups)
 #pragma omp parallel for if (num_devices > 1)
-    for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+    for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
         // check whether the current device is responsible for at least one data point!
         if (data_distribution_->place_specific_num_rows(device_id) == 0) {
             continue;
@@ -271,7 +272,7 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvm<device_ptr_t, queue_t, pin
     const pinned_memory_type pm{ A };
 
 #pragma omp parallel for if (num_devices > 1)
-    for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+    for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
         // check whether the current device is responsible for at least one data point!
         if (data_distribution_->place_specific_num_rows(device_id) == 0) {
             continue;
@@ -346,7 +347,7 @@ void gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::blas_level_3(const solver
 
     // split memory allocation and memory copy!
 #pragma omp parallel for if (num_devices > 1)
-    for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+    for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
         // check whether the current device is responsible for at least one data point!
         if (data_distribution_->place_specific_num_rows(device_id) == 0) {
             continue;
@@ -359,7 +360,7 @@ void gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::blas_level_3(const solver
     }
 
 #pragma omp parallel for ordered if (num_devices > 1)
-    for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+    for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
         // check whether the current device is responsible for at least one data point!
         if (data_distribution_->place_specific_num_rows(device_id) == 0) {
             continue;
@@ -504,14 +505,14 @@ aos_matrix<real_type> gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::predict_
 
     // split memory allocation and memory copy!
 #pragma omp parallel for if (num_devices > 1)
-    for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+    for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
         const queue_type &device = devices_[device_id];
 
         // allocate memory on the device
         alpha_d[device_id] = device_ptr_type{ alpha.shape(), alpha.padding(), device };
     }
 #pragma omp parallel for if (num_devices > 1)
-    for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+    for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
         // copy data to the device
         alpha_d[device_id].copy_to_device(alpha);
     }
@@ -532,7 +533,7 @@ aos_matrix<real_type> gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::predict_
             std::vector<device_ptr_type> sv_d(num_devices);
             // split memory allocation and memory copy!
 #pragma omp parallel for if (num_devices > 1)
-            for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+            for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
                 // check whether the current device is responsible for at least one data point!
                 if (data_distribution_->place_specific_num_rows(device_id) == 0) {
                     continue;
@@ -544,7 +545,7 @@ aos_matrix<real_type> gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::predict_
             }
 
 #pragma omp parallel for ordered if (num_devices > 1)
-            for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+            for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
                 // check whether the current device is responsible for at least one data point!
                 if (data_distribution_->place_specific_num_rows(device_id) == 0) {
                     continue;
@@ -599,29 +600,29 @@ aos_matrix<real_type> gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::predict_
         // upload the w vector to all devices
         // split memory allocation and memory copy!
 #pragma omp parallel for if (num_devices > 1)
-        for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+        for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
             const queue_type &device = devices_[device_id];
 
             // allocate memory on the device
             sv_or_w_d[device_id] = device_ptr_type{ shape{ num_classes, num_features }, shape{ PADDING_SIZE, PADDING_SIZE }, device };
         }
 #pragma omp parallel for if (num_devices > 1)
-        for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+        for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
             // copy data to the device
             sv_or_w_d[device_id].copy_to_device(w);
         }
     } else {
         // use the support vectors for all other kernel functions
         // split memory allocation and memory copy!
 #pragma omp parallel for if (num_devices > 1)
-        for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+        for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
             const queue_type &device = devices_[device_id];
 
             // allocate memory on the device
             sv_or_w_d[device_id] = device_ptr_type{ support_vectors.shape(), support_vectors.padding(), device };
         }
 #pragma omp parallel for if (num_devices > 1)
-        for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+        for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
             // copy data to the device
             sv_or_w_d[device_id].copy_to_device(support_vectors);
         }
@@ -637,7 +638,7 @@ aos_matrix<real_type> gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::predict_
 
     // split memory allocation and memory copy!
 #pragma omp parallel for if (num_devices > 1)
-    for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+    for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
         // check whether the current device is responsible for at least one data point!
         if (data_distribution_->place_specific_num_rows(device_id) == 0) {
             continue;
@@ -651,7 +652,7 @@ aos_matrix<real_type> gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::predict_
     }
 
 #pragma omp parallel for if (num_devices > 1)
-    for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+    for (ssize_t device_id = 0; device_id < num_devices; ++device_id) {
         // check whether the current device is responsible for at least one data point!
         if (data_distribution_->place_specific_num_rows(device_id) == 0) {
             continue;