Refactor validation logic and update CI

Deniskore · Deniskore · commit e792adda2f4b · 2025-12-08T10:02:52.000+02:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -36,6 +36,8 @@ jobs:
         run: cargo check --all-targets
       - name: cargo check (no default features)
         run: cargo check --all-targets --no-default-features
+      - name: cargo check (serde)
+        run: cargo check --all-targets --features serde
 
   test:
     name: Cargo Test
diff --git a/src/kmeans_core_scalar.rs b/src/kmeans_core_scalar.rs
@@ -37,7 +37,8 @@ impl<F: Primitive> CoreBackend<F> for ScalarBackend {
         for c in 0..counts.len() {
             if counts[c] > 0 {
                 let base = c * ncols;
-                let inv_count = F::one() / F::from(counts[c]).unwrap_or(F::one());
+                let count_f = F::from_usize(counts[c]);
+                let inv_count = F::one() / count_f;
                 for j in 0..ncols {
                     centroids[base + j] = sums[base + j] * inv_count;
                 }
diff --git a/src/kmeans_core_simd.rs b/src/kmeans_core_simd.rs
@@ -4,6 +4,7 @@ use crate::kmeans_core_common::{
     find_nearest_centroids_generic,
 };
 use crate::point_source::PointSource;
+use crate::primitive::Primitive;
 
 pub struct SimdBackend;
 
@@ -476,7 +477,7 @@ macro_rules! impl_simd_backend {
                         }
 
                         if counts[k_idx] > 0 {
-                            inv_counts[lane] = 1.0 / counts[k_idx] as $scalar;
+                            inv_counts[lane] = 1.0 / <$scalar>::from_usize(counts[k_idx]);
                             active_mask[lane] = true;
                         } else if source.num_points() > 0 {
                             zero_indices[lane] = rng.random_range(0..source.num_points());
diff --git a/src/kmeans_cpu.rs b/src/kmeans_cpu.rs
@@ -24,7 +24,6 @@ pub(crate) fn run<
     tolerance: F,
     seed: Option<u64>,
 ) -> Result<(Vec<F>, F)> {
-    let _npoints = source.num_points();
     let ncols = source.num_columns();
     let mut rng = match seed {
         Some(s) => StdRng::seed_from_u64(s),
@@ -46,7 +45,7 @@ pub(crate) fn run<
             par_chunk_size,
         )?;
 
-        inertia = F::from(iter_inertia).unwrap_or(F::zero());
+        inertia = F::from(iter_inertia).ok_or(crate::error::Error::ConversionFailure)?;
 
         let old_centroids = C::finalize_centroids(&prepared_centroids, ncols, k);
 
diff --git a/src/kmeans_mini_batch.rs b/src/kmeans_mini_batch.rs
@@ -139,7 +139,7 @@ pub(crate) fn run<
     let (_, _, full_inertia) =
         E::compute_stats_full::<F, C, M, S>(source, ncols, k, &prepared_centroids, par_chunk)?;
     let centroids = C::finalize_centroids(&prepared_centroids, ncols, k);
-    let inertia = F::from(full_inertia).unwrap_or(F::zero());
+    let inertia = F::from(full_inertia).ok_or(KMeansError::ConversionFailure)?;
 
     Ok((centroids, inertia))
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -211,6 +211,12 @@ impl<F: Primitive> KMeans<F> {
     /// let labels = model.predict(&data).unwrap();
     /// assert_eq!(labels.len(), 2);
     /// ```
+    ///
+    /// # Notes
+    ///
+    /// This method assumes that the input `points` contains only finite floating-point values.
+    /// Accessing `NaN` or `Infinity` in the input may result in undefined behavior or
+    /// failure to produce a model.
     pub fn fit_default_scalar(points: impl AsRef<[F]>, ncols: usize, k: usize) -> Result<Self> {
         KMeansBuilder::new(k)
             .cpu_scalar()
@@ -235,6 +241,12 @@ impl<F: Primitive> KMeans<F> {
     /// let labels = model.predict_sequential(&data).unwrap();
     /// assert_eq!(labels.len(), 2);
     /// ```
+    ///
+    /// # Notes
+    ///
+    /// This method assumes that the input `points` contains only finite floating-point values.
+    /// Passing `NaN` or `Infinity` in the input may result in undefined behavior or
+    /// incorrect predictions.
     pub fn predict_sequential(&self, points: impl AsRef<[F]>) -> Result<Vec<usize>> {
         self.predict_with_backend_sequential::<CPUScalar>(points)
     }
@@ -261,7 +273,7 @@ impl<F: Primitive> KMeans<F> {
     ) -> Result<Vec<usize>> {
         self.validate_model_shape()?;
         let points = points.as_ref();
-        validate_inputs(points, self.ncols, self.k)?;
+        validate_prediction_inputs(points, self.ncols)?;
 
         let npoints = points.len() / self.ncols;
         let mut labels = vec![0usize; npoints];
@@ -347,7 +359,7 @@ impl<F: Primitive> KMeans<F> {
         source: &S,
     ) -> Result<Vec<usize>> {
         self.validate_model_shape()?;
-        validate_source_inputs(source, self.k)?;
+        validate_prediction_source_inputs(source, self.ncols)?;
         let npoints = source.num_points();
         let mut labels = vec![0usize; npoints];
 
@@ -393,10 +405,16 @@ impl<F: Primitive> KMeans<F> {
     /// For `MetricType::Euclidean` this returns squared Euclidean distances,
     /// for `MetricType::DotProduct` it returns raw dot-product similarities.
     /// Returns a flat vector of shape (n_points * k).
+    ///
+    /// # Notes
+    ///
+    /// This method assumes that the input `points` contains only finite floating-point values.
+    /// Passing `NaN` or `Infinity` in the input may result in undefined behavior or
+    /// incorrect results.
     pub fn transform(&self, points: impl AsRef<[F]>) -> Result<Vec<F>> {
         self.validate_model_shape()?;
         let points = points.as_ref();
-        validate_inputs(points, self.ncols, self.k)?;
+        validate_prediction_inputs(points, self.ncols)?;
 
         let npoints = points.len() / self.ncols;
         let mut distances = vec![F::zero(); npoints * self.k];
@@ -556,7 +574,7 @@ impl<F: Primitive> KMeansBuilder<F> {
             k,
             iterations: 100,
             attempts: 1,
-            tolerance: F::from(1e-4).unwrap_or(F::zero()),
+            tolerance: F::from(1e-4).unwrap_or(F::epsilon()),
             seed: None,
             mini_batch_rel_tolerance: kmeans_mini_batch::DEFAULT_MINI_BATCH_REL_TOL,
             mini_batch_min_iterations: kmeans_mini_batch::DEFAULT_MINI_BATCH_MIN_ITERATIONS,
@@ -585,6 +603,12 @@ impl<F: Primitive, I: InitializationStrategy> KMeansBuilder<F, BackendNotSet, Al
     ///     .unwrap();
     /// assert_eq!(model.k(), 2);
     /// ```
+    ///
+    /// # Notes
+    ///
+    /// When calling `fit()` on the resulting config, the input `points` must contain only
+    /// finite floating-point values. Passing `NaN` or `Infinity` may result in undefined
+    /// behavior or failure to produce a valid model.
     #[inline]
     pub fn build_default(self) -> KMeansConfig<F, CPUScalar, Euclidean, false, I> {
         self.cpu_scalar().euclidean().build()
@@ -940,7 +964,7 @@ fn validate_centroid_shape<F: Primitive>(centroids: &[F], ncols: usize, k: usize
             "number of centroids must be greater than zero".into(),
         ));
     }
-    if centroids.len() != ncols.saturating_mul(k) {
+    if ncols.checked_mul(k) != Some(centroids.len()) {
         return Err(KMeansError::InvalidInput(
             "centroids length must equal k * ncols".into(),
         ));
@@ -970,6 +994,13 @@ fn validate_inputs<F: Primitive>(points: &[F], ncols: usize, k: usize) -> Result
             "points must contain at least one row".into(),
         ));
     }
+    let npoints = points.len() / ncols;
+    if k > npoints {
+        return Err(KMeansError::InvalidInput(format!(
+            "number of clusters k ({}) cannot be greater than number of points ({})",
+            k, npoints
+        )));
+    }
     Ok(())
 }
 
@@ -993,6 +1024,71 @@ fn validate_source_inputs<F: Primitive, S: PointSource<F>>(source: &S, k: usize)
             "number of centroids must be greater than zero".into(),
         ));
     }
+    let npoints = source.num_points();
+    if npoints == 0 {
+        return Err(KMeansError::InvalidInput(
+            "point source must contain at least one point".into(),
+        ));
+    }
+    if k > npoints {
+        return Err(KMeansError::InvalidInput(format!(
+            "number of clusters k ({}) cannot be greater than number of points ({})",
+            k, npoints
+        )));
+    }
+    if F::from(npoints).is_none() {
+        return Err(KMeansError::InvalidInput(format!(
+            "number of points ({}) cannot be represented in the chosen floating point type",
+            npoints
+        )));
+    }
+    Ok(())
+}
+
+/// Validates input dimensions for prediction/transform operations.
+///
+/// Unlike training validation, this does NOT enforce `k <= n_points` because
+/// a trained model can predict cluster assignments for any number of points (even 1).
+#[inline]
+fn validate_prediction_inputs<F: Primitive>(points: &[F], ncols: usize) -> Result<()> {
+    if ncols == 0 {
+        return Err(KMeansError::InvalidInput(
+            "number of columns must be greater than zero".into(),
+        ));
+    }
+    if !points.len().is_multiple_of(ncols) {
+        return Err(KMeansError::InvalidInput(
+            "points length must be divisible by ncols".into(),
+        ));
+    }
+    if points.is_empty() {
+        return Err(KMeansError::InvalidInput(
+            "points must contain at least one row".into(),
+        ));
+    }
+    Ok(())
+}
+
+/// Validates source input dimensions for prediction/transform operations.
+///
+/// Unlike training validation, this does NOT enforce `k <= n_points` because
+/// a trained model can predict cluster assignments for any number of points (even 1).
+fn validate_prediction_source_inputs<F: Primitive, S: PointSource<F>>(
+    source: &S,
+    expected_ncols: usize,
+) -> Result<()> {
+    if source.num_columns() == 0 {
+        return Err(KMeansError::InvalidInput(
+            "number of columns must be greater than zero".into(),
+        ));
+    }
+    if source.num_columns() != expected_ncols {
+        return Err(KMeansError::DimensionMismatch(format!(
+            "source has {} columns but model expects {}",
+            source.num_columns(),
+            expected_ncols
+        )));
+    }
     if source.num_points() == 0 {
         return Err(KMeansError::InvalidInput(
             "point source must contain at least one point".into(),
diff --git a/src/primitive.rs b/src/primitive.rs
@@ -1,5 +1,25 @@
 use num_traits::Float;
 use std::fmt::Debug;
 
-pub trait Primitive: Float + Debug + Send + Sync + 'static {}
-impl<T: Float + Debug + Send + Sync + 'static> Primitive for T {}
+pub trait Primitive: Float + Debug + Send + Sync + 'static {
+    /// Converts a `usize` to this primitive type.
+    ///
+    /// This method is preferred over `as` casting or generic `From` traits to ensure
+    /// consistent behavior across backends and explicit handling of potential precision loss
+    /// (though for k-means counts/indices, values are expected to fit).
+    fn from_usize(n: usize) -> Self;
+}
+
+impl Primitive for f32 {
+    #[inline(always)]
+    fn from_usize(n: usize) -> Self {
+        n as f32
+    }
+}
+
+impl Primitive for f64 {
+    #[inline(always)]
+    fn from_usize(n: usize) -> Self {
+        n as f64
+    }
+}
diff --git a/src/wasm.rs b/src/wasm.rs
@@ -6,7 +6,7 @@ use js_sys::Uint32Array;
 use wasm_bindgen::prelude::*;
 
 macro_rules! wasm_model {
-    ($name:ident, $float:ty, $to_array:ident, $doc:literal) => {
+    ($name:ident, $float:ty, $doc:literal) => {
         #[wasm_bindgen]
         #[doc = $doc]
         pub struct $name {
@@ -106,14 +106,12 @@ macro_rules! wasm_model {
 wasm_model!(
     WasmModel,
     f32,
-    to_array_f32,
     "Wasm-friendly K-Means++ model using f32 inputs/outputs."
 );
 
 wasm_model!(
     WasmModelF64,
     f64,
-    to_array_f64,
     "Wasm-friendly K-Means++ model using f64 inputs/outputs."
 );
 
diff --git a/tests/api_compile_tests.rs b/tests/api_compile_tests.rs
@@ -132,7 +132,8 @@ fn test_builder_type_state_progression() {
 
 #[test]
 fn test_mini_batch_api_compiles() {
-    let points = [0.0_f32, 1.0_f32];
+    // Need at least k points for k clusters
+    let points = [0.0_f32, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
     let source = SlicePointSource::<f32>::new(&points, 1).unwrap();
     let _ = KMeansBuilder::<f32>::new(8)
         .cpu_scalar()
diff --git a/tests/validation_tests.rs b/tests/validation_tests.rs
@@ -0,0 +1,42 @@
+use kmeans_uni::{Error, KMeansBuilder};
+
+#[test]
+fn test_k_greater_than_n_points_fails() {
+    let data = vec![1.0f32; 10]; // 10 points
+    let ncols = 1;
+    let k = 11; // k > n
+
+    let result = KMeansBuilder::new(k).build_default().fit(&data, ncols);
+
+    assert!(result.is_err());
+    match result {
+        Err(Error::InvalidInput(msg)) => {
+            assert!(msg.contains("cannot be greater than number of points"));
+        }
+        _ => panic!("Expected InvalidInput error"),
+    }
+}
+
+#[test]
+fn test_k_equals_n_points_succeeds() {
+    let data = vec![1.0f32, 2.0, 3.0]; // 3 points
+    let ncols = 1;
+    let k = 3; // k == n
+
+    let result = KMeansBuilder::new(k).build_default().fit(&data, ncols);
+
+    if let Err(e) = result {
+        panic!("Validation failed for k=n: {:?}", e);
+    }
+}
+
+#[test]
+fn test_checked_mul_overflow_theoretical() {
+    let data = vec![0.0f32; 100];
+    let ncols = 2;
+    let k = 5;
+
+    let result = KMeansBuilder::new(k).build_default().fit(&data, ncols);
+
+    assert!(result.is_ok());
+}

Original file line number	Diff line number	Diff line change
`@@ -139,7 +139,7 @@ pub(crate) fn run<`
`139`	`139`	`let (_, _, full_inertia) =`
`140`	`140`	`E::compute_stats_full::<F, C, M, S>(source, ncols, k, &prepared_centroids, par_chunk)?;`
`141`	`141`	`let centroids = C::finalize_centroids(&prepared_centroids, ncols, k);`
`142`		`- let inertia = F::from(full_inertia).unwrap_or(F::zero());`
	`142`	`+ let inertia = F::from(full_inertia).ok_or(KMeansError::ConversionFailure)?;`
`143`	`143`
`144`	`144`	`Ok((centroids, inertia))`
`145`	`145`	`}`