Added sample weight to SINDy fit method.

filippozacchei · filippozacchei · commit 60f520c273e3 · 2025-09-24T15:21:48.000-07:00
To do: account for WEAK sindy
diff --git a/pysindy/optimizers/stlsq.py b/pysindy/optimizers/stlsq.py
@@ -78,6 +78,15 @@ class STLSQ(BaseOptimizer):
     history_ : list
         History of ``coef_``. ``history_[k]`` contains the values of
         ``coef_`` at iteration k of sequentially thresholded least-squares.
+        
+            
+    Notes
+    -----
+    - Supports ``sample_weight`` during :meth:`fit`. Sample weights are applied
+      by rescaling rows of the regression problem (X, y) before column
+      normalization and thresholding. This allows weighted least squares
+      formulations in SINDy.
+    - When ``sample_weight`` is not provided, all samples are treated equally.
 
     Examples
     --------
diff --git a/pysindy/pysindy.py b/pysindy/pysindy.py
@@ -294,6 +294,8 @@ def __init__(
             differentiation_method = FiniteDifference(axis=-2)
         self.differentiation_method = differentiation_method
         self.discrete_time = discrete_time
+        self.set_fit_request(sample_weight=True)
+        self.set_score_request(sample_weight=True)
 
     def fit(
         self,
@@ -302,6 +304,7 @@ def fit(
         x_dot=None,
         u=None,
         feature_names: Optional[list[str]] = None,
+        sample_weight=None,
     ):
         """
         Fit a SINDy model.
@@ -342,6 +345,11 @@ def fit(
         feature_names : list of string, length n_input_features, optional
             Names for the input features (e.g. :code:`['x', 'y', 'z']`).
             If None, will use :code:`['x0', 'x1', ...]`.
+            
+        sample_weight : float or array-like of shape (n_samples,), optional
+            Per-sample weights for the regression. Passed internally to
+            the optimizer (e.g. STLSQ). Supports compatibility with
+            scikit-learn tools such as GridSearchCV when using weighted data.
 
         Returns
         -------
@@ -371,14 +379,18 @@ def fit(
 
         self.feature_names = feature_names
 
+        # User may give one weight per trajectory or one weight per sample
+        if sample_weight is not None:
+            sample_weight = _expand_sample_weights(sample_weight, x)
+                                           
         steps = [
             ("features", self.feature_library),
             ("shaping", SampleConcatter()),
             ("model", self.optimizer),
         ]
         x_dot = concat_sample_axis(x_dot)
         self.model = Pipeline(steps)
-        self.model.fit(x, x_dot)
+        self.model.fit(x, x_dot, model__sample_weight=sample_weight)
         self._fit_shape()
 
         return self
@@ -412,6 +424,7 @@ def predict(self, x, u=None):
         x, _, u = _comprehend_and_validate_inputs(x, 1, None, u, self.feature_library)
 
         check_is_fitted(self, "model")
+        
         if self.n_control_features_ > 0 and u is None:
             raise TypeError("Model was fit using control variables, so u is required")
         if self.n_control_features_ == 0 and u is not None:
@@ -467,7 +480,7 @@ def print(self, lhs=None, precision=3, **kwargs):
                 names = f"{lhs[i]}"
             print(f"{names} = {eqn}", **kwargs)
 
-    def score(self, x, t, x_dot=None, u=None, metric=r2_score, **metric_kws):
+    def score(self, x, t, x_dot=None, u=None, metric=r2_score, sample_weight=None, **metric_kws):
         """
         Returns a score for the time derivative prediction produced by the model.
 
@@ -500,9 +513,14 @@ def score(self, x, t, x_dot=None, u=None, metric=r2_score, **metric_kws):
             See `Scikit-learn \
             <https://scikit-learn.org/stable/modules/model_evaluation.html>`_
             for more options.
+        
+        sample_weight : array-like of shape (n_samples,), optional
+            Per-sample weights passed directly to the metric. This is the
+            preferred way to supply weights.
 
         metric_kws: dict, optional
             Optional keyword arguments to pass to the metric function.
+            
 
         Returns
         -------
@@ -523,10 +541,21 @@ def score(self, x, t, x_dot=None, u=None, metric=r2_score, **metric_kws):
 
         x, x_dot = self._process_trajectories(x, t, x_dot)
 
+        if sample_weight is not None:
+            sample_weight = _expand_sample_weights(sample_weight, x)
+    
         x_dot = concat_sample_axis(x_dot)
         x_dot_predict = concat_sample_axis(x_dot_predict)
 
-        x_dot, x_dot_predict = drop_nan_samples(x_dot, x_dot_predict)
+        if sample_weight is not None:
+            x_dot, x_dot_predict, good_idx = drop_nan_samples(
+                x_dot, x_dot_predict, return_indices=True
+            )
+            sample_weight = sample_weight[good_idx]
+            metric_kws = {**metric_kws, "sample_weight": sample_weight}  
+        else:
+            x_dot, x_dot_predict = drop_nan_samples(x_dot, x_dot_predict)
+
         return metric(x_dot, x_dot_predict, **metric_kws)
 
     def _process_trajectories(self, x, t, x_dot):
@@ -910,3 +939,43 @@ def comprehend_and_validate(arr, t):
             )
         u = [comprehend_and_validate(ui, ti) for ui, ti in _zip_like_sequence(u, t)]
     return x, x_dot, u
+
+def _expand_sample_weights(sample_weight, trajectories):
+    """Expand trajectory-level weights to per-sample weights.
+
+    Parameters
+    ----------
+    sample_weight : array-like of shape (n_trajectories,) or (n_samples,), default=None
+        If length == n_trajectories, each trajectory weight is expanded to cover
+        all samples in that trajectory.
+        If length == n_samples, interpreted as per-sample weights directly.
+        If None, uniform weighting is applied.
+
+    trajectories : list of array-like
+        The list of input trajectories, each shape (n_samples_i, n_features).
+
+    Returns
+    -------
+    sample_weight : ndarray of shape (sum_i n_samples_i,)
+        Per-sample weights, ready to use in metrics.
+    """
+    if sample_weight is None:
+        return None
+
+    sample_weight = np.asarray(sample_weight)
+
+    n_traj = len(trajectories)
+    n_samples_total = sum(len(traj) for traj in trajectories)
+
+    if sample_weight.ndim == 1 and len(sample_weight) == n_traj:
+        # Efficient expansion using np.repeat
+        traj_lengths = [len(traj) for traj in trajectories]
+        return np.repeat(sample_weight, traj_lengths)
+
+    if sample_weight.ndim == 1 and len(sample_weight) == n_samples_total:
+        return sample_weight
+
+    raise ValueError(
+        f"sample_weight must be length {n_traj} (per trajectory) or "
+        f"{n_samples_total} (per sample), got {len(sample_weight)}"
+    )
diff --git a/pysindy/utils/base.py b/pysindy/utils/base.py
@@ -128,7 +128,7 @@ def _check_control_shape(x, u, trim_last_point):
     return u_arr
 
 
-def drop_nan_samples(x, y):
+def drop_nan_samples(x, y, return_indices: bool = False):
     """Drops samples from x and y where either has a nan value"""
     x_non_sample_axes = tuple(ax for ax in range(x.ndim) if ax != x.ax_sample)
     y_non_sample_axes = tuple(ax for ax in range(y.ndim) if ax != y.ax_sample)
@@ -137,6 +137,8 @@ def drop_nan_samples(x, y):
     good_sample_ind = np.nonzero(x_good_samples & y_good_samples)[0]
     x = x.take(good_sample_ind, axis=x.ax_sample)
     y = y.take(good_sample_ind, axis=y.ax_sample)
+    if return_indices:
+        return x, y, good_sample_ind
     return x, y