Generate features as part of dataloader for optimized multiprocessing and batching

RalfG · RalfG · commit 12f332aca3ea · 2025-06-05T12:25:02.000+02:00
diff --git a/deeplc/__init__.py b/deeplc/__init__.py
@@ -1,8 +1,8 @@
-__all__ = ["DeepLC"]
+# __all__ = ["DeepLC"]
 
-from importlib.metadata import version
+# from importlib.metadata import version
 
-__version__ = version("deeplc")
+# __version__ = version("deeplc")
 
 
-from deeplc.deeplc import DeepLC
+# from deeplc.deeplc import DeepLC
diff --git a/deeplc/_data.py b/deeplc/_data.py
@@ -1,49 +1,44 @@
 import torch
+from psm_utils.psm_list import PSMList
 from torch.utils.data import Dataset
 
+from deeplc._features import encode_peptidoform
 
-class DeepLCDataset(Dataset):
-    """
-    Custom Dataset class for DeepLC used for loading features from peptide sequences.
-
-    Parameters
-    ----------
-    X : ndarray
-        Feature matrix for input data.
-    X_sum : ndarray
-        Feature matrix for sum of input data.
-    X_global : ndarray
-        Feature matrix for global input data.
-    X_hc : ndarray
-        Feature matrix for high-order context features.
-    target : ndarray, optional
-        The target retention times. Default is None.
-    """
 
-    def __init__(self, X, X_sum, X_global, X_hc, target=None):
-        self.X = torch.from_numpy(X).float()
-        self.X_sum = torch.from_numpy(X_sum).float()
-        self.X_global = torch.from_numpy(X_global).float()
-        self.X_hc = torch.from_numpy(X_hc).float()
+class DeepLCDataset(Dataset):
+    """Custom Dataset class for DeepLC used for loading features from peptide sequences."""
 
-        if target is not None:
-            self.target = torch.from_numpy(target).float()  # Add target values if provided
+    def __init__(self, psm_list: PSMList, add_ccs_features: bool = False):
+        self.psm_list = psm_list
+        self.add_ccs_features = add_ccs_features
+        
+        self._targets = self._get_targets(psm_list)
+    
+    @staticmethod
+    def _get_targets(psm_list: PSMList) -> torch.Tensor | None:
+        retention_times = [psm.retention_time for psm in psm_list]
+        if None not in retention_times:
+            return torch.tensor(retention_times, dtype=torch.float32)
         else:
-            self.target = None  # If no target is provided, set it to None
+            return None
 
     def __len__(self):
-        return self.X.shape[0]
+        return len(self.psm_list)
 
-    def __getitem__(self, idx):
-        if self.target is not None:
-            # Return both features and target during training
-            return (
-                self.X[idx],
-                self.X_sum[idx],
-                self.X_global[idx],
-                self.X_hc[idx],
-                self.target[idx],
-            )
-        else:
-            # Return only features during prediction
-            return (self.X[idx], self.X_sum[idx], self.X_global[idx], self.X_hc[idx])
+    def __getitem__(self, idx) -> tuple:
+        if not isinstance(idx, int):
+            raise TypeError(f"Index must be an integer, got {type(idx)} instead.")
+        features = encode_peptidoform(
+            self.psm_list[idx].peptidoform,
+            add_ccs_features=self.add_ccs_features
+        )
+        feature_tuples = (
+            torch.from_numpy(features["matrix"]).to(dtype=torch.float32),
+            torch.from_numpy(features["matrix_sum"]).to(dtype=torch.float32),
+            torch.from_numpy(features["matrix_global"]).to(dtype=torch.float32),
+            torch.from_numpy(features["matrix_hc"]).to(dtype=torch.float32),
+        )
+        targets = self._targets[idx] if self._targets is not None else torch.full_like(
+            feature_tuples[0], fill_value=float('nan'), dtype=torch.float32
+        )
+        return feature_tuples, targets
diff --git a/deeplc/_features.py b/deeplc/_features.py
@@ -148,7 +148,7 @@ def _compute_rolling_sum(matrix: np.ndarray, n: int = 2) -> np.ndarray:
 
 def encode_peptidoform(
     peptidoform: Peptidoform | str,
-    predict_ccs: bool = False,
+    add_ccs_features: bool = False,
     padding_length: int = 60,
     positions: set[int] | None = None,
     positions_pos: set[int] | None = None,
@@ -188,7 +188,7 @@ def encode_peptidoform(
 
     matrix_all = np.sum(std_matrix, axis=0)
     matrix_all = np.append(matrix_all, seq_len)
-    if predict_ccs:
+    if add_ccs_features:
         matrix_all = np.append(matrix_all, (seq.count("H")) / seq_len)
         matrix_all = np.append(
             matrix_all, (seq.count("F") + seq.count("W") + seq.count("Y")) / seq_len
@@ -198,50 +198,12 @@ def encode_peptidoform(
         matrix_all = np.append(matrix_all, charge)
 
     matrix_sum = _compute_rolling_sum(std_matrix.T, n=2)[:, ::2].T
+    
+    matrix_global = np.concatenate([matrix_all, pos_matrix.flatten()])
 
     return {
         "matrix": std_matrix,
         "matrix_sum": matrix_sum,
-        "matrix_all": matrix_all,
-        "pos_matrix": pos_matrix.flatten(),
+        "matrix_global": matrix_global,
         "matrix_hc": onehot_matrix,
     }
-
-
-def extract_features(
-    peptidoforms: list[str | Peptidoform] | PSMList,
-    predict_ccs: bool = False,
-) -> dict[str, dict[int, np.ndarray]]:
-    """Extract features for all peptidoforms."""
-    if isinstance(peptidoforms, PSMList):
-        peptidoforms = [psm.peptidoform for psm in peptidoforms]
-
-    encodings = [encode_peptidoform(pf, predict_ccs=predict_ccs) for pf in peptidoforms]
-    aggregated_encodings = aggregate_encodings(encodings)
-
-    return aggregated_encodings
-
-
-def aggregate_encodings(
-    encodings: list[dict[str, np.ndarray]],
-) -> dict[str, dict[int, np.ndarray]]:
-    """Aggregate list of encodings into single dictionary."""
-    return {key: {i: enc[key] for i, enc in enumerate(encodings)} for key in encodings[0]}
-
-
-def unpack_features(
-    features: dict[str, np.ndarray],
-) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
-    """Unpack dictionary with features to numpy arrays."""
-    X_sum = np.stack(list(features["matrix_sum"].values()))
-    X_global = np.concatenate(
-        (
-            np.stack(list(features["matrix_all"].values())),
-            np.stack(list(features["pos_matrix"].values())),
-        ),
-        axis=1,
-    )
-    X_hc = np.stack(list(features["matrix_hc"].values()))
-    X_main = np.stack(list(features["matrix"].values()))
-
-    return X_sum, X_global, X_hc, X_main
diff --git a/deeplc/calibration.py b/deeplc/calibration.py
@@ -15,8 +15,8 @@
 LOGGER = logging.getLogger(__name__)
 
 
-class Calibrator(ABC):
-    """Abstract base class for retention time calibrators."""
+class Calibration(ABC):
+    """Abstract base class for retention time calibration."""
 
     @abstractmethod
     def __init__(self, *args, **kwargs):
@@ -27,9 +27,33 @@ def fit(measured_tr: np.ndarray, predicted_tr: np.ndarray) -> None: ...
 
     @abstractmethod
     def transform(tr: np.ndarray) -> np.ndarray: ...
+    
 
+class IdentityCalibration(Calibration):
+    """No calibration, just returns the predicted retention times."""
 
-class PiecewiseLinearCalibrator(Calibrator):
+    def fit(self, measured_tr: np.ndarray, predicted_tr: np.ndarray) -> None:
+        """No fitting required for NoCalibration."""
+        pass
+
+    def transform(self, tr: np.ndarray) -> np.ndarray:
+        """
+        Transform the predicted retention times without any calibration.
+
+        Parameters
+        ----------
+        tr
+            Retention times to be transformed.
+
+        Returns
+        -------
+        np.ndarray
+            Transformed retention times (same as input).
+        """
+        return tr
+
+
+class PiecewiseLinearCalibration(Calibration):
     def __init__(
         self,
         split_cal: int = 50,
@@ -202,7 +226,7 @@ def transform(self, tr: np.ndarray) -> np.ndarray:
         return np.array(cal_preds)
 
 
-class SplineTransformerCalibrator(Calibrator):
+class SplineTransformerCalibration(Calibration):
     def __init__(self):
         """SplineTransformer calibration for retention time."""
         super().__init__()
diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py