minor refactoring

Jhsmit · Jhsmit · commit 9eee11543e5c · 2025-07-31T14:42:24.000+02:00
diff --git a/examples/create_dataset.py b/examples/create_dataset.py
@@ -6,7 +6,7 @@
 from hdxms_datasets.models import (
     DatasetMetadata,
     HDXDataSet,
-    HDXState,
+    State,
     DeuterationType,
     Author,
     PeptideFormat,
@@ -16,6 +16,7 @@
     Publication,
     Structure,
 )
+from hdxms_datasets.utils import verify_sequence
 
 # %%
 
@@ -76,10 +77,16 @@
     ),
 ]
 
+# %%
+# %%
+# test loading the peptides and verifying the sequence
+# by comparing sequences of peptides to the protein state sequence
+for peptide in peptides:
+    verify_sequence(peptide.load(), protein_state.sequence, n_term=protein_state.n_term)
 
 # %%
 states = [
-    HDXState(
+    State(
         name="Tetramer",
         description="SecB WT in tetrameric state",
         protein_state=protein_state,
@@ -99,7 +106,7 @@
         "Y109A",
         "T115A",
         "S119A",
-    ],  # this information is also deducible from comparing sequences
+    ],  # this information is also deducible by comparing sequences between states
 )
 
 peptides = [
@@ -118,16 +125,22 @@
     )
 ]
 
+# %%
+# test loading the peptides and verifying the sequence
+# by comparing sequences of peptides to the protein state sequence
+for peptide in peptides:
+    verify_sequence(peptide.load(), protein_state.sequence, n_term=protein_state.n_term)
+
+# %%
+
 states.append(
-    HDXState(
+    State(
         name="Dimer",
         description="SecB mutatant in dimeric state",
         protein_state=protein_state,
         peptides=peptides,
     )
 )
-states
-
 
 # %%
 pub = Publication(
diff --git a/hdxms_datasets/database.py b/hdxms_datasets/database.py
@@ -207,14 +207,14 @@ def submit_dataset(
 
 
 class DataBase:
-    def __init__(self, database_root: Path | str):
-        self.database_root = Path(database_root)
-        self.database_root.mkdir(exist_ok=True, parents=True)
+    def __init__(self, database_dir: Path | str):
+        self.database_dir = Path(database_dir)
+        self.database_dir.mkdir(exist_ok=True, parents=True)
 
     @property
     def datasets(self) -> list[str]:
         """List of available datasets in the cache dir"""
-        return [d.stem for d in self.database_root.iterdir() if self.is_dataset(d)]
+        return [d.stem for d in self.database_dir.iterdir() if self.is_dataset(d)]
 
     @staticmethod
     def is_dataset(path: Path) -> bool:
@@ -225,11 +225,11 @@ def is_dataset(path: Path) -> bool:
         return (path / "dataset.json").exists()
 
     def clear_cache(self) -> None:
-        for dir in self.database_root.iterdir():
+        for dir in self.database_dir.iterdir():
             shutil.rmtree(dir)
 
     def load_dataset(self, dataset_id: str) -> HDXDataSet:
-        dataset_root = self.database_root / dataset_id
+        dataset_root = self.database_dir / dataset_id
         dataset = HDXDataSet.model_validate_json(
             Path(dataset_root, "dataset.json").read_text(),
             context={"dataset_root": dataset_root},
@@ -249,8 +249,8 @@ class RemoteDataBase(DataBase):
         remote_url: URL of the remote repository (default: DATABASE_URL).
     """
 
-    def __init__(self, data_root_path: Path | str, remote_url: str = DATABASE_URL):
-        super().__init__(data_root_path)
+    def __init__(self, database_dir: Path | str, remote_url: str = DATABASE_URL):
+        super().__init__(database_dir)
         self.remote_url = remote_url
 
     def get_index(self) -> nw.DataFrame:
diff --git a/hdxms_datasets/loader.py b/hdxms_datasets/loader.py
@@ -71,7 +71,7 @@ def read_csv(source: Path | str | IO | bytes) -> nw.DataFrame:
         try:
             import pandas as pd
 
-            return nw.from_native(pd.read_csv(source))
+            return nw.from_native(pd.read_csv(source))  # type: ignore
         except ImportError:
             raise ValueError("No suitable backend found for reading file-like objects or bytes.")
 
diff --git a/hdxms_datasets/process.py b/hdxms_datasets/process.py
@@ -2,11 +2,9 @@
 
 from pathlib import Path
 import warnings
-from collections import defaultdict
 from functools import reduce
 from operator import and_
-from typing import Literal, Optional, TypedDict, Union
-
+from typing import Optional
 import narwhals as nw
 from statsmodels.stats.weightstats import DescrStatsW
 from uncertainties import Variable, ufloat
@@ -139,9 +137,10 @@ def apply_filters(df, **filters):
     return df.filter(f_expr)
 
 
+@nw.narwhalify
 def aggregate_columns(
     df: nw.DataFrame, columns: list[str], by: list[str] = ["start", "end", "exposure"]
-):
+) -> nw.DataFrame:
     """
     Aggregate the DataFrame the specified columns by intensity-weighted average.
     """