Add LRGB only valid sequences option (#529)

j-adamczyk · web-flow · commit b0d79610761b · 2026-03-16T11:00:34.000+01:00
diff --git a/skfp/datasets/lrgb/benchmark.py b/skfp/datasets/lrgb/benchmark.py
@@ -203,6 +203,7 @@ def load_lrgb_mol_dataset(
 @validate_params(
     {
         "dataset_name": [StrOptions({"Peptides-func", "Peptides-struct"})],
+        "valid_sequences_only": ["boolean"],
         "data_dir": [None, str, os.PathLike],
         "as_frame": ["boolean"],
         "verbose": ["boolean"],
@@ -211,6 +212,7 @@ def load_lrgb_mol_dataset(
 )
 def load_lrgb_mol_splits(
     dataset_name: str,
+    valid_sequences_only: bool = False,
     data_dir: str | os.PathLike | None = None,
     as_dict: bool = False,
     verbose: bool = False,
@@ -227,7 +229,12 @@ def load_lrgb_mol_splits(
     Parameters
     ----------
     dataset_name : {"Peptides-func", "Peptides-struct"}
-        Name of the dataset to loads splits for.
+        Name of the dataset to load splits for.
+
+    valid_sequences_only : bool, default=False
+        Whether to load only rows with valid amino acid sequences, which can be loaded
+        as RDKit ``Mol`` objects. This removes some sequences with valid SMILES, but custom
+        notation for chemical modifications.
 
     data_dir : {None, str, path-like}, default=None
         Path to the root data directory. If ``None``, currently set scikit-learn directory
@@ -255,10 +262,13 @@ def load_lrgb_mol_splits(
         Advances in Neural Information Processing Systems 35 (2022): 22326-22340
         <https://proceedings.neurips.cc/paper_files/paper/2022/hash/8c3c666820ea055a77726d66fc7d447f-Abstract-Datasets_and_Benchmarks.html>`_
     """
+    file_dataset_name = dataset_name.lower().replace("-", "_")
+    valid_only = "_valid_seqs" if valid_sequences_only else ""
+
     splits = fetch_splits(
         data_dir,
         dataset_name=f"LRGB_{dataset_name}",
-        filename=f"lrgb_splits_{dataset_name.lower().replace('-', '_')}.json",
+        filename=f"lrgb_splits_{file_dataset_name}{valid_only}.json",
         verbose=verbose,
     )
     if as_dict:
diff --git a/skfp/datasets/moleculeace/benchmark.py b/skfp/datasets/moleculeace/benchmark.py
@@ -323,7 +323,7 @@ def load_moleculeace_splits(
     Parameters
     ----------
     dataset_name : str
-        Name of the dataset to loads splits for.
+        Name of the dataset to load splits for.
 
     split_type: {"random", "activity_cliff"}
         Type of the split to load.
diff --git a/skfp/datasets/moleculenet/benchmark.py b/skfp/datasets/moleculenet/benchmark.py
@@ -257,7 +257,7 @@ def load_ogb_splits(
     Parameters
     ----------
     dataset_name : {"ESOL", "FreeSolv", "Lipophilicity","BACE", "BBBP", "HIV", "ClinTox", "MUV", "SIDER", "Tox21", "ToxCast", "PCBA"}
-        Name of the dataset to loads splits for.
+        Name of the dataset to load splits for.
 
     data_dir : {None, str, path-like}, default=None
         Path to the root data directory. If ``None``, currently set scikit-learn directory
diff --git a/skfp/datasets/tdc/benchmark.py b/skfp/datasets/tdc/benchmark.py
@@ -351,7 +351,7 @@ def load_tdc_splits(
     Parameters
     ----------
     dataset_name : str
-        Name of the dataset to loads splits for.
+        Name of the dataset to load splits for.
 
     data_dir : {None, str, path-like}, default=None
         Path to the root data directory. If ``None``, currently set scikit-learn directory
diff --git a/tests/datasets/lrgb.py b/tests/datasets/lrgb.py
@@ -77,14 +77,16 @@ def test_load_lrgb_splits_as_dict(dataset_name):
     only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"],
 )
 @pytest.mark.parametrize(
-    "dataset_name, dataset_length",
+    "dataset_name, valid_sequences_only, dataset_length",
     [
-        ("Peptides-func", 15535),
-        ("Peptides-struct", 15535),
+        ("Peptides-func", False, 15535),
+        ("Peptides-struct", False, 15535),
+        ("Peptides-func", True, 15424),
+        ("Peptides-struct", True, 15424),
     ],
 )
-def test_load_lrgb_splits_lengths(dataset_name, dataset_length):
-    train, valid, test = load_lrgb_mol_splits(dataset_name)
+def test_load_lrgb_splits_lengths(dataset_name, valid_sequences_only, dataset_length):
+    train, valid, test = load_lrgb_mol_splits(dataset_name, valid_sequences_only)
     loaded_length = len(train) + len(valid) + len(test)
     assert_equal(loaded_length, dataset_length)