Skip to content

Commit b0d7961

Browse files
authored
Add LRGB only valid sequences option (#529)
1 parent a881f2c commit b0d7961

File tree

5 files changed

+22
-10
lines changed

5 files changed

+22
-10
lines changed

skfp/datasets/lrgb/benchmark.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ def load_lrgb_mol_dataset(
203203
@validate_params(
204204
{
205205
"dataset_name": [StrOptions({"Peptides-func", "Peptides-struct"})],
206+
"valid_sequences_only": ["boolean"],
206207
"data_dir": [None, str, os.PathLike],
207208
"as_frame": ["boolean"],
208209
"verbose": ["boolean"],
@@ -211,6 +212,7 @@ def load_lrgb_mol_dataset(
211212
)
212213
def load_lrgb_mol_splits(
213214
dataset_name: str,
215+
valid_sequences_only: bool = False,
214216
data_dir: str | os.PathLike | None = None,
215217
as_dict: bool = False,
216218
verbose: bool = False,
@@ -227,7 +229,12 @@ def load_lrgb_mol_splits(
227229
Parameters
228230
----------
229231
dataset_name : {"Peptides-func", "Peptides-struct"}
230-
Name of the dataset to loads splits for.
232+
Name of the dataset to load splits for.
233+
234+
valid_sequences_only : bool, default=False
235+
Whether to load only rows with valid amino acid sequences, which can be loaded
236+
as RDKit ``Mol`` objects. This removes some sequences with valid SMILES, but custom
237+
notation for chemical modifications.
231238
232239
data_dir : {None, str, path-like}, default=None
233240
Path to the root data directory. If ``None``, currently set scikit-learn directory
@@ -255,10 +262,13 @@ def load_lrgb_mol_splits(
255262
Advances in Neural Information Processing Systems 35 (2022): 22326-22340
256263
<https://proceedings.neurips.cc/paper_files/paper/2022/hash/8c3c666820ea055a77726d66fc7d447f-Abstract-Datasets_and_Benchmarks.html>`_
257264
"""
265+
file_dataset_name = dataset_name.lower().replace("-", "_")
266+
valid_only = "_valid_seqs" if valid_sequences_only else ""
267+
258268
splits = fetch_splits(
259269
data_dir,
260270
dataset_name=f"LRGB_{dataset_name}",
261-
filename=f"lrgb_splits_{dataset_name.lower().replace('-', '_')}.json",
271+
filename=f"lrgb_splits_{file_dataset_name}{valid_only}.json",
262272
verbose=verbose,
263273
)
264274
if as_dict:

skfp/datasets/moleculeace/benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ def load_moleculeace_splits(
323323
Parameters
324324
----------
325325
dataset_name : str
326-
Name of the dataset to loads splits for.
326+
Name of the dataset to load splits for.
327327
328328
split_type: {"random", "activity_cliff"}
329329
Type of the split to load.

skfp/datasets/moleculenet/benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ def load_ogb_splits(
257257
Parameters
258258
----------
259259
dataset_name : {"ESOL", "FreeSolv", "Lipophilicity","BACE", "BBBP", "HIV", "ClinTox", "MUV", "SIDER", "Tox21", "ToxCast", "PCBA"}
260-
Name of the dataset to loads splits for.
260+
Name of the dataset to load splits for.
261261
262262
data_dir : {None, str, path-like}, default=None
263263
Path to the root data directory. If ``None``, currently set scikit-learn directory

skfp/datasets/tdc/benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ def load_tdc_splits(
351351
Parameters
352352
----------
353353
dataset_name : str
354-
Name of the dataset to loads splits for.
354+
Name of the dataset to load splits for.
355355
356356
data_dir : {None, str, path-like}, default=None
357357
Path to the root data directory. If ``None``, currently set scikit-learn directory

tests/datasets/lrgb.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,14 +77,16 @@ def test_load_lrgb_splits_as_dict(dataset_name):
7777
only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"],
7878
)
7979
@pytest.mark.parametrize(
80-
"dataset_name, dataset_length",
80+
"dataset_name, valid_sequences_only, dataset_length",
8181
[
82-
("Peptides-func", 15535),
83-
("Peptides-struct", 15535),
82+
("Peptides-func", False, 15535),
83+
("Peptides-struct", False, 15535),
84+
("Peptides-func", True, 15424),
85+
("Peptides-struct", True, 15424),
8486
],
8587
)
86-
def test_load_lrgb_splits_lengths(dataset_name, dataset_length):
87-
train, valid, test = load_lrgb_mol_splits(dataset_name)
88+
def test_load_lrgb_splits_lengths(dataset_name, valid_sequences_only, dataset_length):
89+
train, valid, test = load_lrgb_mol_splits(dataset_name, valid_sequences_only)
8890
loaded_length = len(train) + len(valid) + len(test)
8991
assert_equal(loaded_length, dataset_length)
9092

0 commit comments

Comments
 (0)