Skip to content

Commit 1d279a9

Browse files
mjstej-adamczyk
andauthored
Add examples to train-test split functions in various splitters (#502)
Co-authored-by: Jakub Adamczyk <jakubadamczyk10@gmail.com>
1 parent 6532038 commit 1d279a9

File tree

5 files changed

+130
-9
lines changed

5 files changed

+130
-9
lines changed

skfp/model_selection/splitters/butina_split.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,14 @@ def butina_train_test_split(
151151
.. [6] `Leland McInnes
152152
"PyNNDescent for fast Approximate Nearest Neighbors"
153153
<https://pynndescent.readthedocs.io/en/latest/>`_
154+
155+
Examples
156+
--------
157+
>>> from skfp.model_selection.splitters import butina_train_test_split
158+
>>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
159+
>>> train_smiles, test_smiles = butina_train_test_split(smiles, train_size=0.75, test_size=0.25)
160+
>>> train_smiles
161+
['CCBr', 'CCI', 'CCF', 'CC=O', 'CCO', 'CCC']
154162
"""
155163
train_size, test_size = validate_train_test_split_sizes(
156164
train_size, test_size, len(data)
@@ -336,6 +344,16 @@ def butina_train_valid_test_split(
336344
.. [6] `Leland McInnes
337345
"PyNNDescent for fast Approximate Nearest Neighbors"
338346
<https://pynndescent.readthedocs.io/en/latest/>`_
347+
348+
Examples
349+
--------
350+
>>> from skfp.model_selection.splitters import butina_train_valid_test_split
351+
>>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
352+
>>> train_smiles, valid_smiles, test_smiles = butina_train_valid_test_split(
353+
... smiles, train_size=0.5, valid_size=0.25, test_size=0.25
354+
... )
355+
>>> train_smiles
356+
['CCF', 'CC=O', 'CCO', 'CCC']
339357
"""
340358
train_size, valid_size, test_size = validate_train_valid_test_split_sizes(
341359
train_size, valid_size, test_size, len(data)

skfp/model_selection/splitters/maxmin_split.py

Lines changed: 52 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -101,18 +101,28 @@ def maxmin_train_test_split(
101101
.. [1] `Mark Ashton et al.
102102
"Identification of Diverse Database Subsets using Property-Based and Fragment-Based Molecular Descriptions"
103103
Quant. Struct.-Act. Relat., 21: 598-604
104-
<https://onlinelibrary.wiley.com/doi/10.1002/qsar.200290002>_`
104+
<https://onlinelibrary.wiley.com/doi/10.1002/qsar.200290002>`_
105105
106106
.. [2] `Roger Sayle
107107
"Improved RDKit implementation"
108-
<https://github.com/rdkit/UGM_2017/blob/master/Presentations/Sayle_RDKitDiversity_Berlin17.pdf>_`
108+
<https://github.com/rdkit/UGM_2017/blob/master/Presentations/Sayle_RDKitDiversity_Berlin17.pdf>`_
109109
110110
.. [3] `Tim Dudgeon
111111
"Revisiting the MaxMinPicker"
112-
<https://rdkit.org/docs/cppapi/classRDPickers_1_1MaxMinPicker.html>_`
112+
<https://rdkit.org/docs/cppapi/classRDPickers_1_1MaxMinPicker.html>`_
113113
114114
.. [4] `Squonk - RDKit MaxMin Picker
115-
<https://squonk.it/docs/cells/RDKit%20MaxMin%20Picker>_`
115+
<https://squonk.it/docs/cells/RDKit%20MaxMin%20Picker>`_
116+
117+
Examples
118+
--------
119+
>>> from skfp.model_selection.splitters import maxmin_train_test_split
120+
>>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
121+
>>> train_smiles, test_smiles = maxmin_train_test_split(
122+
... smiles, train_size=0.75, test_size=0.25, random_state=42
123+
... )
124+
>>> train_smiles
125+
['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF']
116126
"""
117127
data_size = len(data)
118128
train_size, test_size = validate_train_test_split_sizes(
@@ -249,18 +259,28 @@ def maxmin_train_valid_test_split(
249259
.. [1] `Mark Ashton et al.
250260
"Identification of Diverse Database Subsets using Property-Based and Fragment-Based Molecular Descriptions"
251261
Quant. Struct.-Act. Relat., 21: 598-604
252-
<https://onlinelibrary.wiley.com/doi/10.1002/qsar.200290002>_`
262+
<https://onlinelibrary.wiley.com/doi/10.1002/qsar.200290002>`_
253263
254264
.. [2] `Roger Sayle
255265
"Improved RDKit implementation"
256-
<https://github.com/rdkit/UGM_2017/blob/master/Presentations/Sayle_RDKitDiversity_Berlin17.pdf>_`
266+
<https://github.com/rdkit/UGM_2017/blob/master/Presentations/Sayle_RDKitDiversity_Berlin17.pdf>`_
257267
258268
.. [3] `Tim Dudgeon
259269
"Revisiting the MaxMinPicker"
260-
<https://rdkit.org/docs/cppapi/classRDPickers_1_1MaxMinPicker.html>_`
270+
<https://rdkit.org/docs/cppapi/classRDPickers_1_1MaxMinPicker.html>`_
261271
262272
.. [4] `Squonk - RDKit MaxMin Picker
263-
<https://squonk.it/docs/cells/RDKit%20MaxMin%20Picker>_`
273+
<https://squonk.it/docs/cells/RDKit%20MaxMin%20Picker>`_
274+
275+
Examples
276+
--------
277+
>>> from skfp.model_selection.splitters import maxmin_train_valid_test_split
278+
>>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
279+
>>> train_smiles, valid_smiles, test_smiles = maxmin_train_valid_test_split(
280+
... smiles, train_size=0.5, valid_size=0.25, test_size=0.25, random_state=42
281+
... )
282+
>>> train_smiles
283+
['CCCl', 'CCBr', 'CCI', 'CCF']
264284
"""
265285
data_size = len(data)
266286
train_size, valid_size, test_size = validate_train_valid_test_split_sizes(
@@ -405,6 +425,17 @@ def maxmin_stratified_train_test_split(
405425
See Also
406426
--------
407427
:func:`maxmin_train_test_split` : Regular MaxMin split.
428+
429+
Examples
430+
--------
431+
>>> from skfp.model_selection.splitters import maxmin_stratified_train_test_split
432+
>>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
433+
>>> labels = [0, 0, 1, 1, 0, 1, 0, 1]
434+
>>> train_smiles, test_smiles, train_labels, test_labels = maxmin_stratified_train_test_split(
435+
... smiles, labels, train_size=0.75, test_size=0.25, random_state=42
436+
... )
437+
>>> print('Train SMILES:', train_smiles)
438+
Train SMILES: ['CCO', 'CCBr', 'CCF', 'CCC', 'CCI', 'CC=O']
408439
"""
409440
data_arr = np.array(data)
410441
labels = np.array(labels, dtype=int)
@@ -561,6 +592,19 @@ def maxmin_stratified_train_valid_test_split(
561592
See Also
562593
--------
563594
:func:`maxmin_train_valid_test_split` : Regular MaxMin split.
595+
596+
Examples
597+
--------
598+
>>> from skfp.model_selection.splitters import maxmin_stratified_train_valid_test_split
599+
>>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
600+
>>> labels = [0, 0, 1, 1, 0, 1, 0, 1]
601+
>>> train_smiles, valid_smiles, test_smiles, train_labels, valid_labels, test_labels = (
602+
... maxmin_stratified_train_valid_test_split(
603+
... smiles, labels, train_size=0.5, valid_size=0.25, test_size=0.25, random_state=42
604+
... )
605+
... )
606+
>>> print('Train SMILES:', train_smiles)
607+
Train SMILES: ['CCBr', 'CCF', 'CCC', 'CCI']
564608
"""
565609
data_arr = np.array(data)
566610
labels = np.array(labels, dtype=int)

skfp/model_selection/splitters/pubchem_split.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,16 @@ def pubchem_train_test_split(
130130
"An update on PUG-REST: RESTful interface for programmatic access to PubChem."
131131
Nucleic Acids Res. 2018 Jul 2;46(W1):W563-W570.
132132
<https://doi.org/10.1093/nar/gky294>`_
133+
134+
Examples
135+
--------
136+
>>> from skfp.model_selection.splitters import pubchem_train_test_split
137+
>>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
138+
>>> train_smiles, test_smiles = pubchem_train_test_split(
139+
... smiles, train_size=0.75, test_size=0.25, n_jobs=1, n_retries=1
140+
... )
141+
>>> train_smiles
142+
['CCCl', 'CCI', 'CCO', 'CCN', 'CCBr', 'CCC']
133143
"""
134144
years = _get_pubchem_years(data, n_jobs, n_retries, verbose)
135145

@@ -296,6 +306,16 @@ def pubchem_train_valid_test_split(
296306
"An update on PUG-REST: RESTful interface for programmatic access to PubChem."
297307
Nucleic Acids Res. 2018 Jul 2;46(W1):W563-W570.
298308
<https://doi.org/10.1093/nar/gky294>`_
309+
310+
Examples
311+
--------
312+
>>> from skfp.model_selection.splitters import pubchem_train_valid_test_split
313+
>>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
314+
>>> train_smiles, valid_smiles, test_smiles = pubchem_train_valid_test_split(
315+
... smiles, train_size=0.5, valid_size=0.25, test_size=0.25, n_jobs=1, n_retries=1, verbose=0
316+
... )
317+
>>> train_smiles
318+
['CCCl', 'CCI', 'CCO', 'CCN']
299319
"""
300320
years = _get_pubchem_years(data, n_jobs, n_retries, verbose)
301321

@@ -381,7 +401,8 @@ def _get_cid_for_smiles(smiles: str, n_retries: int, verbosity: int) -> str | No
381401
"""
382402
Get PubChem CID from SMILES, or None if molecule cannot be found.
383403
"""
384-
print(smiles)
404+
if verbosity > 0:
405+
print(smiles)
385406
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{quote(smiles)}/cids/JSON"
386407

387408
response = None

skfp/model_selection/splitters/randomized_scaffold_split.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,16 @@ def randomized_scaffold_train_test_split(
125125
"Does GNN Pretraining Help Molecular Representation?"
126126
Advances in Neural Information Processing Systems 35 (NeurIPS 2022).
127127
<https://proceedings.neurips.cc/paper_files/paper/2022/hash/4ec360efb3f52643ac43fda570ec0118-Abstract-Conference.html>`_
128+
129+
Examples
130+
--------
131+
>>> from skfp.model_selection.splitters import randomized_scaffold_train_test_split
132+
>>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF']
133+
>>> train_smiles, test_smiles = randomized_scaffold_train_test_split(
134+
... smiles, train_size=6, test_size=2, random_state=42
135+
... )
136+
>>> train_smiles
137+
['C1CCCCC1', 'c1ccccc1']
128138
"""
129139
train_size, test_size = validate_train_test_split_sizes(
130140
train_size, test_size, len(data)
@@ -289,6 +299,16 @@ def randomized_scaffold_train_valid_test_split(
289299
"Does GNN Pretraining Help Molecular Representation?"
290300
Advances in Neural Information Processing Systems 35 (NeurIPS 2022).
291301
<https://proceedings.neurips.cc/paper_files/paper/2022/hash/4ec360efb3f52643ac43fda570ec0118-Abstract-Conference.html>`_
302+
303+
Examples
304+
--------
305+
>>> from skfp.model_selection.splitters import randomized_scaffold_train_valid_test_split
306+
>>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF']
307+
>>> train_smiles, valid_smiles, test_smiles = randomized_scaffold_train_valid_test_split(
308+
... smiles, train_size=6, valid_size=1, test_size=1, random_state=42
309+
... )
310+
>>> train_smiles
311+
['c1ccccc1']
292312
"""
293313
train_size, valid_size, test_size = validate_train_valid_test_split_sizes(
294314
train_size, valid_size, test_size, len(data)

skfp/model_selection/splitters/scaffold_split.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,14 @@ def scaffold_train_test_split(
117117
118118
.. [3] ` Bemis-Murcko scaffolds and their variants
119119
<https://github.com/rdkit/rdkit/discussions/6844>`_
120+
121+
Examples
122+
--------
123+
>>> from skfp.model_selection.splitters import scaffold_train_test_split
124+
>>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF']
125+
>>> train_smiles, test_smiles = scaffold_train_test_split(smiles, train_size=6, test_size=2)
126+
>>> train_smiles
127+
['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF']
120128
"""
121129
train_size, test_size = validate_train_test_split_sizes(
122130
train_size, test_size, len(data)
@@ -272,6 +280,16 @@ def scaffold_train_valid_test_split(
272280
273281
.. [3] ` Bemis-Murcko scaffolds and their variants
274282
<https://github.com/rdkit/rdkit/discussions/6844>`_
283+
284+
Examples
285+
--------
286+
>>> from skfp.model_selection.splitters import scaffold_train_valid_test_split
287+
>>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF']
288+
>>> train_smiles, valid_smiles, test_smiles = scaffold_train_valid_test_split(
289+
... smiles, train_size=6, valid_size=1, test_size=1
290+
... )
291+
>>> train_smiles
292+
['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF']
275293
"""
276294
train_size, valid_size, test_size = validate_train_valid_test_split_sizes(
277295
train_size, valid_size, test_size, len(data)

0 commit comments

Comments
 (0)