diff --git a/skfp/model_selection/splitters/butina_split.py b/skfp/model_selection/splitters/butina_split.py index c947f445..6e9e184a 100644 --- a/skfp/model_selection/splitters/butina_split.py +++ b/skfp/model_selection/splitters/butina_split.py @@ -151,6 +151,14 @@ def butina_train_test_split( .. [6] `Leland McInnes "PyNNDescent for fast Approximate Nearest Neighbors" `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import butina_train_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> train_smiles, test_smiles = butina_train_test_split(smiles, train_size=0.75, test_size=0.25) + >>> train_smiles + ['CCBr', 'CCI', 'CCF', 'CC=O', 'CCO', 'CCC'] """ train_size, test_size = validate_train_test_split_sizes( train_size, test_size, len(data) @@ -336,6 +344,16 @@ def butina_train_valid_test_split( .. [6] `Leland McInnes "PyNNDescent for fast Approximate Nearest Neighbors" `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import butina_train_valid_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> train_smiles, valid_smiles, test_smiles = butina_train_valid_test_split( + ... smiles, train_size=0.5, valid_size=0.25, test_size=0.25 + ... ) + >>> train_smiles + ['CCF', 'CC=O', 'CCO', 'CCC'] """ train_size, valid_size, test_size = validate_train_valid_test_split_sizes( train_size, valid_size, test_size, len(data) diff --git a/skfp/model_selection/splitters/maxmin_split.py b/skfp/model_selection/splitters/maxmin_split.py index 1917593c..54f5aa11 100644 --- a/skfp/model_selection/splitters/maxmin_split.py +++ b/skfp/model_selection/splitters/maxmin_split.py @@ -101,18 +101,28 @@ def maxmin_train_test_split( .. [1] `Mark Ashton et al. "Identification of Diverse Database Subsets using Property-Based and Fragment-Based Molecular Descriptions" Quant. Struct.-Act. Relat., 21: 598-604 - _` + `_ .. [2] `Roger Sayle "Improved RDKit implementation" - _` + `_ .. [3] `Tim Dudgeon "Revisiting the MaxMinPicker" - _` + `_ .. [4] `Squonk - RDKit MaxMin Picker - _` + `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import maxmin_train_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> train_smiles, test_smiles = maxmin_train_test_split( + ... smiles, train_size=0.75, test_size=0.25, random_state=42 + ... ) + >>> train_smiles + ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] """ data_size = len(data) train_size, test_size = validate_train_test_split_sizes( @@ -249,18 +259,28 @@ def maxmin_train_valid_test_split( .. [1] `Mark Ashton et al. "Identification of Diverse Database Subsets using Property-Based and Fragment-Based Molecular Descriptions" Quant. Struct.-Act. Relat., 21: 598-604 - _` + `_ .. [2] `Roger Sayle "Improved RDKit implementation" - _` + `_ .. [3] `Tim Dudgeon "Revisiting the MaxMinPicker" - _` + `_ .. [4] `Squonk - RDKit MaxMin Picker - _` + `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import maxmin_train_valid_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> train_smiles, valid_smiles, test_smiles = maxmin_train_valid_test_split( + ... smiles, train_size=0.5, valid_size=0.25, test_size=0.25, random_state=42 + ... ) + >>> train_smiles + ['CCCl', 'CCBr', 'CCI', 'CCF'] """ data_size = len(data) train_size, valid_size, test_size = validate_train_valid_test_split_sizes( @@ -405,6 +425,17 @@ def maxmin_stratified_train_test_split( See Also -------- :func:`maxmin_train_test_split` : Regular MaxMin split. + + Examples + -------- + >>> from skfp.model_selection.splitters import maxmin_stratified_train_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> labels = [0, 0, 1, 1, 0, 1, 0, 1] + >>> train_smiles, test_smiles, train_labels, test_labels = maxmin_stratified_train_test_split( + ... smiles, labels, train_size=0.75, test_size=0.25, random_state=42 + ... ) + >>> print('Train SMILES:', train_smiles) + Train SMILES: ['CCO', 'CCBr', 'CCF', 'CCC', 'CCI', 'CC=O'] """ data_arr = np.array(data) labels = np.array(labels, dtype=int) @@ -561,6 +592,19 @@ def maxmin_stratified_train_valid_test_split( See Also -------- :func:`maxmin_train_valid_test_split` : Regular MaxMin split. + + Examples + -------- + >>> from skfp.model_selection.splitters import maxmin_stratified_train_valid_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> labels = [0, 0, 1, 1, 0, 1, 0, 1] + >>> train_smiles, valid_smiles, test_smiles, train_labels, valid_labels, test_labels = ( + ... maxmin_stratified_train_valid_test_split( + ... smiles, labels, train_size=0.5, valid_size=0.25, test_size=0.25, random_state=42 + ... ) + ... ) + >>> print('Train SMILES:', train_smiles) + Train SMILES: ['CCBr', 'CCF', 'CCC', 'CCI'] """ data_arr = np.array(data) labels = np.array(labels, dtype=int) diff --git a/skfp/model_selection/splitters/pubchem_split.py b/skfp/model_selection/splitters/pubchem_split.py index c56e155e..4bc3a6d5 100644 --- a/skfp/model_selection/splitters/pubchem_split.py +++ b/skfp/model_selection/splitters/pubchem_split.py @@ -130,6 +130,16 @@ def pubchem_train_test_split( "An update on PUG-REST: RESTful interface for programmatic access to PubChem." Nucleic Acids Res. 2018 Jul 2;46(W1):W563-W570. `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import pubchem_train_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> train_smiles, test_smiles = pubchem_train_test_split( + ... smiles, train_size=0.75, test_size=0.25, n_jobs=1, n_retries=1 + ... ) + >>> train_smiles + ['CCCl', 'CCI', 'CCO', 'CCN', 'CCBr', 'CCC'] """ years = _get_pubchem_years(data, n_jobs, n_retries, verbose) @@ -296,6 +306,16 @@ def pubchem_train_valid_test_split( "An update on PUG-REST: RESTful interface for programmatic access to PubChem." Nucleic Acids Res. 2018 Jul 2;46(W1):W563-W570. `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import pubchem_train_valid_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> train_smiles, valid_smiles, test_smiles = pubchem_train_valid_test_split( + ... smiles, train_size=0.5, valid_size=0.25, test_size=0.25, n_jobs=1, n_retries=1, verbose=0 + ... ) + >>> train_smiles + ['CCCl', 'CCI', 'CCO', 'CCN'] """ years = _get_pubchem_years(data, n_jobs, n_retries, verbose) @@ -381,7 +401,8 @@ def _get_cid_for_smiles(smiles: str, n_retries: int, verbosity: int) -> str | No """ Get PubChem CID from SMILES, or None if molecule cannot be found. """ - print(smiles) + if verbosity > 0: + print(smiles) url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{quote(smiles)}/cids/JSON" response = None diff --git a/skfp/model_selection/splitters/randomized_scaffold_split.py b/skfp/model_selection/splitters/randomized_scaffold_split.py index d60198fc..828b7040 100644 --- a/skfp/model_selection/splitters/randomized_scaffold_split.py +++ b/skfp/model_selection/splitters/randomized_scaffold_split.py @@ -125,6 +125,16 @@ def randomized_scaffold_train_test_split( "Does GNN Pretraining Help Molecular Representation?" Advances in Neural Information Processing Systems 35 (NeurIPS 2022). `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import randomized_scaffold_train_test_split + >>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] + >>> train_smiles, test_smiles = randomized_scaffold_train_test_split( + ... smiles, train_size=6, test_size=2, random_state=42 + ... ) + >>> train_smiles + ['C1CCCCC1', 'c1ccccc1'] """ train_size, test_size = validate_train_test_split_sizes( train_size, test_size, len(data) @@ -289,6 +299,16 @@ def randomized_scaffold_train_valid_test_split( "Does GNN Pretraining Help Molecular Representation?" Advances in Neural Information Processing Systems 35 (NeurIPS 2022). `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import randomized_scaffold_train_valid_test_split + >>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] + >>> train_smiles, valid_smiles, test_smiles = randomized_scaffold_train_valid_test_split( + ... smiles, train_size=6, valid_size=1, test_size=1, random_state=42 + ... ) + >>> train_smiles + ['c1ccccc1'] """ train_size, valid_size, test_size = validate_train_valid_test_split_sizes( train_size, valid_size, test_size, len(data) diff --git a/skfp/model_selection/splitters/scaffold_split.py b/skfp/model_selection/splitters/scaffold_split.py index fb792319..6c6f767a 100644 --- a/skfp/model_selection/splitters/scaffold_split.py +++ b/skfp/model_selection/splitters/scaffold_split.py @@ -117,6 +117,14 @@ def scaffold_train_test_split( .. [3] ` Bemis-Murcko scaffolds and their variants `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import scaffold_train_test_split + >>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] + >>> train_smiles, test_smiles = scaffold_train_test_split(smiles, train_size=6, test_size=2) + >>> train_smiles + ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] """ train_size, test_size = validate_train_test_split_sizes( train_size, test_size, len(data) @@ -272,6 +280,16 @@ def scaffold_train_valid_test_split( .. [3] ` Bemis-Murcko scaffolds and their variants `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import scaffold_train_valid_test_split + >>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] + >>> train_smiles, valid_smiles, test_smiles = scaffold_train_valid_test_split( + ... smiles, train_size=6, valid_size=1, test_size=1 + ... ) + >>> train_smiles + ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] """ train_size, valid_size, test_size = validate_train_valid_test_split_sizes( train_size, valid_size, test_size, len(data)