diff --git a/skfp/model_selection/splitters/butina_split.py b/skfp/model_selection/splitters/butina_split.py
index c947f445..6e9e184a 100644
--- a/skfp/model_selection/splitters/butina_split.py
+++ b/skfp/model_selection/splitters/butina_split.py
@@ -151,6 +151,14 @@ def butina_train_test_split(
.. [6] `Leland McInnes
"PyNNDescent for fast Approximate Nearest Neighbors"
`_
+
+ Examples
+ --------
+ >>> from skfp.model_selection.splitters import butina_train_test_split
+ >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
+ >>> train_smiles, test_smiles = butina_train_test_split(smiles, train_size=0.75, test_size=0.25)
+ >>> train_smiles
+ ['CCBr', 'CCI', 'CCF', 'CC=O', 'CCO', 'CCC']
"""
train_size, test_size = validate_train_test_split_sizes(
train_size, test_size, len(data)
@@ -336,6 +344,16 @@ def butina_train_valid_test_split(
.. [6] `Leland McInnes
"PyNNDescent for fast Approximate Nearest Neighbors"
`_
+
+ Examples
+ --------
+ >>> from skfp.model_selection.splitters import butina_train_valid_test_split
+ >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
+ >>> train_smiles, valid_smiles, test_smiles = butina_train_valid_test_split(
+ ... smiles, train_size=0.5, valid_size=0.25, test_size=0.25
+ ... )
+ >>> train_smiles
+ ['CCF', 'CC=O', 'CCO', 'CCC']
"""
train_size, valid_size, test_size = validate_train_valid_test_split_sizes(
train_size, valid_size, test_size, len(data)
diff --git a/skfp/model_selection/splitters/maxmin_split.py b/skfp/model_selection/splitters/maxmin_split.py
index 1917593c..54f5aa11 100644
--- a/skfp/model_selection/splitters/maxmin_split.py
+++ b/skfp/model_selection/splitters/maxmin_split.py
@@ -101,18 +101,28 @@ def maxmin_train_test_split(
.. [1] `Mark Ashton et al.
"Identification of Diverse Database Subsets using Property-Based and Fragment-Based Molecular Descriptions"
Quant. Struct.-Act. Relat., 21: 598-604
- _`
+ `_
.. [2] `Roger Sayle
"Improved RDKit implementation"
- _`
+ `_
.. [3] `Tim Dudgeon
"Revisiting the MaxMinPicker"
- _`
+ `_
.. [4] `Squonk - RDKit MaxMin Picker
- _`
+ `_
+
+ Examples
+ --------
+ >>> from skfp.model_selection.splitters import maxmin_train_test_split
+ >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
+ >>> train_smiles, test_smiles = maxmin_train_test_split(
+ ... smiles, train_size=0.75, test_size=0.25, random_state=42
+ ... )
+ >>> train_smiles
+ ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF']
"""
data_size = len(data)
train_size, test_size = validate_train_test_split_sizes(
@@ -249,18 +259,28 @@ def maxmin_train_valid_test_split(
.. [1] `Mark Ashton et al.
"Identification of Diverse Database Subsets using Property-Based and Fragment-Based Molecular Descriptions"
Quant. Struct.-Act. Relat., 21: 598-604
- _`
+ `_
.. [2] `Roger Sayle
"Improved RDKit implementation"
- _`
+ `_
.. [3] `Tim Dudgeon
"Revisiting the MaxMinPicker"
- _`
+ `_
.. [4] `Squonk - RDKit MaxMin Picker
- _`
+ `_
+
+ Examples
+ --------
+ >>> from skfp.model_selection.splitters import maxmin_train_valid_test_split
+ >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
+ >>> train_smiles, valid_smiles, test_smiles = maxmin_train_valid_test_split(
+ ... smiles, train_size=0.5, valid_size=0.25, test_size=0.25, random_state=42
+ ... )
+ >>> train_smiles
+ ['CCCl', 'CCBr', 'CCI', 'CCF']
"""
data_size = len(data)
train_size, valid_size, test_size = validate_train_valid_test_split_sizes(
@@ -405,6 +425,17 @@ def maxmin_stratified_train_test_split(
See Also
--------
:func:`maxmin_train_test_split` : Regular MaxMin split.
+
+ Examples
+ --------
+ >>> from skfp.model_selection.splitters import maxmin_stratified_train_test_split
+ >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
+ >>> labels = [0, 0, 1, 1, 0, 1, 0, 1]
+ >>> train_smiles, test_smiles, train_labels, test_labels = maxmin_stratified_train_test_split(
+ ... smiles, labels, train_size=0.75, test_size=0.25, random_state=42
+ ... )
+ >>> print('Train SMILES:', train_smiles)
+ Train SMILES: ['CCO', 'CCBr', 'CCF', 'CCC', 'CCI', 'CC=O']
"""
data_arr = np.array(data)
labels = np.array(labels, dtype=int)
@@ -561,6 +592,19 @@ def maxmin_stratified_train_valid_test_split(
See Also
--------
:func:`maxmin_train_valid_test_split` : Regular MaxMin split.
+
+ Examples
+ --------
+ >>> from skfp.model_selection.splitters import maxmin_stratified_train_valid_test_split
+ >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
+ >>> labels = [0, 0, 1, 1, 0, 1, 0, 1]
+ >>> train_smiles, valid_smiles, test_smiles, train_labels, valid_labels, test_labels = (
+ ... maxmin_stratified_train_valid_test_split(
+ ... smiles, labels, train_size=0.5, valid_size=0.25, test_size=0.25, random_state=42
+ ... )
+ ... )
+ >>> print('Train SMILES:', train_smiles)
+ Train SMILES: ['CCBr', 'CCF', 'CCC', 'CCI']
"""
data_arr = np.array(data)
labels = np.array(labels, dtype=int)
diff --git a/skfp/model_selection/splitters/pubchem_split.py b/skfp/model_selection/splitters/pubchem_split.py
index c56e155e..4bc3a6d5 100644
--- a/skfp/model_selection/splitters/pubchem_split.py
+++ b/skfp/model_selection/splitters/pubchem_split.py
@@ -130,6 +130,16 @@ def pubchem_train_test_split(
"An update on PUG-REST: RESTful interface for programmatic access to PubChem."
Nucleic Acids Res. 2018 Jul 2;46(W1):W563-W570.
`_
+
+ Examples
+ --------
+ >>> from skfp.model_selection.splitters import pubchem_train_test_split
+ >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
+ >>> train_smiles, test_smiles = pubchem_train_test_split(
+ ... smiles, train_size=0.75, test_size=0.25, n_jobs=1, n_retries=1
+ ... )
+ >>> train_smiles
+ ['CCCl', 'CCI', 'CCO', 'CCN', 'CCBr', 'CCC']
"""
years = _get_pubchem_years(data, n_jobs, n_retries, verbose)
@@ -296,6 +306,16 @@ def pubchem_train_valid_test_split(
"An update on PUG-REST: RESTful interface for programmatic access to PubChem."
Nucleic Acids Res. 2018 Jul 2;46(W1):W563-W570.
`_
+
+ Examples
+ --------
+ >>> from skfp.model_selection.splitters import pubchem_train_valid_test_split
+ >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O']
+ >>> train_smiles, valid_smiles, test_smiles = pubchem_train_valid_test_split(
+ ... smiles, train_size=0.5, valid_size=0.25, test_size=0.25, n_jobs=1, n_retries=1, verbose=0
+ ... )
+ >>> train_smiles
+ ['CCCl', 'CCI', 'CCO', 'CCN']
"""
years = _get_pubchem_years(data, n_jobs, n_retries, verbose)
@@ -381,7 +401,8 @@ def _get_cid_for_smiles(smiles: str, n_retries: int, verbosity: int) -> str | No
"""
Get PubChem CID from SMILES, or None if molecule cannot be found.
"""
- print(smiles)
+ if verbosity > 0:
+ print(smiles)
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{quote(smiles)}/cids/JSON"
response = None
diff --git a/skfp/model_selection/splitters/randomized_scaffold_split.py b/skfp/model_selection/splitters/randomized_scaffold_split.py
index d60198fc..828b7040 100644
--- a/skfp/model_selection/splitters/randomized_scaffold_split.py
+++ b/skfp/model_selection/splitters/randomized_scaffold_split.py
@@ -125,6 +125,16 @@ def randomized_scaffold_train_test_split(
"Does GNN Pretraining Help Molecular Representation?"
Advances in Neural Information Processing Systems 35 (NeurIPS 2022).
`_
+
+ Examples
+ --------
+ >>> from skfp.model_selection.splitters import randomized_scaffold_train_test_split
+ >>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF']
+ >>> train_smiles, test_smiles = randomized_scaffold_train_test_split(
+ ... smiles, train_size=6, test_size=2, random_state=42
+ ... )
+ >>> train_smiles
+ ['C1CCCCC1', 'c1ccccc1']
"""
train_size, test_size = validate_train_test_split_sizes(
train_size, test_size, len(data)
@@ -289,6 +299,16 @@ def randomized_scaffold_train_valid_test_split(
"Does GNN Pretraining Help Molecular Representation?"
Advances in Neural Information Processing Systems 35 (NeurIPS 2022).
`_
+
+ Examples
+ --------
+ >>> from skfp.model_selection.splitters import randomized_scaffold_train_valid_test_split
+ >>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF']
+ >>> train_smiles, valid_smiles, test_smiles = randomized_scaffold_train_valid_test_split(
+ ... smiles, train_size=6, valid_size=1, test_size=1, random_state=42
+ ... )
+ >>> train_smiles
+ ['c1ccccc1']
"""
train_size, valid_size, test_size = validate_train_valid_test_split_sizes(
train_size, valid_size, test_size, len(data)
diff --git a/skfp/model_selection/splitters/scaffold_split.py b/skfp/model_selection/splitters/scaffold_split.py
index fb792319..6c6f767a 100644
--- a/skfp/model_selection/splitters/scaffold_split.py
+++ b/skfp/model_selection/splitters/scaffold_split.py
@@ -117,6 +117,14 @@ def scaffold_train_test_split(
.. [3] ` Bemis-Murcko scaffolds and their variants
`_
+
+ Examples
+ --------
+ >>> from skfp.model_selection.splitters import scaffold_train_test_split
+ >>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF']
+ >>> train_smiles, test_smiles = scaffold_train_test_split(smiles, train_size=6, test_size=2)
+ >>> train_smiles
+ ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF']
"""
train_size, test_size = validate_train_test_split_sizes(
train_size, test_size, len(data)
@@ -272,6 +280,16 @@ def scaffold_train_valid_test_split(
.. [3] ` Bemis-Murcko scaffolds and their variants
`_
+
+ Examples
+ --------
+ >>> from skfp.model_selection.splitters import scaffold_train_valid_test_split
+ >>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF']
+ >>> train_smiles, valid_smiles, test_smiles = scaffold_train_valid_test_split(
+ ... smiles, train_size=6, valid_size=1, test_size=1
+ ... )
+ >>> train_smiles
+ ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF']
"""
train_size, valid_size, test_size = validate_train_valid_test_split_sizes(
train_size, valid_size, test_size, len(data)