From a148b7f6220d5641a48981b80f7aa8633a233eff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Stefanik?= Date: Tue, 28 Oct 2025 20:52:29 +0000 Subject: [PATCH 1/4] Add examples to train-test split functions in various splitters --- .../model_selection/splitters/butina_split.py | 24 ++++ .../model_selection/splitters/maxmin_split.py | 114 ++++++++++++++++++ .../splitters/pubchem_split.py | 42 +++++++ .../splitters/randomized_scaffold_split.py | 26 ++++ .../splitters/scaffold_split.py | 24 ++++ 5 files changed, 230 insertions(+) diff --git a/skfp/model_selection/splitters/butina_split.py b/skfp/model_selection/splitters/butina_split.py index c947f445..74c796c4 100644 --- a/skfp/model_selection/splitters/butina_split.py +++ b/skfp/model_selection/splitters/butina_split.py @@ -151,6 +151,16 @@ def butina_train_test_split( .. [6] `Leland McInnes "PyNNDescent for fast Approximate Nearest Neighbors" `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import butina_train_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> train_smiles, test_smiles = butina_train_test_split(smiles, train_size=0.75, test_size=0.25) + >>> print('Train SMILES:', train_smiles) + Train SMILES: ['CCBr', 'CCI', 'CCF', 'CC=O', 'CCO', 'CCC'] + >>> print('Test SMILES:', test_smiles) + Test SMILES: ['CCN', 'CCCl'] """ train_size, test_size = validate_train_test_split_sizes( train_size, test_size, len(data) @@ -336,6 +346,20 @@ def butina_train_valid_test_split( .. [6] `Leland McInnes "PyNNDescent for fast Approximate Nearest Neighbors" `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import butina_train_valid_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> train_smiles, valid_smiles, test_smiles = butina_train_valid_test_split( + ... smiles, train_size=0.5, valid_size=0.25, test_size=0.25 + ... ) + >>> print('Train SMILES:', train_smiles) + Train SMILES: ['CCF', 'CC=O', 'CCO', 'CCC'] + >>> print('Valid SMILES:', valid_smiles) + Valid SMILES: ['CCBr', 'CCI'] + >>> print('Test SMILES:', test_smiles) + Test SMILES: ['CCN', 'CCCl'] """ train_size, valid_size, test_size = validate_train_valid_test_split_sizes( train_size, valid_size, test_size, len(data) diff --git a/skfp/model_selection/splitters/maxmin_split.py b/skfp/model_selection/splitters/maxmin_split.py index 1917593c..6c51e14c 100644 --- a/skfp/model_selection/splitters/maxmin_split.py +++ b/skfp/model_selection/splitters/maxmin_split.py @@ -113,6 +113,27 @@ def maxmin_train_test_split( .. [4] `Squonk - RDKit MaxMin Picker _` + + Examples + -------- + >>> from skfp.model_selection.splitters import maxmin_train_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> train_smiles, test_smiles = maxmin_train_test_split( + ... smiles, train_size=0.75, test_size=0.25, random_state=42 + ... ) + >>> print('Train SMILES:', train_smiles) + Train SMILES: ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] + >>> print('Test SMILES:', test_smiles) + Test SMILES: ['CCC', 'CC=O'] + >>> additional_names = ['ethanol', 'ethylamine', 'propane', 'chloroethane', + ... 'bromoethane', 'iodoethane', 'fluoroethane', 'acetaldehyde'] + >>> train_smiles, test_smiles, train_names, test_names = maxmin_train_test_split( + ... smiles, additional_names, train_size=0.75, test_size=0.25, random_state=42 + ... ) + >>> print('Train Names:', train_names) + Train Names: ['ethanol', 'ethylamine', 'chloroethane', 'bromoethane', 'iodoethane', 'fluoroethane'] + >>> print('Test Names:', test_names) + Test Names: ['propane', 'acetaldehyde'] """ data_size = len(data) train_size, test_size = validate_train_test_split_sizes( @@ -261,6 +282,34 @@ def maxmin_train_valid_test_split( .. [4] `Squonk - RDKit MaxMin Picker _` + + Examples + -------- + >>> from skfp.model_selection.splitters import maxmin_train_valid_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> train_smiles, valid_smiles, test_smiles = maxmin_train_valid_test_split( + ... smiles, train_size=0.5, valid_size=0.25, test_size=0.25, random_state=42 + ... ) + >>> print('Train SMILES:', train_smiles) + Train SMILES: ['CCCl', 'CCBr', 'CCI', 'CCF'] + >>> print('Valid SMILES:', valid_smiles) + Valid SMILES: ['CCO', 'CCN'] + >>> print('Test SMILES:', test_smiles) + Test SMILES: ['CCC', 'CC=O'] + >>> additional_names = ['ethanol', 'ethylamine', 'propane', 'chloroethane', + ... 'bromoethane', 'iodoethane', 'fluoroethane', 'acetaldehyde'] + >>> train_smiles, valid_smiles, test_smiles, train_names, valid_names, test_names = ( + ... maxmin_train_valid_test_split( + ... smiles, additional_names, + ... train_size=0.5, valid_size=0.25, test_size=0.25, random_state=42 + ... ) + ... ) + >>> print('Train Names:', train_names) + Train Names: ['chloroethane', 'bromoethane', 'iodoethane', 'fluoroethane'] + >>> print('Valid Names:', valid_names) + Valid Names: ['ethanol', 'ethylamine'] + >>> print('Test Names:', test_names) + Test Names: ['propane', 'acetaldehyde'] """ data_size = len(data) train_size, valid_size, test_size = validate_train_valid_test_split_sizes( @@ -405,6 +454,34 @@ def maxmin_stratified_train_test_split( See Also -------- :func:`maxmin_train_test_split` : Regular MaxMin split. + + Examples + -------- + >>> from skfp.model_selection.splitters import maxmin_stratified_train_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> labels = [0, 0, 1, 1, 0, 1, 0, 1] + >>> train_smiles, test_smiles, train_labels, test_labels = maxmin_stratified_train_test_split( + ... smiles, labels, train_size=0.75, test_size=0.25, random_state=42 + ... ) + >>> print('Train SMILES:', train_smiles) + Train SMILES: ['CCO', 'CCBr', 'CCF', 'CCC', 'CCI', 'CC=O'] + >>> print('Test SMILES:', test_smiles) + Test SMILES: ['CCN', 'CCCl'] + >>> print('Train Labels:', train_labels) + Train Labels: [0 0 0 1 1 1] + >>> print('Test Labels:', test_labels) + Test Labels: [0 1] + >>> additional_names = ['ethanol', 'ethylamine', 'propane', 'chloroethane', + ... 'bromoethane', 'iodoethane', 'fluoroethane', 'acetaldehyde'] + >>> train_smiles, test_smiles, train_labels, test_labels, train_names, test_names = ( + ... maxmin_stratified_train_test_split( + ... smiles, labels, additional_names, train_size=0.75, test_size=0.25, random_state=42 + ... ) + ... ) + >>> print('Train Names:', train_names) + Train Names: ['ethanol', 'bromoethane', 'fluoroethane', 'propane', 'iodoethane', 'acetaldehyde'] + >>> print('Test Names:', test_names) + Test Names: ['ethylamine', 'chloroethane'] """ data_arr = np.array(data) labels = np.array(labels, dtype=int) @@ -561,6 +638,43 @@ def maxmin_stratified_train_valid_test_split( See Also -------- :func:`maxmin_train_valid_test_split` : Regular MaxMin split. + + Examples + -------- + >>> from skfp.model_selection.splitters import maxmin_stratified_train_valid_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> labels = [0, 0, 1, 1, 0, 1, 0, 1] + >>> train_smiles, valid_smiles, test_smiles, train_labels, valid_labels, test_labels = ( + ... maxmin_stratified_train_valid_test_split( + ... smiles, labels, train_size=0.5, valid_size=0.25, test_size=0.25, random_state=42 + ... ) + ... ) + >>> print('Train SMILES:', train_smiles) + Train SMILES: ['CCBr', 'CCF', 'CCC', 'CCI'] + >>> print('Valid SMILES:', valid_smiles) + Valid SMILES: ['CCO', 'CC=O'] + >>> print('Test SMILES:', test_smiles) + Test SMILES: ['CCN', 'CCCl'] + >>> print('Train Labels:', train_labels) + Train Labels: [0 0 1 1] + >>> print('Valid Labels:', valid_labels) + Valid Labels: [0 1] + >>> print('Test Labels:', test_labels) + Test Labels: [0 1] + >>> additional_names = ['ethanol', 'ethylamine', 'propane', 'chloroethane', + ... 'bromoethane', 'iodoethane', 'fluoroethane', 'acetaldehyde'] + >>> res = maxmin_stratified_train_valid_test_split( + ... smiles, labels, additional_names, train_size=0.5, valid_size=0.25, test_size=0.25, random_state=42 + ... ) + >>> len(res) + 9 + >>> train_smiles, valid_smiles, test_smiles, train_labels, valid_labels, test_labels, train_names, valid_names, test_names = res + >>> print('Train Names:', train_names) + Train Names: ['bromoethane', 'fluoroethane', 'propane', 'iodoethane'] + >>> print('Valid Names:', valid_names) + Valid Names: ['ethanol', 'acetaldehyde'] + >>> print('Test Names:', test_names) + Test Names: ['ethylamine', 'chloroethane'] """ data_arr = np.array(data) labels = np.array(labels, dtype=int) diff --git a/skfp/model_selection/splitters/pubchem_split.py b/skfp/model_selection/splitters/pubchem_split.py index c56e155e..37090601 100644 --- a/skfp/model_selection/splitters/pubchem_split.py +++ b/skfp/model_selection/splitters/pubchem_split.py @@ -130,6 +130,26 @@ def pubchem_train_test_split( "An update on PUG-REST: RESTful interface for programmatic access to PubChem." Nucleic Acids Res. 2018 Jul 2;46(W1):W563-W570. `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import pubchem_train_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> train_smiles, test_smiles = pubchem_train_test_split( + ... smiles, train_size=0.75, test_size=0.25, n_jobs=1, n_retries=1, verbose=0 + ... ) + CCO + CCN + CCC + CCCl + CCBr + CCI + CCF + CC=O + >>> print('Train SMILES:', train_smiles) + Train SMILES: ['CCCl', 'CCI', 'CCO', 'CCN', 'CCBr', 'CCC'] + >>> print('Test SMILES:', test_smiles) + Test SMILES: ['CC=O', 'CCF'] """ years = _get_pubchem_years(data, n_jobs, n_retries, verbose) @@ -296,6 +316,28 @@ def pubchem_train_valid_test_split( "An update on PUG-REST: RESTful interface for programmatic access to PubChem." Nucleic Acids Res. 2018 Jul 2;46(W1):W563-W570. `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import pubchem_train_valid_test_split + >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] + >>> train_smiles, valid_smiles, test_smiles = pubchem_train_valid_test_split( + ... smiles, train_size=0.5, valid_size=0.25, test_size=0.25, n_jobs=1, n_retries=1, verbose=0 + ... ) + CCO + CCN + CCC + CCCl + CCBr + CCI + CCF + CC=O + >>> print('Train SMILES:', train_smiles) + Train SMILES: ['CCCl', 'CCI', 'CCO', 'CCN'] + >>> print('Valid SMILES:', valid_smiles) + Valid SMILES: ['CCBr', 'CCC'] + >>> print('Test SMILES:', test_smiles) + Test SMILES: ['CC=O', 'CCF'] """ years = _get_pubchem_years(data, n_jobs, n_retries, verbose) diff --git a/skfp/model_selection/splitters/randomized_scaffold_split.py b/skfp/model_selection/splitters/randomized_scaffold_split.py index d60198fc..754ffbb1 100644 --- a/skfp/model_selection/splitters/randomized_scaffold_split.py +++ b/skfp/model_selection/splitters/randomized_scaffold_split.py @@ -125,6 +125,18 @@ def randomized_scaffold_train_test_split( "Does GNN Pretraining Help Molecular Representation?" Advances in Neural Information Processing Systems 35 (NeurIPS 2022). `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import randomized_scaffold_train_test_split + >>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] + >>> train_smiles, test_smiles = randomized_scaffold_train_test_split( + ... smiles, train_size=6, test_size=2, random_state=42 + ... ) + >>> print('Train SMILES:', train_smiles) + Train SMILES: ['C1CCCCC1', 'c1ccccc1'] + >>> print('Test SMILES:', test_smiles) + Test SMILES: ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] """ train_size, test_size = validate_train_test_split_sizes( train_size, test_size, len(data) @@ -289,6 +301,20 @@ def randomized_scaffold_train_valid_test_split( "Does GNN Pretraining Help Molecular Representation?" Advances in Neural Information Processing Systems 35 (NeurIPS 2022). `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import randomized_scaffold_train_valid_test_split + >>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] + >>> train_smiles, valid_smiles, test_smiles = randomized_scaffold_train_valid_test_split( + ... smiles, train_size=6, valid_size=1, test_size=1, random_state=42 + ... ) + >>> print('Train SMILES:', train_smiles) + Train SMILES: ['c1ccccc1'] + >>> print('Valid SMILES:', valid_smiles) + Valid SMILES: ['C1CCCCC1'] + >>> print('Test SMILES:', test_smiles) + Test SMILES: ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] """ train_size, valid_size, test_size = validate_train_valid_test_split_sizes( train_size, valid_size, test_size, len(data) diff --git a/skfp/model_selection/splitters/scaffold_split.py b/skfp/model_selection/splitters/scaffold_split.py index fb792319..3897f1fb 100644 --- a/skfp/model_selection/splitters/scaffold_split.py +++ b/skfp/model_selection/splitters/scaffold_split.py @@ -117,6 +117,16 @@ def scaffold_train_test_split( .. [3] ` Bemis-Murcko scaffolds and their variants `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import scaffold_train_test_split + >>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] + >>> train_smiles, test_smiles = scaffold_train_test_split(smiles, train_size=6, test_size=2) + >>> print('Train SMILES:', train_smiles) + Train SMILES: ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] + >>> print('Test SMILES:', test_smiles) + Test SMILES: ['c1ccccc1', 'C1CCCCC1'] """ train_size, test_size = validate_train_test_split_sizes( train_size, test_size, len(data) @@ -272,6 +282,20 @@ def scaffold_train_valid_test_split( .. [3] ` Bemis-Murcko scaffolds and their variants `_ + + Examples + -------- + >>> from skfp.model_selection.splitters import scaffold_train_valid_test_split + >>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] + >>> train_smiles, valid_smiles, test_smiles = scaffold_train_valid_test_split( + ... smiles, train_size=6, valid_size=1, test_size=1 + ... ) + >>> print('Train SMILES:', train_smiles) + Train SMILES: ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] + >>> print('Valid SMILES:', valid_smiles) + Valid SMILES: ['C1CCCCC1'] + >>> print('Test SMILES:', test_smiles) + Test SMILES: ['c1ccccc1'] """ train_size, valid_size, test_size = validate_train_valid_test_split_sizes( train_size, valid_size, test_size, len(data) From 46d8c426d1dcf8b897aa60557fc250f7dcff0a89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Stefanik?= Date: Tue, 28 Oct 2025 21:03:16 +0000 Subject: [PATCH 2/4] Fix formatting of references in maxmin_split.py documentation --- skfp/model_selection/splitters/maxmin_split.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/skfp/model_selection/splitters/maxmin_split.py b/skfp/model_selection/splitters/maxmin_split.py index 6c51e14c..0daf7f90 100644 --- a/skfp/model_selection/splitters/maxmin_split.py +++ b/skfp/model_selection/splitters/maxmin_split.py @@ -101,18 +101,18 @@ def maxmin_train_test_split( .. [1] `Mark Ashton et al. "Identification of Diverse Database Subsets using Property-Based and Fragment-Based Molecular Descriptions" Quant. Struct.-Act. Relat., 21: 598-604 - _` + `_ .. [2] `Roger Sayle "Improved RDKit implementation" - _` + `_ .. [3] `Tim Dudgeon "Revisiting the MaxMinPicker" - _` + `_ .. [4] `Squonk - RDKit MaxMin Picker - _` + `_ Examples -------- @@ -270,18 +270,18 @@ def maxmin_train_valid_test_split( .. [1] `Mark Ashton et al. "Identification of Diverse Database Subsets using Property-Based and Fragment-Based Molecular Descriptions" Quant. Struct.-Act. Relat., 21: 598-604 - _` + `_ .. [2] `Roger Sayle "Improved RDKit implementation" - _` + `_ .. [3] `Tim Dudgeon "Revisiting the MaxMinPicker" - _` + `_ .. [4] `Squonk - RDKit MaxMin Picker - _` + `_ Examples -------- From 727cadc862de4bc02e139e2b402f9038b10a9a6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Stefanik?= Date: Thu, 30 Oct 2025 19:52:01 +0000 Subject: [PATCH 3/4] Remove unnecessarry lines, fix verbosity issue --- .../model_selection/splitters/butina_split.py | 6 -- .../model_selection/splitters/maxmin_split.py | 70 ------------------- .../splitters/pubchem_split.py | 27 +------ .../splitters/randomized_scaffold_split.py | 6 -- .../splitters/scaffold_split.py | 6 -- 5 files changed, 3 insertions(+), 112 deletions(-) diff --git a/skfp/model_selection/splitters/butina_split.py b/skfp/model_selection/splitters/butina_split.py index 74c796c4..c6dec33a 100644 --- a/skfp/model_selection/splitters/butina_split.py +++ b/skfp/model_selection/splitters/butina_split.py @@ -159,8 +159,6 @@ def butina_train_test_split( >>> train_smiles, test_smiles = butina_train_test_split(smiles, train_size=0.75, test_size=0.25) >>> print('Train SMILES:', train_smiles) Train SMILES: ['CCBr', 'CCI', 'CCF', 'CC=O', 'CCO', 'CCC'] - >>> print('Test SMILES:', test_smiles) - Test SMILES: ['CCN', 'CCCl'] """ train_size, test_size = validate_train_test_split_sizes( train_size, test_size, len(data) @@ -356,10 +354,6 @@ def butina_train_valid_test_split( ... ) >>> print('Train SMILES:', train_smiles) Train SMILES: ['CCF', 'CC=O', 'CCO', 'CCC'] - >>> print('Valid SMILES:', valid_smiles) - Valid SMILES: ['CCBr', 'CCI'] - >>> print('Test SMILES:', test_smiles) - Test SMILES: ['CCN', 'CCCl'] """ train_size, valid_size, test_size = validate_train_valid_test_split_sizes( train_size, valid_size, test_size, len(data) diff --git a/skfp/model_selection/splitters/maxmin_split.py b/skfp/model_selection/splitters/maxmin_split.py index 0daf7f90..976b1031 100644 --- a/skfp/model_selection/splitters/maxmin_split.py +++ b/skfp/model_selection/splitters/maxmin_split.py @@ -123,17 +123,6 @@ def maxmin_train_test_split( ... ) >>> print('Train SMILES:', train_smiles) Train SMILES: ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] - >>> print('Test SMILES:', test_smiles) - Test SMILES: ['CCC', 'CC=O'] - >>> additional_names = ['ethanol', 'ethylamine', 'propane', 'chloroethane', - ... 'bromoethane', 'iodoethane', 'fluoroethane', 'acetaldehyde'] - >>> train_smiles, test_smiles, train_names, test_names = maxmin_train_test_split( - ... smiles, additional_names, train_size=0.75, test_size=0.25, random_state=42 - ... ) - >>> print('Train Names:', train_names) - Train Names: ['ethanol', 'ethylamine', 'chloroethane', 'bromoethane', 'iodoethane', 'fluoroethane'] - >>> print('Test Names:', test_names) - Test Names: ['propane', 'acetaldehyde'] """ data_size = len(data) train_size, test_size = validate_train_test_split_sizes( @@ -292,24 +281,6 @@ def maxmin_train_valid_test_split( ... ) >>> print('Train SMILES:', train_smiles) Train SMILES: ['CCCl', 'CCBr', 'CCI', 'CCF'] - >>> print('Valid SMILES:', valid_smiles) - Valid SMILES: ['CCO', 'CCN'] - >>> print('Test SMILES:', test_smiles) - Test SMILES: ['CCC', 'CC=O'] - >>> additional_names = ['ethanol', 'ethylamine', 'propane', 'chloroethane', - ... 'bromoethane', 'iodoethane', 'fluoroethane', 'acetaldehyde'] - >>> train_smiles, valid_smiles, test_smiles, train_names, valid_names, test_names = ( - ... maxmin_train_valid_test_split( - ... smiles, additional_names, - ... train_size=0.5, valid_size=0.25, test_size=0.25, random_state=42 - ... ) - ... ) - >>> print('Train Names:', train_names) - Train Names: ['chloroethane', 'bromoethane', 'iodoethane', 'fluoroethane'] - >>> print('Valid Names:', valid_names) - Valid Names: ['ethanol', 'ethylamine'] - >>> print('Test Names:', test_names) - Test Names: ['propane', 'acetaldehyde'] """ data_size = len(data) train_size, valid_size, test_size = validate_train_valid_test_split_sizes( @@ -465,23 +436,6 @@ def maxmin_stratified_train_test_split( ... ) >>> print('Train SMILES:', train_smiles) Train SMILES: ['CCO', 'CCBr', 'CCF', 'CCC', 'CCI', 'CC=O'] - >>> print('Test SMILES:', test_smiles) - Test SMILES: ['CCN', 'CCCl'] - >>> print('Train Labels:', train_labels) - Train Labels: [0 0 0 1 1 1] - >>> print('Test Labels:', test_labels) - Test Labels: [0 1] - >>> additional_names = ['ethanol', 'ethylamine', 'propane', 'chloroethane', - ... 'bromoethane', 'iodoethane', 'fluoroethane', 'acetaldehyde'] - >>> train_smiles, test_smiles, train_labels, test_labels, train_names, test_names = ( - ... maxmin_stratified_train_test_split( - ... smiles, labels, additional_names, train_size=0.75, test_size=0.25, random_state=42 - ... ) - ... ) - >>> print('Train Names:', train_names) - Train Names: ['ethanol', 'bromoethane', 'fluoroethane', 'propane', 'iodoethane', 'acetaldehyde'] - >>> print('Test Names:', test_names) - Test Names: ['ethylamine', 'chloroethane'] """ data_arr = np.array(data) labels = np.array(labels, dtype=int) @@ -651,30 +605,6 @@ def maxmin_stratified_train_valid_test_split( ... ) >>> print('Train SMILES:', train_smiles) Train SMILES: ['CCBr', 'CCF', 'CCC', 'CCI'] - >>> print('Valid SMILES:', valid_smiles) - Valid SMILES: ['CCO', 'CC=O'] - >>> print('Test SMILES:', test_smiles) - Test SMILES: ['CCN', 'CCCl'] - >>> print('Train Labels:', train_labels) - Train Labels: [0 0 1 1] - >>> print('Valid Labels:', valid_labels) - Valid Labels: [0 1] - >>> print('Test Labels:', test_labels) - Test Labels: [0 1] - >>> additional_names = ['ethanol', 'ethylamine', 'propane', 'chloroethane', - ... 'bromoethane', 'iodoethane', 'fluoroethane', 'acetaldehyde'] - >>> res = maxmin_stratified_train_valid_test_split( - ... smiles, labels, additional_names, train_size=0.5, valid_size=0.25, test_size=0.25, random_state=42 - ... ) - >>> len(res) - 9 - >>> train_smiles, valid_smiles, test_smiles, train_labels, valid_labels, test_labels, train_names, valid_names, test_names = res - >>> print('Train Names:', train_names) - Train Names: ['bromoethane', 'fluoroethane', 'propane', 'iodoethane'] - >>> print('Valid Names:', valid_names) - Valid Names: ['ethanol', 'acetaldehyde'] - >>> print('Test Names:', test_names) - Test Names: ['ethylamine', 'chloroethane'] """ data_arr = np.array(data) labels = np.array(labels, dtype=int) diff --git a/skfp/model_selection/splitters/pubchem_split.py b/skfp/model_selection/splitters/pubchem_split.py index 37090601..42e1f19e 100644 --- a/skfp/model_selection/splitters/pubchem_split.py +++ b/skfp/model_selection/splitters/pubchem_split.py @@ -136,20 +136,10 @@ def pubchem_train_test_split( >>> from skfp.model_selection.splitters import pubchem_train_test_split >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] >>> train_smiles, test_smiles = pubchem_train_test_split( - ... smiles, train_size=0.75, test_size=0.25, n_jobs=1, n_retries=1, verbose=0 + ... smiles, train_size=0.75, test_size=0.25, n_jobs=1, n_retries=1 ... ) - CCO - CCN - CCC - CCCl - CCBr - CCI - CCF - CC=O >>> print('Train SMILES:', train_smiles) Train SMILES: ['CCCl', 'CCI', 'CCO', 'CCN', 'CCBr', 'CCC'] - >>> print('Test SMILES:', test_smiles) - Test SMILES: ['CC=O', 'CCF'] """ years = _get_pubchem_years(data, n_jobs, n_retries, verbose) @@ -324,20 +314,8 @@ def pubchem_train_valid_test_split( >>> train_smiles, valid_smiles, test_smiles = pubchem_train_valid_test_split( ... smiles, train_size=0.5, valid_size=0.25, test_size=0.25, n_jobs=1, n_retries=1, verbose=0 ... ) - CCO - CCN - CCC - CCCl - CCBr - CCI - CCF - CC=O >>> print('Train SMILES:', train_smiles) Train SMILES: ['CCCl', 'CCI', 'CCO', 'CCN'] - >>> print('Valid SMILES:', valid_smiles) - Valid SMILES: ['CCBr', 'CCC'] - >>> print('Test SMILES:', test_smiles) - Test SMILES: ['CC=O', 'CCF'] """ years = _get_pubchem_years(data, n_jobs, n_retries, verbose) @@ -423,7 +401,8 @@ def _get_cid_for_smiles(smiles: str, n_retries: int, verbosity: int) -> str | No """ Get PubChem CID from SMILES, or None if molecule cannot be found. """ - print(smiles) + if verbosity > 0: + print(smiles) url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{quote(smiles)}/cids/JSON" response = None diff --git a/skfp/model_selection/splitters/randomized_scaffold_split.py b/skfp/model_selection/splitters/randomized_scaffold_split.py index 754ffbb1..33966f8d 100644 --- a/skfp/model_selection/splitters/randomized_scaffold_split.py +++ b/skfp/model_selection/splitters/randomized_scaffold_split.py @@ -135,8 +135,6 @@ def randomized_scaffold_train_test_split( ... ) >>> print('Train SMILES:', train_smiles) Train SMILES: ['C1CCCCC1', 'c1ccccc1'] - >>> print('Test SMILES:', test_smiles) - Test SMILES: ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] """ train_size, test_size = validate_train_test_split_sizes( train_size, test_size, len(data) @@ -311,10 +309,6 @@ def randomized_scaffold_train_valid_test_split( ... ) >>> print('Train SMILES:', train_smiles) Train SMILES: ['c1ccccc1'] - >>> print('Valid SMILES:', valid_smiles) - Valid SMILES: ['C1CCCCC1'] - >>> print('Test SMILES:', test_smiles) - Test SMILES: ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] """ train_size, valid_size, test_size = validate_train_valid_test_split_sizes( train_size, valid_size, test_size, len(data) diff --git a/skfp/model_selection/splitters/scaffold_split.py b/skfp/model_selection/splitters/scaffold_split.py index 3897f1fb..af4cb75c 100644 --- a/skfp/model_selection/splitters/scaffold_split.py +++ b/skfp/model_selection/splitters/scaffold_split.py @@ -125,8 +125,6 @@ def scaffold_train_test_split( >>> train_smiles, test_smiles = scaffold_train_test_split(smiles, train_size=6, test_size=2) >>> print('Train SMILES:', train_smiles) Train SMILES: ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] - >>> print('Test SMILES:', test_smiles) - Test SMILES: ['c1ccccc1', 'C1CCCCC1'] """ train_size, test_size = validate_train_test_split_sizes( train_size, test_size, len(data) @@ -292,10 +290,6 @@ def scaffold_train_valid_test_split( ... ) >>> print('Train SMILES:', train_smiles) Train SMILES: ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] - >>> print('Valid SMILES:', valid_smiles) - Valid SMILES: ['C1CCCCC1'] - >>> print('Test SMILES:', test_smiles) - Test SMILES: ['c1ccccc1'] """ train_size, valid_size, test_size = validate_train_valid_test_split_sizes( train_size, valid_size, test_size, len(data) From 6bb0503d498880060e5c7090512664290761253e Mon Sep 17 00:00:00 2001 From: Jakub Adamczyk Date: Fri, 31 Oct 2025 10:08:03 +0100 Subject: [PATCH 4/4] Simplify examples --- skfp/model_selection/splitters/butina_split.py | 8 ++++---- skfp/model_selection/splitters/maxmin_split.py | 8 ++++---- skfp/model_selection/splitters/pubchem_split.py | 8 ++++---- .../splitters/randomized_scaffold_split.py | 8 ++++---- skfp/model_selection/splitters/scaffold_split.py | 8 ++++---- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/skfp/model_selection/splitters/butina_split.py b/skfp/model_selection/splitters/butina_split.py index c6dec33a..6e9e184a 100644 --- a/skfp/model_selection/splitters/butina_split.py +++ b/skfp/model_selection/splitters/butina_split.py @@ -157,8 +157,8 @@ def butina_train_test_split( >>> from skfp.model_selection.splitters import butina_train_test_split >>> smiles = ['CCO', 'CCN', 'CCC', 'CCCl', 'CCBr', 'CCI', 'CCF', 'CC=O'] >>> train_smiles, test_smiles = butina_train_test_split(smiles, train_size=0.75, test_size=0.25) - >>> print('Train SMILES:', train_smiles) - Train SMILES: ['CCBr', 'CCI', 'CCF', 'CC=O', 'CCO', 'CCC'] + >>> train_smiles + ['CCBr', 'CCI', 'CCF', 'CC=O', 'CCO', 'CCC'] """ train_size, test_size = validate_train_test_split_sizes( train_size, test_size, len(data) @@ -352,8 +352,8 @@ def butina_train_valid_test_split( >>> train_smiles, valid_smiles, test_smiles = butina_train_valid_test_split( ... smiles, train_size=0.5, valid_size=0.25, test_size=0.25 ... ) - >>> print('Train SMILES:', train_smiles) - Train SMILES: ['CCF', 'CC=O', 'CCO', 'CCC'] + >>> train_smiles + ['CCF', 'CC=O', 'CCO', 'CCC'] """ train_size, valid_size, test_size = validate_train_valid_test_split_sizes( train_size, valid_size, test_size, len(data) diff --git a/skfp/model_selection/splitters/maxmin_split.py b/skfp/model_selection/splitters/maxmin_split.py index 976b1031..54f5aa11 100644 --- a/skfp/model_selection/splitters/maxmin_split.py +++ b/skfp/model_selection/splitters/maxmin_split.py @@ -121,8 +121,8 @@ def maxmin_train_test_split( >>> train_smiles, test_smiles = maxmin_train_test_split( ... smiles, train_size=0.75, test_size=0.25, random_state=42 ... ) - >>> print('Train SMILES:', train_smiles) - Train SMILES: ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] + >>> train_smiles + ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] """ data_size = len(data) train_size, test_size = validate_train_test_split_sizes( @@ -279,8 +279,8 @@ def maxmin_train_valid_test_split( >>> train_smiles, valid_smiles, test_smiles = maxmin_train_valid_test_split( ... smiles, train_size=0.5, valid_size=0.25, test_size=0.25, random_state=42 ... ) - >>> print('Train SMILES:', train_smiles) - Train SMILES: ['CCCl', 'CCBr', 'CCI', 'CCF'] + >>> train_smiles + ['CCCl', 'CCBr', 'CCI', 'CCF'] """ data_size = len(data) train_size, valid_size, test_size = validate_train_valid_test_split_sizes( diff --git a/skfp/model_selection/splitters/pubchem_split.py b/skfp/model_selection/splitters/pubchem_split.py index 42e1f19e..4bc3a6d5 100644 --- a/skfp/model_selection/splitters/pubchem_split.py +++ b/skfp/model_selection/splitters/pubchem_split.py @@ -138,8 +138,8 @@ def pubchem_train_test_split( >>> train_smiles, test_smiles = pubchem_train_test_split( ... smiles, train_size=0.75, test_size=0.25, n_jobs=1, n_retries=1 ... ) - >>> print('Train SMILES:', train_smiles) - Train SMILES: ['CCCl', 'CCI', 'CCO', 'CCN', 'CCBr', 'CCC'] + >>> train_smiles + ['CCCl', 'CCI', 'CCO', 'CCN', 'CCBr', 'CCC'] """ years = _get_pubchem_years(data, n_jobs, n_retries, verbose) @@ -314,8 +314,8 @@ def pubchem_train_valid_test_split( >>> train_smiles, valid_smiles, test_smiles = pubchem_train_valid_test_split( ... smiles, train_size=0.5, valid_size=0.25, test_size=0.25, n_jobs=1, n_retries=1, verbose=0 ... ) - >>> print('Train SMILES:', train_smiles) - Train SMILES: ['CCCl', 'CCI', 'CCO', 'CCN'] + >>> train_smiles + ['CCCl', 'CCI', 'CCO', 'CCN'] """ years = _get_pubchem_years(data, n_jobs, n_retries, verbose) diff --git a/skfp/model_selection/splitters/randomized_scaffold_split.py b/skfp/model_selection/splitters/randomized_scaffold_split.py index 33966f8d..828b7040 100644 --- a/skfp/model_selection/splitters/randomized_scaffold_split.py +++ b/skfp/model_selection/splitters/randomized_scaffold_split.py @@ -133,8 +133,8 @@ def randomized_scaffold_train_test_split( >>> train_smiles, test_smiles = randomized_scaffold_train_test_split( ... smiles, train_size=6, test_size=2, random_state=42 ... ) - >>> print('Train SMILES:', train_smiles) - Train SMILES: ['C1CCCCC1', 'c1ccccc1'] + >>> train_smiles + ['C1CCCCC1', 'c1ccccc1'] """ train_size, test_size = validate_train_test_split_sizes( train_size, test_size, len(data) @@ -307,8 +307,8 @@ def randomized_scaffold_train_valid_test_split( >>> train_smiles, valid_smiles, test_smiles = randomized_scaffold_train_valid_test_split( ... smiles, train_size=6, valid_size=1, test_size=1, random_state=42 ... ) - >>> print('Train SMILES:', train_smiles) - Train SMILES: ['c1ccccc1'] + >>> train_smiles + ['c1ccccc1'] """ train_size, valid_size, test_size = validate_train_valid_test_split_sizes( train_size, valid_size, test_size, len(data) diff --git a/skfp/model_selection/splitters/scaffold_split.py b/skfp/model_selection/splitters/scaffold_split.py index af4cb75c..6c6f767a 100644 --- a/skfp/model_selection/splitters/scaffold_split.py +++ b/skfp/model_selection/splitters/scaffold_split.py @@ -123,8 +123,8 @@ def scaffold_train_test_split( >>> from skfp.model_selection.splitters import scaffold_train_test_split >>> smiles = ['c1ccccc1', 'C1CCCCC1', 'CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] >>> train_smiles, test_smiles = scaffold_train_test_split(smiles, train_size=6, test_size=2) - >>> print('Train SMILES:', train_smiles) - Train SMILES: ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] + >>> train_smiles + ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] """ train_size, test_size = validate_train_test_split_sizes( train_size, test_size, len(data) @@ -288,8 +288,8 @@ def scaffold_train_valid_test_split( >>> train_smiles, valid_smiles, test_smiles = scaffold_train_valid_test_split( ... smiles, train_size=6, valid_size=1, test_size=1 ... ) - >>> print('Train SMILES:', train_smiles) - Train SMILES: ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] + >>> train_smiles + ['CCO', 'CCN', 'CCCl', 'CCBr', 'CCI', 'CCF'] """ train_size, valid_size, test_size = validate_train_valid_test_split_sizes( train_size, valid_size, test_size, len(data)