Skip to content

Commit 693d4ea

Browse files
authored
Substructural filters conditions (#475)
1 parent 5045a2e commit 693d4ea

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+765
-787
lines changed

examples/09_molecular_filters.ipynb

Lines changed: 91 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,7 @@
8181
"name": "stderr",
8282
"output_type": "stream",
8383
"text": [
84-
"<frozen importlib._bootstrap>:241: RuntimeWarning: to-Python converter for boost::shared_ptr<RDKit::FilterHierarchyMatcher> already registered; second conversion method ignored.\n",
85-
"<frozen importlib._bootstrap>:241: RuntimeWarning: to-Python converter for boost::shared_ptr<RDKit::FilterCatalogEntry> already registered; second conversion method ignored.\n"
84+
"<frozen importlib._bootstrap>:241: RuntimeWarning: to-Python converter for boost::shared_ptr<RDKit::FilterHierarchyMatcher> already registered; second conversion method ignored.\n"
8685
]
8786
}
8887
],
@@ -108,7 +107,7 @@
108107
},
109108
{
110109
"cell_type": "code",
111-
"execution_count": 5,
110+
"execution_count": 2,
112111
"id": "0f8eefe0",
113112
"metadata": {},
114113
"outputs": [],
@@ -123,7 +122,7 @@
123122
},
124123
{
125124
"cell_type": "code",
126-
"execution_count": 6,
125+
"execution_count": 3,
127126
"id": "3137351a",
128127
"metadata": {},
129128
"outputs": [],
@@ -136,7 +135,7 @@
136135
},
137136
{
138137
"cell_type": "code",
139-
"execution_count": 7,
138+
"execution_count": 4,
140139
"id": "ab862891",
141140
"metadata": {},
142141
"outputs": [
@@ -149,7 +148,7 @@
149148
" 'C[C@H](CCC(=O)O)CC1=CC=CC=C1']"
150149
]
151150
},
152-
"execution_count": 7,
151+
"execution_count": 4,
153152
"metadata": {},
154153
"output_type": "execute_result"
155154
}
@@ -161,7 +160,7 @@
161160
},
162161
{
163162
"cell_type": "code",
164-
"execution_count": 8,
163+
"execution_count": 5,
165164
"id": "93f94568",
166165
"metadata": {},
167166
"outputs": [
@@ -174,7 +173,7 @@
174173
" 'C[C@H](CCC(=O)O)CC1=CC=CC=C1']"
175174
]
176175
},
177-
"execution_count": 8,
176+
"execution_count": 5,
178177
"metadata": {},
179178
"output_type": "execute_result"
180179
}
@@ -186,7 +185,7 @@
186185
},
187186
{
188187
"cell_type": "code",
189-
"execution_count": 9,
188+
"execution_count": 6,
190189
"id": "cab9fc95",
191190
"metadata": {},
192191
"outputs": [
@@ -199,7 +198,7 @@
199198
" 'C[C@H](CCC(=O)O)CC1=CC=CC=C1']"
200199
]
201200
},
202-
"execution_count": 9,
201+
"execution_count": 6,
203202
"metadata": {},
204203
"output_type": "execute_result"
205204
}
@@ -211,7 +210,7 @@
211210
},
212211
{
213212
"cell_type": "code",
214-
"execution_count": 10,
213+
"execution_count": 7,
215214
"id": "19a21c0e",
216215
"metadata": {},
217216
"outputs": [
@@ -221,7 +220,7 @@
221220
"['CC(=O)OC1=CC=CC=C1C(=O)O', 'C[C@H](CCC(=O)O)CC1=CC=CC=C1']"
222221
]
223222
},
224-
"execution_count": 10,
223+
"execution_count": 7,
225224
"metadata": {},
226225
"output_type": "execute_result"
227226
}
@@ -241,10 +240,19 @@
241240
},
242241
{
243242
"cell_type": "code",
244-
"execution_count": 14,
243+
"execution_count": 8,
245244
"id": "02a47287",
246245
"metadata": {},
247-
"outputs": [],
246+
"outputs": [
247+
{
248+
"name": "stderr",
249+
"output_type": "stream",
250+
"text": [
251+
"/home/jakub/PycharmProjects/scikit-fingerprints/skfp/bases/base_filter.py:109: UserWarning: return_indicators is deprecated and will be removed in 2.0, use return_type instead\n",
252+
" warnings.warn(\n"
253+
]
254+
}
255+
],
248256
"source": [
249257
"mw_mask = MolecularWeightFilter(return_indicators=True)\n",
250258
"lipinski_mask = LipinskiFilter(return_indicators=True)\n",
@@ -254,7 +262,7 @@
254262
},
255263
{
256264
"cell_type": "code",
257-
"execution_count": 15,
265+
"execution_count": 9,
258266
"id": "4d53f1e4",
259267
"metadata": {},
260268
"outputs": [],
@@ -272,7 +280,7 @@
272280
},
273281
{
274282
"cell_type": "code",
275-
"execution_count": 16,
283+
"execution_count": 10,
276284
"id": "307f6e05",
277285
"metadata": {},
278286
"outputs": [],
@@ -285,7 +293,7 @@
285293
},
286294
{
287295
"cell_type": "code",
288-
"execution_count": 17,
296+
"execution_count": 11,
289297
"id": "11970880",
290298
"metadata": {},
291299
"outputs": [
@@ -384,7 +392,7 @@
384392
"4 True False False "
385393
]
386394
},
387-
"execution_count": 17,
395+
"execution_count": 11,
388396
"metadata": {},
389397
"output_type": "execute_result"
390398
}
@@ -405,39 +413,95 @@
405413
"cell_type": "markdown",
406414
"id": "a581bb4f-ba1e-4d96-afcc-c219afd442ac",
407415
"metadata": {},
408-
"source": "Substructural filters use sets of SMARTS patterns to define unwanted substructures. An example is Brenk filter ([docs](https://scikit-fingerprints.readthedocs.io/latest/modules/generated/skfp.filters.BrenkFilter.html)), designed to filter out molecules containing substructures with undesirable pharmacokinetics or toxicity, e.g. sulfates, phosphates, nitro groups. Other filters from this group often work based on similar principles, but differing in how aggressively they filter the molecules."
416+
"source": [
417+
"Substructural filters use sets of SMARTS patterns to define unwanted substructures. An example is Brenk filter ([docs](https://scikit-fingerprints.readthedocs.io/latest/modules/generated/skfp.filters.BrenkFilter.html)), designed to filter out molecules containing substructures with undesirable pharmacokinetics or toxicity, e.g. sulfates, phosphates, nitro groups. Other filters from this group often work based on similar principles, but differing in how aggressively they filter the molecules."
418+
]
409419
},
410420
{
411421
"cell_type": "code",
412-
"execution_count": 84,
422+
"execution_count": 12,
413423
"id": "b0285ab1",
414424
"metadata": {
415425
"scrolled": true
416426
},
427+
"outputs": [],
428+
"source": [
429+
"from skfp.filters import BrenkFilter\n",
430+
"\n",
431+
"brenk_filter = BrenkFilter()"
432+
]
433+
},
434+
{
435+
"cell_type": "markdown",
436+
"id": "9af3ce4f",
437+
"metadata": {},
438+
"source": [
439+
"Meanings of filter conditions are available through `.get_feature_names_out()` method. Generally, they are interpretable names given by creators, rather than raw SMARTS patterns."
440+
]
441+
},
442+
{
443+
"cell_type": "code",
444+
"execution_count": 13,
445+
"id": "94b2989e-f6ac-46a7-b1d6-17662b8a4579",
446+
"metadata": {
447+
"scrolled": true
448+
},
417449
"outputs": [
418450
{
419451
"data": {
420452
"text/plain": [
421-
"['description', 'FilterSet', 'Reference', 'Scope']"
453+
"array(['>_2_ester_groups', '2-halo_pyridine', 'acid_halide',\n",
454+
" 'acyclic_C=C-O', 'acyl_cyanide', 'acyl_hydrazine', 'aldehyde',\n",
455+
" 'Aliphatic_long_chain', 'alkyl_halide', 'amidotetrazole',\n",
456+
" 'aniline', 'azepane', 'Azido_group', 'Azo_group', 'azocane',\n",
457+
" 'benzidine', 'beta-keto/anhydride', 'biotin_analogue',\n",
458+
" 'Carbo_cation/anion', 'catechol', 'charged_oxygen_or_sulfur_atoms',\n",
459+
" 'chinone_1', 'chinone_2', 'conjugated_nitrile_group',\n",
460+
" 'crown_ether', 'cumarine', 'cyanamide',\n",
461+
" 'cyanate_/aminonitrile_/thiocyanate', 'cyanohydrins',\n",
462+
" 'cycloheptane_1', 'cycloheptane_2', 'cyclooctane_1',\n",
463+
" 'cyclooctane_2', 'diaminobenzene_1', 'diaminobenzene_2',\n",
464+
" 'diaminobenzene_3', 'diazo_group', 'diketo_group', 'disulphide',\n",
465+
" 'enamine', 'ester_of_HOBT', 'four_member_lactones',\n",
466+
" 'halogenated_ring_1', 'halogenated_ring_2', 'heavy_metal',\n",
467+
" 'het-C-het_not_in_ring', 'hydantoin', 'hydrazine', 'hydroquinone',\n",
468+
" 'hydroxamic_acid', 'imine_1', 'imine_2', 'iodine', 'isocyanate',\n",
469+
" 'isolated_alkene', 'ketene', 'methylidene-1,3-dithiole',\n",
470+
" 'Michael_acceptor_1', 'Michael_acceptor_2', 'Michael_acceptor_3',\n",
471+
" 'Michael_acceptor_4', 'Michael_acceptor_5', 'N_oxide',\n",
472+
" 'N-acyl-2-amino-5-mercapto-1,3,4-_thiadiazole', 'N-C-halo',\n",
473+
" 'N-halo', 'N-hydroxyl_pyridine', 'nitro_group', 'N-nitroso',\n",
474+
" 'oxime_1', 'oxime_2', 'Oxygen-nitrogen_single_bond',\n",
475+
" 'Perfluorinated_chain', 'peroxide', 'phenol_ester',\n",
476+
" 'phenyl_carbonate', 'phosphor', 'phthalimide',\n",
477+
" 'Polycyclic_aromatic_hydrocarbon_1',\n",
478+
" 'Polycyclic_aromatic_hydrocarbon_2',\n",
479+
" 'Polycyclic_aromatic_hydrocarbon_3', 'polyene',\n",
480+
" 'quaternary_nitrogen_1', 'quaternary_nitrogen_2',\n",
481+
" 'quaternary_nitrogen_3', 'saponine_derivative', 'silicon_halogen',\n",
482+
" 'stilbene', 'sulfinic_acid', 'Sulfonic_acid_1', 'Sulfonic_acid_2',\n",
483+
" 'sulfonyl_cyanide', 'sulfur_oxygen_single_bond', 'sulphate',\n",
484+
" 'sulphur_nitrogen_single_bond', 'Thiobenzothiazole_1',\n",
485+
" 'thiobenzothiazole_2', 'Thiocarbonyl_group', 'thioester',\n",
486+
" 'thiol_1', 'thiol_2', 'Three-membered_heterocycle', 'triflate',\n",
487+
" 'triphenyl_methyl-silyl', 'triple_bond'], dtype='<U44')"
422488
]
423489
},
424-
"execution_count": 84,
490+
"execution_count": 13,
425491
"metadata": {},
426492
"output_type": "execute_result"
427493
}
428494
],
429495
"source": [
430-
"from skfp.filters import BrenkFilter\n",
431-
"\n",
432-
"brenk_filter = BrenkFilter()"
496+
"brenk_filter.get_feature_names_out()"
433497
]
434498
},
435499
{
436500
"cell_type": "markdown",
437-
"id": "9af3ce4f",
501+
"id": "6c6f153b-a8c9-4118-b51d-8f164f47c5d8",
438502
"metadata": {},
439503
"source": [
440-
"Patterns are saved in `._filters` attribute of the filter object. They are represented using RDKit `FilterCatalog` objects, which are quite efficient in checking the patterns, but make it challenging to inspect SMARTS patterns from Python. The first few patterns from BRENK and their meanings are:"
504+
"Underlying SMARTS patterns are represented using RDKit `FilterCatalog` objects, which are quite efficient in checking the patterns. Unfortunately, getting the actual SMARTS strings in Python is challenging, and more easily inferred from RDKit files. The first few patterns from BRENK and their meanings are:"
441505
]
442506
},
443507
{

skfp/bases/base_filter.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ class BaseFilter(ABC, BaseEstimator, TransformerMixin):
3434
3535
Parameters
3636
----------
37+
condition_names : list[str]
38+
Names of filter conditions, e.g. physicochemical properties and their limits,
39+
or SMARTS patterns.
40+
3741
allow_one_violation : bool, default=False
3842
Whether to allow violating one of the rules for a molecule. This makes the
3943
filter less restrictive.
@@ -85,13 +89,15 @@ class BaseFilter(ABC, BaseEstimator, TransformerMixin):
8589

8690
def __init__(
8791
self,
92+
condition_names: list[str],
8893
allow_one_violation: bool = False,
8994
return_type: str = "mol",
9095
return_indicators: bool = False,
9196
n_jobs: int | None = None,
9297
batch_size: int | None = None,
9398
verbose: int | dict = 0,
9499
):
100+
self.condition_names = condition_names
95101
self.allow_one_violation = allow_one_violation
96102
self.return_type = return_type
97103
self.return_indicators = return_indicators
@@ -127,13 +133,7 @@ def get_feature_names_out(self, input_features=None) -> np.ndarray:
127133
feature_names_out : ndarray of str objects
128134
Filter condition names.
129135
"""
130-
if not hasattr(self, "_condition_names"):
131-
raise AttributeError(
132-
f"Filter condition names not yet supported for "
133-
f"{self.__class__.__name__}"
134-
)
135-
136-
return np.array(self._condition_names)
136+
return np.array(self.condition_names)
137137

138138
def fit(self, X: Sequence[str | Mol], y: np.ndarray | None = None):
139139
"""Unused, kept for scikit-learn compatibility.

0 commit comments

Comments
 (0)