11import os .path
22import warnings
33from collections .abc import Sequence
4+ from numbers import Integral
45
6+ from joblib import effective_n_jobs
57from rdkit .Chem import Mol , SDMolSupplier , SDWriter
68from rdkit .Chem .PropertyMol import PropertyMol
79
810from skfp .bases import BasePreprocessor
911from skfp .utils import require_mols
12+ from skfp .utils .functions import _get_rdkit_version
13+
14+ _MIN_MULTITHREADED_SDF_VERSION = (2025 , 9 , 1 )
1015
1116
1217class MolFromSDFTransformer (BasePreprocessor ):
@@ -29,6 +34,11 @@ class MolFromSDFTransformer(BasePreprocessor):
2934 Remove explicit hydrogens from the molecule where possible, using RDKit
3035 implicit hydrogens instead.
3136
37+ n_jobs : int, default=None
38+ The number of jobs to use when reading molecules from an SDF file path.
39+ If ``n_jobs > 1`` and the installed RDKit version is at least ``2025.09.1``
40+ the file is read in parallel. Raw SDF text input is always processed sequentially.
41+
3242 References
3343 ----------
3444 .. [1] `RDKit SDMolSupplier documentation
@@ -50,14 +60,16 @@ class MolFromSDFTransformer(BasePreprocessor):
5060 _parameter_constraints : dict = {
5161 "sanitize" : ["boolean" ],
5262 "remove_hydrogens" : ["boolean" ],
63+ "n_jobs" : [Integral , None ],
5364 }
5465
5566 def __init__ (
5667 self ,
5768 sanitize : bool = True ,
5869 remove_hydrogens : bool = True ,
70+ n_jobs : int | None = None ,
5971 ):
60- super ().__init__ ()
72+ super ().__init__ (n_jobs = n_jobs )
6173 self .sanitize = sanitize
6274 self .remove_hydrogens = remove_hydrogens
6375
@@ -84,12 +96,9 @@ def transform(self, X: str, copy: bool = False) -> list[Mol]: # type: ignore[ov
8496 if not os .path .exists (X ):
8597 raise FileNotFoundError (f"SDF file at path '{ X } ' not found" )
8698
87- with open (X ) as file :
88- X = file .read ()
89-
90- supplier = SDMolSupplier ()
91- supplier .SetData (X , sanitize = self .sanitize , removeHs = self .remove_hydrogens )
92- mols = list (supplier )
99+ mols = self ._read_sdf_file (X )
100+ else :
101+ mols = self ._read_sdf_text (X )
93102
94103 if not mols :
95104 warnings .warn ("No molecules detected in provided SDF file" )
@@ -99,6 +108,61 @@ def transform(self, X: str, copy: bool = False) -> list[Mol]: # type: ignore[ov
99108 def _transform_batch (self , X ):
100109 pass # unused
101110
111+ def _read_sdf_file (self , filepath : str ) -> list [Mol ]:
112+ n_jobs = effective_n_jobs (self .n_jobs )
113+
114+ if n_jobs > 1 :
115+ rdkit_version = _get_rdkit_version ()
116+ if rdkit_version < _MIN_MULTITHREADED_SDF_VERSION :
117+ warnings .warn (
118+ "Parallel SDF reading requires RDKit >= 2025.09.1. "
119+ f"Installed version is { '.' .join (map (str , rdkit_version ))} . "
120+ "Falling back to sequential loading."
121+ )
122+ else :
123+ return self ._read_sdf_file_parallel (filepath , n_jobs )
124+
125+ return list (
126+ SDMolSupplier (
127+ filepath ,
128+ sanitize = self .sanitize ,
129+ removeHs = self .remove_hydrogens ,
130+ )
131+ )
132+
133+ def _read_sdf_file_parallel (self , filepath : str , n_jobs : int ) -> list [Mol ]:
134+ from rdkit .Chem import MultithreadedSDMolSupplier
135+
136+ with MultithreadedSDMolSupplier (
137+ filepath ,
138+ sanitize = self .sanitize ,
139+ removeHs = self .remove_hydrogens ,
140+ numWriterThreads = n_jobs ,
141+ ) as supplier :
142+ mols_with_record_ids = [
143+ (supplier .GetLastRecordId (), mol )
144+ for mol in supplier
145+ if mol is not None # multithreaded supplier may yield None duplicates
146+ ]
147+
148+ mols_with_record_ids .sort (key = lambda item : item [0 ])
149+ return [mol for _ , mol in mols_with_record_ids ]
150+
151+ def _read_sdf_text (self , sdf_text : str ) -> list [Mol ]:
152+ if effective_n_jobs (self .n_jobs ) > 1 :
153+ warnings .warn (
154+ "Parallel SDF reading requires a file path. Falling back to sequential "
155+ "loading for raw SDF text input."
156+ )
157+
158+ supplier = SDMolSupplier ()
159+ supplier .SetData (
160+ sdf_text ,
161+ sanitize = self .sanitize ,
162+ removeHs = self .remove_hydrogens ,
163+ )
164+ return list (supplier )
165+
102166
103167class MolToSDFTransformer (BasePreprocessor ):
104168 """
0 commit comments