add canonicalize flag

sfluegel05 · sfluegel05 · commit e85a9c19a2a3 · 2025-08-01T14:06:05.000+02:00
diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py
@@ -177,6 +177,11 @@ class ChemDataReader(TokenIndexerReader):
 
     COLLATOR = RaggedCollator
 
+    def __init__(self, canonicalize_smiles=True, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.canonicalize_smiles = canonicalize_smiles
+        print(f"Using SMILES canonicalization: {self.canonicalize_smiles}")
+
     @classmethod
     def name(cls) -> str:
         """Returns the name of the data reader."""
@@ -192,13 +197,14 @@ def _read_data(self, raw_data: str) -> List[int]:
         Returns:
             List[int]: A list of integers representing the indices of the SMILES tokens.
         """
-        try:
-            mol = Chem.MolFromSmiles(raw_data.strip())
-            if mol is not None:
-                raw_data = Chem.MolToSmiles(mol, canonical=True)
-        except Exception as e:
-            print(f"RDKit failed to process {raw_data}")
-            print(f"\t{e}")
+        if self.canonicalize_smiles:
+            try:
+                mol = Chem.MolFromSmiles(raw_data.strip())
+                if mol is not None:
+                    raw_data = Chem.MolToSmiles(mol, canonical=True)
+            except Exception as e:
+                print(f"RDKit failed to process {raw_data}")
+                print(f"\t{e}")
 
         return [self._get_token_index(v[1]) for v in _tokenize(raw_data)]