restructure error handling and update cache indexing for gnn

sfluegel05 · sfluegel05 · commit a11b0b7949b6 · 2025-07-18T19:25:29.000+02:00
diff --git a/chebifier/prediction_models/gnn_predictor.py b/chebifier/prediction_models/gnn_predictor.py
@@ -60,7 +60,7 @@ def read_smiles(self, smiles):
                 if isinstance(prop.encoder, IndexEncoder):
                     if str(value) in prop.encoder.cache:
                         index = (
-                            prop.encoder.cache.index(str(value)) + prop.encoder.offset
+                            prop.encoder.cache[str(value)] + prop.encoder.offset
                         )
                     else:
                         index = 0
diff --git a/chebifier/prediction_models/nn_predictor.py b/chebifier/prediction_models/nn_predictor.py
@@ -57,25 +57,26 @@ def predict_smiles_list(self, smiles_list) -> list:
         could_not_parse = []
         index_map = dict()
         for i, smiles in enumerate(smiles_list):
+            if not smiles:
+                print(f"Model {self.model_name} received a missing SMILES string at position {i}.")
+                could_not_parse.append(i)
+                continue
             try:
-                # Try to parse the smiles string
-                if not smiles:
-                    raise ValueError()
                 d = self.read_smiles(smiles)
+
                 # This is just for sanity checks
                 rdmol = Chem.MolFromSmiles(smiles, sanitize=False)
-            except Exception as e:
-                # Note if it fails
-                could_not_parse.append(i)
-                print(f"Failing to parse {smiles} due to {e}")
-            else:
                 if rdmol is None:
+                    print(f"Model {self.model_name} received a SMILES string RDKit can't read at position {i}: {smiles}")
                     could_not_parse.append(i)
-                else:
-                    index_map[i] = len(token_dicts)
-                    token_dicts.append(d)
+                    continue
+            except Exception as e:
+                could_not_parse.append(i)
+                print(f"Model {self.model_name} failed to parse a SMILES string at position {i}: {smiles}")
+            index_map[i] = len(token_dicts)
+            token_dicts.append(d)
         results = []
-        if token_dicts:
+        if len(token_dicts) > 0:
             for batch in tqdm.tqdm(
                 self.batchify(token_dicts),
                 desc=f"{self.model_name}",

Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@ def read_smiles(self, smiles):`
`60`	`60`	`if isinstance(prop.encoder, IndexEncoder):`
`61`	`61`	`if str(value) in prop.encoder.cache:`
`62`	`62`	`index = (`
`63`		`- prop.encoder.cache.index(str(value)) + prop.encoder.offset`
	`63`	`+ prop.encoder.cache[str(value)] + prop.encoder.offset`
`64`	`64`	`)`
`65`	`65`	`else:`
`66`	`66`	`index = 0`