Improve typing of reaction metadata (#86)

kmaziarz · web-flow · commit 0de1ca29bee4 · 2024-04-10T12:50:27.000+01:00
Following a discussion in #80, this PR adjusts the type hints for reaction metadata to be `ReactionMetaData` instead of the generic `Dict[str, Any]` to allow for more precise type checking.
diff --git a/syntheseus/reaction_prediction/inference/chemformer.py b/syntheseus/reaction_prediction/inference/chemformer.py
@@ -15,6 +15,7 @@
 from syntheseus.interface.bag import Bag
 from syntheseus.interface.models import InputType, ReactionType
 from syntheseus.interface.molecule import Molecule
+from syntheseus.interface.reaction import ReactionMetaData
 from syntheseus.reaction_prediction.inference.base import ExternalReactionModel
 from syntheseus.reaction_prediction.utils.inference import (
     get_module_path,
@@ -135,7 +136,7 @@ def _get_reactions(
         # and [InputType, ReactionType] is not visible to mypy.
         if self.is_forward():
             process_fn: Callable[
-                [InputType, List[str], List[Dict[str, Any]]], Sequence[ReactionType]
+                [InputType, List[str], List[ReactionMetaData]], Sequence[ReactionType]
             ] = process_raw_smiles_outputs_forwards  # type: ignore[assignment]
         else:
             process_fn = process_raw_smiles_outputs_backwards  # type: ignore[assignment]
diff --git a/syntheseus/reaction_prediction/inference/root_aligned.py b/syntheseus/reaction_prediction/inference/root_aligned.py
@@ -13,13 +13,13 @@
 import random
 import warnings
 from collections import defaultdict
-from typing import Any, Dict, List, Optional, Sequence
+from typing import Any, List, Optional, Sequence
 
 import yaml
 from rdkit import Chem
 
 from syntheseus.interface.molecule import Molecule
-from syntheseus.interface.reaction import SingleProductReaction
+from syntheseus.interface.reaction import ReactionMetaData, SingleProductReaction
 from syntheseus.reaction_prediction.inference.base import ExternalBackwardReactionModel
 from syntheseus.reaction_prediction.utils.inference import (
     get_unique_file_in_dir,
@@ -85,7 +85,7 @@ def _mols_to_batch(self, inputs) -> List[bytes]:
         # Example outcome: b'C C ( = O ) c 1 c c c 2 c ( c c n 2 C ( = O ) O C ( C ) ( C ) C ) c 1\n'.
         return [bytes(smi_tokenizer(input.smiles) + "\n", "utf-8") for input in inputs]
 
-    def _build_kwargs_from_scores(self, scores: List[float]) -> List[Dict[str, Any]]:
+    def _build_kwargs_from_scores(self, scores: List[float]) -> List[ReactionMetaData]:
         """Compute kwargs to save in the predictions given raw scores from the RootAligned model.
 
         The scores we get from the model cannot be directly interpreted as a (log) probability.
@@ -111,7 +111,7 @@ def _build_kwargs_from_scores(self, scores: List[float]) -> List[Dict[str, Any]]
             1.0 / (k + 1) for k in range(self.beam_size)
         )
 
-        kwargs_list: List[Dict[str, Any]] = []
+        kwargs_list: List[ReactionMetaData] = []
         for score in scores:
             best_pos = -math.floor(score / 1e8)
             total_rr = score + best_pos * 1e8
@@ -121,14 +121,15 @@ def _build_kwargs_from_scores(self, scores: List[float]) -> List[Dict[str, Any]]
 
             new_score = total_rr - (best_pos + 1) * max_possible_total_rr
             assert new_score <= 0.0
-            metadata = {
-                "original_score": score,
-                "best_pos": best_pos,
-                "total_rr": total_rr,
-                "score": new_score,
-            }
-
-            kwargs_list.append(metadata)
+
+            kwargs_list.append(
+                {  # type: ignore[typeddict-unknown-key]
+                    "original_score": score,
+                    "best_pos": best_pos,
+                    "total_rr": total_rr,
+                    "score": new_score,
+                }
+            )
 
         # Make sure the new scores produce the same ranking.
         for kwargs, next_kwargs in zip(kwargs_list, kwargs_list[1:]):
diff --git a/syntheseus/reaction_prediction/utils/inference.py b/syntheseus/reaction_prediction/utils/inference.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Any, Dict, List, Sequence, Union, cast
+from typing import Any, List, Sequence, Union
 
 from syntheseus.interface.bag import Bag
 from syntheseus.interface.molecule import Molecule
@@ -12,7 +12,7 @@
 
 
 def process_raw_smiles_outputs_backwards(
-    input: Molecule, output_list: List[str], metadata_list: List[Dict[str, Any]]
+    input: Molecule, output_list: List[str], metadata_list: List[ReactionMetaData]
 ) -> Sequence[SingleProductReaction]:
     """Convert raw SMILES outputs into a list of `SingleProductReaction` objects.
 
@@ -33,16 +33,14 @@ def process_raw_smiles_outputs_backwards(
         # Only consider the prediction if the SMILES can be parsed.
         if reactants is not None:
             predictions.append(
-                SingleProductReaction(
-                    product=input, reactants=reactants, metadata=cast(ReactionMetaData, metadata)
-                )
+                SingleProductReaction(product=input, reactants=reactants, metadata=metadata)
             )
 
     return predictions
 
 
 def process_raw_smiles_outputs_forwards(
-    input: Bag[Molecule], output_list: List[str], metadata_list: List[Dict[str, Any]]
+    input: Bag[Molecule], output_list: List[str], metadata_list: List[ReactionMetaData]
 ) -> Sequence[Reaction]:
     """Convert raw SMILES outputs into a list of `Reaction` objects.
     Like method `process_raw_smiles_outputs_backwards`, but for forward models.
@@ -63,11 +61,7 @@ def process_raw_smiles_outputs_forwards(
 
         # Only consider the prediction if the SMILES can be parsed.
         if products is not None:
-            predictions.append(
-                Reaction(
-                    products=products, reactants=input, metadata=cast(ReactionMetaData, metadata)
-                )
-            )
+            predictions.append(Reaction(products=products, reactants=input, metadata=metadata))
 
     return predictions