fix: performance issue code formula model (#78)

Matteo-Omenetti · web-flow · commit b78d81fe1ef6 · 2025-02-06T07:09:28.000+01:00
Signed-off-by: Matteo Omenetti &lt;omenetti.matteo@gmail.com&gt;
diff --git a/docling_ibm_models/code_formula_model/code_formula_predictor.py b/docling_ibm_models/code_formula_model/code_formula_predictor.py
@@ -8,7 +8,7 @@
 import numpy as np
 import torch
 from PIL import Image
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList
 
 from docling_ibm_models.code_formula_model.models.sam_opt import SamOPTForCausalLM
 from docling_ibm_models.code_formula_model.models.sam_opt_image_processor import (
@@ -18,6 +18,22 @@
 _log = logging.getLogger(__name__)
 
 
+class StopOnString(StoppingCriteria):
+    def __init__(self, tokenizer, stop_string):
+        self.stop_token_ids = tokenizer.encode(stop_string, add_special_tokens=False)
+
+    def __call__(self, input_ids, scores, **kwargs):
+        for sequence in input_ids:
+            sequence_list = sequence.tolist()
+            for i in range(len(sequence_list) - len(self.stop_token_ids) + 1):
+                if (
+                    sequence_list[i : i + len(self.stop_token_ids)]
+                    == self.stop_token_ids
+                ):
+                    return True
+        return False
+
+
 class CodeFormulaPredictor:
     """
     Code and Formula Predictor using a multi-modal vision-language model.
@@ -127,12 +143,37 @@ def _get_prompt(self, label: str) -> str:
 
         return prompt
 
+    def _strip(self, text: str):
+        """
+        Removes any occurrences of the substrings in remove_list from the end of text.
+
+        Parameters
+        ----------
+        text : str
+            The original string.
+
+        Returns
+        -------
+        str
+            The trimmed string.
+        """
+        remove_list = [r"\quad", r"\\", r"\,", " c c c c", " l l l l l"]
+        changed = True
+        while changed:
+            changed = False
+            for substr in remove_list:
+                if text.endswith(substr):
+                    text = text[: -len(substr)]
+                    changed = True
+
+        return text.strip()
+
     @torch.inference_mode()
     def predict(
         self,
         images: List[Union[Image.Image, np.ndarray]],
         labels: List[str],
-        temperature: Optional[float] = 0.1,
+        temperature: Optional[float] = 0.0,
     ) -> List[str]:
         """
         Predicts the textual representation of input images (code or LaTeX).
@@ -144,7 +185,7 @@ def predict(
         labels : List[str]
             List of labels indicating the type of each image ('code' or 'formula').
         temperature : Optional[float]
-            Sampling temperature for generation, by default set to 0.1.
+            Sampling temperature for generation, by default set to 0.0.
 
         Returns
         -------
@@ -198,6 +239,16 @@ def predict(
         prompt_ids = tokenized["input_ids"]
         attention_mask = tokenized["attention_mask"]
 
+        stopping_criteria = StoppingCriteriaList(
+            [
+                StopOnString(self._tokenizer, r" \quad \quad \quad \quad"),
+                StopOnString(self._tokenizer, r" \\ \\ \\ \\"),
+                StopOnString(self._tokenizer, r" \, \, \, \,"),
+                StopOnString(self._tokenizer, r" c c c c c c c c c c c c c c c c"),
+                StopOnString(self._tokenizer, r" l l l l l l l l l l l l l l l l l"),
+            ]
+        )
+
         if self._device == "cpu":
             output_ids_list = self._model.generate(
                 input_ids=prompt_ids,
@@ -207,7 +258,8 @@ def predict(
                 temperature=temperature,
                 max_new_tokens=4096 - prompt_ids.shape[1],
                 use_cache=True,
-                no_repeat_ngram_size=300,
+                no_repeat_ngram_size=200,
+                stopping_criteria=stopping_criteria,
             )
         else:
             with torch.autocast(device_type=self._device, dtype=torch.bfloat16):
@@ -218,11 +270,13 @@ def predict(
                     temperature=temperature,
                     max_new_tokens=4096 - prompt_ids.shape[1],
                     use_cache=True,
-                    no_repeat_ngram_size=300,
+                    no_repeat_ngram_size=200,
+                    stopping_criteria=stopping_criteria,
                 )
 
         outputs = self._tokenizer.batch_decode(
             output_ids_list[:, prompt_ids.shape[1] :], skip_special_tokens=True
         )
+        outputs = [self._strip(output) for output in outputs]
 
         return outputs