add some more tokenizer max length checks

HenryL27 · HenryL27 · commit 9426cd608612 · 2024-02-14T17:46:58.000-08:00
Signed-off-by: HenryL27 &lt;hmlindeman@yahoo.com&gt;
diff --git a/opensearch_py_ml/ml_models/crossencodermodel.py b/opensearch_py_ml/ml_models/crossencodermodel.py
@@ -120,6 +120,17 @@ def zip_model(self, framework: str = "pt", zip_fname: str = "model.zip") -> Path
         # save tokenizer file
         tk_path = Path(f"/tmp/{mname}-tokenizer")
         tk.save_pretrained(tk_path)
+        if tk.model_max_length > model.get_max_length():
+            model_config = AutoConfig.from_pretrained(self._hf_model_id)
+            if hasattr(model_config, "max_position_embeddings"):
+                tk.model_max_length = model_config.max_position_embeddings
+            elif hasattr(model_config, "n_positions"):
+                tk.model_max_length = model_config.n_positions
+            else:
+                tk.model_max_length = 2**15  # =32768. Set to something big I guess
+            print(
+                f"The model_max_length is not properly defined in tokenizer_config.json. Setting it to be {tk.model_max_length}"
+            )
         _fix_tokenizer(tk.model_max_length, tk_path)
 
         # get apache license