From f6ae513d106c4d6465a94fa78c546ee8b59d7f50 Mon Sep 17 00:00:00 2001
From: Jebq <jb.oger2312@gmail.com>
Date: Thu, 22 Feb 2024 22:40:16 +0000
Subject: [PATCH 1/7] refactor: use self.model_id where applicable

Signed-off-by: Jean-Baptiste Oger <contact@jeanbaptisteoger.com>
---
 .../ml_models/sentencetransformermodel.py     | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py
index 10f1174b5..0b81ec974 100644
--- a/opensearch_py_ml/ml_models/sentencetransformermodel.py
+++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py
@@ -419,7 +419,7 @@ def train_model(
         """
 
         if model_id is None:
-            model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
+            model_id = self.model_id
         if output_model_name is None:
             output_model_name = str(self.model_id.split("/")[-1] + ".pt")
 
@@ -765,7 +765,7 @@ def _fill_null_truncation_field(
     def save_as_pt(
         self,
         sentences: [str],
-        model_id="sentence-transformers/msmarco-distilbert-base-tas-b",
+        model_id: str = None,
         model_name: str = None,
         save_json_folder_path: str = None,
         model_output_path: str = None,
@@ -806,6 +806,9 @@ def save_as_pt(
         :rtype: string
         """
 
+        if model_id is None:
+            model_id =  self.model_id
+
         model = SentenceTransformer(model_id)
 
         if model_name is None:
@@ -877,7 +880,7 @@ def save_as_pt(
 
     def save_as_onnx(
         self,
-        model_id="sentence-transformers/msmarco-distilbert-base-tas-b",
+        model_id: str = None,
         model_name: str = None,
         save_json_folder_path: str = None,
         model_output_path: str = None,
@@ -915,6 +918,9 @@ def save_as_onnx(
         :rtype: string
         """
 
+        if model_id is None:
+            model_id = self.model_id
+
         model = SentenceTransformer(model_id)
 
         if model_name is None:
@@ -1143,6 +1149,7 @@ def make_model_config_json(
         model_name: str = None,
         version_number: str = 1,
         model_format: str = "TORCH_SCRIPT",
+        config_out_path: str = None,
         model_zip_file_path: str = None,
         embedding_dimension: int = None,
         pooling_mode: str = None,
@@ -1163,6 +1170,10 @@ def make_model_config_json(
         :param model_format:
             Optional, the format of the model. Default is "TORCH_SCRIPT".
         :type model_format: string
+        :param config_output_path:
+             Optional, path to save model config json file. If None, default as
+             default_folder_path from the constructor
+        :type config_output_path: string
         :param model_zip_file_path:
             Optional, path to the model zip file. Default is the zip file path used in save_as_pt or save_as_onnx
             depending on model_format. This zip file is used to compute model_content_size_in_bytes and
@@ -1198,6 +1209,8 @@ def make_model_config_json(
         :rtype: string
         """
         folder_path = self.folder_path
+        if config_output_path is None:
+            config_output_path = self.folder_path
         config_json_file_path = os.path.join(folder_path, "config.json")
         if model_name is None:
             model_name = self.model_id
@@ -1313,7 +1326,7 @@ def make_model_config_json(
             print(json.dumps(model_config_content, indent=4))
 
         model_config_file_path = os.path.join(
-            folder_path, "ml-commons_model_config.json"
+            config_output_path, "ml-commons_model_config.json"
         )
         os.makedirs(os.path.dirname(model_config_file_path), exist_ok=True)
         with open(model_config_file_path, "w") as file:

From 455190406ba4ab5cfbce2f9651acd7f886d14d0f Mon Sep 17 00:00:00 2001
From: Jebq <jb.oger2312@gmail.com>
Date: Thu, 22 Feb 2024 22:49:51 +0000
Subject: [PATCH 2/7] doc: update CHANGELOG

Signed-off-by: Jean-Baptiste Oger <contact@jeanbaptisteoger.com>
---
 CHANGELOG.md                                           | 2 ++
 opensearch_py_ml/ml_models/sentencetransformermodel.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cc0a009fc..2854a0650 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -36,6 +36,8 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - Replaced usage of `is_datetime_or_timedelta_dtype` with `is_timedelta64_dtype` and `is_datetime64_any_dtype` by @rawwar ([#316](https://github.com/opensearch-project/opensearch-py-ml/pull/316))
 - use try-except-else block for handling unexpected exceptions during integration tests by @rawwar([#370](https://github.com/opensearch-project/opensearch-py-ml/pull/370))
 - Removed pandas version pin in nox tests by @rawwar ([#368](https://github.com/opensearch-project/opensearch-py-ml/pull/368))
+- Fix to ensure `model_id` is only required once when saving a model in `sentencetransformermodel.py` in ([#376](https://github.com/opensearch-project/opensearch-py-ml/pull/376))
+- Add a `config_output_path` param in `make_model_config_json` in `sentencetransformermodel.py` to chose where the saving location of the model config json file. ([#376](https://github.com/opensearch-project/opensearch-py-ml/pull/376))
 
 ### Fixed
 - Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203))
diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py
index 0b81ec974..7f2e148cc 100644
--- a/opensearch_py_ml/ml_models/sentencetransformermodel.py
+++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py
@@ -1149,7 +1149,7 @@ def make_model_config_json(
         model_name: str = None,
         version_number: str = 1,
         model_format: str = "TORCH_SCRIPT",
-        config_out_path: str = None,
+        config_output_path: str = None,
         model_zip_file_path: str = None,
         embedding_dimension: int = None,
         pooling_mode: str = None,

From ee7b3d133af4882b67475bdb7f8f145aaf41371e Mon Sep 17 00:00:00 2001
From: Jebq <jb.oger2312@gmail.com>
Date: Thu, 22 Feb 2024 22:52:40 +0000
Subject: [PATCH 3/7] lint

Signed-off-by: Jean-Baptiste Oger <contact@jeanbaptisteoger.com>
---
 opensearch_py_ml/ml_models/sentencetransformermodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py
index 7f2e148cc..a6d2da492 100644
--- a/opensearch_py_ml/ml_models/sentencetransformermodel.py
+++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py
@@ -807,7 +807,7 @@ def save_as_pt(
         """
 
         if model_id is None:
-            model_id =  self.model_id
+            model_id = self.model_id
 
         model = SentenceTransformer(model_id)
 

From b2bc31e68252d51e386ce649d265cd5d6fdfe918 Mon Sep 17 00:00:00 2001
From: Jebq <jb.oger2312@gmail.com>
Date: Fri, 23 Feb 2024 17:56:05 +0000
Subject: [PATCH 4/7] test: update tests

Signed-off-by: Jean-Baptiste Oger <contact@jeanbaptisteoger.com>
---
 .../ml_models/sentencetransformermodel.py     |   4 +-
 .../test_sentencetransformermodel_pytest.py   | 123 ++++++++++++++++--
 2 files changed, 111 insertions(+), 16 deletions(-)

diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py
index a6d2da492..0c8259242 100644
--- a/opensearch_py_ml/ml_models/sentencetransformermodel.py
+++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py
@@ -780,7 +780,7 @@ def save_as_pt(
             Required, for example  sentences = ['today is sunny']
         :type sentences: List of string [str]
         :param model_id:
-            sentence transformer model id to download model from sentence transformers.
+            Optional, sentence transformer model id to download model from sentence transformers.
             default model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
         :type model_id: string
         :param model_name:
@@ -892,7 +892,7 @@ def save_as_onnx(
         zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster
 
         :param model_id:
-            sentence transformer model id to download model from sentence transformers.
+            Optional, sentence transformer model id to download model from sentence transformers.
             default model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
         :type model_id: string
         :param model_name:
diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py
index c9c9046ba..fcb82b8c4 100644
--- a/tests/ml_models/test_sentencetransformermodel_pytest.py
+++ b/tests/ml_models/test_sentencetransformermodel_pytest.py
@@ -251,7 +251,7 @@ def test_make_model_config_json_for_torch_script():
         model_id=model_id,
     )
 
-    test_model5.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model5.save_as_pt(sentences=["today is sunny"])
     model_config_path_torch = test_model5.make_model_config_json(
         model_format="TORCH_SCRIPT", verbose=True
     )
@@ -267,6 +267,38 @@ def test_make_model_config_json_for_torch_script():
     clean_test_folder(TEST_FOLDER)
 
 
+def test_make_model_config_json_set_path_for_torch_script():
+    model_id = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
+    model_format = "TORCH_SCRIPT"
+    expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and was designed for semantic search. It has been trained on 215M  pairs from diverse sources."
+    expected_model_config_data = {
+        "embedding_dimension": 384,
+        "pooling_mode": "MEAN",
+        "normalize_result": True,
+    }
+
+    clean_test_folder(TEST_FOLDER)
+    test_model5 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model5.save_as_pt(sentences=["today is sunny"])
+    model_config_path_torch = test_model5.make_model_config_json(
+        config_output_path=TEST_FOLDER, model_format="TORCH_SCRIPT", verbose=True
+    )
+
+    compare_model_config(
+        model_config_path_torch,
+        model_id,
+        model_format,
+        expected_model_description=expected_model_description,
+        expected_model_config_data=expected_model_config_data,
+    )
+
+    clean_test_folder(TEST_FOLDER)
+
+
 def test_make_model_config_json_for_onnx():
     model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2"
     model_format = "ONNX"
@@ -283,7 +315,7 @@ def test_make_model_config_json_for_onnx():
         model_id=model_id,
     )
 
-    test_model6.save_as_onnx(model_id=model_id)
+    test_model6.save_as_onnx()
     model_config_path_onnx = test_model6.make_model_config_json(model_format="ONNX")
 
     compare_model_config(
@@ -297,6 +329,38 @@ def test_make_model_config_json_for_onnx():
     clean_test_folder(TEST_FOLDER)
 
 
+def test_make_model_config_json_set_path_for_onnx():
+    model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2"
+    model_format = "ONNX"
+    expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search."
+    expected_model_config_data = {
+        "embedding_dimension": 384,
+        "pooling_mode": "MEAN",
+        "normalize_result": False,
+    }
+
+    clean_test_folder(TEST_FOLDER)
+    test_model6 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model6.save_as_onnx()
+    model_config_path_onnx = test_model6.make_model_config_json(
+        config_output_path=TEST_FOLDER, model_format="ONNX"
+    )
+
+    compare_model_config(
+        model_config_path_onnx,
+        model_id,
+        model_format,
+        expected_model_description=expected_model_description,
+        expected_model_config_data=expected_model_config_data,
+    )
+
+    clean_test_folder(TEST_FOLDER)
+
+
 def test_overwrite_fields_in_model_config():
     model_id = "sentence-transformers/all-distilroberta-v1"
     model_format = "TORCH_SCRIPT"
@@ -318,7 +382,7 @@ def test_overwrite_fields_in_model_config():
         model_id=model_id,
     )
 
-    test_model7.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model7.save_as_pt(sentences=["today is sunny"])
     model_config_path_torch = test_model7.make_model_config_json(
         model_format="TORCH_SCRIPT"
     )
@@ -337,7 +401,7 @@ def test_overwrite_fields_in_model_config():
         model_id=model_id,
     )
 
-    test_model8.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model8.save_as_pt(sentences=["today is sunny"])
     model_config_path_torch = test_model8.make_model_config_json(
         model_format="TORCH_SCRIPT",
         embedding_dimension=overwritten_model_config_data["embedding_dimension"],
@@ -367,7 +431,7 @@ def test_missing_readme_md_file():
         model_id=model_id,
     )
 
-    test_model9.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model9.save_as_pt(sentences=["today is sunny"])
     temp_path = os.path.join(
         TEST_FOLDER,
         "README.md",
@@ -403,7 +467,7 @@ def test_missing_expected_description_in_readme_file():
         model_id=model_id,
     )
 
-    test_model10.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model10.save_as_pt(sentences=["today is sunny"])
     temp_path = os.path.join(
         TEST_FOLDER,
         "README.md",
@@ -440,7 +504,7 @@ def test_overwrite_description():
         model_id=model_id,
     )
 
-    test_model11.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model11.save_as_pt(sentences=["today is sunny"])
     model_config_path_torch = test_model11.make_model_config_json(
         model_format=model_format, description=expected_model_description
     )
@@ -471,7 +535,7 @@ def test_long_description():
         model_id=model_id,
     )
 
-    test_model12.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model12.save_as_pt(sentences=["today is sunny"])
     model_config_path_torch = test_model12.make_model_config_json(
         model_format=model_format
     )
@@ -501,7 +565,7 @@ def test_truncation_parameter():
         model_id=model_id,
     )
 
-    test_model13.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model13.save_as_pt(sentences=["today is sunny"])
 
     tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
     try:
@@ -534,7 +598,7 @@ def test_undefined_model_max_length_in_tokenizer_for_torch_script():
         model_id=model_id,
     )
 
-    test_model14.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model14.save_as_pt(sentences=["today is sunny"])
 
     tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
     try:
@@ -563,7 +627,7 @@ def test_undefined_model_max_length_in_tokenizer_for_onnx():
         model_id=model_id,
     )
 
-    test_model14.save_as_onnx(model_id=model_id)
+    test_model14.save_as_onnx()
 
     tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
     try:
@@ -598,7 +662,6 @@ def test_save_as_pt_with_license():
     )
 
     test_model15.save_as_pt(
-        model_id=model_id,
         sentences=["today is sunny"],
         add_apache_license=True,
     )
@@ -622,7 +685,7 @@ def test_save_as_onnx_with_license():
         model_id=model_id,
     )
 
-    test_model16.save_as_onnx(model_id=model_id, add_apache_license=True)
+    test_model16.save_as_onnx(add_apache_license=True)
 
     compare_model_zip_file(onnx_zip_file_path, onnx_expected_filenames, model_format)
 
@@ -649,7 +712,7 @@ def test_zip_model_with_license():
         model_id=model_id,
     )
 
-    test_model17.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model17.save_as_pt(sentences=["today is sunny"])
     compare_model_zip_file(zip_file_path, expected_filenames_wo_license, model_format)
 
     test_model17.zip_model(add_apache_license=True)
@@ -658,5 +721,37 @@ def test_zip_model_with_license():
     clean_test_folder(TEST_FOLDER)
 
 
+def test_save_as_pt_model_with_different_id():
+    model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
+    model_id2 = "sentence-transformers/all-MiniLM-L6-v2"
+    model_format = "TORCH_SCRIPT"
+    zip_file_path = os.path.join(TEST_FOLDER, "msmarco-distilbert-base-tas-b.zip")
+    zip_file_path2 = os.path.join(TEST_FOLDER, "all-MiniLM-L6-v2")
+    expected_filenames_wo_model_id = {
+        "msmarco-distilbert-base-tas-b.pt",
+        "tokenizer.json",
+    }
+    expected_filenames_with_model_id = {
+        "msmarco-distilbert-base-tas-b.pt",
+        "tokenizer.json",
+    }
+
+    clean_test_folder(TEST_FOLDER)
+    test_model17 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model17.save_as_pt(sentences=["today is sunny"])
+    compare_model_zip_file(zip_file_path, expected_filenames_wo_model_id, model_format)
+
+    test_model17.save_as_pt(model_id=model_id2, sentences=["today is sunny"])
+    compare_model_zip_file(
+        zip_file_path2, expected_filenames_with_model_id, model_format
+    )
+
+    clean_test_folder(TEST_FOLDER)
+
+
 clean_test_folder(TEST_FOLDER)
 clean_test_folder(TESTDATA_UNZIP_FOLDER)

From 6c1e47633926f31fd43e85a5adb635b0b2fed243 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Oger <contact@jeanbaptisteoger.com>
Date: Fri, 23 Feb 2024 18:17:42 +0000
Subject: [PATCH 5/7] test: fix test_save_as_pt_model_with_different_id

Signed-off-by: Jean-Baptiste Oger <contact@jeanbaptisteoger.com>
---
 tests/ml_models/test_sentencetransformermodel_pytest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py
index fcb82b8c4..b8dd59786 100644
--- a/tests/ml_models/test_sentencetransformermodel_pytest.py
+++ b/tests/ml_models/test_sentencetransformermodel_pytest.py
@@ -726,13 +726,13 @@ def test_save_as_pt_model_with_different_id():
     model_id2 = "sentence-transformers/all-MiniLM-L6-v2"
     model_format = "TORCH_SCRIPT"
     zip_file_path = os.path.join(TEST_FOLDER, "msmarco-distilbert-base-tas-b.zip")
-    zip_file_path2 = os.path.join(TEST_FOLDER, "all-MiniLM-L6-v2")
+    zip_file_path2 = os.path.join(TEST_FOLDER, "all-MiniLM-L6-v2.zip")
     expected_filenames_wo_model_id = {
         "msmarco-distilbert-base-tas-b.pt",
         "tokenizer.json",
     }
     expected_filenames_with_model_id = {
-        "msmarco-distilbert-base-tas-b.pt",
+        "all-MiniLM-L6-v2.pt",
         "tokenizer.json",
     }
 

From c58e75b59499c69883d474b032f00488ab18bd5b Mon Sep 17 00:00:00 2001
From: Jebq <jb.oger2312@gmail.com>
Date: Mon, 26 Feb 2024 21:39:15 +0000
Subject: [PATCH 6/7] Update CHANGELOG.md

Signed-off-by: Jebq <jb.oger2312@gmail.com>
---
 CHANGELOG.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fff8037bd..cc584c84a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -38,8 +38,8 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - use try-except-else block for handling unexpected exceptions during integration tests by @rawwar([#370](https://github.com/opensearch-project/opensearch-py-ml/pull/370))
 - Removed pandas version pin in nox tests by @rawwar ([#368](https://github.com/opensearch-project/opensearch-py-ml/pull/368))
 - Switch AL2 to AL2023 agent and DockerHub to ECR images in ml-models.JenkinsFile ([#377](https://github.com/opensearch-project/opensearch-py-ml/pull/377))
-- Fix to ensure `model_id` is only required once when saving a model in `sentencetransformermodel.py` in ([#376](https://github.com/opensearch-project/opensearch-py-ml/pull/376))
-- Add a `config_output_path` param in `make_model_config_json` in `sentencetransformermodel.py` to chose where the saving location of the model config json file. ([#376](https://github.com/opensearch-project/opensearch-py-ml/pull/376))
+- Fix to ensure `model_id` is only required once when saving a model in `sentencetransformermodel.py` in ([#381](https://github.com/opensearch-project/opensearch-py-ml/pull/381))
+- Add a `config_output_path` param in `make_model_config_json` in `sentencetransformermodel.py` to chose where the saving location of the model config json file. ([#381](https://github.com/opensearch-project/opensearch-py-ml/pull/381))
 
 ### Fixed
 - Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203))

From 12e6351a8d82cf4d79673d81738edab5589157cf Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Oger <contact@jeanbaptisteoger.com>
Date: Tue, 5 Mar 2024 22:18:35 +0000
Subject: [PATCH 7/7] fix: use model_id instead of self.model_id when
 output_model_name is None in train_model

Signed-off-by: Jean-Baptiste Oger <contact@jeanbaptisteoger.com>
---
 opensearch_py_ml/ml_models/sentencetransformermodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py
index 0c8259242..b4cd03007 100644
--- a/opensearch_py_ml/ml_models/sentencetransformermodel.py
+++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py
@@ -421,7 +421,7 @@ def train_model(
         if model_id is None:
             model_id = self.model_id
         if output_model_name is None:
-            output_model_name = str(self.model_id.split("/")[-1] + ".pt")
+            output_model_name = str(model_id.split("/")[-1] + ".pt")
 
         # declare variables before assignment for training
         corp_len = []