From f6ae513d106c4d6465a94fa78c546ee8b59d7f50 Mon Sep 17 00:00:00 2001 From: Jebq Date: Thu, 22 Feb 2024 22:40:16 +0000 Subject: [PATCH 1/7] refactor: use self.model_id where applicable Signed-off-by: Jean-Baptiste Oger --- .../ml_models/sentencetransformermodel.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 10f1174b5..0b81ec974 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -419,7 +419,7 @@ def train_model( """ if model_id is None: - model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" + model_id = self.model_id if output_model_name is None: output_model_name = str(self.model_id.split("/")[-1] + ".pt") @@ -765,7 +765,7 @@ def _fill_null_truncation_field( def save_as_pt( self, sentences: [str], - model_id="sentence-transformers/msmarco-distilbert-base-tas-b", + model_id: str = None, model_name: str = None, save_json_folder_path: str = None, model_output_path: str = None, @@ -806,6 +806,9 @@ def save_as_pt( :rtype: string """ + if model_id is None: + model_id = self.model_id + model = SentenceTransformer(model_id) if model_name is None: @@ -877,7 +880,7 @@ def save_as_pt( def save_as_onnx( self, - model_id="sentence-transformers/msmarco-distilbert-base-tas-b", + model_id: str = None, model_name: str = None, save_json_folder_path: str = None, model_output_path: str = None, @@ -915,6 +918,9 @@ def save_as_onnx( :rtype: string """ + if model_id is None: + model_id = self.model_id + model = SentenceTransformer(model_id) if model_name is None: @@ -1143,6 +1149,7 @@ def make_model_config_json( model_name: str = None, version_number: str = 1, model_format: str = "TORCH_SCRIPT", + config_out_path: str = None, model_zip_file_path: str = None, embedding_dimension: int = None, pooling_mode: str = None, @@ -1163,6 +1170,10 @@ def make_model_config_json( :param model_format: Optional, the format of the model. Default is "TORCH_SCRIPT". :type model_format: string + :param config_output_path: + Optional, path to save model config json file. If None, default as + default_folder_path from the constructor + :type config_output_path: string :param model_zip_file_path: Optional, path to the model zip file. Default is the zip file path used in save_as_pt or save_as_onnx depending on model_format. This zip file is used to compute model_content_size_in_bytes and @@ -1198,6 +1209,8 @@ def make_model_config_json( :rtype: string """ folder_path = self.folder_path + if config_output_path is None: + config_output_path = self.folder_path config_json_file_path = os.path.join(folder_path, "config.json") if model_name is None: model_name = self.model_id @@ -1313,7 +1326,7 @@ def make_model_config_json( print(json.dumps(model_config_content, indent=4)) model_config_file_path = os.path.join( - folder_path, "ml-commons_model_config.json" + config_output_path, "ml-commons_model_config.json" ) os.makedirs(os.path.dirname(model_config_file_path), exist_ok=True) with open(model_config_file_path, "w") as file: From 455190406ba4ab5cfbce2f9651acd7f886d14d0f Mon Sep 17 00:00:00 2001 From: Jebq Date: Thu, 22 Feb 2024 22:49:51 +0000 Subject: [PATCH 2/7] doc: update CHANGELOG Signed-off-by: Jean-Baptiste Oger --- CHANGELOG.md | 2 ++ opensearch_py_ml/ml_models/sentencetransformermodel.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc0a009fc..2854a0650 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,8 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Replaced usage of `is_datetime_or_timedelta_dtype` with `is_timedelta64_dtype` and `is_datetime64_any_dtype` by @rawwar ([#316](https://github.com/opensearch-project/opensearch-py-ml/pull/316)) - use try-except-else block for handling unexpected exceptions during integration tests by @rawwar([#370](https://github.com/opensearch-project/opensearch-py-ml/pull/370)) - Removed pandas version pin in nox tests by @rawwar ([#368](https://github.com/opensearch-project/opensearch-py-ml/pull/368)) +- Fix to ensure `model_id` is only required once when saving a model in `sentencetransformermodel.py` in ([#376](https://github.com/opensearch-project/opensearch-py-ml/pull/376)) +- Add a `config_output_path` param in `make_model_config_json` in `sentencetransformermodel.py` to chose where the saving location of the model config json file. ([#376](https://github.com/opensearch-project/opensearch-py-ml/pull/376)) ### Fixed - Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203)) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 0b81ec974..7f2e148cc 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -1149,7 +1149,7 @@ def make_model_config_json( model_name: str = None, version_number: str = 1, model_format: str = "TORCH_SCRIPT", - config_out_path: str = None, + config_output_path: str = None, model_zip_file_path: str = None, embedding_dimension: int = None, pooling_mode: str = None, From ee7b3d133af4882b67475bdb7f8f145aaf41371e Mon Sep 17 00:00:00 2001 From: Jebq Date: Thu, 22 Feb 2024 22:52:40 +0000 Subject: [PATCH 3/7] lint Signed-off-by: Jean-Baptiste Oger --- opensearch_py_ml/ml_models/sentencetransformermodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 7f2e148cc..a6d2da492 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -807,7 +807,7 @@ def save_as_pt( """ if model_id is None: - model_id = self.model_id + model_id = self.model_id model = SentenceTransformer(model_id) From b2bc31e68252d51e386ce649d265cd5d6fdfe918 Mon Sep 17 00:00:00 2001 From: Jebq Date: Fri, 23 Feb 2024 17:56:05 +0000 Subject: [PATCH 4/7] test: update tests Signed-off-by: Jean-Baptiste Oger --- .../ml_models/sentencetransformermodel.py | 4 +- .../test_sentencetransformermodel_pytest.py | 123 ++++++++++++++++-- 2 files changed, 111 insertions(+), 16 deletions(-) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index a6d2da492..0c8259242 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -780,7 +780,7 @@ def save_as_pt( Required, for example sentences = ['today is sunny'] :type sentences: List of string [str] :param model_id: - sentence transformer model id to download model from sentence transformers. + Optional, sentence transformer model id to download model from sentence transformers. default model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" :type model_id: string :param model_name: @@ -892,7 +892,7 @@ def save_as_onnx( zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster :param model_id: - sentence transformer model id to download model from sentence transformers. + Optional, sentence transformer model id to download model from sentence transformers. default model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" :type model_id: string :param model_name: diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index c9c9046ba..fcb82b8c4 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -251,7 +251,7 @@ def test_make_model_config_json_for_torch_script(): model_id=model_id, ) - test_model5.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model5.save_as_pt(sentences=["today is sunny"]) model_config_path_torch = test_model5.make_model_config_json( model_format="TORCH_SCRIPT", verbose=True ) @@ -267,6 +267,38 @@ def test_make_model_config_json_for_torch_script(): clean_test_folder(TEST_FOLDER) +def test_make_model_config_json_set_path_for_torch_script(): + model_id = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + model_format = "TORCH_SCRIPT" + expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources." + expected_model_config_data = { + "embedding_dimension": 384, + "pooling_mode": "MEAN", + "normalize_result": True, + } + + clean_test_folder(TEST_FOLDER) + test_model5 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model5.save_as_pt(sentences=["today is sunny"]) + model_config_path_torch = test_model5.make_model_config_json( + config_output_path=TEST_FOLDER, model_format="TORCH_SCRIPT", verbose=True + ) + + compare_model_config( + model_config_path_torch, + model_id, + model_format, + expected_model_description=expected_model_description, + expected_model_config_data=expected_model_config_data, + ) + + clean_test_folder(TEST_FOLDER) + + def test_make_model_config_json_for_onnx(): model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2" model_format = "ONNX" @@ -283,7 +315,7 @@ def test_make_model_config_json_for_onnx(): model_id=model_id, ) - test_model6.save_as_onnx(model_id=model_id) + test_model6.save_as_onnx() model_config_path_onnx = test_model6.make_model_config_json(model_format="ONNX") compare_model_config( @@ -297,6 +329,38 @@ def test_make_model_config_json_for_onnx(): clean_test_folder(TEST_FOLDER) +def test_make_model_config_json_set_path_for_onnx(): + model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2" + model_format = "ONNX" + expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search." + expected_model_config_data = { + "embedding_dimension": 384, + "pooling_mode": "MEAN", + "normalize_result": False, + } + + clean_test_folder(TEST_FOLDER) + test_model6 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model6.save_as_onnx() + model_config_path_onnx = test_model6.make_model_config_json( + config_output_path=TEST_FOLDER, model_format="ONNX" + ) + + compare_model_config( + model_config_path_onnx, + model_id, + model_format, + expected_model_description=expected_model_description, + expected_model_config_data=expected_model_config_data, + ) + + clean_test_folder(TEST_FOLDER) + + def test_overwrite_fields_in_model_config(): model_id = "sentence-transformers/all-distilroberta-v1" model_format = "TORCH_SCRIPT" @@ -318,7 +382,7 @@ def test_overwrite_fields_in_model_config(): model_id=model_id, ) - test_model7.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model7.save_as_pt(sentences=["today is sunny"]) model_config_path_torch = test_model7.make_model_config_json( model_format="TORCH_SCRIPT" ) @@ -337,7 +401,7 @@ def test_overwrite_fields_in_model_config(): model_id=model_id, ) - test_model8.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model8.save_as_pt(sentences=["today is sunny"]) model_config_path_torch = test_model8.make_model_config_json( model_format="TORCH_SCRIPT", embedding_dimension=overwritten_model_config_data["embedding_dimension"], @@ -367,7 +431,7 @@ def test_missing_readme_md_file(): model_id=model_id, ) - test_model9.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model9.save_as_pt(sentences=["today is sunny"]) temp_path = os.path.join( TEST_FOLDER, "README.md", @@ -403,7 +467,7 @@ def test_missing_expected_description_in_readme_file(): model_id=model_id, ) - test_model10.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model10.save_as_pt(sentences=["today is sunny"]) temp_path = os.path.join( TEST_FOLDER, "README.md", @@ -440,7 +504,7 @@ def test_overwrite_description(): model_id=model_id, ) - test_model11.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model11.save_as_pt(sentences=["today is sunny"]) model_config_path_torch = test_model11.make_model_config_json( model_format=model_format, description=expected_model_description ) @@ -471,7 +535,7 @@ def test_long_description(): model_id=model_id, ) - test_model12.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model12.save_as_pt(sentences=["today is sunny"]) model_config_path_torch = test_model12.make_model_config_json( model_format=model_format ) @@ -501,7 +565,7 @@ def test_truncation_parameter(): model_id=model_id, ) - test_model13.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model13.save_as_pt(sentences=["today is sunny"]) tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json") try: @@ -534,7 +598,7 @@ def test_undefined_model_max_length_in_tokenizer_for_torch_script(): model_id=model_id, ) - test_model14.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model14.save_as_pt(sentences=["today is sunny"]) tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json") try: @@ -563,7 +627,7 @@ def test_undefined_model_max_length_in_tokenizer_for_onnx(): model_id=model_id, ) - test_model14.save_as_onnx(model_id=model_id) + test_model14.save_as_onnx() tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json") try: @@ -598,7 +662,6 @@ def test_save_as_pt_with_license(): ) test_model15.save_as_pt( - model_id=model_id, sentences=["today is sunny"], add_apache_license=True, ) @@ -622,7 +685,7 @@ def test_save_as_onnx_with_license(): model_id=model_id, ) - test_model16.save_as_onnx(model_id=model_id, add_apache_license=True) + test_model16.save_as_onnx(add_apache_license=True) compare_model_zip_file(onnx_zip_file_path, onnx_expected_filenames, model_format) @@ -649,7 +712,7 @@ def test_zip_model_with_license(): model_id=model_id, ) - test_model17.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model17.save_as_pt(sentences=["today is sunny"]) compare_model_zip_file(zip_file_path, expected_filenames_wo_license, model_format) test_model17.zip_model(add_apache_license=True) @@ -658,5 +721,37 @@ def test_zip_model_with_license(): clean_test_folder(TEST_FOLDER) +def test_save_as_pt_model_with_different_id(): + model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" + model_id2 = "sentence-transformers/all-MiniLM-L6-v2" + model_format = "TORCH_SCRIPT" + zip_file_path = os.path.join(TEST_FOLDER, "msmarco-distilbert-base-tas-b.zip") + zip_file_path2 = os.path.join(TEST_FOLDER, "all-MiniLM-L6-v2") + expected_filenames_wo_model_id = { + "msmarco-distilbert-base-tas-b.pt", + "tokenizer.json", + } + expected_filenames_with_model_id = { + "msmarco-distilbert-base-tas-b.pt", + "tokenizer.json", + } + + clean_test_folder(TEST_FOLDER) + test_model17 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model17.save_as_pt(sentences=["today is sunny"]) + compare_model_zip_file(zip_file_path, expected_filenames_wo_model_id, model_format) + + test_model17.save_as_pt(model_id=model_id2, sentences=["today is sunny"]) + compare_model_zip_file( + zip_file_path2, expected_filenames_with_model_id, model_format + ) + + clean_test_folder(TEST_FOLDER) + + clean_test_folder(TEST_FOLDER) clean_test_folder(TESTDATA_UNZIP_FOLDER) From 6c1e47633926f31fd43e85a5adb635b0b2fed243 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Oger Date: Fri, 23 Feb 2024 18:17:42 +0000 Subject: [PATCH 5/7] test: fix test_save_as_pt_model_with_different_id Signed-off-by: Jean-Baptiste Oger --- tests/ml_models/test_sentencetransformermodel_pytest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index fcb82b8c4..b8dd59786 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -726,13 +726,13 @@ def test_save_as_pt_model_with_different_id(): model_id2 = "sentence-transformers/all-MiniLM-L6-v2" model_format = "TORCH_SCRIPT" zip_file_path = os.path.join(TEST_FOLDER, "msmarco-distilbert-base-tas-b.zip") - zip_file_path2 = os.path.join(TEST_FOLDER, "all-MiniLM-L6-v2") + zip_file_path2 = os.path.join(TEST_FOLDER, "all-MiniLM-L6-v2.zip") expected_filenames_wo_model_id = { "msmarco-distilbert-base-tas-b.pt", "tokenizer.json", } expected_filenames_with_model_id = { - "msmarco-distilbert-base-tas-b.pt", + "all-MiniLM-L6-v2.pt", "tokenizer.json", } From c58e75b59499c69883d474b032f00488ab18bd5b Mon Sep 17 00:00:00 2001 From: Jebq Date: Mon, 26 Feb 2024 21:39:15 +0000 Subject: [PATCH 6/7] Update CHANGELOG.md Signed-off-by: Jebq --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fff8037bd..cc584c84a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,8 +38,8 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - use try-except-else block for handling unexpected exceptions during integration tests by @rawwar([#370](https://github.com/opensearch-project/opensearch-py-ml/pull/370)) - Removed pandas version pin in nox tests by @rawwar ([#368](https://github.com/opensearch-project/opensearch-py-ml/pull/368)) - Switch AL2 to AL2023 agent and DockerHub to ECR images in ml-models.JenkinsFile ([#377](https://github.com/opensearch-project/opensearch-py-ml/pull/377)) -- Fix to ensure `model_id` is only required once when saving a model in `sentencetransformermodel.py` in ([#376](https://github.com/opensearch-project/opensearch-py-ml/pull/376)) -- Add a `config_output_path` param in `make_model_config_json` in `sentencetransformermodel.py` to chose where the saving location of the model config json file. ([#376](https://github.com/opensearch-project/opensearch-py-ml/pull/376)) +- Fix to ensure `model_id` is only required once when saving a model in `sentencetransformermodel.py` in ([#381](https://github.com/opensearch-project/opensearch-py-ml/pull/381)) +- Add a `config_output_path` param in `make_model_config_json` in `sentencetransformermodel.py` to chose where the saving location of the model config json file. ([#381](https://github.com/opensearch-project/opensearch-py-ml/pull/381)) ### Fixed - Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203)) From 12e6351a8d82cf4d79673d81738edab5589157cf Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Oger Date: Tue, 5 Mar 2024 22:18:35 +0000 Subject: [PATCH 7/7] fix: use model_id instead of self.model_id when output_model_name is None in train_model Signed-off-by: Jean-Baptiste Oger --- opensearch_py_ml/ml_models/sentencetransformermodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 0c8259242..b4cd03007 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -421,7 +421,7 @@ def train_model( if model_id is None: model_id = self.model_id if output_model_name is None: - output_model_name = str(self.model_id.split("/")[-1] + ".pt") + output_model_name = str(model_id.split("/")[-1] + ".pt") # declare variables before assignment for training corp_len = []