diff --git a/CHANGELOG.md b/CHANGELOG.md index 133b0abcb..cc584c84a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,8 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - use try-except-else block for handling unexpected exceptions during integration tests by @rawwar([#370](https://github.com/opensearch-project/opensearch-py-ml/pull/370)) - Removed pandas version pin in nox tests by @rawwar ([#368](https://github.com/opensearch-project/opensearch-py-ml/pull/368)) - Switch AL2 to AL2023 agent and DockerHub to ECR images in ml-models.JenkinsFile ([#377](https://github.com/opensearch-project/opensearch-py-ml/pull/377)) +- Fix to ensure `model_id` is only required once when saving a model in `sentencetransformermodel.py` in ([#381](https://github.com/opensearch-project/opensearch-py-ml/pull/381)) +- Add a `config_output_path` param in `make_model_config_json` in `sentencetransformermodel.py` to chose where the saving location of the model config json file. ([#381](https://github.com/opensearch-project/opensearch-py-ml/pull/381)) ### Fixed - Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203)) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 10f1174b5..b4cd03007 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -419,9 +419,9 @@ def train_model( """ if model_id is None: - model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" + model_id = self.model_id if output_model_name is None: - output_model_name = str(self.model_id.split("/")[-1] + ".pt") + output_model_name = str(model_id.split("/")[-1] + ".pt") # declare variables before assignment for training corp_len = [] @@ -765,7 +765,7 @@ def _fill_null_truncation_field( def save_as_pt( self, sentences: [str], - model_id="sentence-transformers/msmarco-distilbert-base-tas-b", + model_id: str = None, model_name: str = None, save_json_folder_path: str = None, model_output_path: str = None, @@ -780,7 +780,7 @@ def save_as_pt( Required, for example sentences = ['today is sunny'] :type sentences: List of string [str] :param model_id: - sentence transformer model id to download model from sentence transformers. + Optional, sentence transformer model id to download model from sentence transformers. default model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" :type model_id: string :param model_name: @@ -806,6 +806,9 @@ def save_as_pt( :rtype: string """ + if model_id is None: + model_id = self.model_id + model = SentenceTransformer(model_id) if model_name is None: @@ -877,7 +880,7 @@ def save_as_pt( def save_as_onnx( self, - model_id="sentence-transformers/msmarco-distilbert-base-tas-b", + model_id: str = None, model_name: str = None, save_json_folder_path: str = None, model_output_path: str = None, @@ -889,7 +892,7 @@ def save_as_onnx( zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster :param model_id: - sentence transformer model id to download model from sentence transformers. + Optional, sentence transformer model id to download model from sentence transformers. default model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" :type model_id: string :param model_name: @@ -915,6 +918,9 @@ def save_as_onnx( :rtype: string """ + if model_id is None: + model_id = self.model_id + model = SentenceTransformer(model_id) if model_name is None: @@ -1143,6 +1149,7 @@ def make_model_config_json( model_name: str = None, version_number: str = 1, model_format: str = "TORCH_SCRIPT", + config_output_path: str = None, model_zip_file_path: str = None, embedding_dimension: int = None, pooling_mode: str = None, @@ -1163,6 +1170,10 @@ def make_model_config_json( :param model_format: Optional, the format of the model. Default is "TORCH_SCRIPT". :type model_format: string + :param config_output_path: + Optional, path to save model config json file. If None, default as + default_folder_path from the constructor + :type config_output_path: string :param model_zip_file_path: Optional, path to the model zip file. Default is the zip file path used in save_as_pt or save_as_onnx depending on model_format. This zip file is used to compute model_content_size_in_bytes and @@ -1198,6 +1209,8 @@ def make_model_config_json( :rtype: string """ folder_path = self.folder_path + if config_output_path is None: + config_output_path = self.folder_path config_json_file_path = os.path.join(folder_path, "config.json") if model_name is None: model_name = self.model_id @@ -1313,7 +1326,7 @@ def make_model_config_json( print(json.dumps(model_config_content, indent=4)) model_config_file_path = os.path.join( - folder_path, "ml-commons_model_config.json" + config_output_path, "ml-commons_model_config.json" ) os.makedirs(os.path.dirname(model_config_file_path), exist_ok=True) with open(model_config_file_path, "w") as file: diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index c9c9046ba..b8dd59786 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -251,7 +251,7 @@ def test_make_model_config_json_for_torch_script(): model_id=model_id, ) - test_model5.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model5.save_as_pt(sentences=["today is sunny"]) model_config_path_torch = test_model5.make_model_config_json( model_format="TORCH_SCRIPT", verbose=True ) @@ -267,6 +267,38 @@ def test_make_model_config_json_for_torch_script(): clean_test_folder(TEST_FOLDER) +def test_make_model_config_json_set_path_for_torch_script(): + model_id = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + model_format = "TORCH_SCRIPT" + expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources." + expected_model_config_data = { + "embedding_dimension": 384, + "pooling_mode": "MEAN", + "normalize_result": True, + } + + clean_test_folder(TEST_FOLDER) + test_model5 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model5.save_as_pt(sentences=["today is sunny"]) + model_config_path_torch = test_model5.make_model_config_json( + config_output_path=TEST_FOLDER, model_format="TORCH_SCRIPT", verbose=True + ) + + compare_model_config( + model_config_path_torch, + model_id, + model_format, + expected_model_description=expected_model_description, + expected_model_config_data=expected_model_config_data, + ) + + clean_test_folder(TEST_FOLDER) + + def test_make_model_config_json_for_onnx(): model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2" model_format = "ONNX" @@ -283,7 +315,7 @@ def test_make_model_config_json_for_onnx(): model_id=model_id, ) - test_model6.save_as_onnx(model_id=model_id) + test_model6.save_as_onnx() model_config_path_onnx = test_model6.make_model_config_json(model_format="ONNX") compare_model_config( @@ -297,6 +329,38 @@ def test_make_model_config_json_for_onnx(): clean_test_folder(TEST_FOLDER) +def test_make_model_config_json_set_path_for_onnx(): + model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2" + model_format = "ONNX" + expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search." + expected_model_config_data = { + "embedding_dimension": 384, + "pooling_mode": "MEAN", + "normalize_result": False, + } + + clean_test_folder(TEST_FOLDER) + test_model6 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model6.save_as_onnx() + model_config_path_onnx = test_model6.make_model_config_json( + config_output_path=TEST_FOLDER, model_format="ONNX" + ) + + compare_model_config( + model_config_path_onnx, + model_id, + model_format, + expected_model_description=expected_model_description, + expected_model_config_data=expected_model_config_data, + ) + + clean_test_folder(TEST_FOLDER) + + def test_overwrite_fields_in_model_config(): model_id = "sentence-transformers/all-distilroberta-v1" model_format = "TORCH_SCRIPT" @@ -318,7 +382,7 @@ def test_overwrite_fields_in_model_config(): model_id=model_id, ) - test_model7.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model7.save_as_pt(sentences=["today is sunny"]) model_config_path_torch = test_model7.make_model_config_json( model_format="TORCH_SCRIPT" ) @@ -337,7 +401,7 @@ def test_overwrite_fields_in_model_config(): model_id=model_id, ) - test_model8.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model8.save_as_pt(sentences=["today is sunny"]) model_config_path_torch = test_model8.make_model_config_json( model_format="TORCH_SCRIPT", embedding_dimension=overwritten_model_config_data["embedding_dimension"], @@ -367,7 +431,7 @@ def test_missing_readme_md_file(): model_id=model_id, ) - test_model9.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model9.save_as_pt(sentences=["today is sunny"]) temp_path = os.path.join( TEST_FOLDER, "README.md", @@ -403,7 +467,7 @@ def test_missing_expected_description_in_readme_file(): model_id=model_id, ) - test_model10.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model10.save_as_pt(sentences=["today is sunny"]) temp_path = os.path.join( TEST_FOLDER, "README.md", @@ -440,7 +504,7 @@ def test_overwrite_description(): model_id=model_id, ) - test_model11.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model11.save_as_pt(sentences=["today is sunny"]) model_config_path_torch = test_model11.make_model_config_json( model_format=model_format, description=expected_model_description ) @@ -471,7 +535,7 @@ def test_long_description(): model_id=model_id, ) - test_model12.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model12.save_as_pt(sentences=["today is sunny"]) model_config_path_torch = test_model12.make_model_config_json( model_format=model_format ) @@ -501,7 +565,7 @@ def test_truncation_parameter(): model_id=model_id, ) - test_model13.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model13.save_as_pt(sentences=["today is sunny"]) tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json") try: @@ -534,7 +598,7 @@ def test_undefined_model_max_length_in_tokenizer_for_torch_script(): model_id=model_id, ) - test_model14.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model14.save_as_pt(sentences=["today is sunny"]) tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json") try: @@ -563,7 +627,7 @@ def test_undefined_model_max_length_in_tokenizer_for_onnx(): model_id=model_id, ) - test_model14.save_as_onnx(model_id=model_id) + test_model14.save_as_onnx() tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json") try: @@ -598,7 +662,6 @@ def test_save_as_pt_with_license(): ) test_model15.save_as_pt( - model_id=model_id, sentences=["today is sunny"], add_apache_license=True, ) @@ -622,7 +685,7 @@ def test_save_as_onnx_with_license(): model_id=model_id, ) - test_model16.save_as_onnx(model_id=model_id, add_apache_license=True) + test_model16.save_as_onnx(add_apache_license=True) compare_model_zip_file(onnx_zip_file_path, onnx_expected_filenames, model_format) @@ -649,7 +712,7 @@ def test_zip_model_with_license(): model_id=model_id, ) - test_model17.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model17.save_as_pt(sentences=["today is sunny"]) compare_model_zip_file(zip_file_path, expected_filenames_wo_license, model_format) test_model17.zip_model(add_apache_license=True) @@ -658,5 +721,37 @@ def test_zip_model_with_license(): clean_test_folder(TEST_FOLDER) +def test_save_as_pt_model_with_different_id(): + model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" + model_id2 = "sentence-transformers/all-MiniLM-L6-v2" + model_format = "TORCH_SCRIPT" + zip_file_path = os.path.join(TEST_FOLDER, "msmarco-distilbert-base-tas-b.zip") + zip_file_path2 = os.path.join(TEST_FOLDER, "all-MiniLM-L6-v2.zip") + expected_filenames_wo_model_id = { + "msmarco-distilbert-base-tas-b.pt", + "tokenizer.json", + } + expected_filenames_with_model_id = { + "all-MiniLM-L6-v2.pt", + "tokenizer.json", + } + + clean_test_folder(TEST_FOLDER) + test_model17 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model17.save_as_pt(sentences=["today is sunny"]) + compare_model_zip_file(zip_file_path, expected_filenames_wo_model_id, model_format) + + test_model17.save_as_pt(model_id=model_id2, sentences=["today is sunny"]) + compare_model_zip_file( + zip_file_path2, expected_filenames_with_model_id, model_format + ) + + clean_test_folder(TEST_FOLDER) + + clean_test_folder(TEST_FOLDER) clean_test_folder(TESTDATA_UNZIP_FOLDER)