Skip to content
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
- Add support for model profiles by @rawwar in ([#358](https://github.com/opensearch-project/opensearch-py-ml/pull/358))
- Support for security default admin credential changes in 2.12.0 in ([#365](https://github.com/opensearch-project/opensearch-py-ml/pull/365))
- adding cross encoder models in the pre-trained traced list ([#378](https://github.com/opensearch-project/opensearch-py-ml/pull/378))
- Add support for Cross Encoders - Trace, Config, Upload by @HenryL27 in ([#375](https://github.com/opensearch-project/opensearch-py-ml/pull/375))


### Changed
Expand Down
3 changes: 2 additions & 1 deletion opensearch_py_ml/ml_models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
# Any modifications Copyright OpenSearch Contributors. See
# GitHub history for details.

from .crossencodermodel import CrossEncoderModel
from .metrics_correlation.mcorr import MCorr
from .sentencetransformermodel import SentenceTransformerModel

__all__ = ["SentenceTransformerModel", "MCorr"]
__all__ = ["SentenceTransformerModel", "MCorr", "CrossEncoderModel"]
324 changes: 324 additions & 0 deletions opensearch_py_ml/ml_models/crossencodermodel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,324 @@
# SPDX-License-Identifier: Apache-2.0
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.
# Any modifications Copyright OpenSearch Contributors. See
# GitHub history for details.

import json
import os
import shutil
from pathlib import Path
from zipfile import ZipFile

import requests
import torch
from opensearchpy import OpenSearch
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer

from opensearch_py_ml.ml_commons import ModelUploader
from opensearch_py_ml.ml_commons.ml_common_utils import (
_generate_model_content_hash_value,
)


def _fix_tokenizer(max_len: int, path: Path):
"""
Add truncation parameters to tokenizer file. Edits the file in place

:param max_len: max number of tokens to truncate to
:type max_len: int
:param path: path to tokenizer file
:type path: str
"""
with open(Path(path) / "tokenizer.json", "r") as f:
parsed = json.load(f)
if "truncation" not in parsed or parsed["truncation"] is None:
parsed["truncation"] = {
"direction": "Right",
"max_length": max_len,
"strategy": "LongestFirst",
"stride": 0,
}
with open(Path(path) / "tokenizer.json", "w") as f:
json.dump(parsed, f, indent=2)


class CrossEncoderModel:
"""
Class for configuring and uploading cross encoder models for opensearch
"""

def __init__(
self, hf_model_id: str, folder_path: str = None, overwrite: bool = False
) -> None:
"""
Initialize a new CrossEncoder model from a huggingface id

:param hf_model_id: huggingface id of the model to load
:type hf_model_id: str
:param folder_path: folder path to save the model
default is /tmp/models/hf_model_id
:type folder_path: str
:param overwrite: whether to overwrite the existing model
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whether to overwrite the existing model folder_path ?

:type overwrite: bool
:return: None
"""
default_folder_path = Path(f"/tmp/models/{hf_model_id}")

if folder_path is None:
self._folder_path = default_folder_path
else:
self._folder_path = Path(folder_path)

if self._folder_path.exists() and not overwrite:
raise Exception(
f"Folder {self._folder_path} already exists. To overwrite it, set `overwrite=True`."
)

self._hf_model_id = hf_model_id
self._framework = None
self._folder_path.mkdir(parents=True, exist_ok=True)
self._model_zip = None
self._model_config = None

def zip_model(
self, framework: str = "torch_script", zip_fname: str = "model.zip"
) -> Path:
"""
Compiles and zips the model to {self._folder_path}/{zip_fname}

:param framework: one of "torch_script", "onnx". The framework to zip the model as.
default: "torch_script"
:type framework: str
:param zip_fname: path to place resulting zip file inside of self._folder_path.
Example: if folder_path is "/tmp/models" and zip_path is "zipped_up.zip" then
the file can be found at "/tmp/models/zipped_up.zip"
Default: "model.zip"
:type zip_fname: str
:return: the path with the zipped model
:rtype: Path
"""
tk = AutoTokenizer.from_pretrained(self._hf_model_id)
model = AutoModelForSequenceClassification.from_pretrained(self._hf_model_id)
features = tk([["dummy sentence 1", "dummy sentence 2"]], return_tensors="pt")
mname = Path(self._hf_model_id).name
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's follow snake casing? model_name?


# bge models don't generate token type ids
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have to any issue to reference here?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I arrived at this conclusion by trying to do the thing and failing, so there might be an issue somewhere out there but it's more of a fundamental architectural feature, not a bug

if mname.startswith("bge"):
features["token_type_ids"] = torch.zeros_like(features["input_ids"])

if framework == "torch_script":
self._framework = "torch_script"
model_loc = CrossEncoderModel._trace_pytorch(model, features, mname)
elif framework == "onnx":
self._framework = "onnx"
model_loc = CrossEncoderModel._trace_onnx(model, features, mname)
else:
raise Exception(
f"Unrecognized framework {framework}. Accepted values are `torch_script`, `onnx`"
)

# save tokenizer file
tk_path = Path(f"/tmp/{mname}-tokenizer")
tk.save_pretrained(tk_path)
if tk.model_max_length is None:
model_config = AutoConfig.from_pretrained(self._hf_model_id)
if hasattr(model_config, "max_position_embeddings"):
tk.model_max_length = model_config.max_position_embeddings
elif hasattr(model_config, "n_positions"):
tk.model_max_length = model_config.n_positions
else:
tk.model_max_length = 2**15 # =32768. Set to something big I guess
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Setting an arbitrary value doesn't seem like a good solution.

What do you think about following this: https://github.com/opensearch-project/opensearch-py-ml/blob/main/opensearch_py_ml/ml_models/sentencetransformermodel.py#L936-L942

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would love to. Unfortunately, model.get_max_seq_length() is not a method of most hf transformer ModelForSequenceClassification - that's a special thing from the sentence-transformers model interface, and I'm not sure it is the guarantee that it claims to be: [implementation]

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm seeing two problems here:

  1. tk.model_max_length is None: [Bug] tokenizer.model_max_length is different when loading model from shortcut or local path huggingface/transformers#14561 ; if you see this, model_max_length will be a larger value and we are going to set a large value which model can't process

  2. I'm not a big fan of setting a static big value like tk.model_max_length = 2**15. What exactly are we achieving here?

It's important to align model_max_length with the model's actual capability (max_position_embeddings). Setting model_max_length to a higher value than max_position_embeddings could lead to errors or unexpected behavior, as the model won't be able to handle inputs longer than its architectural limit. Conversely, setting a lower model_max_length can be useful for optimizing performance or adhering to specific task requirements.

Copy link
Author

@HenryL27 HenryL27 Feb 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand these issues. tk.model_max_length is None is exactly the condition that triggers setting it to something reasonable. I agree that it's important to set it to a value that agrees with the model's capability: that's why the first condition here is checking and setting to max_position_embeddings. We only pick the Very Large Number when we don't have a bound. For instance, suppose someone trained a mamba-based cross encoder with infinite context cuz it's a RNN.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the confusion. What I tried to mean here is, tk.model_max_length is None --> this condition will not even trigger. Without triggering this, model_max_length could be a large value which model can't process.


tokenizer = GPT2Tokenizer.from_pretrained("path/to/local/gpt2")
print(tokenizer.model_max_length)
# 1000000000000000019884624838656

So here model_max_length isn't None, right? But still do we want that?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting. Can we just let huggingface/transformers fix this bug? It seems like a them problem, and from what I can tell the only times we're gonna hit it is if someone is trying to use a very old tokenizer file with their thing. At that point I hope we can assume the user is proficient enough with transformers to debug if necessary.

print(
f"The model_max_length is not found in tokenizer_config.json. Setting it to be {tk.model_max_length}"
)
_fix_tokenizer(tk.model_max_length, tk_path)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tk.model_max_length --> not sure if we will get this value all the time. Please take a look at this PR: #219

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

k. going with the strategy proposed in huggingface/transformers#14561 - look for max_position_embeddings or n_positions in the config object and if I miss in both those places, I set to 32k and hope it's fine. (The fix using sentence_transformers model.get_max_seq_length() also has the potential to return None)


# get apache license
r = requests.get(
"https://github.com/opensearch-project/opensearch-py-ml/raw/main/LICENSE"
)
self._model_zip = self._folder_path / zip_fname
with ZipFile(self._model_zip, "w") as f:
f.write(model_loc, arcname=model_loc.name)
f.write(tk_path / "tokenizer.json", arcname="tokenizer.json")
f.writestr("LICENSE", r.content)

# clean up temp files
shutil.rmtree(tk_path)
os.remove(model_loc)
return self._model_zip

@staticmethod
def _trace_pytorch(model, features, mname) -> Path:
"""
Compiles the model to TORCHSCRIPT format.

:param features: Model input features
:return: Path to the traced model
"""
# compile
compiled = torch.jit.trace(
model,
example_kwarg_inputs={
"input_ids": features["input_ids"],
"attention_mask": features["attention_mask"],
"token_type_ids": features["token_type_ids"],
},
strict=False,
)
save_loc = Path(f"/tmp/{mname}.pt")
torch.jit.save(compiled, f"/tmp/{mname}.pt")
return save_loc

@staticmethod
def _trace_onnx(model, features, mname):
"""
Compiles the model to ONNX format.
"""
# export to onnx
save_loc = Path(f"/tmp/{mname}.onnx")
torch.onnx.export(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think we should add onnx in the requirements.txt.

In addition, can we also add in the requirements-dev.txt?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like it's there already?

model=model,
args=(
features["input_ids"],
features["attention_mask"],
features["token_type_ids"],
),
f=str(save_loc),
input_names=["input_ids", "attention_mask", "token_type_ids"],
output_names=["output"],
dynamic_axes={
"input_ids": {0: "batch_size", 1: "sequence_length"},
"attention_mask": {0: "batch_size", 1: "sequence_length"},
"token_type_ids": {0: "batch_size", 1: "sequence_length"},
"output": {0: "batch_size"},
},
verbose=True,
)
return save_loc

def make_model_config_json(
self,
config_fname: str = "config.json",
model_name: str = None,
version_number: str = '1.0.0',
description: str = None,
all_config: str = None,
model_type: str = None,
verbose: bool = False,
):
"""
Parse from config.json file of pre-trained hugging-face model to generate a ml-commons_model_config.json file.
If all required fields are given by users, use the given parameters and will skip reading the config.json

:param config_fname:
Optional, File name of model json config file. Default is "config.json".
Controls where the config file generated by this function will appear -
"{self._folder_path}/{config_fname}"
:type config_fname: str
:param model_name:
Optional, The name of the model. If None, default is model id, for example,
'sentence-transformers/msmarco-distilbert-base-tas-b'
:type model_name: string
:param version_number:
Optional, The version number of the model. Default is 1
:type version_number: string
:param description: Optional, the description of the model. If None, get description from the README.md
file in the model folder.
:type description: str
:param all_config:
Optional, the all_config of the model. If None, parse all contents from the config file of pre-trained
hugging-face model
:type all_config: dict
:param model_type:
Optional, the model_type of the model. If None, parse model_type from the config file of pre-trained
hugging-face model
:type model_type: string
:param verbose:
optional, use printing more logs. Default as false
:type verbose: bool
:return: model config file path. The file path where the model config file is being saved
:rtype: string
"""
if self._model_zip is None:
raise Exception(
"No model zip file. Generate the model zip file before generating the config."
)
if not self._model_zip.exists():
raise Exception(f"Model zip file {self._model_zip} could not be found")
hash_value = _generate_model_content_hash_value(str(self._model_zip))
if model_name is None:
model_name = Path(self._hf_model_id).name
if description is None:
description = f"Cross Encoder Model {model_name}"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we follow the way we are showing description for embedding models?

if all_config is None:
cfg = AutoConfig.from_pretrained(self._hf_model_id)
all_config = cfg.to_json_string()
if model_type is None:
model_type = "bert"
model_format = None
if self._framework is not None:
model_format = {"torch_script": "TORCH_SCRIPT", "onnx": "ONNX"}.get(
self._framework
)
if model_format is None:
raise Exception(
"Model format either not found or not supported. Zip the model before generating the config"
)
model_config_content = {
"name": model_name,
"version": version_number,
"description": description,
"model_format": model_format,
"function_name": "TEXT_SIMILARITY",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need model_task_type. Let's follow sentencetransformer model example method.

"model_task_type": "TEXT_SIMILARITY",
"model_content_hash_value": hash_value,
"model_config": {
"model_type": model_type,
"embedding_dimension": 1,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this correct?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you look at here: https://huggingface.co/BAAI/bge-reranker-base/blob/main/config.json

max_position_embeddings = 514

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, this is what I did. Artifact of the implementation (depends heavily on the embedding model code). Can probably be cleaned up a bit

"framework_type": "huggingface_transformers",
"all_config": all_config,
},
}
self._model_config = self._folder_path / config_fname
if verbose:
print(json.dumps(model_config_content, indent=2))
with open(self._model_config, "w") as f:
json.dump(model_config_content, f)
return self._model_config

def upload(
self,
client: OpenSearch,
framework: str = "torch_script",
model_group_id: str = "",
verbose: bool = False,
):
"""
Upload the model to OpenSearch

:param client: OpenSearch client
:type client: OpenSearch
:param framework: either 'torch_script' or 'onnx'
:type framework: str
:param model_group_id: model group id to upload this model to
:type model_group_id: str
:param verbose: log a bunch or not
:type verbose: bool
"""
gen_cfg = False
if (
self._model_zip is None
or not self._model_zip.exists()
or self._framework != framework
):
gen_cfg = True
self.zip_model(framework)
if self._model_config is None or not self._model_config.exists() or gen_cfg:
self.make_model_config_json()
uploader = ModelUploader(client)
uploader._register_model(
str(self._model_zip), str(self._model_config), model_group_id, verbose
)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ sentence_transformers
tqdm
transformers
deprecated
requests
Loading