Skip to content

Commit f5ee769

Browse files
committed
add cross-encoder tracing, config-generating, and uploading
Signed-off-by: HenryL27 <[email protected]>
1 parent e0b1bcf commit f5ee769

File tree

3 files changed

+292
-1
lines changed

3 files changed

+292
-1
lines changed

opensearch_py_ml/ml_models/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,6 @@
77

88
from .metrics_correlation.mcorr import MCorr
99
from .sentencetransformermodel import SentenceTransformerModel
10+
from .crossencodermodel import CrossEncoderModel
1011

11-
__all__ = ["SentenceTransformerModel", "MCorr"]
12+
__all__ = ["SentenceTransformerModel", "MCorr", "CrossEncoderModel"]
Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# The OpenSearch Contributors require contributions made to
3+
# this file be licensed under the Apache-2.0 license or a
4+
# compatible open source license.
5+
# Any modifications Copyright OpenSearch Contributors. See
6+
# GitHub history for details.
7+
8+
import json
9+
from opensearch_py_ml.ml_commons import ModelUploader
10+
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
11+
from pathlib import Path
12+
from zipfile import ZipFile
13+
import shutil
14+
import os
15+
import requests
16+
import torch
17+
from opensearch_py_ml.ml_commons.ml_common_utils import (
18+
_generate_model_content_hash_value,
19+
)
20+
from opensearchpy import OpenSearch
21+
22+
23+
def _fix_tokenizer(max_len: int, path: Path):
24+
"""
25+
Add truncation parameters to tokenizer file. Edits the file in place
26+
27+
:param max_len: max number of tokens to truncate to
28+
:type max_len: int
29+
:param path: path to tokenizer file
30+
:type path: str
31+
"""
32+
with open(Path(path) / "tokenizer.json", "r") as f:
33+
parsed = json.load(f)
34+
if "truncation" not in parsed or parsed['truncation'] is None:
35+
parsed['truncation'] = {
36+
"direction": "Right",
37+
"max_length": max_len,
38+
"strategy": "LongestFirst",
39+
"stride": 0,
40+
}
41+
with open(Path(path) / "tokenizer.json", "w") as f:
42+
json.dump(parsed, f, indent=2)
43+
44+
45+
class CrossEncoderModel:
46+
"""
47+
Class for configuring and uploading cross encoder models for opensearch
48+
"""
49+
def __init__(
50+
self,
51+
hf_model_id: str,
52+
folder_path: str = None,
53+
overwrite: bool = False
54+
) -> None:
55+
"""
56+
Initialize a new CrossEncoder model from a huggingface id
57+
58+
:param hf_model_id: huggingface id of the model to load
59+
:type hf_model_id: str
60+
:param folder_path: folder path to save the model
61+
default is /tmp/models/hf_model_id
62+
:type folder_path: str
63+
:param overwrite: whether to overwrite the existing model
64+
:type overwrite: bool
65+
:return: None
66+
"""
67+
default_folder_path = Path(f"/tmp/models/{hf_model_id}")
68+
69+
if folder_path is None:
70+
self._folder_path = default_folder_path
71+
else:
72+
self._folder_path = Path(folder_path)
73+
74+
if self._folder_path.exists() and not overwrite:
75+
raise Exception(f"Folder {self._folder_path} already exists. To overwrite it, set `overwrite=True`.")
76+
77+
self._hf_model_id = hf_model_id
78+
self._framework = None
79+
self._folder_path.mkdir(parents=True, exist_ok=True)
80+
81+
82+
def zip_model(self, framework: str = "pt") -> Path:
83+
"""
84+
Compiles and zips the model to {self._folder_path}/model.zip
85+
86+
:param framework: one of "pt", "onnx". The framework to zip the model as.
87+
default: "pt"
88+
:type framework: str
89+
:return: the path with the zipped model
90+
:rtype: Path
91+
"""
92+
if framework == "pt":
93+
self._framework = "pt"
94+
return self._zip_model_pytorch()
95+
if framework == "onnx":
96+
self._framework = "onnx"
97+
return self._zip_model_onnx()
98+
raise Exception(f"Unrecognized framework {framework}. Accepted values are `pt`, `onnx`")
99+
100+
101+
def _zip_model_pytorch(self) -> Path:
102+
"""
103+
Compiles the model to TORCHSCRIPT format.
104+
"""
105+
tk = AutoTokenizer.from_pretrained(self._hf_model_id)
106+
model = AutoModelForSequenceClassification.from_pretrained(self._hf_model_id)
107+
features = tk([["dummy sentence 1", "dummy sentence 2"]], return_tensors="pt")
108+
mname = Path(self._hf_model_id).name
109+
110+
# bge models don't generate token type ids
111+
if mname.startswith("bge"):
112+
features['token_type_ids'] = torch.zeros_like(features['input_ids'])
113+
114+
# compile
115+
compiled = torch.jit.trace(model, example_kwarg_inputs={
116+
'input_ids': features['input_ids'],
117+
'attention_mask': features['attention_mask'],
118+
'token_type_ids': features['token_type_ids']
119+
}, strict=False)
120+
torch.jit.save(compiled, f"/tmp/{mname}.pt")
121+
122+
# save tokenizer file
123+
tk_path = f"/tmp/{mname}-tokenizer"
124+
tk.save_pretrained(tk_path)
125+
_fix_tokenizer(tk.model_max_length, tk_path)
126+
127+
# get apache license
128+
r = requests.get("https://github.com/opensearch-project/opensearch-py-ml/raw/main/LICENSE")
129+
with ZipFile(self._folder_path / "model.zip", "w") as f:
130+
f.write(f"/tmp/{mname}.pt", arcname=f"{mname}.pt")
131+
f.write(tk_path + "/tokenizer.json", arcname="tokenizer.json")
132+
f.writestr("LICENSE", r.content)
133+
134+
# clean up temp files
135+
shutil.rmtree(f"/tmp/{mname}-tokenizer")
136+
os.remove(f"/tmp/{mname}.pt")
137+
return self._folder_path / "model.zip"
138+
139+
def _zip_model_onnx(self):
140+
"""
141+
Compiles the model to ONNX format.
142+
"""
143+
tk = AutoTokenizer.from_pretrained(self._hf_model_id)
144+
model = AutoModelForSequenceClassification.from_pretrained(self._hf_model_id)
145+
features = tk([["dummy sentence 1", "dummy sentence 2"]], return_tensors="pt")
146+
mname = Path(self._hf_model_id).name
147+
148+
# bge models don't generate token type ids
149+
if mname.startswith("bge"):
150+
features['token_type_ids'] = torch.zeros_like(features['input_ids'])
151+
152+
# export to onnx
153+
onnx_model_path = f"/tmp/{mname}.onnx"
154+
torch.onnx.export(
155+
model=model,
156+
args=(features['input_ids'], features['attention_mask'], features['token_type_ids']),
157+
f=onnx_model_path,
158+
input_names=['input_ids', 'attention_mask', 'token_type_ids'],
159+
output_names=['output'],
160+
dynamic_axes={
161+
'input_ids': {0: 'batch_size', 1: 'sequence_length'},
162+
'attention_mask': {0: 'batch_size', 1: 'sequence_length'},
163+
'token_type_ids': {0: 'batch_size', 1: 'sequence_length'},
164+
'output': {0: 'batch_size'}
165+
},
166+
verbose=True
167+
)
168+
169+
# save tokenizer file
170+
tk_path = f"/tmp/{mname}-tokenizer"
171+
tk.save_pretrained(tk_path)
172+
_fix_tokenizer(tk.model_max_length, tk_path)
173+
174+
# get apache license
175+
r = requests.get("https://github.com/opensearch-project/opensearch-py-ml/raw/main/LICENSE")
176+
with ZipFile(self._folder_path / "model.zip", "w") as f:
177+
f.write(onnx_model_path, arcname=f"{mname}.pt")
178+
f.write(tk_path + "/tokenizer.json", arcname="tokenizer.json")
179+
f.writestr("LICENSE", r.content)
180+
181+
# clean up temp files
182+
shutil.rmtree(f"/tmp/{mname}-tokenizer")
183+
os.remove(onnx_model_path)
184+
return self._folder_path / "model.zip"
185+
186+
187+
def make_model_config_json(
188+
self,
189+
model_name: str = None,
190+
version_number: str = 1,
191+
description: str = None,
192+
all_config: str = None,
193+
model_type: str = None,
194+
verbose: bool = False,
195+
):
196+
"""
197+
Parse from config.json file of pre-trained hugging-face model to generate a ml-commons_model_config.json file.
198+
If all required fields are given by users, use the given parameters and will skip reading the config.json
199+
200+
:param model_name:
201+
Optional, The name of the model. If None, default is model id, for example,
202+
'sentence-transformers/msmarco-distilbert-base-tas-b'
203+
:type model_name: string
204+
:param version_number:
205+
Optional, The version number of the model. Default is 1
206+
:type version_number: string
207+
:param description: Optional, the description of the model. If None, get description from the README.md
208+
file in the model folder.
209+
:type description: str
210+
:param all_config:
211+
Optional, the all_config of the model. If None, parse all contents from the config file of pre-trained
212+
hugging-face model
213+
:type all_config: dict
214+
:param model_type:
215+
Optional, the model_type of the model. If None, parse model_type from the config file of pre-trained
216+
hugging-face model
217+
:type model_type: string
218+
:param verbose:
219+
optional, use printing more logs. Default as false
220+
:type verbose: bool
221+
:return: model config file path. The file path where the model config file is being saved
222+
:rtype: string
223+
"""
224+
if not (self._folder_path / "model.zip").exists():
225+
raise Exception("Generate the model zip before generating the config")
226+
hash_value = _generate_model_content_hash_value(str(self._folder_path / "model.zip"))
227+
if model_name is None:
228+
model_name = Path(self._hf_model_id).name
229+
if description is None:
230+
description = f"Cross Encoder Model {model_name}"
231+
if all_config is None:
232+
cfg = AutoConfig.from_pretrained(self._hf_model_id)
233+
all_config = cfg.to_json_string()
234+
if model_type is None:
235+
model_type = "bert"
236+
model_format = None
237+
if self._framework is not None:
238+
model_format = {
239+
'pt': 'TORCH_SCRIPT',
240+
'onnx': 'ONNX'
241+
}.get(self._framework)
242+
if model_format is None:
243+
raise Exception("Model format either not found or not supported. Zip the model before generating the config")
244+
model_config_content = {
245+
"name": model_name,
246+
"version": f"1.0.{version_number}",
247+
"description": description,
248+
"model_format": model_format,
249+
"function_name": "TEXT_SIMILARITY",
250+
"model_content_hash_value": hash_value,
251+
"model_config": {
252+
"model_type": model_type,
253+
"embedding_dimension": 1,
254+
"framework_type": "huggingface_transformers",
255+
"all_config": all_config,
256+
}
257+
}
258+
if verbose:
259+
print(json.dumps(model_config_content, indent=2))
260+
with open(self._folder_path / "config.json", "w") as f:
261+
json.dump(model_config_content, f)
262+
return self._folder_path / "config.json"
263+
264+
def upload(self, client: OpenSearch, framework: str = 'pt', model_group_id: str = "", verbose: bool = False):
265+
"""
266+
Upload the model to OpenSearch
267+
268+
:param client: OpenSearch client
269+
:type client: OpenSearch
270+
:param framework: either 'pt' or 'onnx'
271+
:type framework: str
272+
:param model_group_id: model group id to upload this model to
273+
:type model_group_id: str
274+
:param verbose: log a bunch or not
275+
:type verbose: bool
276+
"""
277+
config_path = self._folder_path / "config.json"
278+
model_path = self._folder_path / "model.zip"
279+
gen_cfg = False
280+
if not model_path.exists() or self._framework != framework:
281+
gen_cfg = True
282+
self.zip_model(framework)
283+
if not config_path.exists() or gen_cfg:
284+
self.make_model_config_json()
285+
uploader = ModelUploader(client)
286+
uploader._register_model(str(model_path), str(config_path), model_group_id, verbose)
287+
288+
289+

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ sentence_transformers
1212
tqdm
1313
transformers
1414
deprecated
15+
requests

0 commit comments

Comments
 (0)