|
| 1 | +from pathlib import Path |
| 2 | +from types import MethodType |
| 3 | +from typing import Dict, List, Optional, Tuple, Union |
| 4 | + |
| 5 | +import numpy as np |
| 6 | +import torch |
| 7 | +from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE |
| 8 | +from sentence_transformers import SentenceTransformer |
| 9 | +from transformers import AutoTokenizer, PretrainedConfig |
| 10 | +from transformers.file_utils import add_start_docstrings |
| 11 | + |
| 12 | +from .modeling import MODEL_START_DOCSTRING, OVModel |
| 13 | + |
| 14 | + |
| 15 | +@add_start_docstrings( |
| 16 | + """ |
| 17 | + OpenVINO Model for feature extraction tasks for Sentence Transformers. |
| 18 | + """, |
| 19 | + MODEL_START_DOCSTRING, |
| 20 | +) |
| 21 | +class OVSentenceTransformer(OVModel): |
| 22 | + export_feature = "feature-extraction" |
| 23 | + _library_name = "sentence_transformers" |
| 24 | + |
| 25 | + def __init__(self, model=None, config=None, tokenizer=None, **kwargs): |
| 26 | + super().__init__(model, config, **kwargs) |
| 27 | + |
| 28 | + self.encode = MethodType(SentenceTransformer.encode, self) |
| 29 | + self._text_length = MethodType(SentenceTransformer._text_length, self) |
| 30 | + self.default_prompt_name = None |
| 31 | + self.truncate_dim = None |
| 32 | + self.tokenizer = tokenizer |
| 33 | + |
| 34 | + def _save_pretrained(self, save_directory: Union[str, Path]): |
| 35 | + super()._save_pretrained(save_directory) |
| 36 | + self.tokenizer.save_pretrained(save_directory) |
| 37 | + |
| 38 | + def forward(self, inputs: Dict[str, torch.Tensor]): |
| 39 | + self.compile() |
| 40 | + input_ids = inputs.get("input_ids") |
| 41 | + attention_mask = inputs.get("attention_mask") |
| 42 | + token_type_ids = inputs.get("token_type_ids") |
| 43 | + |
| 44 | + np_inputs = isinstance(input_ids, np.ndarray) |
| 45 | + if not np_inputs: |
| 46 | + input_ids = np.array(input_ids) |
| 47 | + attention_mask = np.array(attention_mask) |
| 48 | + token_type_ids = np.array(token_type_ids) if token_type_ids is not None else token_type_ids |
| 49 | + |
| 50 | + inputs = { |
| 51 | + "input_ids": input_ids, |
| 52 | + "attention_mask": attention_mask, |
| 53 | + } |
| 54 | + |
| 55 | + # Add the token_type_ids when needed |
| 56 | + if "token_type_ids" in self.input_names: |
| 57 | + inputs["token_type_ids"] = token_type_ids if token_type_ids is not None else np.zeros_like(input_ids) |
| 58 | + |
| 59 | + outputs = self._inference(inputs) |
| 60 | + return { |
| 61 | + "token_embeddings": torch.from_numpy(outputs["token_embeddings"]).to(self.device), |
| 62 | + "sentence_embedding": torch.from_numpy(outputs["sentence_embedding"]).to(self.device), |
| 63 | + } |
| 64 | + |
| 65 | + @classmethod |
| 66 | + def _from_pretrained( |
| 67 | + cls, |
| 68 | + model_id: Union[str, Path], |
| 69 | + config: PretrainedConfig, |
| 70 | + token: Optional[Union[bool, str]] = None, |
| 71 | + revision: Optional[str] = None, |
| 72 | + force_download: bool = False, |
| 73 | + cache_dir: str = HUGGINGFACE_HUB_CACHE, |
| 74 | + file_name: Optional[str] = None, |
| 75 | + subfolder: str = "", |
| 76 | + from_onnx: bool = False, |
| 77 | + local_files_only: bool = False, |
| 78 | + **kwargs, |
| 79 | + ): |
| 80 | + trust_remote_code = kwargs.pop("trust_remote_code", False) |
| 81 | + tokenizer_kwargs = kwargs.pop("tokenizer_kwargs", None) |
| 82 | + |
| 83 | + tokenizer_args = { |
| 84 | + "token": token, |
| 85 | + "trust_remote_code": trust_remote_code, |
| 86 | + "revision": revision, |
| 87 | + "local_files_only": local_files_only, |
| 88 | + } |
| 89 | + if tokenizer_kwargs: |
| 90 | + kwargs["tokenizer_args"].update(tokenizer_kwargs) |
| 91 | + |
| 92 | + tokenizer = AutoTokenizer.from_pretrained(model_id, **tokenizer_args) |
| 93 | + |
| 94 | + return super()._from_pretrained( |
| 95 | + model_id=model_id, |
| 96 | + config=config, |
| 97 | + token=token, |
| 98 | + revision=revision, |
| 99 | + force_download=force_download, |
| 100 | + cache_dir=cache_dir, |
| 101 | + file_name=file_name, |
| 102 | + subfolder=subfolder, |
| 103 | + from_onnx=from_onnx, |
| 104 | + local_files_only=local_files_only, |
| 105 | + tokenizer=tokenizer, |
| 106 | + **kwargs, |
| 107 | + ) |
| 108 | + |
| 109 | + def tokenize( |
| 110 | + self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]], padding: Union[str, bool] = True |
| 111 | + ) -> Dict[str, torch.Tensor]: |
| 112 | + """Tokenizes a text and maps tokens to token-ids""" |
| 113 | + output = {} |
| 114 | + if isinstance(texts[0], str): |
| 115 | + to_tokenize = [texts] |
| 116 | + elif isinstance(texts[0], dict): |
| 117 | + to_tokenize = [] |
| 118 | + output["text_keys"] = [] |
| 119 | + for lookup in texts: |
| 120 | + text_key, text = next(iter(lookup.items())) |
| 121 | + to_tokenize.append(text) |
| 122 | + output["text_keys"].append(text_key) |
| 123 | + to_tokenize = [to_tokenize] |
| 124 | + else: |
| 125 | + batch1, batch2 = [], [] |
| 126 | + for text_tuple in texts: |
| 127 | + batch1.append(text_tuple[0]) |
| 128 | + batch2.append(text_tuple[1]) |
| 129 | + to_tokenize = [batch1, batch2] |
| 130 | + |
| 131 | + # strip |
| 132 | + to_tokenize = [[str(s).strip() for s in col] for col in to_tokenize] |
| 133 | + |
| 134 | + output.update( |
| 135 | + self.tokenizer( |
| 136 | + *to_tokenize, |
| 137 | + padding=padding, |
| 138 | + truncation="longest_first", |
| 139 | + return_tensors="pt", |
| 140 | + ) |
| 141 | + ) |
| 142 | + return output |
0 commit comments