Skip to content

Commit f2dd398

Browse files
committed
add llm input_embeddings layer to enable text-to-vector capabilities, and uploade script for extracting GPT-NeoX embedding layer.
1 parent f46615b commit f2dd398

File tree

3 files changed

+54
-0
lines changed

3 files changed

+54
-0
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# -*- coding: utf-8 -*-
2+
import torch
3+
import numpy as np
4+
from transformers import AutoModelForCausalLM
5+
6+
7+
model_path = ''
8+
device = torch.device('cuda')
9+
model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True).to(device)
10+
embedding_weights = model.get_input_embeddings().weight.to('cpu').detach().numpy()
11+
np.save('gpt-neox-embedding.npy', embedding_weights)

modelcache/embedding/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from modelcache.utils.lazy_import import LazyImport
33
huggingface = LazyImport("huggingface", globals(), "modelcache.embedding.huggingface")
44
data2vec = LazyImport("data2vec", globals(), "modelcache.embedding.data2vec")
5+
llmEmb = LazyImport("llmEmb", globals(), "modelcache.embedding.llmEmb")
56

67

78
def Huggingface(model="sentence-transformers/all-mpnet-base-v2"):
@@ -10,3 +11,7 @@ def Huggingface(model="sentence-transformers/all-mpnet-base-v2"):
1011

1112
def Data2VecAudio(model="facebook/data2vec-audio-base-960h"):
1213
return data2vec.Data2VecAudio(model)
14+
15+
16+
def LlmEmb2vecAudio():
17+
return llmEmb.LlmEmb2Vec()

modelcache/embedding/llmEmb.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# -*- coding: utf-8 -*-
2+
import numpy as np
3+
from modelcache.embedding.base import BaseEmbedding
4+
from transformers import AutoTokenizer
5+
from transformers import AutoConfig
6+
7+
8+
class LlmEmb2Vec(BaseEmbedding):
9+
def __init__(self):
10+
11+
self.model_name = '' # 13b-mft-embedding.npy
12+
model_path = '' # .npy file storage path
13+
model_file = model_path + self.model_name # .npy file
14+
config = AutoConfig.from_pretrained(model_path)
15+
dimension = config.hidden_size
16+
self.__dimension = dimension
17+
self.model = np.load(model_file)
18+
self.tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
19+
20+
def to_embeddings(self, data, **_):
21+
"""Generate embedding given text input
22+
23+
:param data: text in string.
24+
:return: a text embedding in shape of (dim,).
25+
"""
26+
input_ids = self.tokenizer.encode(data, add_special_tokens=True)
27+
embedding_array = self.model[input_ids].mean(axis=0)
28+
return embedding_array
29+
30+
def post_proc(self, token_embeddings, inputs):
31+
pass
32+
33+
@property
34+
def dimension(self):
35+
"""Embedding dimension.
36+
:return: embedding dimension
37+
"""
38+
return self.__dimension

0 commit comments

Comments
 (0)