File tree Expand file tree Collapse file tree 3 files changed +54
-0
lines changed Expand file tree Collapse file tree 3 files changed +54
-0
lines changed Original file line number Diff line number Diff line change
1
+ # -*- coding: utf-8 -*-
2
+ import torch
3
+ import numpy as np
4
+ from transformers import AutoModelForCausalLM
5
+
6
+
7
+ model_path = ''
8
+ device = torch .device ('cuda' )
9
+ model = AutoModelForCausalLM .from_pretrained (model_path , local_files_only = True ).to (device )
10
+ embedding_weights = model .get_input_embeddings ().weight .to ('cpu' ).detach ().numpy ()
11
+ np .save ('gpt-neox-embedding.npy' , embedding_weights )
Original file line number Diff line number Diff line change 2
2
from modelcache .utils .lazy_import import LazyImport
3
3
huggingface = LazyImport ("huggingface" , globals (), "modelcache.embedding.huggingface" )
4
4
data2vec = LazyImport ("data2vec" , globals (), "modelcache.embedding.data2vec" )
5
+ llmEmb = LazyImport ("llmEmb" , globals (), "modelcache.embedding.llmEmb" )
5
6
6
7
7
8
def Huggingface (model = "sentence-transformers/all-mpnet-base-v2" ):
@@ -10,3 +11,7 @@ def Huggingface(model="sentence-transformers/all-mpnet-base-v2"):
10
11
11
12
def Data2VecAudio (model = "facebook/data2vec-audio-base-960h" ):
12
13
return data2vec .Data2VecAudio (model )
14
+
15
+
16
+ def LlmEmb2vecAudio ():
17
+ return llmEmb .LlmEmb2Vec ()
Original file line number Diff line number Diff line change
1
+ # -*- coding: utf-8 -*-
2
+ import numpy as np
3
+ from modelcache .embedding .base import BaseEmbedding
4
+ from transformers import AutoTokenizer
5
+ from transformers import AutoConfig
6
+
7
+
8
+ class LlmEmb2Vec (BaseEmbedding ):
9
+ def __init__ (self ):
10
+
11
+ self .model_name = '' # 13b-mft-embedding.npy
12
+ model_path = '' # .npy file storage path
13
+ model_file = model_path + self .model_name # .npy file
14
+ config = AutoConfig .from_pretrained (model_path )
15
+ dimension = config .hidden_size
16
+ self .__dimension = dimension
17
+ self .model = np .load (model_file )
18
+ self .tokenizer = AutoTokenizer .from_pretrained (model_path , local_files_only = True )
19
+
20
+ def to_embeddings (self , data , ** _ ):
21
+ """Generate embedding given text input
22
+
23
+ :param data: text in string.
24
+ :return: a text embedding in shape of (dim,).
25
+ """
26
+ input_ids = self .tokenizer .encode (data , add_special_tokens = True )
27
+ embedding_array = self .model [input_ids ].mean (axis = 0 )
28
+ return embedding_array
29
+
30
+ def post_proc (self , token_embeddings , inputs ):
31
+ pass
32
+
33
+ @property
34
+ def dimension (self ):
35
+ """Embedding dimension.
36
+ :return: embedding dimension
37
+ """
38
+ return self .__dimension
You can’t perform that action at this time.
0 commit comments