Skip to content

Commit 8c89e90

Browse files
committed
add paddlenlp for embedding
1 parent fdd0c29 commit 8c89e90

File tree

3 files changed

+86
-1
lines changed

3 files changed

+86
-1
lines changed

modelcache/embedding/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
huggingface = LazyImport("huggingface", globals(), "modelcache.embedding.huggingface")
44
data2vec = LazyImport("data2vec", globals(), "modelcache.embedding.data2vec")
55
llmEmb = LazyImport("llmEmb", globals(), "modelcache.embedding.llmEmb")
6-
fasttext = LazyImport("fasttext", globals(), "gptcache.embedding.fasttext")
6+
fasttext = LazyImport("fasttext", globals(), "modelcache.embedding.fasttext")
7+
paddlenlp = LazyImport("paddlenlp", globals(), "modelcache.embedding.paddlenlp")
78

89

910
def Huggingface(model="sentence-transformers/all-mpnet-base-v2"):
@@ -20,3 +21,7 @@ def LlmEmb2vecAudio():
2021

2122
def FastText(model="en", dim=None):
2223
return fasttext.FastText(model, dim)
24+
25+
26+
def PaddleNLP(model="ernie-3.0-medium-zh"):
27+
return paddlenlp.PaddleNLP(model)

modelcache/embedding/paddlenlp.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Alipay.com Inc.
4+
Copyright (c) 2004-2023 All Rights Reserved.
5+
------------------------------------------------------
6+
File Name : paddlenlp.py
7+
Author : fuhui.phe
8+
Create Time : 2023/12/7 20:43
9+
Description : description what the main function of this file
10+
Change Activity:
11+
version0 : 2023/12/7 20:43 by fuhui.phe init
12+
"""
13+
import numpy as np
14+
15+
from modelcache.embedding.base import BaseEmbedding
16+
from modelcache.utils import import_paddlenlp, import_paddle
17+
18+
import_paddle()
19+
import_paddlenlp()
20+
21+
22+
import paddle # pylint: disable=C0413
23+
from paddlenlp.transformers import AutoModel, AutoTokenizer # pylint: disable=C0413
24+
25+
26+
class PaddleNLP(BaseEmbedding):
27+
def __init__(self, model: str = "ernie-3.0-medium-zh"):
28+
self.model = AutoModel.from_pretrained(model)
29+
self.model.eval()
30+
31+
self.tokenizer = AutoTokenizer.from_pretrained(model)
32+
if not self.tokenizer.pad_token:
33+
self.tokenizer.pad_token = "<pad>"
34+
self.__dimension = None
35+
36+
def to_embeddings(self, data, **_):
37+
"""Generate embedding given text input
38+
39+
:param data: text in string.
40+
:type data: str
41+
42+
:return: a text embedding in shape of (dim,).
43+
"""
44+
if not isinstance(data, list):
45+
data = [data]
46+
inputs = self.tokenizer(
47+
data, padding=True, truncation=True, return_tensors="pd"
48+
)
49+
outs = self.model(**inputs)[0]
50+
emb = self.post_proc(outs, inputs).squeeze(0).detach().numpy()
51+
return np.array(emb).astype("float32")
52+
53+
def post_proc(self, token_embeddings, inputs):
54+
attention_mask = paddle.ones(inputs["token_type_ids"].shape)
55+
input_mask_expanded = (
56+
attention_mask.unsqueeze(-1).expand(token_embeddings.shape).astype("float32")
57+
)
58+
sentence_embs = paddle.sum(
59+
token_embeddings * input_mask_expanded, 1
60+
) / paddle.clip(input_mask_expanded.sum(1), min=1e-9)
61+
return sentence_embs
62+
63+
@property
64+
def dimension(self):
65+
"""Embedding dimension.
66+
67+
:return: embedding dimension
68+
"""
69+
if not self.__dimension:
70+
self.__dimension = len(self.to_embeddings("foo"))
71+
return self.__dimension

modelcache/utils/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,12 @@ def import_torch():
5252

5353
def import_fasttext():
5454
_check_library("fasttext")
55+
56+
57+
def import_paddle():
58+
prompt_install("protobuf==3.20.0")
59+
_check_library("paddlepaddle")
60+
61+
62+
def import_paddlenlp():
63+
_check_library("paddlenlp")

0 commit comments

Comments
 (0)