add paddlenlp for embedding

peng3307165 · peng3307165 · commit 8c89e90ac93e · 2023-12-07T20:49:25.000+08:00
diff --git a/modelcache/embedding/__init__.py b/modelcache/embedding/__init__.py
@@ -3,7 +3,8 @@
 huggingface = LazyImport("huggingface", globals(), "modelcache.embedding.huggingface")
 data2vec = LazyImport("data2vec", globals(), "modelcache.embedding.data2vec")
 llmEmb = LazyImport("llmEmb", globals(), "modelcache.embedding.llmEmb")
-fasttext = LazyImport("fasttext", globals(), "gptcache.embedding.fasttext")
+fasttext = LazyImport("fasttext", globals(), "modelcache.embedding.fasttext")
+paddlenlp = LazyImport("paddlenlp", globals(), "modelcache.embedding.paddlenlp")
 
 
 def Huggingface(model="sentence-transformers/all-mpnet-base-v2"):
@@ -20,3 +21,7 @@ def LlmEmb2vecAudio():
 
 def FastText(model="en", dim=None):
     return fasttext.FastText(model, dim)
+
+
+def PaddleNLP(model="ernie-3.0-medium-zh"):
+    return paddlenlp.PaddleNLP(model)
diff --git a/modelcache/embedding/paddlenlp.py b/modelcache/embedding/paddlenlp.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+"""
+   Alipay.com Inc.
+   Copyright (c) 2004-2023 All Rights Reserved.
+   ------------------------------------------------------
+   File Name : paddlenlp.py
+   Author : fuhui.phe
+   Create Time : 2023/12/7 20:43
+   Description : description what the main function of this file
+   Change Activity: 
+        version0 : 2023/12/7 20:43 by fuhui.phe  init
+"""
+import numpy as np
+
+from modelcache.embedding.base import BaseEmbedding
+from modelcache.utils import import_paddlenlp, import_paddle
+
+import_paddle()
+import_paddlenlp()
+
+
+import paddle  # pylint: disable=C0413
+from paddlenlp.transformers import AutoModel, AutoTokenizer  # pylint: disable=C0413
+
+
+class PaddleNLP(BaseEmbedding):
+    def __init__(self, model: str = "ernie-3.0-medium-zh"):
+        self.model = AutoModel.from_pretrained(model)
+        self.model.eval()
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model)
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = "<pad>"
+        self.__dimension = None
+
+    def to_embeddings(self, data, **_):
+        """Generate embedding given text input
+
+        :param data: text in string.
+        :type data: str
+
+        :return: a text embedding in shape of (dim,).
+        """
+        if not isinstance(data, list):
+            data = [data]
+        inputs = self.tokenizer(
+            data, padding=True, truncation=True, return_tensors="pd"
+        )
+        outs = self.model(**inputs)[0]
+        emb = self.post_proc(outs, inputs).squeeze(0).detach().numpy()
+        return np.array(emb).astype("float32")
+
+    def post_proc(self, token_embeddings, inputs):
+        attention_mask = paddle.ones(inputs["token_type_ids"].shape)
+        input_mask_expanded = (
+            attention_mask.unsqueeze(-1).expand(token_embeddings.shape).astype("float32")
+        )
+        sentence_embs = paddle.sum(
+            token_embeddings * input_mask_expanded, 1
+        ) / paddle.clip(input_mask_expanded.sum(1), min=1e-9)
+        return sentence_embs
+
+    @property
+    def dimension(self):
+        """Embedding dimension.
+
+        :return: embedding dimension
+        """
+        if not self.__dimension:
+            self.__dimension = len(self.to_embeddings("foo"))
+        return self.__dimension
diff --git a/modelcache/utils/__init__.py b/modelcache/utils/__init__.py
@@ -52,3 +52,12 @@ def import_torch():
 
 def import_fasttext():
     _check_library("fasttext")
+
+
+def import_paddle():
+    prompt_install("protobuf==3.20.0")
+    _check_library("paddlepaddle")
+
+
+def import_paddlenlp():
+    _check_library("paddlenlp")