Skip to content

Commit b1e4622

Browse files
authored
Merge pull request #15 from codefuse-ai/modelcache_localDB_dev
add fasttext for embedding
2 parents 22daf83 + 59ba93f commit b1e4622

File tree

3 files changed

+47
-0
lines changed

3 files changed

+47
-0
lines changed

modelcache/embedding/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
huggingface = LazyImport("huggingface", globals(), "modelcache.embedding.huggingface")
44
data2vec = LazyImport("data2vec", globals(), "modelcache.embedding.data2vec")
55
llmEmb = LazyImport("llmEmb", globals(), "modelcache.embedding.llmEmb")
6+
fasttext = LazyImport("fasttext", globals(), "gptcache.embedding.fasttext")
67

78

89
def Huggingface(model="sentence-transformers/all-mpnet-base-v2"):
@@ -15,3 +16,7 @@ def Data2VecAudio(model="facebook/data2vec-audio-base-960h"):
1516

1617
def LlmEmb2vecAudio():
1718
return llmEmb.LlmEmb2Vec()
19+
20+
21+
def FastText(model="en", dim=None):
22+
return fasttext.FastText(model, dim)

modelcache/embedding/fasttext.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Alipay.com Inc.
4+
Copyright (c) 2004-2023 All Rights Reserved.
5+
------------------------------------------------------
6+
File Name : fasttext.py
7+
Author : fuhui.phe
8+
Create Time : 2023/12/3 15:40
9+
Description : description what the main function of this file
10+
Change Activity:
11+
version0 : 2023/12/3 15:40 by fuhui.phe init
12+
"""
13+
import numpy as np
14+
import os
15+
from modelcache.utils import import_fasttext
16+
from modelcache.embedding.base import BaseEmbedding
17+
import_fasttext()
18+
import fasttext.util
19+
20+
21+
class FastText(BaseEmbedding):
22+
def __init__(self, model: str = "en", dim: int = None):
23+
self.model_path = os.path.abspath(fasttext.util.download_model(model))
24+
self.ft = fasttext.load_model(self.model_path)
25+
26+
if dim:
27+
fasttext.util.reduce_model(self.ft, dim)
28+
self.__dimension = self.ft.get_dimension()
29+
30+
def to_embeddings(self, data, **_):
31+
assert isinstance(data, str), "Only allow string as input."
32+
emb = self.ft.get_sentence_vector(data)
33+
return np.array(emb).astype("float32")
34+
35+
@property
36+
def dimension(self):
37+
return self.__dimension
38+

modelcache/utils/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,7 @@ def import_faiss():
4848

4949
def import_torch():
5050
_check_library("torch")
51+
52+
53+
def import_fasttext():
54+
_check_library("fasttext")

0 commit comments

Comments
 (0)