1+ import spacy
2+ import numpy as np
3+ import os
4+ from zhconv import convert
5+ import re
6+ import random
7+
8+ # добавьте специфическую для русского языка модель
9+ import ru_core_news_sm
10+
11+ def detect_lang (text ):
12+ # 定义语言占比字典
13+ lang_dict = {'zh-cn' : 0 , 'zh-tw' : 0 , 'en' : 0 , 'ru' : 0 , 'other' : 0 } # добавьте русский язык
14+ # 随机抽样最多十个字符
15+ sample = random .sample (text , min (10 , len (text )))
16+ # 计算每种语言的字符占比
17+ for char in sample :
18+ if re .search (r'[\u4e00-\u9fa5]' , char ):
19+ lang_dict ['zh-cn' ] += 1
20+ elif re .search (r'[\u4e00-\u9fff]' , char ):
21+ lang_dict ['zh-tw' ] += 1
22+ elif re .search (r'[a-zA-Z]' , char ):
23+ lang_dict ['en' ] += 1
24+ elif re .search (r'[а-яА-Я]' , char ): # добавьте соответствующий диапазон для русских букв
25+ lang_dict ['ru' ] += 1
26+ else :
27+ lang_dict ['other' ] += 1
28+ # 返回占比最高的语言
29+ return max (lang_dict , key = lang_dict .get )
30+
31+ class embedding_processing :
32+
33+ def __init__ (self , model_path = './model' ):
34+ self .en_model = spacy .load ('en_core_web_sm' )
35+ self .zh_model = spacy .load ('zh_core_web_sm' )
36+ self .ru_model = ru_core_news_sm .load () # добавьте модель для русского языка
37+
38+ def model (self ,text ):
39+ lang = detect_lang (text )
40+ if lang == "zh-tw" :
41+ ans_cn = self .zh_model (convert (text )).vector .tolist ()
42+ else :
43+ ans_cn = self .zh_model (text ).vector .tolist ()
44+ ans = self .en_model (text ).vector .tolist ()
45+ return ans_cn + ans
46+
47+ def embedding (self , text_list ):
48+ embeddings_list = [self .model (text ) for text in text_list ]
49+ response_embedding = self .transform_embedding_to_dict (embeddings_list ,text_list )
50+ return response_embedding
51+
52+ def transform_embedding_to_dict (self , embedding_list , text_list , model_name = "text-embedding-elmo-002" ):
53+ prompt_tokens = sum (len (text ) for text in text_list )
54+ total_tokens = sum (len (embedding ) for embedding in embedding_list )
55+
56+ transformed_data = {
57+ "data" : [
58+ {
59+ "embedding" : embedding ,
60+ "index" : index ,
61+ "object" : "embedding"
62+ }
63+ for index , embedding in enumerate (embedding_list )
64+ ],
65+ "model" : model_name ,
66+ "object" : "list" ,
67+ "usage" : {
68+ "prompt_tokens" : prompt_tokens ,
69+ "total_tokens" : total_tokens
70+ }
71+ }
72+ return transformed_data
0 commit comments