Skip to content

Commit fef48ee

Browse files
committed
Добавлены embeddings
1 parent cb7ba99 commit fef48ee

File tree

5 files changed

+154
-1
lines changed

5 files changed

+154
-1
lines changed

backend.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,13 @@
2323
import async_timeout
2424

2525
from fp.fp import FreeProxy
26+
from embedding_processing import embedding_processing
2627
import concurrent.futures
2728

2829
app = FastAPI()
30+
embedding_proc = embedding_processing()
31+
LOG = logging.getLogger(__name__)
32+
2933
app.add_middleware(GZipMiddleware)
3034
app.add_middleware(
3135
CORSMiddleware,
@@ -118,6 +122,19 @@ def streaming():
118122

119123
return StreamingResponse(streaming(), media_type='text/event-stream')
120124

125+
@app.post('/v1/embeddings')
126+
async def create_embedding(request: Request):
127+
j_input = await request.json()
128+
#model = embedding_processing()
129+
embedding = embedding_proc.embedding(text_list=j_input['input'])
130+
await log_event()
131+
return JSONResponse(
132+
embedding
133+
)
134+
135+
async def log_event():
136+
LOG.info('served')
137+
121138
@app.get("/v1/dashboard/billing/subscription")
122139
@app.get("/dashboard/billing/subscription")
123140
async def billing_subscription():

embedding_processing.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import spacy
2+
import numpy as np
3+
import os
4+
from zhconv import convert
5+
import re
6+
import random
7+
8+
# добавьте специфическую для русского языка модель
9+
import ru_core_news_sm
10+
11+
def detect_lang(text):
12+
# 定义语言占比字典
13+
lang_dict = {'zh-cn': 0, 'zh-tw': 0, 'en': 0, 'ru': 0, 'other': 0} # добавьте русский язык
14+
# 随机抽样最多十个字符
15+
sample = random.sample(text, min(10, len(text)))
16+
# 计算每种语言的字符占比
17+
for char in sample:
18+
if re.search(r'[\u4e00-\u9fa5]', char):
19+
lang_dict['zh-cn'] += 1
20+
elif re.search(r'[\u4e00-\u9fff]', char):
21+
lang_dict['zh-tw'] += 1
22+
elif re.search(r'[a-zA-Z]', char):
23+
lang_dict['en'] += 1
24+
elif re.search(r'[а-яА-Я]', char): # добавьте соответствующий диапазон для русских букв
25+
lang_dict['ru'] += 1
26+
else:
27+
lang_dict['other'] += 1
28+
# 返回占比最高的语言
29+
return max(lang_dict, key=lang_dict.get)
30+
31+
class embedding_processing:
32+
33+
def __init__(self, model_path='./model'):
34+
self.en_model = spacy.load('en_core_web_sm')
35+
self.zh_model = spacy.load('zh_core_web_sm')
36+
self.ru_model = ru_core_news_sm.load() # добавьте модель для русского языка
37+
38+
def model(self,text):
39+
lang = detect_lang(text)
40+
if lang == "zh-tw":
41+
ans_cn = self.zh_model(convert(text)).vector.tolist()
42+
else:
43+
ans_cn = self.zh_model(text).vector.tolist()
44+
ans = self.en_model(text).vector.tolist()
45+
return ans_cn+ans
46+
47+
def embedding(self, text_list):
48+
embeddings_list = [self.model(text) for text in text_list]
49+
response_embedding = self.transform_embedding_to_dict(embeddings_list,text_list)
50+
return response_embedding
51+
52+
def transform_embedding_to_dict(self, embedding_list, text_list, model_name="text-embedding-elmo-002"):
53+
prompt_tokens = sum(len(text) for text in text_list)
54+
total_tokens = sum(len(embedding) for embedding in embedding_list)
55+
56+
transformed_data = {
57+
"data": [
58+
{
59+
"embedding": embedding,
60+
"index": index,
61+
"object": "embedding"
62+
}
63+
for index, embedding in enumerate(embedding_list)
64+
],
65+
"model": model_name,
66+
"object": "list",
67+
"usage": {
68+
"prompt_tokens": prompt_tokens,
69+
"total_tokens": total_tokens
70+
}
71+
}
72+
return transformed_data

requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,4 +59,6 @@ free-proxy
5959
watchdog~=3.0.0
6060
js2py
6161
quickjs
62-
httpx
62+
httpx
63+
spacy
64+
zhconv

start.bat

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
@echo off
2+
echo Opening NeuroGPT...
3+
4+
set HIDE_LOCAL_MODELS=true
5+
6+
echo Checking for updates...
7+
REM Создаем временную копию файла config.json
8+
copy /Y config.json config_temp.json
9+
git checkout main
10+
git fetch --all
11+
git reset --hard origin/main
12+
git pull
13+
REM Восстанавливаем оригинальный файл config.json
14+
copy /Y config_temp.json config.json
15+
del config_temp.json
16+
17+
python -m venv venv
18+
call venv\Scripts\activate.bat
19+
python -m pip install --upgrade pip
20+
python -m pip install -U setuptools
21+
python -m pip install -r requirements.txt
22+
python -m spacy download en_core_web_sm
23+
python -m spacy download zh_core_web_sm
24+
python -m spacy download ru_core_news_sm
25+
26+
echo Completed.
27+
echo Running NeuroGPT...
28+
29+
python webui.py
30+
pause
31+
32+
:: Упаковано и собрано telegram каналом Neurogen News: https://t.me/neurogen_news

start_endpoint.bat

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
@echo off
2+
echo Opening NeuroGPT endpoint...
3+
4+
echo Checking for updates...
5+
REM Создаем временную копию файла config.json
6+
copy /Y config.json config_temp.json
7+
git checkout main
8+
git fetch --all
9+
git reset --hard origin/main
10+
git pull
11+
REM Восстанавливаем оригинальный файл config.json
12+
copy /Y config_temp.json config.json
13+
del config_temp.json
14+
15+
python -m venv venv
16+
call venv\Scripts\activate.bat
17+
python -m pip install --upgrade pip
18+
python -m pip install -U setuptools
19+
python -m pip install -r requirements.txt
20+
python -m spacy download en_core_web_sm
21+
python -m spacy download zh_core_web_sm
22+
python -m spacy download ru_core_news_sm
23+
24+
echo Completed.
25+
echo Running NeuroGPT...
26+
27+
python endpoint.py
28+
pause
29+
30+
:: Упаковано и собрано telegram каналом Neurogen News: https://t.me/neurogen_news

0 commit comments

Comments
 (0)