-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathemb_save_db.py
More file actions
49 lines (38 loc) · 1.64 KB
/
emb_save_db.py
File metadata and controls
49 lines (38 loc) · 1.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import asyncio
from chromadbinit import chromadb_collection
from to_chunk import splitter
from embedding import embed_chunks_concurrent
from docstore import doc_store
async def process_and_save(doc_file: str):
print("1. 开始切分父子块...")
parents, children = splitter.split_main(doc_file)
print(f" - 生成父块: {len(parents)} 个")
print(f" - 生成子块: {len(children)} 个")
# 2. 保存父块到 SQLite (不需要向量化,只存文本)
print("2. 保存父块到 DocStore...")
doc_store.save_parents(parents)
# 3. 向量化子块 (高并发)
print("3. 正在并发向量化子块...")
child_texts = [c['content'] for c in children]
# 每次处理一批,防止一次性内存爆炸
batch_size = 50
for i in range(0, len(children), batch_size):
batch_children = children[i: i + batch_size]
batch_texts = [c['content'] for c in batch_children]
# 并发获取向量
embeddings = await embed_chunks_concurrent(batch_texts, max_concurrency=20)
# 准备写入 Chroma 的数据
ids = [c['id'] for c in batch_children]
metadatas = [{"parent_id": c['parent_id']} for c in batch_children]
# 写入 Chroma
chromadb_collection.add(
ids=ids,
documents=batch_texts,
embeddings=embeddings,
metadatas=metadatas
)
print(f" - 已处理批次 {i} - {i + batch_size}")
print("入库完成!")
if __name__ == "__main__":
# 支持文档格式:txt, pdf, docx, xlsx, markdown, html, 各种编程脚本文件!
asyncio.run(process_and_save("软件服务协议.docx"))