Skip to content

Commit 5f4382e

Browse files
Merge pull request #31 from DUT-Team-21TCLC-DT3/feat/embedding_index
feat: Index embedding
2 parents ab52bdd + 282717f commit 5f4382e

File tree

2 files changed

+596
-0
lines changed

2 files changed

+596
-0
lines changed
Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "016f4d06",
6+
"metadata": {},
7+
"source": [
8+
"Chỉ chạy 1 lần "
9+
]
10+
},
11+
{
12+
"cell_type": "code",
13+
"execution_count": null,
14+
"id": "95b00098",
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"# Cell 1: Cài thư viện nếu môi trường mới (có thể bỏ qua nếu đã cài)\n",
19+
"# Chạy 1 lần khi setup môi trường.\n",
20+
"\n",
21+
"# !pip install neo4j python-dotenv\n"
22+
]
23+
},
24+
{
25+
"cell_type": "code",
26+
"execution_count": 1,
27+
"id": "be46576f",
28+
"metadata": {},
29+
"outputs": [
30+
{
31+
"name": "stdout",
32+
"output_type": "stream",
33+
"text": [
34+
"NEO4J_URI : neo4j+s://5c2beee6.databases.neo4j.io\n",
35+
"NEO4J_USER : neo4j\n",
36+
"INDEX_NAME : chunk_embedding_index\n",
37+
"EMBED_DIM (dim) : 768\n"
38+
]
39+
}
40+
],
41+
"source": [
42+
"# Cell 2: Đọc cấu hình từ .env\n",
43+
"\n",
44+
"import os\n",
45+
"from dotenv import load_dotenv\n",
46+
"\n",
47+
"# Load file .env trong cùng thư mục notebook\n",
48+
"load_dotenv()\n",
49+
"\n",
50+
"NEO4J_URI = os.getenv(\"NEO4J_URI\", \"bolt://localhost:7687\")\n",
51+
"NEO4J_USER = os.getenv(\"NEO4J_USER\", \"neo4j\")\n",
52+
"NEO4J_PASSWORD = os.getenv(\"NEO4J_PASSWORD\", \"password\")\n",
53+
"\n",
54+
"# Tên index và dimension embedding\n",
55+
"INDEX_NAME = os.getenv(\"NEO4J_EMBED_INDEX_NAME\", \"chunk_embedding_index\")\n",
56+
"\n",
57+
"# dangvantuan/vietnamese-embedding dùng 768 chiều\n",
58+
"# Nếu sau này đổi model khác, có thể chỉnh ENV EMBED_DIM hoặc sửa con số này\n",
59+
"EMBED_DIM = int(os.getenv(\"EMBED_DIM\", \"768\"))\n",
60+
"\n",
61+
"print(\"NEO4J_URI :\", NEO4J_URI)\n",
62+
"print(\"NEO4J_USER :\", NEO4J_USER)\n",
63+
"print(\"INDEX_NAME :\", INDEX_NAME)\n",
64+
"print(\"EMBED_DIM (dim) :\", EMBED_DIM)\n"
65+
]
66+
},
67+
{
68+
"cell_type": "code",
69+
"execution_count": 2,
70+
"id": "4158ab6b",
71+
"metadata": {},
72+
"outputs": [
73+
{
74+
"name": "stdout",
75+
"output_type": "stream",
76+
"text": [
77+
"Kết nối Neo4j OK, test result: 1\n"
78+
]
79+
}
80+
],
81+
"source": [
82+
"# Cell 3: Kết nối Neo4j\n",
83+
"\n",
84+
"from neo4j import GraphDatabase, basic_auth\n",
85+
"\n",
86+
"driver = GraphDatabase.driver(\n",
87+
" NEO4J_URI,\n",
88+
" auth=basic_auth(NEO4J_USER, NEO4J_PASSWORD)\n",
89+
")\n",
90+
"\n",
91+
"# Test nhẹ\n",
92+
"with driver.session() as session:\n",
93+
" result = session.run(\"RETURN 1 AS ok\").single()\n",
94+
" print(\"Kết nối Neo4j OK, test result:\", result[\"ok\"])\n"
95+
]
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": 3,
100+
"id": "81bd4247",
101+
"metadata": {},
102+
"outputs": [],
103+
"source": [
104+
"# Cell 4: Hàm tạo / đảm bảo tồn tại vector index cho :Chunk.embedding\n",
105+
"\n",
106+
"from neo4j.exceptions import Neo4jError\n",
107+
"\n",
108+
"def create_embedding_vector_index(\n",
109+
" driver,\n",
110+
" index_name: str = INDEX_NAME,\n",
111+
" dim: int = EMBED_DIM\n",
112+
"):\n",
113+
" \"\"\"\n",
114+
" Tạo VECTOR INDEX cho property `embedding` trên label :Chunk.\n",
115+
" - Dùng IF NOT EXISTS nên chạy nhiều lần cũng không sao.\n",
116+
" - dim là số chiều embedding (dangvantuan = 768).\n",
117+
" \"\"\"\n",
118+
" print(f\"➡️ Đang đảm bảo index '{index_name}' tồn tại (dim={dim})...\")\n",
119+
"\n",
120+
" cypher = f\"\"\"\n",
121+
" CREATE VECTOR INDEX {index_name} IF NOT EXISTS\n",
122+
" FOR (c:Chunk) ON (c.embedding)\n",
123+
" OPTIONS {{\n",
124+
" indexConfig: {{\n",
125+
" `vector.dimensions`: {dim},\n",
126+
" `vector.similarity_function`: 'cosine'\n",
127+
" }}\n",
128+
" }}\n",
129+
" \"\"\"\n",
130+
"\n",
131+
" try:\n",
132+
" with driver.session() as session:\n",
133+
" session.run(cypher)\n",
134+
" print(f\"✅ Đã đảm bảo tồn tại VECTOR INDEX '{index_name}' (dim={dim})\")\n",
135+
" except Neo4jError as e:\n",
136+
" print(\"❌ Lỗi khi tạo vector index:\")\n",
137+
" print(e)\n"
138+
]
139+
},
140+
{
141+
"cell_type": "code",
142+
"execution_count": 4,
143+
"id": "dd561919",
144+
"metadata": {},
145+
"outputs": [
146+
{
147+
"name": "stdout",
148+
"output_type": "stream",
149+
"text": [
150+
"➡️ Đang đảm bảo index 'chunk_embedding_index' tồn tại (dim=768)...\n",
151+
"✅ Đã đảm bảo tồn tại VECTOR INDEX 'chunk_embedding_index' (dim=768)\n"
152+
]
153+
}
154+
],
155+
"source": [
156+
"# Cell 5: Thực thi tạo index\n",
157+
"\n",
158+
"create_embedding_vector_index(\n",
159+
" driver=driver,\n",
160+
" index_name=INDEX_NAME,\n",
161+
" dim=EMBED_DIM\n",
162+
")\n"
163+
]
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": 5,
168+
"id": "e31b7375",
169+
"metadata": {},
170+
"outputs": [
171+
{
172+
"name": "stdout",
173+
"output_type": "stream",
174+
"text": [
175+
"Hiện có 1 VECTOR INDEX:\n",
176+
"📌 Index: chunk_embedding_index\n",
177+
" state : ONLINE\n",
178+
" entityType : NODE\n",
179+
" labels : ['Chunk']\n",
180+
" properties : ['embedding']\n",
181+
" options : {'indexConfig': {'vector.dimensions': 768, 'vector.hnsw.m': 16, 'vector.quantization.enabled': True, 'vector.similarity_function': 'COSINE', 'vector.hnsw.ef_construction': 100}, 'indexProvider': 'vector-3.0'}\n",
182+
"------------------------------------------------------------\n"
183+
]
184+
}
185+
],
186+
"source": [
187+
"# Cell 6: Kiểm tra các VECTOR INDEX trong Neo4j (tuỳ chọn)\n",
188+
"\n",
189+
"with driver.session() as session:\n",
190+
" result = session.run(\"\"\"\n",
191+
" SHOW INDEXES\n",
192+
" YIELD name, type, entityType, labelsOrTypes, properties, options, state\n",
193+
" WHERE type = 'VECTOR'\n",
194+
" RETURN name, entityType, labelsOrTypes, properties, options, state\n",
195+
" \"\"\")\n",
196+
" rows = list(result)\n",
197+
"\n",
198+
"if not rows:\n",
199+
" print(\"⚠️ Chưa có VECTOR INDEX nào.\")\n",
200+
"else:\n",
201+
" print(f\"Hiện có {len(rows)} VECTOR INDEX:\")\n",
202+
" for r in rows:\n",
203+
" print(\"📌 Index:\", r[\"name\"])\n",
204+
" print(\" state :\", r[\"state\"])\n",
205+
" print(\" entityType :\", r[\"entityType\"])\n",
206+
" print(\" labels :\", r[\"labelsOrTypes\"])\n",
207+
" print(\" properties :\", r[\"properties\"])\n",
208+
" print(\" options :\", r[\"options\"])\n",
209+
" print(\"-\" * 60)\n"
210+
]
211+
},
212+
{
213+
"cell_type": "code",
214+
"execution_count": 6,
215+
"id": "018a87e0",
216+
"metadata": {},
217+
"outputs": [
218+
{
219+
"name": "stdout",
220+
"output_type": "stream",
221+
"text": [
222+
"Đã đóng kết nối Neo4j.\n"
223+
]
224+
}
225+
],
226+
"source": [
227+
"# Cell 7: Đóng driver (tuỳ chọn, cho gọn)\n",
228+
"\n",
229+
"driver.close()\n",
230+
"print(\"Đã đóng kết nối Neo4j.\")\n"
231+
]
232+
},
233+
{
234+
"cell_type": "code",
235+
"execution_count": null,
236+
"id": "a3976e84",
237+
"metadata": {},
238+
"outputs": [],
239+
"source": []
240+
}
241+
],
242+
"metadata": {
243+
"kernelspec": {
244+
"display_name": ".venv",
245+
"language": "python",
246+
"name": "python3"
247+
},
248+
"language_info": {
249+
"codemirror_mode": {
250+
"name": "ipython",
251+
"version": 3
252+
},
253+
"file_extension": ".py",
254+
"mimetype": "text/x-python",
255+
"name": "python",
256+
"nbconvert_exporter": "python",
257+
"pygments_lexer": "ipython3",
258+
"version": "3.11.5"
259+
}
260+
},
261+
"nbformat": 4,
262+
"nbformat_minor": 5
263+
}

0 commit comments

Comments
 (0)