1
+ from typing import List , Dict , Any
2
+ import json
3
+ import argparse
4
+ from sentence_transformers import SentenceTransformer
5
+ import array
6
+ import oracledb
7
+ import yaml
8
+ import os
9
+ from pathlib import Path
10
+
11
+
12
+ class OraDBVectorStore :
13
+ def __init__ (self , persist_directory : str = "embeddings" ):
14
+ """Initialize Oracle DB Vector Store
15
+
16
+ Args:
17
+ persist_directory: Not used for Oracle DB connection but kept for compatibility
18
+ """
19
+ # Load Oracle DB credentials from config.yaml
20
+ credentials = self ._load_config ()
21
+
22
+ username = credentials .get ("ORACLE_DB_USERNAME" , "ADMIN" )
23
+ password = credentials .get ("ORACLE_DB_PASSWORD" , "" )
24
+ dsn = credentials .get ("ORACLE_DB_DSN" , "" )
25
+
26
+ if not password or not dsn :
27
+ raise ValueError ("Oracle DB credentials not found in config.yaml. Please set ORACLE_DB_USERNAME, ORACLE_DB_PASSWORD, and ORACLE_DB_DSN." )
28
+
29
+ # Connect to the database
30
+ try :
31
+ conn23c = oracledb .connect (user = username , password = password , dsn = dsn )
32
+ print ("Oracle DB Connection successful!" )
33
+ except Exception as e :
34
+ print ("Oracle DB Connection failed!" , e )
35
+ raise
36
+
37
+ # Create a table to store the data
38
+ cursor = conn23c .cursor ()
39
+
40
+ self .connection = conn23c
41
+ self .cursor = cursor
42
+
43
+ sql = """CREATE TABLE IF NOT EXISTS PDFCollection (
44
+ id VARCHAR2(4000 BYTE) PRIMARY KEY,
45
+ text VARCHAR2(4000 BYTE),
46
+ metadata VARCHAR2(4000 BYTE),
47
+ embedding VECTOR
48
+ )"""
49
+
50
+ cursor .execute (sql )
51
+
52
+ sql = """CREATE TABLE IF NOT EXISTS WebCollection (
53
+ id VARCHAR2(4000 BYTE) PRIMARY KEY,
54
+ text VARCHAR2(4000 BYTE),
55
+ metadata VARCHAR2(4000 BYTE),
56
+ embedding VECTOR
57
+ )"""
58
+
59
+ cursor .execute (sql )
60
+
61
+ sql = """CREATE TABLE IF NOT EXISTS RepoCollection (
62
+ id VARCHAR2(4000 BYTE) PRIMARY KEY,
63
+ text VARCHAR2(4000 BYTE),
64
+ metadata VARCHAR2(4000 BYTE),
65
+ embedding VECTOR
66
+ )"""
67
+
68
+ cursor .execute (sql )
69
+
70
+
71
+ sql = """CREATE TABLE IF NOT EXISTS GeneralCollection (
72
+ id VARCHAR2(4000 BYTE) PRIMARY KEY,
73
+ text VARCHAR2(4000 BYTE),
74
+ metadata VARCHAR2(4000 BYTE),
75
+ embedding VECTOR
76
+ )"""
77
+
78
+ cursor .execute (sql )
79
+
80
+ self .encoder = SentenceTransformer ('all-MiniLM-L12-v2' )
81
+
82
+
83
+ def _load_config (self ) -> Dict [str , str ]:
84
+ """Load configuration from config.yaml"""
85
+ try :
86
+ config_path = Path ("config.yaml" )
87
+ if not config_path .exists ():
88
+ print ("Warning: config.yaml not found. Using empty configuration." )
89
+ return {}
90
+
91
+ with open (config_path , 'r' ) as f :
92
+ config = yaml .safe_load (f )
93
+ return config if config else {}
94
+ except Exception as e :
95
+ print (f"Warning: Error loading config: { str (e )} " )
96
+ return {}
97
+
98
+ def _sanitize_metadata (self , metadata : Dict ) -> Dict :
99
+ """Sanitize metadata to ensure all values are valid types for Oracle DB"""
100
+ sanitized = {}
101
+ for key , value in metadata .items ():
102
+ if isinstance (value , (str , int , float , bool )):
103
+ sanitized [key ] = value
104
+ elif isinstance (value , list ):
105
+ # Convert list to string representation
106
+ sanitized [key ] = str (value )
107
+ elif value is None :
108
+ # Replace None with empty string
109
+ sanitized [key ] = ""
110
+ else :
111
+ # Convert any other type to string
112
+ sanitized [key ] = str (value )
113
+ return sanitized
114
+
115
+ def add_pdf_chunks (self , chunks : List [Dict [str , Any ]], document_id : str ):
116
+ """Add chunks from a PDF document to the vector store"""
117
+ if not chunks :
118
+ return
119
+
120
+ # Prepare data for Oracle DB
121
+ texts = [chunk ["text" ] for chunk in chunks ]
122
+ metadatas = [self ._sanitize_metadata (chunk ["metadata" ]) for chunk in chunks ]
123
+ ids = [f"{ document_id } _{ i } " for i in range (len (chunks ))]
124
+
125
+ # Encode all texts in a batch
126
+ embeddings = self .encoder .encode (texts , batch_size = 32 , show_progress_bar = True )
127
+
128
+ table_name = "PDFCollection"
129
+ # Truncate the table
130
+ self .cursor .execute (f"truncate table { table_name } " )
131
+
132
+ # Insert embeddings into Oracle
133
+ for i , (docid , text , metadata , embedding ) in enumerate (zip (ids , texts , metadatas , embeddings ), start = 1 ):
134
+ json_metadata = json .dumps (metadata ) # Convert to JSON string
135
+ vector = array .array ("f" , embedding )
136
+
137
+ self .cursor .execute (
138
+ "INSERT INTO PDFCollection (id, text, metadata, embedding) VALUES (:1, :2, :3, :4)" ,
139
+ (docid , text , json_metadata , vector )
140
+ )
141
+
142
+ self .connection .commit ()
143
+
144
+ def add_web_chunks (self , chunks : List [Dict [str , Any ]], source_id : str ):
145
+ """Add chunks from web content to the vector store"""
146
+ if not chunks :
147
+ return
148
+
149
+ # Prepare data for Oracle DB
150
+ texts = [chunk ["text" ] for chunk in chunks ]
151
+ metadatas = [self ._sanitize_metadata (chunk ["metadata" ]) for chunk in chunks ]
152
+ ids = [f"{ source_id } _{ i } " for i in range (len (chunks ))]
153
+
154
+ # Encode all texts in a batch
155
+ embeddings = self .encoder .encode (texts , batch_size = 32 , show_progress_bar = True )
156
+
157
+ table_name = "WebCollection"
158
+ # No truncation for web chunks, just append new ones
159
+
160
+ # Insert embeddings into Oracle
161
+ for i , (docid , text , metadata , embedding ) in enumerate (zip (ids , texts , metadatas , embeddings ), start = 1 ):
162
+ json_metadata = json .dumps (metadata ) # Convert to JSON string
163
+ vector = array .array ("f" , embedding )
164
+
165
+ self .cursor .execute (
166
+ "INSERT INTO WebCollection (id, text, metadata, embedding) VALUES (:1, :2, :3, :4)" ,
167
+ (docid , text , json_metadata , vector )
168
+ )
169
+
170
+ self .connection .commit ()
171
+
172
+ def add_general_knowledge (self , chunks : List [Dict [str , Any ]], source_id : str ):
173
+ """Add general knowledge chunks to the vector store"""
174
+ if not chunks :
175
+ return
176
+
177
+ # Prepare data for Oracle DB
178
+ texts = [chunk ["text" ] for chunk in chunks ]
179
+ metadatas = [self ._sanitize_metadata (chunk ["metadata" ]) for chunk in chunks ]
180
+ ids = [f"{ source_id } _{ i } " for i in range (len (chunks ))]
181
+
182
+ # Encode all texts in a batch
183
+ embeddings = self .encoder .encode (texts , batch_size = 32 , show_progress_bar = True )
184
+
185
+ table_name = "GeneralCollection"
186
+
187
+ # Insert embeddings into Oracle
188
+ for i , (docid , text , metadata , embedding ) in enumerate (zip (ids , texts , metadatas , embeddings ), start = 1 ):
189
+ json_metadata = json .dumps (metadata ) # Convert to JSON string
190
+ vector = array .array ("f" , embedding )
191
+
192
+ self .cursor .execute (
193
+ "INSERT INTO GeneralCollection (id, text, metadata, embedding) VALUES (:1, :2, :3, :4)" ,
194
+ (docid , text , json_metadata , vector )
195
+ )
196
+
197
+ self .connection .commit ()
198
+
199
+ def add_repo_chunks (self , chunks : List [Dict [str , Any ]], document_id : str ):
200
+ """Add chunks from a repository to the vector store"""
201
+ if not chunks :
202
+ return
203
+
204
+ # Prepare data for Oracle DB
205
+ texts = [chunk ["text" ] for chunk in chunks ]
206
+ metadatas = [self ._sanitize_metadata (chunk ["metadata" ]) for chunk in chunks ]
207
+ ids = [f"{ document_id } _{ i } " for i in range (len (chunks ))]
208
+
209
+ # Encode all texts in a batch
210
+ embeddings = self .encoder .encode (texts , batch_size = 32 , show_progress_bar = True )
211
+
212
+ table_name = "RepoCollection"
213
+
214
+ # Insert embeddings into Oracle
215
+ for i , (docid , text , metadata , embedding ) in enumerate (zip (ids , texts , metadatas , embeddings ), start = 1 ):
216
+ json_metadata = json .dumps (metadata ) # Convert to JSON string
217
+ vector = array .array ("f" , embedding )
218
+
219
+ self .cursor .execute (
220
+ "INSERT INTO RepoCollection (id, text, metadata, embedding) VALUES (:1, :2, :3, :4)" ,
221
+ (docid , text , json_metadata , vector )
222
+ )
223
+
224
+ self .connection .commit ()
225
+
226
+ def query_pdf_collection (self , query : str , n_results : int = 3 ) -> List [Dict [str , Any ]]:
227
+ """Query the PDF documents collection"""
228
+ # Generate Embeddings
229
+ embeddings = self .encoder .encode (query , batch_size = 32 , show_progress_bar = True )
230
+ new_vector = array .array ("f" , embeddings )
231
+
232
+ sql = """
233
+ SELECT Id, Text, MetaData, Embedding
234
+ FROM PDFCOLLECTION
235
+ ORDER BY VECTOR_DISTANCE(EMBEDDING, :nv, EUCLIDEAN)
236
+ FETCH FIRST 10 ROWS ONLY
237
+ """
238
+
239
+ self .cursor .execute (sql , {"nv" : new_vector })
240
+
241
+ # Fetch all rows
242
+ rows = self .cursor .fetchall ()
243
+
244
+ # Format results
245
+ formatted_results = []
246
+ for row in rows :
247
+ result = {
248
+ "content" : row [1 ],
249
+ "metadata" : json .loads (row [2 ]) if isinstance (row [2 ], str ) else row [2 ]
250
+ }
251
+ formatted_results .append (result )
252
+
253
+ return formatted_results
254
+
255
+ def query_web_collection (self , query : str , n_results : int = 3 ) -> List [Dict [str , Any ]]:
256
+ """Query the web documents collection"""
257
+ # Generate Embeddings
258
+ embeddings = self .encoder .encode (query , batch_size = 32 , show_progress_bar = True )
259
+ new_vector = array .array ("f" , embeddings )
260
+
261
+ sql = """
262
+ SELECT Id, Text, MetaData, Embedding
263
+ FROM WebCOLLECTION
264
+ ORDER BY VECTOR_DISTANCE(EMBEDDING, :nv, EUCLIDEAN)
265
+ FETCH FIRST 10 ROWS ONLY
266
+ """
267
+
268
+ self .cursor .execute (sql , {"nv" : new_vector })
269
+
270
+ # Fetch all rows
271
+ rows = self .cursor .fetchall ()
272
+
273
+ # Format results
274
+ formatted_results = []
275
+ for row in rows :
276
+ result = {
277
+ "content" : row [1 ],
278
+ "metadata" : json .loads (row [2 ]) if isinstance (row [2 ], str ) else row [2 ]
279
+ }
280
+ formatted_results .append (result )
281
+
282
+ return formatted_results
283
+
284
+ def query_general_collection (self , query : str , n_results : int = 3 ) -> List [Dict [str , Any ]]:
285
+ """Query the general knowledge collection"""
286
+ # Generate Embeddings
287
+ embeddings = self .encoder .encode (query , batch_size = 32 , show_progress_bar = True )
288
+ new_vector = array .array ("f" , embeddings )
289
+
290
+ sql = """
291
+ SELECT Id, Text, MetaData, Embedding
292
+ FROM GeneralCollection
293
+ ORDER BY VECTOR_DISTANCE(EMBEDDING, :nv, EUCLIDEAN)
294
+ FETCH FIRST 10 ROWS ONLY
295
+ """
296
+
297
+ self .cursor .execute (sql , {"nv" : new_vector })
298
+
299
+ # Fetch all rows
300
+ rows = self .cursor .fetchall ()
301
+
302
+ # Format results
303
+ formatted_results = []
304
+ for row in rows :
305
+ result = {
306
+ "content" : row [1 ],
307
+ "metadata" : json .loads (row [2 ]) if isinstance (row [2 ], str ) else row [2 ]
308
+ }
309
+ formatted_results .append (result )
310
+
311
+ return formatted_results
312
+
313
+ def query_repo_collection (self , query : str , n_results : int = 3 ) -> List [Dict [str , Any ]]:
314
+ """Query the repository documents collection"""
315
+ # Generate Embeddings
316
+ embeddings = self .encoder .encode (query , batch_size = 32 , show_progress_bar = True )
317
+ new_vector = array .array ("f" , embeddings )
318
+
319
+ sql = """
320
+ SELECT Id, Text, MetaData, Embedding
321
+ FROM RepoCOLLECTION
322
+ ORDER BY VECTOR_DISTANCE(EMBEDDING, :nv, EUCLIDEAN)
323
+ FETCH FIRST 10 ROWS ONLY
324
+ """
325
+
326
+ self .cursor .execute (sql , {"nv" : new_vector })
327
+
328
+ # Fetch all rows
329
+ rows = self .cursor .fetchall ()
330
+
331
+ # Format results
332
+ formatted_results = []
333
+ for row in rows :
334
+ result = {
335
+ "content" : row [1 ],
336
+ "metadata" : json .loads (row [2 ]) if isinstance (row [2 ], str ) else row [2 ]
337
+ }
338
+ formatted_results .append (result )
339
+
340
+ return formatted_results
341
+
342
+ def main ():
343
+ parser = argparse .ArgumentParser (description = "Manage Oracle DB vector store" )
344
+ parser .add_argument ("--add" , help = "JSON file containing chunks to add" )
345
+ parser .add_argument ("--add-web" , help = "JSON file containing web chunks to add" )
346
+ parser .add_argument ("--query" , help = "Query to search for" )
347
+
348
+ args = parser .parse_args ()
349
+ store = OraDBVectorStore ()
350
+
351
+ if args .add :
352
+ with open (args .add , 'r' , encoding = 'utf-8' ) as f :
353
+ chunks = json .load (f )
354
+ store .add_pdf_chunks (chunks , document_id = args .add )
355
+ print (f"✓ Added { len (chunks )} PDF chunks to Oracle DB vector store" )
356
+
357
+ if args .add_web :
358
+ with open (args .add_web , 'r' , encoding = 'utf-8' ) as f :
359
+ chunks = json .load (f )
360
+ store .add_web_chunks (chunks , source_id = args .add_web )
361
+ print (f"✓ Added { len (chunks )} web chunks to Oracle DB vector store" )
362
+
363
+ if args .query :
364
+ # Query both collections
365
+ pdf_results = store .query_pdf_collection (args .query )
366
+ web_results = store .query_web_collection (args .query )
367
+
368
+ print ("\n PDF Results:" )
369
+ print ("-" * 50 )
370
+ for result in pdf_results :
371
+ print (f"Content: { result ['content' ][:200 ]} ..." )
372
+ print (f"Source: { result ['metadata' ].get ('source' , 'Unknown' )} " )
373
+ print (f"Pages: { result ['metadata' ].get ('page_numbers' , [])} " )
374
+ print ("-" * 50 )
375
+
376
+ print ("\n Web Results:" )
377
+ print ("-" * 50 )
378
+ for result in web_results :
379
+ print (f"Content: { result ['content' ][:200 ]} ..." )
380
+ print (f"Source: { result ['metadata' ].get ('source' , 'Unknown' )} " )
381
+ print (f"Title: { result ['metadata' ].get ('title' , 'Unknown' )} " )
382
+ print ("-" * 50 )
383
+
384
+ if __name__ == "__main__" :
385
+ main ()
0 commit comments