@@ -229,18 +229,28 @@ def init_vectordb() -> None:
229229 vectors_config = VectorParams (size = embedding_model .embedding_size , distance = Distance .COSINE ),
230230 )
231231
232- # Generate embeddings with the fastembed `TextEmbedding` instance and upload directly to Qdrant
232+ # Generate embeddings with the fastembed `TextEmbedding` instance and upload directly to Qdrant in batches
233233 # https://qdrant.tech/documentation/fastembed/fastembed-rerankers/
234- embeddings = list (embedding_model .embed ([d .page_content for d in docs ]))
235- qdrant_client .upsert (
236- collection_name = settings .docs_collection_name ,
237- points = models .Batch (
238- ids = list (range (1 , len (docs ) + 1 )),
239- vectors = [emb .tolist () for emb in embeddings ],
240- payloads = [doc .metadata for doc in docs ],
241- ),
234+ batch_size = 500
235+ total_docs = len (docs )
236+ for batch_start in range (0 , total_docs , batch_size ):
237+ batch_end = min (batch_start + batch_size , total_docs )
238+ batch_docs = docs [batch_start :batch_end ]
239+ # Generate embeddings for this batch
240+ embeddings = embedding_model .embed ([doc .page_content for doc in batch_docs ])
241+ qdrant_client .upsert (
242+ collection_name = settings .docs_collection_name ,
243+ points = models .Batch (
244+ ids = list (range (batch_start + 1 , batch_end + 1 )),
245+ vectors = [emb .tolist () for emb in embeddings ],
246+ payloads = [doc .metadata for doc in batch_docs ],
247+ ),
248+ )
249+ print (f"Indexed documents { batch_start + 1 } -{ batch_end } " )
250+
251+ print (
252+ f"Done generating and indexing { total_docs } documents into the vectordb in { time .time () - start_time } seconds"
242253 )
243- print (f"Done generating and indexing { len (docs )} documents into the vectordb in { time .time () - start_time } seconds" )
244254
245255 # Using langchain vectorstore wrapper
246256 # from langchain_qdrant import QdrantVectorStore
0 commit comments