1010from  azure .storage .filedatalake  import  DataLakeServiceClient 
1111from  azure .search .documents .indexes  import  SearchIndexClient 
1212
13+ 
1314key_vault_name  =  'kv_to-be-replaced' 
1415managed_identity_client_id  =  'mici_to-be-replaced' 
1516file_system_client_name  =  "data" 
@@ -58,7 +59,7 @@ def clean_spaces_with_regex(text):
5859
5960
6061def  chunk_data (text ):
61-     tokens_per_chunk  =  1024    # 500 
62+     tokens_per_chunk  =  256    # 1024  # 500
6263    text  =  clean_spaces_with_regex (text )
6364
6465    sentences  =  text .split ('. ' )  # Split text into sentences 
@@ -115,6 +116,7 @@ def chunk_data(text):
115116
116117def  prepare_search_doc (content , document_id ):
117118    chunks  =  chunk_data (content )
119+     results  =  []
118120    chunk_num  =  0 
119121    for  chunk  in  chunks :
120122        chunk_num  +=  1 
@@ -138,7 +140,8 @@ def prepare_search_doc(content, document_id):
138140            "sourceurl" : path .name .split ('/' )[- 1 ],
139141            "contentVector" : v_contentVector 
140142        }
141-     return  result 
143+         results .append (result )
144+     return  results 
142145
143146
144147# conversationIds = [] 
@@ -163,13 +166,14 @@ def prepare_search_doc(content, document_id):
163166        page  =  pdf_reader .pages [page_num ]
164167        text  +=  page .extract_text ()
165168    result  =  prepare_search_doc (text , document_id )
166-     docs .append (result )
169+     docs .extend (result )
167170
168171    counter  +=  1 
169172    if  docs  !=  [] and  counter  %  10  ==  0 :
170173        result  =  search_client .upload_documents (documents = docs )
171174        docs  =  []
172-         print (f' { str (counter )}   uploaded' )
173175
174176if  docs  !=  []:
175177    results  =  search_client .upload_documents (documents = docs )
178+ 
179+ print (f'{ str (counter )}   files processed.' )
0 commit comments