1010from azure .storage .filedatalake import DataLakeServiceClient
1111from azure .search .documents .indexes import SearchIndexClient
1212
13+
1314key_vault_name = 'kv_to-be-replaced'
1415managed_identity_client_id = 'mici_to-be-replaced'
1516file_system_client_name = "data"
@@ -58,7 +59,7 @@ def clean_spaces_with_regex(text):
5859
5960
6061def chunk_data (text ):
61- tokens_per_chunk = 1024 # 500
62+ tokens_per_chunk = 256 # 1024 # 500
6263 text = clean_spaces_with_regex (text )
6364
6465 sentences = text .split ('. ' ) # Split text into sentences
@@ -115,6 +116,7 @@ def chunk_data(text):
115116
116117def prepare_search_doc (content , document_id ):
117118 chunks = chunk_data (content )
119+ results = []
118120 chunk_num = 0
119121 for chunk in chunks :
120122 chunk_num += 1
@@ -138,7 +140,8 @@ def prepare_search_doc(content, document_id):
138140 "sourceurl" : path .name .split ('/' )[- 1 ],
139141 "contentVector" : v_contentVector
140142 }
141- return result
143+ results .append (result )
144+ return results
142145
143146
144147# conversationIds = []
@@ -163,13 +166,14 @@ def prepare_search_doc(content, document_id):
163166 page = pdf_reader .pages [page_num ]
164167 text += page .extract_text ()
165168 result = prepare_search_doc (text , document_id )
166- docs .append (result )
169+ docs .extend (result )
167170
168171 counter += 1
169172 if docs != [] and counter % 10 == 0 :
170173 result = search_client .upload_documents (documents = docs )
171174 docs = []
172- print (f' { str (counter )} uploaded' )
173175
174176if docs != []:
175177 results = search_client .upload_documents (documents = docs )
178+
179+ print (f'{ str (counter )} files processed.' )
0 commit comments