Skip to content

Commit f1686b7

Browse files
Merge pull request #436 from microsoft/psl-hb-bug-17212
fix: update AI model deployment versions and adjust data processing chunk size
2 parents 4e58b5c + 00a3025 commit f1686b7

File tree

2 files changed

+12
-4
lines changed

2 files changed

+12
-4
lines changed

infra/deploy_ai_foundry.bicep

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ var aiModelDeployments = [
3535
name: deploymentType
3636
capacity: gptDeploymentCapacity
3737
}
38+
version: '2024-05-13'
3839
raiPolicyName: 'Microsoft.Default'
3940
}
4041
{
@@ -44,6 +45,7 @@ var aiModelDeployments = [
4445
name: 'Standard'
4546
capacity: embeddingDeploymentCapacity
4647
}
48+
version: '2'
4749
raiPolicyName: 'Microsoft.Default'
4850
}
4951
]
@@ -159,8 +161,10 @@ resource aiServicesDeployments 'Microsoft.CognitiveServices/accounts/deployments
159161
model: {
160162
format: 'OpenAI'
161163
name: aiModeldeployment.model
164+
version: aiModeldeployment.version
162165
}
163166
raiPolicyName: aiModeldeployment.raiPolicyName
167+
versionUpgradeOption: 'OnceCurrentVersionExpired'
164168
}
165169
sku:{
166170
name: aiModeldeployment.sku.name

infra/scripts/index_scripts/02_process_data.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from azure.storage.filedatalake import DataLakeServiceClient
1111
from azure.search.documents.indexes import SearchIndexClient
1212

13+
1314
key_vault_name = 'kv_to-be-replaced'
1415
managed_identity_client_id = 'mici_to-be-replaced'
1516
file_system_client_name = "data"
@@ -58,7 +59,7 @@ def clean_spaces_with_regex(text):
5859

5960

6061
def chunk_data(text):
61-
tokens_per_chunk = 1024 # 500
62+
tokens_per_chunk = 256 # 1024 # 500
6263
text = clean_spaces_with_regex(text)
6364

6465
sentences = text.split('. ') # Split text into sentences
@@ -115,6 +116,7 @@ def chunk_data(text):
115116

116117
def prepare_search_doc(content, document_id):
117118
chunks = chunk_data(content)
119+
results = []
118120
chunk_num = 0
119121
for chunk in chunks:
120122
chunk_num += 1
@@ -138,7 +140,8 @@ def prepare_search_doc(content, document_id):
138140
"sourceurl": path.name.split('/')[-1],
139141
"contentVector": v_contentVector
140142
}
141-
return result
143+
results.append(result)
144+
return results
142145

143146

144147
# conversationIds = []
@@ -163,13 +166,14 @@ def prepare_search_doc(content, document_id):
163166
page = pdf_reader.pages[page_num]
164167
text += page.extract_text()
165168
result = prepare_search_doc(text, document_id)
166-
docs.append(result)
169+
docs.extend(result)
167170

168171
counter += 1
169172
if docs != [] and counter % 10 == 0:
170173
result = search_client.upload_documents(documents=docs)
171174
docs = []
172-
print(f' {str(counter)} uploaded')
173175

174176
if docs != []:
175177
results = search_client.upload_documents(documents=docs)
178+
179+
print(f'{str(counter)} files processed.')

0 commit comments

Comments
 (0)