Skip to content

Commit 54ecfcb

Browse files
Merge pull request #297 from microsoft/psl-pylintREsolved
style: Solved PyLint issues
2 parents 83f7b62 + e1c4338 commit 54ecfcb

File tree

2 files changed

+96
-130
lines changed

2 files changed

+96
-130
lines changed
Lines changed: 36 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,30 @@
1-
from azure.keyvault.secrets import SecretClient
1+
from azure.keyvault.secrets import SecretClient
22
from azure.identity import DefaultAzureCredential
33

44
key_vault_name = 'kv_to-be-replaced'
55
managed_identity_client_id = 'mici_to-be-replaced'
66
index_name = "pdf_index"
77

8-
def get_secrets_from_kv(kv_name, secret_name):
98

10-
# Set the name of the Azure Key Vault
11-
key_vault_name = kv_name
12-
credential = DefaultAzureCredential(managed_identity_client_id=managed_identity_client_id)
9+
def get_secrets_from_kv(kv_name, secret_name):
10+
"""Retrieve a secret from Azure Key Vault."""
11+
key_vault_name = kv_name
12+
credential = DefaultAzureCredential(
13+
managed_identity_client_id=managed_identity_client_id
14+
)
15+
secret_client = SecretClient(
16+
vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential
17+
)
18+
return secret_client.get_secret(secret_name).value
1319

14-
# Create a secret client object using the credential and Key Vault name
15-
secret_client = SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential)
1620

17-
# Retrieve the secret value
18-
return(secret_client.get_secret(secret_name).value)
21+
search_endpoint = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-ENDPOINT")
22+
search_key = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-KEY")
1923

20-
search_endpoint = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-ENDPOINT")
21-
search_key = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-KEY")
2224

23-
# Create the search index
2425
def create_search_index():
25-
from azure.core.credentials import AzureKeyCredential
26-
search_credential = AzureKeyCredential(search_key)
27-
26+
"""Create an Azure Search index."""
27+
from azure.core.credentials import AzureKeyCredential
2828
from azure.search.documents.indexes import SearchIndexClient
2929
from azure.search.documents.indexes.models import (
3030
SimpleField,
@@ -38,61 +38,55 @@ def create_search_index():
3838
SemanticPrioritizedFields,
3939
SemanticField,
4040
SemanticSearch,
41-
SearchIndex
41+
SearchIndex,
4242
)
4343

44-
# Create a search index
44+
search_credential = AzureKeyCredential(search_key)
4545
index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
4646

47-
# fields = [
48-
# SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
49-
# SearchableField(name="chunk_id", type=SearchFieldDataType.String),
50-
# SearchableField(name="content", type=SearchFieldDataType.String),
51-
# SearchableField(name="sourceurl", type=SearchFieldDataType.String),
52-
# SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
53-
# searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
54-
# ]
55-
5647
fields = [
5748
SimpleField(name="id", type=SearchFieldDataType.String, key=True),
5849
SimpleField(name="chunk_id", type=SearchFieldDataType.String),
5950
SearchField(name="content", type=SearchFieldDataType.String),
6051
SearchableField(name="sourceurl", type=SearchFieldDataType.String),
61-
SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), \
62-
vector_search_dimensions=1536,vector_search_profile_name="myHnswProfile"
63-
)
52+
SearchField(
53+
name="contentVector",
54+
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
55+
vector_search_dimensions=1536,
56+
vector_search_profile_name="myHnswProfile",
57+
),
6458
]
6559

66-
# Configure the vector search configuration
6760
vector_search = VectorSearch(
6861
algorithms=[
69-
HnswAlgorithmConfiguration(
70-
name="myHnsw"
71-
)
62+
HnswAlgorithmConfiguration(name="myHnsw")
7263
],
7364
profiles=[
7465
VectorSearchProfile(
7566
name="myHnswProfile",
7667
algorithm_configuration_name="myHnsw",
7768
)
78-
]
69+
],
7970
)
8071

8172
semantic_config = SemanticConfiguration(
8273
name="my-semantic-config",
8374
prioritized_fields=SemanticPrioritizedFields(
8475
keywords_fields=[SemanticField(field_name="chunk_id")],
85-
content_fields=[SemanticField(field_name="content")]
86-
)
76+
content_fields=[SemanticField(field_name="content")],
77+
),
8778
)
8879

89-
# Create the semantic settings with the configuration
9080
semantic_search = SemanticSearch(configurations=[semantic_config])
9181

92-
# Create the search index with the semantic settings
93-
index = SearchIndex(name=index_name, fields=fields,
94-
vector_search=vector_search, semantic_search=semantic_search)
82+
index = SearchIndex(
83+
name=index_name,
84+
fields=fields,
85+
vector_search=vector_search,
86+
semantic_search=semantic_search,
87+
)
9588
result = index_client.create_or_update_index(index)
96-
print(f' {result.name} created')
89+
print(f'{result.name} created')
90+
9791

98-
create_search_index()
92+
create_search_index()
Lines changed: 60 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -1,75 +1,76 @@
1-
import json
21
from azure.core.credentials import AzureKeyCredential
3-
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
4-
from azure.keyvault.secrets import SecretClient
2+
from azure.identity import DefaultAzureCredential
3+
from azure.keyvault.secrets import SecretClient
54
from openai import AzureOpenAI
6-
import pandas as pd
7-
import re
5+
import re
86
import time
7+
import pypdf
8+
from io import BytesIO
9+
from azure.search.documents import SearchClient
10+
from azure.storage.filedatalake import DataLakeServiceClient
11+
from azure.search.documents.indexes import SearchIndexClient
912

1013
key_vault_name = 'kv_to-be-replaced'
1114
managed_identity_client_id = 'mici_to-be-replaced'
12-
file_system_client_name = "data"
15+
file_system_client_name = "data"
1316
directory = 'pdf'
1417

1518

16-
1719
def get_secrets_from_kv(kv_name, secret_name):
18-
# Set the name of the Azure Key Vault
19-
key_vault_name = kv_name
20+
# Set the name of the Azure Key Vault
21+
key_vault_name = kv_name
2022
credential = DefaultAzureCredential(managed_identity_client_id=managed_identity_client_id)
2123

22-
# Create a secret client object using the credential and Key Vault name
23-
secret_client = SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential)
24-
return(secret_client.get_secret(secret_name).value)
24+
# Create a secret client object using the credential and Key Vault name
25+
secret_client = SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential)
26+
return (secret_client.get_secret(secret_name).value)
2527

2628

27-
search_endpoint = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-ENDPOINT")
28-
search_key = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-KEY")
29+
search_endpoint = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-ENDPOINT")
30+
search_key = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-KEY")
31+
openai_api_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-KEY")
32+
openai_api_base = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-ENDPOINT")
33+
openai_api_version = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-PREVIEW-API-VERSION")
34+
deployment = get_secrets_from_kv(key_vault_name, "AZURE-OPEN-AI-DEPLOYMENT-MODEL") # "gpt-4o-mini"
2935

30-
openai_api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")
31-
openai_api_base = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")
32-
openai_api_version = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-PREVIEW-API-VERSION")
33-
deployment = get_secrets_from_kv(key_vault_name,"AZURE-OPEN-AI-DEPLOYMENT-MODEL") #"gpt-4o-mini"
3436

35-
36-
# Function: Get Embeddings
37-
def get_embeddings(text: str,openai_api_base,openai_api_version,openai_api_key):
37+
# Function: Get Embeddings
38+
def get_embeddings(text: str, openai_api_base, openai_api_version, openai_api_key):
3839
model_id = "text-embedding-ada-002"
3940
client = AzureOpenAI(
4041
api_version=openai_api_version,
4142
azure_endpoint=openai_api_base,
42-
api_key = openai_api_key
43+
api_key=openai_api_key
4344
)
44-
45+
4546
embedding = client.embeddings.create(input=text, model=model_id).data[0].embedding
4647

4748
return embedding
4849

49-
# Function: Clean Spaces with Regex -
50+
51+
# Function: Clean Spaces with Regex -
5052
def clean_spaces_with_regex(text):
5153
# Use a regular expression to replace multiple spaces with a single space
5254
cleaned_text = re.sub(r'\s+', ' ', text)
5355
# Use a regular expression to replace consecutive dots with a single dot
5456
cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
5557
return cleaned_text
5658

59+
5760
def chunk_data(text):
58-
tokens_per_chunk = 1024 #500
61+
tokens_per_chunk = 1024 # 500
5962
text = clean_spaces_with_regex(text)
60-
SENTENCE_ENDINGS = [".", "!", "?"]
61-
WORDS_BREAKS = ['\n', '\t', '}', '{', ']', '[', ')', '(', ' ', ':', ';', ',']
6263

63-
sentences = text.split('. ') # Split text into sentences
64+
sentences = text.split('. ') # Split text into sentences
6465
chunks = []
6566
current_chunk = ''
6667
current_chunk_token_count = 0
67-
68+
6869
# Iterate through each sentence
6970
for sentence in sentences:
7071
# Split sentence into tokens
7172
tokens = sentence.split()
72-
73+
7374
# Check if adding the current sentence exceeds tokens_per_chunk
7475
if current_chunk_token_count + len(tokens) <= tokens_per_chunk:
7576
# Add the sentence to the current chunk
@@ -83,121 +84,92 @@ def chunk_data(text):
8384
chunks.append(current_chunk)
8485
current_chunk = sentence
8586
current_chunk_token_count = len(tokens)
86-
87+
8788
# Add the last chunk
8889
if current_chunk:
8990
chunks.append(current_chunk)
90-
91-
return chunks
9291

93-
from azure.search.documents import SearchClient
94-
from azure.storage.filedatalake import (
95-
DataLakeServiceClient,
96-
DataLakeDirectoryClient,
97-
FileSystemClient
98-
)
92+
return chunks
9993

10094

101-
account_name = get_secrets_from_kv(key_vault_name, "ADLS-ACCOUNT-NAME")
95+
account_name = get_secrets_from_kv(key_vault_name, "ADLS-ACCOUNT-NAME")
10296

10397
account_url = f"https://{account_name}.dfs.core.windows.net"
10498

10599
credential = DefaultAzureCredential()
106-
service_client = DataLakeServiceClient(account_url, credential=credential,api_version='2023-01-03')
100+
service_client = DataLakeServiceClient(account_url, credential=credential, api_version='2023-01-03')
107101

108-
file_system_client = service_client.get_file_system_client(file_system_client_name)
102+
file_system_client = service_client.get_file_system_client(file_system_client_name)
109103

110104
directory_name = directory
111105
paths = file_system_client.get_paths(path=directory_name)
112106
print(paths)
113107

114108
index_name = "pdf_index"
115109

116-
117-
from azure.search.documents.indexes import SearchIndexClient
118-
from azure.search.documents.indexes.models import (
119-
SimpleField,
120-
SearchFieldDataType,
121-
SearchableField,
122-
SearchField,
123-
VectorSearch,
124-
HnswAlgorithmConfiguration,
125-
VectorSearchProfile,
126-
SemanticConfiguration,
127-
SemanticPrioritizedFields,
128-
SemanticField,
129-
SemanticSearch,
130-
SearchIndex
131-
)
132110
search_credential = AzureKeyCredential(search_key)
133111

134112
search_client = SearchClient(search_endpoint, index_name, search_credential)
135113
index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
136114

137115

138-
def prepare_search_doc(content, document_id):
116+
def prepare_search_doc(content, document_id):
139117
chunks = chunk_data(content)
140118
chunk_num = 0
141119
for chunk in chunks:
142120
chunk_num += 1
143121
chunk_id = document_id + '_' + str(chunk_num).zfill(2)
144-
122+
145123
try:
146-
v_contentVector = get_embeddings(str(chunk),openai_api_base,openai_api_version,openai_api_key)
147-
except:
124+
v_contentVector = get_embeddings(str(chunk), openai_api_base, openai_api_version, openai_api_key)
125+
except Exception as e:
126+
print(f"Error occurred: {e}. Retrying after 30 seconds...")
148127
time.sleep(30)
149-
try:
150-
v_contentVector = get_embeddings(str(chunk),openai_api_base,openai_api_version,openai_api_key)
151-
except:
128+
try:
129+
v_contentVector = get_embeddings(str(chunk), openai_api_base, openai_api_version, openai_api_key)
130+
except Exception as e:
131+
print(f"Retry failed: {e}. Setting v_contentVector to an empty list.")
152132
v_contentVector = []
133+
153134
result = {
154-
"id": chunk_id,
155-
"chunk_id": chunk_id,
156-
"content": chunk,
157-
"sourceurl": path.name.split('/')[-1],
158-
"contentVector": v_contentVector
159-
}
135+
"id": chunk_id,
136+
"chunk_id": chunk_id,
137+
"content": chunk,
138+
"sourceurl": path.name.split('/')[-1],
139+
"contentVector": v_contentVector
140+
}
160141
return result
161-
142+
143+
162144
# conversationIds = []
163145
docs = []
164146
counter = 0
165-
from datetime import datetime, timedelta
166-
import pypdf
167-
from io import BytesIO
147+
168148

169149
for path in paths:
170150
file_client = file_system_client.get_file_client(path.name)
171151
pdf_file = file_client.download_file()
172-
152+
173153
stream = BytesIO()
174154
pdf_file.readinto(stream)
175155
pdf_reader = pypdf.PdfReader(stream)
176156
filename = path.name.split('/')[-1]
177-
document_id = filename.split('_')[1].replace('.pdf','')
178-
157+
document_id = filename.split('_')[1].replace('.pdf', '')
179158

180159
text = ''
181160
num_pages = len(pdf_reader.pages)
182161
for page_num in range(num_pages):
183-
162+
184163
page = pdf_reader.pages[page_num]
185164
text += page.extract_text()
186-
187-
188-
189165
result = prepare_search_doc(text, document_id)
190166
docs.append(result)
191-
167+
192168
counter += 1
193169
if docs != [] and counter % 10 == 0:
194170
result = search_client.upload_documents(documents=docs)
195171
docs = []
196172
print(f' {str(counter)} uploaded')
197-
173+
198174
if docs != []:
199175
results = search_client.upload_documents(documents=docs)
200-
201-
202-
203-

0 commit comments

Comments
 (0)