Skip to content

Commit 627a82e

Browse files
author
Harmanpreet Kaur
committed
solved pylint issue
1 parent 9f3452a commit 627a82e

File tree

2 files changed

+93
-171
lines changed

2 files changed

+93
-171
lines changed
Lines changed: 36 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,30 @@
1-
from azure.keyvault.secrets import SecretClient
1+
from azure.keyvault.secrets import SecretClient
22
from azure.identity import DefaultAzureCredential
33

44
key_vault_name = 'kv_to-be-replaced'
55
managed_identity_client_id = 'mici_to-be-replaced'
66
index_name = "pdf_index"
77

8-
def get_secrets_from_kv(kv_name, secret_name):
98

10-
# Set the name of the Azure Key Vault
11-
key_vault_name = kv_name
12-
credential = DefaultAzureCredential(managed_identity_client_id=managed_identity_client_id)
9+
def get_secrets_from_kv(kv_name, secret_name):
10+
"""Retrieve a secret from Azure Key Vault."""
11+
key_vault_name = kv_name
12+
credential = DefaultAzureCredential(
13+
managed_identity_client_id=managed_identity_client_id
14+
)
15+
secret_client = SecretClient(
16+
vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential
17+
)
18+
return secret_client.get_secret(secret_name).value
1319

14-
# Create a secret client object using the credential and Key Vault name
15-
secret_client = SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential)
1620

17-
# Retrieve the secret value
18-
return(secret_client.get_secret(secret_name).value)
21+
search_endpoint = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-ENDPOINT")
22+
search_key = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-KEY")
1923

20-
search_endpoint = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-ENDPOINT")
21-
search_key = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-KEY")
2224

23-
# Create the search index
2425
def create_search_index():
25-
from azure.core.credentials import AzureKeyCredential
26-
search_credential = AzureKeyCredential(search_key)
27-
26+
"""Create an Azure Search index."""
27+
from azure.core.credentials import AzureKeyCredential
2828
from azure.search.documents.indexes import SearchIndexClient
2929
from azure.search.documents.indexes.models import (
3030
SimpleField,
@@ -38,61 +38,55 @@ def create_search_index():
3838
SemanticPrioritizedFields,
3939
SemanticField,
4040
SemanticSearch,
41-
SearchIndex
41+
SearchIndex,
4242
)
4343

44-
# Create a search index
44+
search_credential = AzureKeyCredential(search_key)
4545
index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
4646

47-
# fields = [
48-
# SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
49-
# SearchableField(name="chunk_id", type=SearchFieldDataType.String),
50-
# SearchableField(name="content", type=SearchFieldDataType.String),
51-
# SearchableField(name="sourceurl", type=SearchFieldDataType.String),
52-
# SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
53-
# searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
54-
# ]
55-
5647
fields = [
5748
SimpleField(name="id", type=SearchFieldDataType.String, key=True),
5849
SimpleField(name="chunk_id", type=SearchFieldDataType.String),
5950
SearchField(name="content", type=SearchFieldDataType.String),
6051
SearchableField(name="sourceurl", type=SearchFieldDataType.String),
61-
SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), \
62-
vector_search_dimensions=1536,vector_search_profile_name="myHnswProfile"
63-
)
52+
SearchField(
53+
name="contentVector",
54+
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
55+
vector_search_dimensions=1536,
56+
vector_search_profile_name="myHnswProfile",
57+
),
6458
]
6559

66-
# Configure the vector search configuration
6760
vector_search = VectorSearch(
6861
algorithms=[
69-
HnswAlgorithmConfiguration(
70-
name="myHnsw"
71-
)
62+
HnswAlgorithmConfiguration(name="myHnsw")
7263
],
7364
profiles=[
7465
VectorSearchProfile(
7566
name="myHnswProfile",
7667
algorithm_configuration_name="myHnsw",
7768
)
78-
]
69+
],
7970
)
8071

8172
semantic_config = SemanticConfiguration(
8273
name="my-semantic-config",
8374
prioritized_fields=SemanticPrioritizedFields(
8475
keywords_fields=[SemanticField(field_name="chunk_id")],
85-
content_fields=[SemanticField(field_name="content")]
86-
)
76+
content_fields=[SemanticField(field_name="content")],
77+
),
8778
)
8879

89-
# Create the semantic settings with the configuration
9080
semantic_search = SemanticSearch(configurations=[semantic_config])
9181

92-
# Create the search index with the semantic settings
93-
index = SearchIndex(name=index_name, fields=fields,
94-
vector_search=vector_search, semantic_search=semantic_search)
82+
index = SearchIndex(
83+
name=index_name,
84+
fields=fields,
85+
vector_search=vector_search,
86+
semantic_search=semantic_search,
87+
)
9588
result = index_client.create_or_update_index(index)
96-
print(f' {result.name} created')
89+
print(f'{result.name} created')
90+
9791

98-
create_search_index()
92+
create_search_index()
Lines changed: 57 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -1,203 +1,131 @@
1-
import json
21
from azure.core.credentials import AzureKeyCredential
3-
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
4-
from azure.keyvault.secrets import SecretClient
2+
from azure.identity import DefaultAzureCredential
3+
from azure.keyvault.secrets import SecretClient
54
from openai import AzureOpenAI
6-
import pandas as pd
7-
import re
5+
import re
86
import time
7+
from azure.search.documents import SearchClient
8+
from azure.storage.filedatalake import DataLakeServiceClient
9+
import pypdf
10+
from io import BytesIO
11+
from azure.search.documents.indexes import SearchIndexClient
912

1013
key_vault_name = 'kv_to-be-replaced'
1114
managed_identity_client_id = 'mici_to-be-replaced'
12-
file_system_client_name = "data"
15+
file_system_client_name = "data"
1316
directory = 'pdf'
1417

1518

16-
1719
def get_secrets_from_kv(kv_name, secret_name):
18-
# Set the name of the Azure Key Vault
19-
key_vault_name = kv_name
2020
credential = DefaultAzureCredential(managed_identity_client_id=managed_identity_client_id)
21+
secret_client = SecretClient(vault_url=f"https://{kv_name}.vault.azure.net/", credential=credential)
22+
return secret_client.get_secret(secret_name).value
2123

22-
# Create a secret client object using the credential and Key Vault name
23-
secret_client = SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential)
24-
return(secret_client.get_secret(secret_name).value)
2524

25+
search_endpoint = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-ENDPOINT")
26+
search_key = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-KEY")
27+
openai_api_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-KEY")
28+
openai_api_base = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-ENDPOINT")
29+
openai_api_version = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-PREVIEW-API-VERSION")
30+
deployment = get_secrets_from_kv(key_vault_name, "AZURE-OPEN-AI-DEPLOYMENT-MODEL")
2631

27-
search_endpoint = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-ENDPOINT")
28-
search_key = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-KEY")
2932

30-
openai_api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")
31-
openai_api_base = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")
32-
openai_api_version = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-PREVIEW-API-VERSION")
33-
deployment = get_secrets_from_kv(key_vault_name,"AZURE-OPEN-AI-DEPLOYMENT-MODEL") #"gpt-4o-mini"
34-
35-
36-
# Function: Get Embeddings
37-
def get_embeddings(text: str,openai_api_base,openai_api_version,openai_api_key):
33+
def get_embeddings(text: str, openai_api_base, openai_api_version, openai_api_key):
3834
model_id = "text-embedding-ada-002"
3935
client = AzureOpenAI(
4036
api_version=openai_api_version,
4137
azure_endpoint=openai_api_base,
42-
api_key = openai_api_key
38+
api_key=openai_api_key
4339
)
44-
45-
embedding = client.embeddings.create(input=text, model=model_id).data[0].embedding
40+
return client.embeddings.create(input=text, model=model_id).data[0].embedding
4641

47-
return embedding
4842

49-
# Function: Clean Spaces with Regex -
5043
def clean_spaces_with_regex(text):
51-
# Use a regular expression to replace multiple spaces with a single space
5244
cleaned_text = re.sub(r'\s+', ' ', text)
53-
# Use a regular expression to replace consecutive dots with a single dot
5445
cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
5546
return cleaned_text
5647

48+
5749
def chunk_data(text):
58-
tokens_per_chunk = 1024 #500
50+
tokens_per_chunk = 1024
5951
text = clean_spaces_with_regex(text)
60-
SENTENCE_ENDINGS = [".", "!", "?"]
61-
WORDS_BREAKS = ['\n', '\t', '}', '{', ']', '[', ')', '(', ' ', ':', ';', ',']
62-
63-
sentences = text.split('. ') # Split text into sentences
52+
sentences = text.split('. ')
6453
chunks = []
6554
current_chunk = ''
6655
current_chunk_token_count = 0
67-
68-
# Iterate through each sentence
56+
6957
for sentence in sentences:
70-
# Split sentence into tokens
7158
tokens = sentence.split()
72-
73-
# Check if adding the current sentence exceeds tokens_per_chunk
7459
if current_chunk_token_count + len(tokens) <= tokens_per_chunk:
75-
# Add the sentence to the current chunk
7660
if current_chunk:
7761
current_chunk += '. ' + sentence
7862
else:
7963
current_chunk += sentence
8064
current_chunk_token_count += len(tokens)
8165
else:
82-
# Add current chunk to chunks list and start a new chunk
8366
chunks.append(current_chunk)
8467
current_chunk = sentence
8568
current_chunk_token_count = len(tokens)
86-
87-
# Add the last chunk
69+
8870
if current_chunk:
8971
chunks.append(current_chunk)
90-
91-
return chunks
92-
93-
from azure.search.documents import SearchClient
94-
from azure.storage.filedatalake import (
95-
DataLakeServiceClient,
96-
DataLakeDirectoryClient,
97-
FileSystemClient
98-
)
9972

73+
return chunks
10074

101-
account_name = get_secrets_from_kv(key_vault_name, "ADLS-ACCOUNT-NAME")
10275

76+
account_name = get_secrets_from_kv(key_vault_name, "ADLS-ACCOUNT-NAME")
10377
account_url = f"https://{account_name}.dfs.core.windows.net"
104-
10578
credential = DefaultAzureCredential()
106-
service_client = DataLakeServiceClient(account_url, credential=credential,api_version='2023-01-03')
107-
108-
file_system_client = service_client.get_file_system_client(file_system_client_name)
109-
110-
directory_name = directory
111-
paths = file_system_client.get_paths(path=directory_name)
112-
print(paths)
113-
79+
service_client = DataLakeServiceClient(account_url, credential=credential, api_version='2023-01-03')
80+
file_system_client = service_client.get_file_system_client(file_system_client_name)
81+
paths = file_system_client.get_paths(path=directory)
11482
index_name = "pdf_index"
115-
116-
117-
from azure.search.documents.indexes import SearchIndexClient
118-
from azure.search.documents.indexes.models import (
119-
SimpleField,
120-
SearchFieldDataType,
121-
SearchableField,
122-
SearchField,
123-
VectorSearch,
124-
HnswAlgorithmConfiguration,
125-
VectorSearchProfile,
126-
SemanticConfiguration,
127-
SemanticPrioritizedFields,
128-
SemanticField,
129-
SemanticSearch,
130-
SearchIndex
131-
)
13283
search_credential = AzureKeyCredential(search_key)
133-
13484
search_client = SearchClient(search_endpoint, index_name, search_credential)
13585
index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
13686

13787

138-
def prepare_search_doc(content, document_id):
88+
def prepare_search_doc(content, document_id):
13989
chunks = chunk_data(content)
140-
chunk_num = 0
141-
for chunk in chunks:
142-
chunk_num += 1
143-
chunk_id = document_id + '_' + str(chunk_num).zfill(2)
144-
90+
docs = []
91+
for chunk_num, chunk in enumerate(chunks, start=1):
92+
chunk_id = f"{document_id}_{str(chunk_num).zfill(2)}"
14593
try:
146-
v_contentVector = get_embeddings(str(chunk),openai_api_base,openai_api_version,openai_api_key)
147-
except:
94+
v_contentVector = get_embeddings(chunk, openai_api_base, openai_api_version, openai_api_key)
95+
except Exception:
14896
time.sleep(30)
149-
try:
150-
v_contentVector = get_embeddings(str(chunk),openai_api_base,openai_api_version,openai_api_key)
151-
except:
97+
try:
98+
v_contentVector = get_embeddings(chunk, openai_api_base, openai_api_version, openai_api_key)
99+
except Exception:
152100
v_contentVector = []
153101
result = {
154-
"id": chunk_id,
155-
"chunk_id": chunk_id,
156-
"content": chunk,
157-
"sourceurl": path.name.split('/')[-1],
158-
"contentVector": v_contentVector
159-
}
160-
return result
161-
162-
# conversationIds = []
102+
"id": chunk_id,
103+
"chunk_id": chunk_id,
104+
"content": chunk,
105+
"sourceurl": path.name.split('/')[-1],
106+
"contentVector": v_contentVector
107+
}
108+
docs.append(result)
109+
return docs
110+
111+
163112
docs = []
164113
counter = 0
165-
from datetime import datetime, timedelta
166-
import pypdf
167-
from io import BytesIO
168-
169114
for path in paths:
170115
file_client = file_system_client.get_file_client(path.name)
171116
pdf_file = file_client.download_file()
172-
173117
stream = BytesIO()
174118
pdf_file.readinto(stream)
175119
pdf_reader = pypdf.PdfReader(stream)
176120
filename = path.name.split('/')[-1]
177-
document_id = filename.split('_')[1].replace('.pdf','')
178-
179-
180-
text = ''
181-
num_pages = len(pdf_reader.pages)
182-
for page_num in range(num_pages):
183-
184-
page = pdf_reader.pages[page_num]
185-
text += page.extract_text()
186-
187-
188-
189-
result = prepare_search_doc(text, document_id)
190-
docs.append(result)
191-
121+
document_id = filename.split('_')[1].replace('.pdf', '')
122+
text = ''.join(page.extract_text() for page in pdf_reader.pages)
123+
docs.extend(prepare_search_doc(text, document_id))
192124
counter += 1
193-
if docs != [] and counter % 10 == 0:
194-
result = search_client.upload_documents(documents=docs)
125+
if docs and counter % 10 == 0:
126+
search_client.upload_documents(documents=docs)
195127
docs = []
196-
print(f' {str(counter)} uploaded')
197-
198-
if docs != []:
199-
results = search_client.upload_documents(documents=docs)
200-
201-
128+
print(f'{counter} uploaded')
202129

203-
130+
if docs:
131+
search_client.upload_documents(documents=docs)

0 commit comments

Comments
 (0)