Merge pull request #297 from microsoft/psl-pylintREsolved

Roopan-Microsoft · web-flow · commit 54ecfcb3173c · 2025-03-20T21:19:50.000+05:30
style: Solved PyLint issues
diff --git a/infra/scripts/index_scripts/01_create_search_index.py b/infra/scripts/index_scripts/01_create_search_index.py
@@ -1,30 +1,30 @@
-from azure.keyvault.secrets import SecretClient  
+from azure.keyvault.secrets import SecretClient
 from azure.identity import DefaultAzureCredential
 
 key_vault_name = 'kv_to-be-replaced'
 managed_identity_client_id = 'mici_to-be-replaced'
 index_name = "pdf_index"
 
-def get_secrets_from_kv(kv_name, secret_name):
 
-    # Set the name of the Azure Key Vault  
-    key_vault_name = kv_name 
-    credential = DefaultAzureCredential(managed_identity_client_id=managed_identity_client_id)
+def get_secrets_from_kv(kv_name, secret_name):
+    """Retrieve a secret from Azure Key Vault."""
+    key_vault_name = kv_name
+    credential = DefaultAzureCredential(
+        managed_identity_client_id=managed_identity_client_id
+    )
+    secret_client = SecretClient(
+        vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential
+    )
+    return secret_client.get_secret(secret_name).value
 
-    # Create a secret client object using the credential and Key Vault name  
-    secret_client =  SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential)  
 
-    # Retrieve the secret value  
-    return(secret_client.get_secret(secret_name).value)
+search_endpoint = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-ENDPOINT")
+search_key = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-KEY")
 
-search_endpoint = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-ENDPOINT")
-search_key =  get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-KEY")
 
-# Create the search index
 def create_search_index():
-    from azure.core.credentials import AzureKeyCredential 
-    search_credential = AzureKeyCredential(search_key)
-
+    """Create an Azure Search index."""
+    from azure.core.credentials import AzureKeyCredential
     from azure.search.documents.indexes import SearchIndexClient
     from azure.search.documents.indexes.models import (
         SimpleField,
@@ -38,61 +38,55 @@ def create_search_index():
         SemanticPrioritizedFields,
         SemanticField,
         SemanticSearch,
-        SearchIndex
+        SearchIndex,
     )
 
-    # Create a search index 
+    search_credential = AzureKeyCredential(search_key)
     index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
 
-    # fields = [
-    #     SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
-    #     SearchableField(name="chunk_id", type=SearchFieldDataType.String),
-    #     SearchableField(name="content", type=SearchFieldDataType.String),
-    #     SearchableField(name="sourceurl", type=SearchFieldDataType.String),
-    #     SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
-    #                 searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
-    # ]
-
     fields = [
         SimpleField(name="id", type=SearchFieldDataType.String, key=True),
         SimpleField(name="chunk_id", type=SearchFieldDataType.String),
         SearchField(name="content", type=SearchFieldDataType.String),
         SearchableField(name="sourceurl", type=SearchFieldDataType.String),
-        SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), \
-        vector_search_dimensions=1536,vector_search_profile_name="myHnswProfile"
-        )
+        SearchField(
+            name="contentVector",
+            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+            vector_search_dimensions=1536,
+            vector_search_profile_name="myHnswProfile",
+        ),
     ]
 
-    # Configure the vector search configuration 
     vector_search = VectorSearch(
         algorithms=[
-            HnswAlgorithmConfiguration(
-                name="myHnsw"
-            )
+            HnswAlgorithmConfiguration(name="myHnsw")
         ],
         profiles=[
             VectorSearchProfile(
                 name="myHnswProfile",
                 algorithm_configuration_name="myHnsw",
             )
-        ]
+        ],
     )
 
     semantic_config = SemanticConfiguration(
         name="my-semantic-config",
         prioritized_fields=SemanticPrioritizedFields(
             keywords_fields=[SemanticField(field_name="chunk_id")],
-            content_fields=[SemanticField(field_name="content")]
-        )
+            content_fields=[SemanticField(field_name="content")],
+        ),
     )
 
-    # Create the semantic settings with the configuration
     semantic_search = SemanticSearch(configurations=[semantic_config])
 
-    # Create the search index with the semantic settings
-    index = SearchIndex(name=index_name, fields=fields,
-                        vector_search=vector_search, semantic_search=semantic_search)
+    index = SearchIndex(
+        name=index_name,
+        fields=fields,
+        vector_search=vector_search,
+        semantic_search=semantic_search,
+    )
     result = index_client.create_or_update_index(index)
-    print(f' {result.name} created')
+    print(f'{result.name} created')
+
 
-create_search_index()
+create_search_index()
diff --git a/infra/scripts/index_scripts/02_process_data.py b/infra/scripts/index_scripts/02_process_data.py
@@ -1,75 +1,76 @@
-import json
 from azure.core.credentials import AzureKeyCredential
-from azure.identity import DefaultAzureCredential, get_bearer_token_provider
-from azure.keyvault.secrets import SecretClient  
+from azure.identity import DefaultAzureCredential
+from azure.keyvault.secrets import SecretClient
 from openai import AzureOpenAI
-import pandas as pd
-import re 
+import re
 import time
+import pypdf
+from io import BytesIO
+from azure.search.documents import SearchClient
+from azure.storage.filedatalake import DataLakeServiceClient
+from azure.search.documents.indexes import SearchIndexClient
 
 key_vault_name = 'kv_to-be-replaced'
 managed_identity_client_id = 'mici_to-be-replaced'
-file_system_client_name = "data" 
+file_system_client_name = "data"
 directory = 'pdf'
 
 
-
 def get_secrets_from_kv(kv_name, secret_name):
-    # Set the name of the Azure Key Vault  
-    key_vault_name = kv_name 
+    # Set the name of the Azure Key Vault
+    key_vault_name = kv_name
     credential = DefaultAzureCredential(managed_identity_client_id=managed_identity_client_id)
 
-    # Create a secret client object using the credential and Key Vault name  
-    secret_client =  SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential)  
-    return(secret_client.get_secret(secret_name).value)
+    # Create a secret client object using the credential and Key Vault name
+    secret_client = SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential)
+    return (secret_client.get_secret(secret_name).value)
 
 
-search_endpoint = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-ENDPOINT")
-search_key =  get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-KEY")
+search_endpoint = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-ENDPOINT")
+search_key = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-KEY")
+openai_api_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-KEY")
+openai_api_base = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-ENDPOINT")
+openai_api_version = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-PREVIEW-API-VERSION")
+deployment = get_secrets_from_kv(key_vault_name, "AZURE-OPEN-AI-DEPLOYMENT-MODEL")  # "gpt-4o-mini"
 
-openai_api_key  =  get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")
-openai_api_base =  get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")
-openai_api_version = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-PREVIEW-API-VERSION") 
-deployment =  get_secrets_from_kv(key_vault_name,"AZURE-OPEN-AI-DEPLOYMENT-MODEL")  #"gpt-4o-mini"
 
-
-# Function: Get Embeddings 
-def get_embeddings(text: str,openai_api_base,openai_api_version,openai_api_key):
+# Function: Get Embeddings
+def get_embeddings(text: str, openai_api_base, openai_api_version, openai_api_key):
     model_id = "text-embedding-ada-002"
     client = AzureOpenAI(
         api_version=openai_api_version,
         azure_endpoint=openai_api_base,
-        api_key = openai_api_key
+        api_key=openai_api_key
     )
-    
+
     embedding = client.embeddings.create(input=text, model=model_id).data[0].embedding
 
     return embedding
 
-# Function: Clean Spaces with Regex - 
+
+# Function: Clean Spaces with Regex -
 def clean_spaces_with_regex(text):
     # Use a regular expression to replace multiple spaces with a single space
     cleaned_text = re.sub(r'\s+', ' ', text)
     # Use a regular expression to replace consecutive dots with a single dot
     cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
     return cleaned_text
 
+
 def chunk_data(text):
-    tokens_per_chunk = 1024 #500
+    tokens_per_chunk = 1024  # 500
     text = clean_spaces_with_regex(text)
-    SENTENCE_ENDINGS = [".", "!", "?"]
-    WORDS_BREAKS = ['\n', '\t', '}', '{', ']', '[', ')', '(', ' ', ':', ';', ',']
 
-    sentences = text.split('. ') # Split text into sentences
+    sentences = text.split('. ')  # Split text into sentences
     chunks = []
     current_chunk = ''
     current_chunk_token_count = 0
-    
+
     # Iterate through each sentence
     for sentence in sentences:
         # Split sentence into tokens
         tokens = sentence.split()
-        
+
         # Check if adding the current sentence exceeds tokens_per_chunk
         if current_chunk_token_count + len(tokens) <= tokens_per_chunk:
             # Add the sentence to the current chunk
@@ -83,121 +84,92 @@ def chunk_data(text):
             chunks.append(current_chunk)
             current_chunk = sentence
             current_chunk_token_count = len(tokens)
-    
+
     # Add the last chunk
     if current_chunk:
         chunks.append(current_chunk)
-    
-    return chunks
 
-from azure.search.documents import SearchClient
-from azure.storage.filedatalake import (
-    DataLakeServiceClient,
-    DataLakeDirectoryClient,
-    FileSystemClient
-)
+    return chunks
 
 
-account_name =  get_secrets_from_kv(key_vault_name, "ADLS-ACCOUNT-NAME")
+account_name = get_secrets_from_kv(key_vault_name, "ADLS-ACCOUNT-NAME")
 
 account_url = f"https://{account_name}.dfs.core.windows.net"
 
 credential = DefaultAzureCredential()
-service_client = DataLakeServiceClient(account_url, credential=credential,api_version='2023-01-03') 
+service_client = DataLakeServiceClient(account_url, credential=credential, api_version='2023-01-03')
 
-file_system_client = service_client.get_file_system_client(file_system_client_name)  
+file_system_client = service_client.get_file_system_client(file_system_client_name)
 
 directory_name = directory
 paths = file_system_client.get_paths(path=directory_name)
 print(paths)
 
 index_name = "pdf_index"
 
-
-from azure.search.documents.indexes import SearchIndexClient
-from azure.search.documents.indexes.models import (
-    SimpleField,
-    SearchFieldDataType,
-    SearchableField,
-    SearchField,
-    VectorSearch,
-    HnswAlgorithmConfiguration,
-    VectorSearchProfile,
-    SemanticConfiguration,
-    SemanticPrioritizedFields,
-    SemanticField,
-    SemanticSearch,
-    SearchIndex
-)
 search_credential = AzureKeyCredential(search_key)
 
 search_client = SearchClient(search_endpoint, index_name, search_credential)
 index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
 
 
-def prepare_search_doc(content, document_id): 
+def prepare_search_doc(content, document_id):
     chunks = chunk_data(content)
     chunk_num = 0
     for chunk in chunks:
         chunk_num += 1
         chunk_id = document_id + '_' + str(chunk_num).zfill(2)
-        
+
         try:
-            v_contentVector = get_embeddings(str(chunk),openai_api_base,openai_api_version,openai_api_key)
-        except:
+            v_contentVector = get_embeddings(str(chunk), openai_api_base, openai_api_version, openai_api_key)
+        except Exception as e:
+            print(f"Error occurred: {e}. Retrying after 30 seconds...")
             time.sleep(30)
-            try: 
-                v_contentVector = get_embeddings(str(chunk),openai_api_base,openai_api_version,openai_api_key)
-            except: 
+            try:
+                v_contentVector = get_embeddings(str(chunk), openai_api_base, openai_api_version, openai_api_key)
+            except Exception as e:
+                print(f"Retry failed: {e}. Setting v_contentVector to an empty list.")
                 v_contentVector = []
+
         result = {
-                "id": chunk_id,
-                "chunk_id": chunk_id,
-                "content": chunk,
-                "sourceurl": path.name.split('/')[-1],
-                "contentVector": v_contentVector
-            }
+            "id": chunk_id,
+            "chunk_id": chunk_id,
+            "content": chunk,
+            "sourceurl": path.name.split('/')[-1],
+            "contentVector": v_contentVector
+        }
     return result
-        
+
+
 # conversationIds = []
 docs = []
 counter = 0
-from datetime import datetime, timedelta
-import pypdf 
-from io import BytesIO
+
 
 for path in paths:
     file_client = file_system_client.get_file_client(path.name)
     pdf_file = file_client.download_file()
-    
+
     stream = BytesIO()
     pdf_file.readinto(stream)
     pdf_reader = pypdf.PdfReader(stream)
     filename = path.name.split('/')[-1]
-    document_id = filename.split('_')[1].replace('.pdf','')
-    
+    document_id = filename.split('_')[1].replace('.pdf', '')
 
     text = ''
     num_pages = len(pdf_reader.pages)
     for page_num in range(num_pages):
-        
+
         page = pdf_reader.pages[page_num]
         text += page.extract_text()
-    
-
-
     result = prepare_search_doc(text, document_id)
     docs.append(result)
-    
+
     counter += 1
     if docs != [] and counter % 10 == 0:
         result = search_client.upload_documents(documents=docs)
         docs = []
         print(f' {str(counter)} uploaded')
-    
+
 if docs != []:
     results = search_client.upload_documents(documents=docs)
-
-
-
-