|
4 | 4 | from openai import AzureOpenAI |
5 | 5 | import re |
6 | 6 | import time |
7 | | -from azure.search.documents import SearchClient |
8 | | -from azure.storage.filedatalake import DataLakeServiceClient |
9 | 7 | import pypdf |
10 | 8 | from io import BytesIO |
| 9 | +from azure.search.documents import SearchClient |
| 10 | +from azure.storage.filedatalake import DataLakeServiceClient |
11 | 11 | from azure.search.documents.indexes import SearchIndexClient |
12 | 12 |
|
13 | 13 | key_vault_name = 'kv_to-be-replaced' |
|
17 | 17 |
|
18 | 18 |
|
19 | 19 | def get_secrets_from_kv(kv_name, secret_name): |
| 20 | + # Set the name of the Azure Key Vault |
20 | 21 | key_vault_name = kv_name |
21 | 22 | credential = DefaultAzureCredential(managed_identity_client_id=managed_identity_client_id) |
| 23 | + |
| 24 | + # Create a secret client object using the credential and Key Vault name |
22 | 25 | secret_client = SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential) |
23 | | - return secret_client.get_secret(secret_name).value |
| 26 | + return (secret_client.get_secret(secret_name).value) |
24 | 27 |
|
25 | 28 |
|
26 | 29 | search_endpoint = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-ENDPOINT") |
27 | 30 | search_key = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-KEY") |
28 | 31 | openai_api_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-KEY") |
29 | 32 | openai_api_base = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-ENDPOINT") |
30 | 33 | openai_api_version = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-PREVIEW-API-VERSION") |
31 | | -deployment = get_secrets_from_kv(key_vault_name, "AZURE-OPEN-AI-DEPLOYMENT-MODEL") |
| 34 | +deployment = get_secrets_from_kv(key_vault_name, "AZURE-OPEN-AI-DEPLOYMENT-MODEL") # "gpt-4o-mini" |
32 | 35 |
|
33 | 36 |
|
| 37 | +# Function: Get Embeddings |
34 | 38 | def get_embeddings(text: str, openai_api_base, openai_api_version, openai_api_key): |
35 | 39 | model_id = "text-embedding-ada-002" |
36 | 40 | client = AzureOpenAI( |
37 | 41 | api_version=openai_api_version, |
38 | 42 | azure_endpoint=openai_api_base, |
39 | 43 | api_key=openai_api_key |
40 | 44 | ) |
41 | | - return client.embeddings.create(input=text, model=model_id).data[0].embedding |
42 | 45 |
|
| 46 | + embedding = client.embeddings.create(input=text, model=model_id).data[0].embedding |
| 47 | + |
| 48 | + return embedding |
43 | 49 |
|
| 50 | + |
| 51 | +# Function: Clean Spaces with Regex - |
44 | 52 | def clean_spaces_with_regex(text): |
| 53 | + # Use a regular expression to replace multiple spaces with a single space |
45 | 54 | cleaned_text = re.sub(r'\s+', ' ', text) |
| 55 | + # Use a regular expression to replace consecutive dots with a single dot |
46 | 56 | cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text) |
47 | 57 | return cleaned_text |
48 | 58 |
|
49 | 59 |
|
50 | 60 | def chunk_data(text): |
51 | | - tokens_per_chunk = 1024 |
| 61 | + tokens_per_chunk = 1024 # 500 |
52 | 62 | text = clean_spaces_with_regex(text) |
53 | | - sentences = text.split('. ') |
| 63 | + |
| 64 | + sentences = text.split('. ') # Split text into sentences |
54 | 65 | chunks = [] |
55 | 66 | current_chunk = '' |
56 | 67 | current_chunk_token_count = 0 |
57 | 68 |
|
| 69 | + # Iterate through each sentence |
58 | 70 | for sentence in sentences: |
| 71 | + # Split sentence into tokens |
59 | 72 | tokens = sentence.split() |
| 73 | + |
| 74 | + # Check if adding the current sentence exceeds tokens_per_chunk |
60 | 75 | if current_chunk_token_count + len(tokens) <= tokens_per_chunk: |
| 76 | + # Add the sentence to the current chunk |
61 | 77 | if current_chunk: |
62 | 78 | current_chunk += '. ' + sentence |
63 | 79 | else: |
64 | 80 | current_chunk += sentence |
65 | 81 | current_chunk_token_count += len(tokens) |
66 | 82 | else: |
| 83 | + # Add current chunk to chunks list and start a new chunk |
67 | 84 | chunks.append(current_chunk) |
68 | 85 | current_chunk = sentence |
69 | 86 | current_chunk_token_count = len(tokens) |
70 | 87 |
|
| 88 | + # Add the last chunk |
71 | 89 | if current_chunk: |
72 | 90 | chunks.append(current_chunk) |
73 | 91 |
|
74 | 92 | return chunks |
75 | 93 |
|
76 | 94 |
|
77 | 95 | account_name = get_secrets_from_kv(key_vault_name, "ADLS-ACCOUNT-NAME") |
| 96 | + |
78 | 97 | account_url = f"https://{account_name}.dfs.core.windows.net" |
| 98 | + |
79 | 99 | credential = DefaultAzureCredential() |
80 | 100 | service_client = DataLakeServiceClient(account_url, credential=credential, api_version='2023-01-03') |
| 101 | + |
81 | 102 | file_system_client = service_client.get_file_system_client(file_system_client_name) |
82 | | -paths = file_system_client.get_paths(path=directory) |
| 103 | + |
| 104 | +directory_name = directory |
| 105 | +paths = file_system_client.get_paths(path=directory_name) |
| 106 | +print(paths) |
| 107 | + |
83 | 108 | index_name = "pdf_index" |
| 109 | + |
84 | 110 | search_credential = AzureKeyCredential(search_key) |
| 111 | + |
85 | 112 | search_client = SearchClient(search_endpoint, index_name, search_credential) |
86 | 113 | index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential) |
87 | 114 |
|
88 | 115 |
|
89 | 116 | def prepare_search_doc(content, document_id): |
90 | 117 | chunks = chunk_data(content) |
91 | | - docs = [] |
92 | | - for chunk_num, chunk in enumerate(chunks, start=1): |
93 | | - chunk_id = f"{document_id}_{str(chunk_num).zfill(2)}" |
| 118 | + chunk_num = 0 |
| 119 | + for chunk in chunks: |
| 120 | + chunk_num += 1 |
| 121 | + chunk_id = document_id + '_' + str(chunk_num).zfill(2) |
| 122 | + |
94 | 123 | try: |
95 | | - v_contentVector = get_embeddings(chunk, openai_api_base, openai_api_version, openai_api_key) |
96 | | - except Exception: |
| 124 | + v_contentVector = get_embeddings(str(chunk), openai_api_base, openai_api_version, openai_api_key) |
| 125 | + except Exception as e: |
| 126 | + print(f"Error occurred: {e}. Retrying after 30 seconds...") |
97 | 127 | time.sleep(30) |
98 | 128 | try: |
99 | | - v_contentVector = get_embeddings(chunk, openai_api_base, openai_api_version, openai_api_key) |
100 | | - except Exception: |
| 129 | + v_contentVector = get_embeddings(str(chunk), openai_api_base, openai_api_version, openai_api_key) |
| 130 | + except Exception as e: |
| 131 | + print(f"Retry failed: {e}. Setting v_contentVector to an empty list.") |
101 | 132 | v_contentVector = [] |
| 133 | + |
102 | 134 | result = { |
103 | 135 | "id": chunk_id, |
104 | 136 | "chunk_id": chunk_id, |
105 | 137 | "content": chunk, |
106 | 138 | "sourceurl": path.name.split('/')[-1], |
107 | 139 | "contentVector": v_contentVector |
108 | 140 | } |
109 | | - docs.append(result) |
110 | | - return docs |
| 141 | + return result |
111 | 142 |
|
112 | 143 |
|
| 144 | +# conversationIds = [] |
113 | 145 | docs = [] |
114 | 146 | counter = 0 |
| 147 | + |
| 148 | + |
115 | 149 | for path in paths: |
116 | 150 | file_client = file_system_client.get_file_client(path.name) |
117 | 151 | pdf_file = file_client.download_file() |
| 152 | + |
118 | 153 | stream = BytesIO() |
119 | 154 | pdf_file.readinto(stream) |
120 | 155 | pdf_reader = pypdf.PdfReader(stream) |
121 | 156 | filename = path.name.split('/')[-1] |
122 | 157 | document_id = filename.split('_')[1].replace('.pdf', '') |
123 | | - text = ''.join(page.extract_text() for page in pdf_reader.pages) |
124 | | - docs.extend(prepare_search_doc(text, document_id)) |
| 158 | + |
| 159 | + text = '' |
| 160 | + num_pages = len(pdf_reader.pages) |
| 161 | + for page_num in range(num_pages): |
| 162 | + |
| 163 | + page = pdf_reader.pages[page_num] |
| 164 | + text += page.extract_text() |
| 165 | + result = prepare_search_doc(text, document_id) |
| 166 | + docs.append(result) |
| 167 | + |
125 | 168 | counter += 1 |
126 | | - if docs and counter % 10 == 0: |
127 | | - search_client.upload_documents(documents=docs) |
| 169 | + if docs != [] and counter % 10 == 0: |
| 170 | + result = search_client.upload_documents(documents=docs) |
128 | 171 | docs = [] |
129 | | - print(f'{counter} uploaded') |
| 172 | + print(f' {str(counter)} uploaded') |
130 | 173 |
|
131 | | -if docs: |
132 | | - search_client.upload_documents(documents=docs) |
| 174 | +if docs != []: |
| 175 | + results = search_client.upload_documents(documents=docs) |
0 commit comments