-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindexation.py
More file actions
208 lines (169 loc) · 7.49 KB
/
indexation.py
File metadata and controls
208 lines (169 loc) · 7.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import pandas as pd
from sentence_transformers import SentenceTransformer
import os
import json
import pathlib
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
SearchableField,
SearchField,
SimpleField,
SearchFieldDataType,
SemanticField,
VectorSearchProfile,
SemanticSettings,
VectorSearchAlgorithmKind,
HnswVectorSearchAlgorithmConfiguration,
SemanticConfiguration,
SearchIndex,
PrioritizedFields,
VectorSearch,
HnswParameters,
)
# Load pre-trained SentenceTransformer model
MODEL = SentenceTransformer('all-MiniLM-L6-v2')
# Load environment variables from .env file
load_dotenv(override=True)
# Fetch Azure service endpoint, index name, and API key from environment variables
search_endpoint = os.environ.get("SEACRH_ENDPOINT")
index_name = os.environ.get("INDEX_NAME")
api_key = os.environ.get("API_KEY")
credential = AzureKeyCredential(api_key)
def creating_index_from_excel(file: pd.DataFrame, knowledge_base: list) -> list:
"""
Creating a knowledge base from the provided Excel file.
This functino extracts data from the DataFrame, encodes the definitions using
a pre-trained model, and appends each document to the knowledge base list.
Args:
file (pd.DataFrame): The DataFrame containing the data loaded from Excel.
knowledge_base (list): The list to which each document will be appended.
Returns:
list: The updated knowledge base list with generated documents.
"""
id = 0
for index, row in file.iterrows():
# Only process rows where 'DEF' is not NaN
if not f'{row["DEF"]}'=="nan":
document = {
"id": f'{id}',
"Label": (row["LIBELLE"] if not f'{row["LIBELLE"]}' == "nan" else ""),
"Definition": (row["DEF"] if not f'{row["DEF"]}' == "nan" else ""),
"Label_def_vector": MODEL.encode(f'{row["LIBELLE"]}: {row["DEF"]}').tolist()
}
knowledge_base.append(document)
id += 1
return knowledge_base
def knowledge_base_to_json(knowledge_base):
"""
Save each document in the knowledge base to a separate JSON file.
This function writes each document in the knowledge base to a file in the 'eurovoc'
directory with a sequential naming convention.
Args:
knowledge_base(list): The list of documents to be saved as JSON files.
"""
path = os.getcwd()
id = 1
for doc in knowledge_base:
id += 1
file_path = os.path.join(path, 'eurovoc', f'eurovoc_{id}.json')
# Write document data to JSON file
with open(file_path, "w") as file:
json.dump(doc, file)
def create_or_update_index_on_Azure():
"""
Create or update an Azure AI Search Index.
This function defines the structure of the index, including fields and their types,
as well as vector search and semantic configurations, and submits to Azure.
"""
index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)
# Define the fields for the index
fields = [
SimpleField(name="id", type=SearchFieldDataType.String, key=True, retrievable=True),
SearchableField(name="Label", type=SearchFieldDataType.String, filterable=True, retrievable=True),
SearchableField(name="Definition", type=SearchFieldDataType.String, filterable=True, retrievable=True),
SearchField(name="Label_def_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=384, vector_search_profile="myHnswProfile")
]
# Configure vector search settings
vector_search = VectorSearch(
algorithms=[HnswVectorSearchAlgorithmConfiguration(name="myHnsw", kind=VectorSearchAlgorithmKind.HNSW,
parameters=HnswParameters(m=4, ef_construction=400, ef_search=500, metric="cosine"))],
profiles=[VectorSearchProfile(name="myHnswProfile", algorithm="myHnsw")],
)
# Define semantic configuratino
semantic_config = SemanticConfiguration(
name="eurovoc-poc-semantic-tagging",
prioritized_fields=PrioritizedFields(
title_field=SemanticField(field_name="Label"),
prioritized_content_fields=[SemanticField(field_name="Definition")],
prioritized_keywords_fields=[SemanticField(field_name="Label")]
)
)
# Combine semantic configurations
semantic_settings = SemanticSettings(configurations=[semantic_config])
# Create the index
index = SearchIndex(
name = index_name, fields = fields, vector_search=vector_search, semantic_settings=semantic_settings
)
# Submit the index creation or update the request to Azure
result = index_client.create_or_update_index(index)
print(f" {result.name} created")
def get_search_client():
"""
Create and return a SearchClient instance for interacting with Azure AI search.
Returns:
SearchClient: An instance of SearchClient, configured with endpoint, index name, and credentials.
"""
return SearchClient(endpoint=search_endpoint, index_name=index_name, credential=credential)
def upload_index_to_Azure(folder_path: str):
"""
Upload JSON documents from a specified folder to Azure AI Search.
This function loads JSON files from the given folder and uploads each document
to the Azure search index. It logs any failures that occur during the upload process.
Args:
folder_path (str): The path to the folder containing the JSON files to upload.
"""
index_client = get_search_client()
index_entry_path = pathlib.Path(folder_path).glob("*")
failed_to_upload = [] # List to keep track of failed upload attempts
for entry_path in index_entry_path:
print(entry_path)
try:
with open(entry_path) as file:
index = json.load(file)
print(f"Uploading {index['id']}")
index_client.upload_documents(documents=[index])
except Exception as e:
print(entry_path)
print(e)
failed_to_upload.append(entry_path) # Record the path of the failed upload
print(failed_to_upload)
def create_index():
"""
Coordinate the creation of the Azure search index.
This function reads an Excel file, prepares the knowledge base by converting it
to JSON documents, creates or updates the Azure search index, and uploads the
documents to Azure.
This is the main function that consolidates the steps necessary to set up
the search infrastructure.
"""
path = os.getcwd()
index_folder = os.path.join(path, 'eurovoc')
eurovoc_path = os.path.join(path, 'EuroVoc.xlsx')
# Load Excel file into DataFrame
eurovoc = pd.read_excel(eurovoc_path, sheet_name="desc_en")
# Prepare knowledge base documents from Excel data
print("DATA PREPARATION: create embedded_index")
knowledge_base = []
creating_index_from_excel(eurovoc, knowledge_base)
knowledge_base_to_json(knowledge_base)
# Create or update the index on Azure
print("DATA PREPARATION: create_or_update_index_on_azure")
create_or_update_index_on_Azure()
# Upload the index documents to Azure
print("DATA PREPARATION: upload_index_to_azure")
upload_index_to_Azure(index_folder)
if __name__ == "__main__":
create_index()