Skip to content

Commit 6754f9e

Browse files
committed
Merge branch 'main' into feature/text2sql-modulisation
2 parents 1e43911 + 2edb05c commit 6754f9e

27 files changed

+368
-514
lines changed

deploy_ai_search/.env

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ StorageAccount__FQEndpoint=<Fully qualified endpoint in form ResourceId=resource
1515
StorageAccount__ConnectionString=<connectionString if using non managed identity. In format: DefaultEndpointsProtocol=https;AccountName=<STG NAME>;AccountKey=<ACCOUNT KEY>;EndpointSuffix=core.windows.net>
1616
StorageAccount__RagDocuments__Container=<containerName>
1717
StorageAccount__Text2SqlSchemaStore__Container=<containerName>
18+
StorageAccount__Text2SqlQueryCache__Container=<containerName>
1819
OpenAI__ApiKey=<openAIKey if using non managed identity>
1920
OpenAI__Endpoint=<openAIEndpoint>
2021
OpenAI__EmbeddingModel=<openAIEmbeddingModelName>

deploy_ai_search/README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,19 @@ The associated scripts in this portion of the repository contains pre-built scri
2424
- `index_type text_2_sql_schema_store`. This selects the `Text2SQLSchemaStoreAISearch` sub class.
2525
- `rebuild`. Whether to delete and rebuild the index.
2626
- `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version.
27-
- `single_data_dictionary`. Optional parameter that controls whether you will be uploading a single data dictionary, or a data dictionary file per entity. By default, this is set to False.
27+
- `single_data_dictionary_file`. Optional parameter that controls whether you will be uploading a single data dictionary, or a data dictionary file per entity. By default, this is set to False.
2828

2929
### Query Cache Index
3030

3131
1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication.
32-
2. Adjust `text_2_sql_query_cache.py` with any changes to the index. **There is no provided indexer or skillset for this cache, it is expected that application code will write directly to it. See the details in the Text2SQL README for different cache strategies.**
32+
2. Adjust `text_2_sql_query_cache.py` with any changes to the index. **There is an optional provided indexer or skillset for this cache. You may instead want the application code will write directly to it. See the details in the Text2SQL README for different cache strategies.**
3333
3. Run `deploy.py` with the following args:
3434

3535
- `index_type text_2_sql_query_cache`. This selects the `Text2SQLQueryCacheAISearch` sub class.
3636
- `rebuild`. Whether to delete and rebuild the index.
3737
- `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version.
38+
- `enable_cache_indexer`. Optional parameter that will enable the query cache indexer. Defaults to False.
39+
- `single_cache__file`. Optional parameter that controls whether you will be uploading a single data dictionary, or a data dictionary file per entity. By default, this is set to False.
3840

3941
## ai_search.py & environment.py
4042

deploy_ai_search/deploy.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,14 @@ def deploy_config(arguments: argparse.Namespace):
2424
index_config = Text2SqlSchemaStoreAISearch(
2525
suffix=arguments.suffix,
2626
rebuild=arguments.rebuild,
27-
single_data_dictionary=arguments.single_data_dictionary,
27+
single_data_dictionary_file=arguments.single_data_dictionary_file,
2828
)
2929
elif arguments.index_type == "text_2_sql_query_cache":
3030
index_config = Text2SqlQueryCacheAISearch(
31-
suffix=arguments.suffix, rebuild=arguments.rebuild
31+
suffix=arguments.suffix,
32+
rebuild=arguments.rebuild,
33+
single_query_cache_file=arguments.single_query_cache_file,
34+
enable_query_cache_indexer=arguments.enable_query_cache_indexer,
3235
)
3336
else:
3437
raise ValueError("Invalid Indexer Type")
@@ -60,11 +63,23 @@ def deploy_config(arguments: argparse.Namespace):
6063
help="Whether want to enable chunking by page in adi skill, if no value is passed considered False",
6164
)
6265
parser.add_argument(
63-
"--single_data_dictionary",
66+
"--single_data_dictionary_file",
6467
type=bool,
6568
required=False,
6669
help="Whether or not a single data dictionary file should be uploaded, or one per entity",
6770
)
71+
parser.add_argument(
72+
"--single_query_cache_file",
73+
type=bool,
74+
required=False,
75+
help="Whether or not a single cache file should be uploaded, or one per question",
76+
)
77+
parser.add_argument(
78+
"--enable_query_cache_indexer",
79+
type=bool,
80+
required=False,
81+
help="Whether or not the sql query cache indexer should be enabled",
82+
)
6883
parser.add_argument(
6984
"--suffix",
7085
type=str,

deploy_ai_search/text_2_sql_query_cache.py

Lines changed: 127 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,20 @@
55
SearchFieldDataType,
66
SearchField,
77
SearchableField,
8-
SimpleField,
9-
ComplexField,
108
SemanticField,
119
SemanticPrioritizedFields,
1210
SemanticConfiguration,
1311
SemanticSearch,
12+
SearchIndexer,
13+
FieldMapping,
14+
SimpleField,
15+
ComplexField,
16+
IndexingParameters,
17+
IndexingParametersConfiguration,
18+
BlobIndexerDataToExtract,
19+
IndexerExecutionEnvironment,
20+
BlobIndexerParsingMode,
21+
FieldMappingFunction,
1422
)
1523
from ai_search import AISearch
1624
from environment import (
@@ -21,16 +29,30 @@
2129
class Text2SqlQueryCacheAISearch(AISearch):
2230
"""This class is used to deploy the sql index."""
2331

24-
def __init__(self, suffix: str | None = None, rebuild: bool | None = False):
32+
def __init__(
33+
self,
34+
suffix: str | None = None,
35+
rebuild: bool | None = False,
36+
single_query_cache_file: bool | None = False,
37+
enable_query_cache_indexer: bool | None = False,
38+
):
2539
"""Initialize the Text2SqlAISearch class. This class implements the deployment of the sql index.
2640
2741
Args:
2842
suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer.
2943
rebuild (bool, optional): Whether to rebuild the index. Defaults to False.
44+
single_query_cache_file (bool, optional): Whether to use a single cache file. Defaults to False. Only applies if the cache indexer is enabled.
45+
enable_query_cache_indexer (bool, optional): Whether to enable cache indexer. Defaults to False.
3046
"""
3147
self.indexer_type = IndexerType.TEXT_2_SQL_QUERY_CACHE
48+
self.enable_query_cache_indexer = enable_query_cache_indexer
3249
super().__init__(suffix, rebuild)
3350

51+
if single_query_cache_file:
52+
self.parsing_mode = BlobIndexerParsingMode.JSON_ARRAY
53+
else:
54+
self.parsing_mode = BlobIndexerParsingMode.JSON
55+
3456
def get_index_fields(self) -> list[SearchableField]:
3557
"""This function returns the index fields for sql index.
3658
@@ -56,6 +78,11 @@ def get_index_fields(self) -> list[SearchableField]:
5678
name="SqlQueryDecomposition",
5779
collection=True,
5880
fields=[
81+
SearchableField(
82+
name="SubQuestion",
83+
type=SearchFieldDataType.String,
84+
filterable=True,
85+
),
5986
SearchableField(
6087
name="SqlQuery",
6188
type=SearchFieldDataType.String,
@@ -130,3 +157,100 @@ def get_semantic_search(self) -> SemanticSearch:
130157
semantic_search = SemanticSearch(configurations=[semantic_config])
131158

132159
return semantic_search
160+
161+
def get_skills(self) -> list:
162+
"""Get the skillset for the indexer.
163+
164+
Returns:
165+
list: The skillsets used in the indexer"""
166+
167+
if self.enable_query_cache_indexer is False:
168+
return []
169+
170+
embedding_skill = self.get_vector_skill(
171+
"/document", "/document/Question", target_name="QuestionEmbedding"
172+
)
173+
174+
skills = [embedding_skill]
175+
176+
return skills
177+
178+
def get_indexer(self) -> SearchIndexer:
179+
"""This function returns the indexer for sql.
180+
181+
Returns:
182+
SearchIndexer: The indexer for sql"""
183+
184+
if self.enable_query_cache_indexer is False:
185+
return None
186+
187+
# Only place on schedule if it is not a test deployment
188+
if self.test:
189+
schedule = None
190+
batch_size = 4
191+
else:
192+
schedule = {"interval": "PT24H"}
193+
batch_size = 16
194+
195+
if self.environment.use_private_endpoint:
196+
execution_environment = IndexerExecutionEnvironment.PRIVATE
197+
else:
198+
execution_environment = IndexerExecutionEnvironment.STANDARD
199+
200+
indexer_parameters = IndexingParameters(
201+
batch_size=batch_size,
202+
configuration=IndexingParametersConfiguration(
203+
data_to_extract=BlobIndexerDataToExtract.CONTENT_AND_METADATA,
204+
query_timeout=None,
205+
execution_environment=execution_environment,
206+
fail_on_unprocessable_document=False,
207+
fail_on_unsupported_content_type=False,
208+
index_storage_metadata_only_for_oversized_documents=True,
209+
indexed_file_name_extensions=".json",
210+
parsing_mode=self.parsing_mode,
211+
),
212+
max_failed_items=5,
213+
)
214+
215+
indexer = SearchIndexer(
216+
name=self.indexer_name,
217+
description="Indexer to sql entities and generate embeddings",
218+
skillset_name=self.skillset_name,
219+
target_index_name=self.index_name,
220+
data_source_name=self.data_source_name,
221+
schedule=schedule,
222+
field_mappings=[
223+
FieldMapping(
224+
source_field_name="metadata_storage_last_modified",
225+
target_field_name="DateLastModified",
226+
)
227+
],
228+
output_field_mappings=[
229+
FieldMapping(
230+
source_field_name="/document/Question",
231+
target_field_name="Id",
232+
mapping_function=FieldMappingFunction(
233+
name="base64Encode",
234+
parameters={"useHttpServerUtilityUrlTokenEncode": False},
235+
),
236+
),
237+
FieldMapping(
238+
source_field_name="/document/Question", target_field_name="Question"
239+
),
240+
FieldMapping(
241+
source_field_name="/document/QuestionEmbedding",
242+
target_field_name="QuestionEmbedding",
243+
),
244+
FieldMapping(
245+
source_field_name="/document/SqlQueryDecomposition",
246+
target_field_name="SqlQueryDecomposition",
247+
),
248+
FieldMapping(
249+
source_field_name="/document/DateLastModified",
250+
target_field_name="DateLastModified",
251+
),
252+
],
253+
parameters=indexer_parameters,
254+
)
255+
256+
return indexer

deploy_ai_search/text_2_sql_schema_store.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def __init__(
4343
self,
4444
suffix: str | None = None,
4545
rebuild: bool | None = False,
46-
single_data_dictionary: bool | None = False,
46+
single_data_dictionary_file: bool | None = False,
4747
):
4848
"""Initialize the Text2SqlAISearch class. This class implements the deployment of the sql index.
4949
@@ -57,7 +57,7 @@ def __init__(
5757
]
5858
super().__init__(suffix, rebuild)
5959

60-
if single_data_dictionary:
60+
if single_data_dictionary_file:
6161
self.parsing_mode = BlobIndexerParsingMode.JSON_ARRAY
6262
else:
6363
self.parsing_mode = BlobIndexerParsingMode.JSON

text_2_sql/autogen/agentic_text_2_sql.ipynb renamed to text_2_sql/autogen/Iteration 5 - Agentic Vector Based Text2SQL.ipynb

File renamed without changes.

text_2_sql/autogen/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ As the query cache is shared between users (no data is stored in the cache), a n
2020

2121
## Provided Notebooks & Scripts
2222

23-
- `./agentic_text_2_sql.ipynb` provides example of how to utilise the Agentic Vector Based Text2SQL approach to query the database. The query cache plugin will be enabled or disabled depending on the environmental parameters.
23+
- `./Iteration 5 - Agentic Vector Based Text2SQL.ipynb` provides example of how to utilise the Agentic Vector Based Text2SQL approach to query the database. The query cache plugin will be enabled or disabled depending on the environmental parameters.
2424

2525
## Agents
2626

text_2_sql/autogen/agentic_text_2_sql.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Licensed under the MIT License.
33
from autogen_agentchat.task import TextMentionTermination, MaxMessageTermination
44
from autogen_agentchat.teams import SelectorGroupChat
5-
from utils.models import MINI_MODEL
5+
from utils.llm_model_creator import LLMModelCreator
66
from utils.llm_agent_creator import LLMAgentCreator
77
import logging
88
from agents.custom_agents.sql_query_cache_agent import SqlQueryCacheAgent
@@ -86,13 +86,17 @@ def selector(messages):
8686
and messages[-1].content is not None
8787
):
8888
cache_result = json.loads(messages[-1].content)
89-
if cache_result.get("cached_questions_and_schemas") is not None:
89+
if cache_result.get(
90+
"cached_questions_and_schemas"
91+
) is not None and cache_result.get("contains_pre_run_results"):
9092
decision = "sql_query_correction_agent"
93+
if (
94+
cache_result.get("cached_questions_and_schemas") is not None
95+
and cache_result.get("contains_pre_run_results") is False
96+
):
97+
decision = "sql_query_generation_agent"
9198
else:
92-
decision = "sql_schema_selection_agent"
93-
94-
elif messages[-1].source == "sql_query_cache_agent":
95-
decision = "question_decomposition_agent"
99+
decision = "question_decomposition_agent"
96100

97101
elif messages[-1].source == "question_decomposition_agent":
98102
decomposition_result = json.loads(messages[-1].content)
@@ -129,7 +133,7 @@ def agentic_flow(self):
129133
agentic_flow = SelectorGroupChat(
130134
self.agents,
131135
allow_repeated_speaker=False,
132-
model_client=MINI_MODEL,
136+
model_client=LLMModelCreator.get_model("4o-mini"),
133137
termination_condition=self.termination_condition,
134138
selector_func=AgenticText2Sql.selector,
135139
)

text_2_sql/autogen/agents/llm_agents/answer_agent.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
model:
2-
gpt-4o-mini
2+
4o-mini
33
description:
44
"An agent that takes the final results from the SQL query and writes the answer to the user's question"
55
system_message:

text_2_sql/autogen/agents/llm_agents/question_decomposition_agent.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
model:
2-
gpt-4o-mini
2+
4o-mini
33
description:
44
"An agent that will decompose the user's question into smaller parts to be used in the SQL queries. Use this agent when the user's question is too complex to be answered in one SQL query. Only use if the user's question is too complex to be answered in one SQL query."
55
system_message:

0 commit comments

Comments
 (0)