Skip to content

Commit 87189c2

Browse files
committed
Add column value store
1 parent 54c2b3a commit 87189c2

File tree

6 files changed

+232
-10
lines changed

6 files changed

+232
-10
lines changed

deploy_ai_search/.env

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ StorageAccount__ConnectionString=<connectionString if using non managed identity
1616
StorageAccount__RagDocuments__Container=<containerName>
1717
StorageAccount__Text2SqlSchemaStore__Container=<containerName>
1818
StorageAccount__Text2SqlQueryCache__Container=<containerName>
19+
StorageAccount__Text2SqlColumnValueStore__Container=<containerName>
1920
OpenAI__ApiKey=<openAIKey if using non managed identity>
2021
OpenAI__Endpoint=<openAIEndpoint>
2122
OpenAI__EmbeddingModel=<openAIEmbeddingModelName>

deploy_ai_search/environment.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class IndexerType(Enum):
1414
RAG_DOCUMENTS = "rag-documents"
1515
TEXT_2_SQL_SCHEMA_STORE = "text-2-sql-schema-store"
1616
TEXT_2_SQL_QUERY_CACHE = "text-2-sql-query-cache"
17+
TEXT_2_SQL_COLUMN_VALUE_STORE = "text-2-sql-column-value-store"
1718

1819

1920
class IdentityType(Enum):

deploy_ai_search/pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,6 @@ dependencies = [
1111
"azure-storage-blob>=12.24.0",
1212
"python-dotenv>=1.0.1",
1313
]
14+
15+
[tool.uv.sources]
16+
text-2-sql-core = { workspace = true }
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
from azure.search.documents.indexes.models import (
5+
SearchFieldDataType,
6+
SearchableField,
7+
SearchIndexer,
8+
FieldMapping,
9+
SimpleField,
10+
IndexingParameters,
11+
IndexingParametersConfiguration,
12+
BlobIndexerDataToExtract,
13+
IndexerExecutionEnvironment,
14+
BlobIndexerParsingMode,
15+
FieldMappingFunction,
16+
)
17+
from ai_search import AISearch
18+
from environment import (
19+
IndexerType,
20+
)
21+
import os
22+
from text_2_sql_core.utils.database import DatabaseEngine
23+
24+
25+
class Text2SqlSchemaStoreAISearch(AISearch):
26+
"""This class is used to deploy the sql index."""
27+
28+
def __init__(
29+
self,
30+
suffix: str | None = None,
31+
rebuild: bool | None = False,
32+
single_data_dictionary_file: bool | None = False,
33+
):
34+
"""Initialize the Text2SqlAISearch class. This class implements the deployment of the sql index.
35+
36+
Args:
37+
suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer.
38+
rebuild (bool, optional): Whether to rebuild the index. Defaults to False.
39+
"""
40+
self.indexer_type = IndexerType.TEXT_2_SQL_COLUMN_VALUE_STORE
41+
self.database_engine = DatabaseEngine[
42+
os.environ["Text2Sql__DatabaseEngine"].upper()
43+
]
44+
super().__init__(suffix, rebuild)
45+
46+
@property
47+
def excluded_fields_for_database_engine(self):
48+
"""A method to get the excluded fields for the database engine."""
49+
50+
all_engine_specific_fields = ["Warehouse", "Database", "Catalog"]
51+
if self.database_engine == DatabaseEngine.SNOWFLAKE:
52+
engine_specific_fields = ["Warehouse", "Database"]
53+
elif self.database_engine == DatabaseEngine.TSQL:
54+
engine_specific_fields = ["Database"]
55+
elif self.database_engine == DatabaseEngine.DATABRICKS:
56+
engine_specific_fields = ["Catalog"]
57+
58+
return [
59+
field
60+
for field in all_engine_specific_fields
61+
if field not in engine_specific_fields
62+
]
63+
64+
def get_index_fields(self) -> list[SearchableField]:
65+
"""This function returns the index fields for sql index.
66+
67+
Returns:
68+
list[SearchableField]: The index fields for sql index"""
69+
70+
fields = [
71+
SimpleField(
72+
name="Id",
73+
type=SearchFieldDataType.String,
74+
key=True,
75+
analyzer_name="keyword",
76+
),
77+
SimpleField(
78+
name="Entity",
79+
type=SearchFieldDataType.String,
80+
),
81+
SimpleField(
82+
name="Database",
83+
type=SearchFieldDataType.String,
84+
),
85+
SimpleField(
86+
name="Warehouse",
87+
type=SearchFieldDataType.String,
88+
),
89+
SimpleField(
90+
name="Catalog",
91+
type=SearchFieldDataType.String,
92+
),
93+
SimpleField(
94+
name="Column",
95+
type=SearchFieldDataType.String,
96+
),
97+
SearchableField(
98+
name="Value",
99+
type=SearchFieldDataType.String,
100+
hidden=False,
101+
),
102+
SimpleField(
103+
name="Synonyms", type=SearchFieldDataType.String, collection=True
104+
),
105+
SimpleField(
106+
name="DateLastModified",
107+
type=SearchFieldDataType.DateTimeOffset,
108+
filterable=True,
109+
),
110+
]
111+
112+
# Remove fields that are not supported by the database engine
113+
fields = [
114+
field
115+
for field in fields
116+
if field.name not in self.excluded_fields_for_database_engine
117+
]
118+
119+
return fields
120+
121+
def get_skills(self) -> list:
122+
"""Get the skillset for the indexer.
123+
124+
Returns:
125+
list: The skillsets used in the indexer"""
126+
127+
skills = []
128+
129+
return skills
130+
131+
def get_indexer(self) -> SearchIndexer:
132+
"""This function returns the indexer for sql.
133+
134+
Returns:
135+
SearchIndexer: The indexer for sql"""
136+
137+
# Only place on schedule if it is not a test deployment
138+
if self.test:
139+
schedule = None
140+
batch_size = 4
141+
else:
142+
schedule = {"interval": "PT24H"}
143+
batch_size = 16
144+
145+
if self.environment.use_private_endpoint:
146+
execution_environment = IndexerExecutionEnvironment.PRIVATE
147+
else:
148+
execution_environment = IndexerExecutionEnvironment.STANDARD
149+
150+
indexer_parameters = IndexingParameters(
151+
batch_size=batch_size,
152+
configuration=IndexingParametersConfiguration(
153+
data_to_extract=BlobIndexerDataToExtract.CONTENT_AND_METADATA,
154+
query_timeout=None,
155+
execution_environment=execution_environment,
156+
fail_on_unprocessable_document=False,
157+
fail_on_unsupported_content_type=False,
158+
index_storage_metadata_only_for_oversized_documents=True,
159+
indexed_file_name_extensions=".jsonl",
160+
parsing_mode=BlobIndexerParsingMode.JSON_LINES,
161+
),
162+
max_failed_items=5,
163+
)
164+
165+
indexer = SearchIndexer(
166+
name=self.indexer_name,
167+
description="Indexer to column values",
168+
skillset_name=self.skillset_name,
169+
target_index_name=self.index_name,
170+
data_source_name=self.data_source_name,
171+
schedule=schedule,
172+
field_mappings=[
173+
FieldMapping(
174+
source_field_name="metadata_storage_last_modified",
175+
target_field_name="DateLastModified",
176+
)
177+
],
178+
output_field_mappings=[
179+
FieldMapping(
180+
source_field_name="/document/Entity",
181+
target_field_name="Id",
182+
mapping_function=FieldMappingFunction(
183+
name="base64Encode",
184+
parameters={"useHttpServerUtilityUrlTokenEncode": False},
185+
),
186+
),
187+
FieldMapping(
188+
source_field_name="/document/Entity", target_field_name="Entity"
189+
),
190+
FieldMapping(
191+
source_field_name="/document/Database",
192+
target_field_name="Database",
193+
),
194+
FieldMapping(
195+
source_field_name="/document/Warehouse",
196+
target_field_name="Warehouse",
197+
),
198+
FieldMapping(
199+
source_field_name="/document/Column",
200+
target_field_name="Column",
201+
),
202+
FieldMapping(
203+
source_field_name="/document/Value",
204+
target_field_name="Value",
205+
),
206+
FieldMapping(
207+
source_field_name="/document/Synonyms",
208+
target_field_name="Synonyms",
209+
),
210+
FieldMapping(
211+
source_field_name="/document/DateLastModified",
212+
target_field_name="DateLastModified",
213+
),
214+
],
215+
parameters=indexer_parameters,
216+
)
217+
218+
# Remove fields that are not supported by the database engine
219+
indexer.output_field_mappings = [
220+
field_mapping
221+
for field_mapping in indexer.output_field_mappings
222+
if field_mapping.target_field_name
223+
not in self.excluded_fields_for_database_engine
224+
]
225+
226+
return indexer

deploy_ai_search/text_2_sql_schema_store.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,7 @@
2525
IndexerType,
2626
)
2727
import os
28-
from enum import StrEnum
29-
30-
31-
class DatabaseEngine(StrEnum):
32-
"""An enumeration to represent a database engine."""
33-
34-
SNOWFLAKE = "SNOWFLAKE"
35-
TSQL = "TSQL"
36-
DATABRICKS = "DATABRICKS"
28+
from text_2_sql_core.utils.database import DatabaseEngine
3729

3830

3931
class Text2SqlSchemaStoreAISearch(AISearch):

text_2_sql/autogen/pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,5 +26,4 @@ dev = [
2626
]
2727

2828
[tool.uv.sources]
29-
autogen-text-2-sql = { workspace = true }
3029
text-2-sql-core = { workspace = true }

0 commit comments

Comments
 (0)