Skip to content

Commit 685e56f

Browse files
Prasanjeet-MicrosoftVamshi-MicrosoftPrajwal-Microsoft
authored
fix: add missing 'text' and 'layoutText' fields for Azure Search integrated vectorization (#1839)
Co-authored-by: Vamshi-Microsoft <[email protected]> Co-authored-by: Prajwal-Microsoft <[email protected]>
1 parent fc4e2a2 commit 685e56f

28 files changed

+746
-475
lines changed

.github/workflows/broken-links-checker.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ on:
66
- '**/*.md'
77
workflow_dispatch:
88

9+
permissions:
10+
contents: read
11+
912
jobs:
1013
markdown-link-check:
1114
name: Check Markdown Broken Links
@@ -26,14 +29,14 @@ jobs:
2629
echo "md_files<<EOF" >> $GITHUB_OUTPUT
2730
echo "$files" >> $GITHUB_OUTPUT
2831
echo "EOF" >> $GITHUB_OUTPUT
29-
3032
- name: Check Broken Links in Added/Modified Files (PR)
3133
if: github.event_name == 'pull_request' && steps.changed-files.outputs.md_files != ''
3234
uses: lycheeverse/[email protected]
3335
with:
3436
args: >
3537
--verbose --exclude-mail --no-progress --exclude ^https?://
3638
${{ steps.changed-files.outputs.md_files }}
39+
failIfEmpty: false
3740
env:
3841
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
3942

@@ -44,6 +47,7 @@ jobs:
4447
args: >
4548
--verbose --exclude-mail --no-progress --exclude ^https?://
4649
'**/*.md'
50+
failIfEmpty: false
4751
output: lychee/out.md
4852
env:
4953
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

code/backend/batch/utilities/helpers/azure_search_helper.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,25 @@ def create_index(self):
132132
filterable=True,
133133
),
134134
]
135+
if self.env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION:
136+
logger.info("Adding 'text' field for integrated vectorization.")
137+
fields.append(
138+
SearchableField(
139+
name=self.env_helper.AZURE_SEARCH_TEXT_COLUMN,
140+
type=SearchFieldDataType.String,
141+
filterable=False,
142+
sortable=False,
143+
)
144+
)
145+
logger.info("Adding 'layoutText' field for integrated vectorization.")
146+
fields.append(
147+
SearchableField(
148+
name=self.env_helper.AZURE_SEARCH_LAYOUT_TEXT_COLUMN,
149+
type=SearchFieldDataType.String,
150+
filterable=False,
151+
sortable=False,
152+
)
153+
)
135154

136155
if self.env_helper.USE_ADVANCED_IMAGE_PROCESSING:
137156
logger.info("Adding image_vector field to index")
@@ -274,7 +293,7 @@ def get_conversation_logger(self):
274293
embedding_function=self.llm_helper.get_embedding_model().embed_query,
275294
fields=fields,
276295
user_agent="langchain chatwithyourdata-sa",
277-
credential=credential # Add token credential or send none so it is auto handled by AzureSearch library
296+
credential=credential, # Add token credential or send none so it is auto handled by AzureSearch library
278297
)
279298
else:
280299
return AzureSearch(

code/backend/batch/utilities/helpers/env_helper.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ def __load_config(self, **kwargs) -> None:
7676
self.AZURE_SEARCH_SOURCE_COLUMN = os.getenv(
7777
"AZURE_SEARCH_SOURCE_COLUMN", "source"
7878
)
79+
self.AZURE_SEARCH_TEXT_COLUMN = os.getenv("AZURE_SEARCH_TEXT_COLUMN", "text")
80+
self.AZURE_SEARCH_LAYOUT_TEXT_COLUMN = os.getenv(
81+
"AZURE_SEARCH_LAYOUT_TEXT_COLUMN", "layoutText"
82+
)
7983
self.AZURE_SEARCH_CHUNK_COLUMN = os.getenv("AZURE_SEARCH_CHUNK_COLUMN", "chunk")
8084
self.AZURE_SEARCH_OFFSET_COLUMN = os.getenv(
8185
"AZURE_SEARCH_OFFSET_COLUMN", "offset"
@@ -173,9 +177,7 @@ def __load_config(self, **kwargs) -> None:
173177
self.AZURE_OPENAI_MODEL_NAME = azure_openai_model_info.get("modelName", "")
174178
else:
175179
# Otherwise, fallback to individual environment variables
176-
self.AZURE_OPENAI_MODEL = os.getenv(
177-
"AZURE_OPENAI_MODEL", "gpt-4.1"
178-
)
180+
self.AZURE_OPENAI_MODEL = os.getenv("AZURE_OPENAI_MODEL", "gpt-4.1")
179181
self.AZURE_OPENAI_MODEL_NAME = os.getenv(
180182
"AZURE_OPENAI_MODEL_NAME", "gpt-4.1"
181183
)

code/backend/batch/utilities/integrated_vectorization/azure_search_index.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,26 @@ def create_or_update_index(self):
9999
),
100100
]
101101

102+
if self.env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION:
103+
logger.info("Adding `text` field for integrated vectorization.")
104+
fields.append(
105+
SearchableField(
106+
name="text",
107+
type=SearchFieldDataType.String,
108+
filterable=False,
109+
sortable=False,
110+
)
111+
)
112+
logger.info("Adding `layoutText` field for integrated vectorization.")
113+
fields.append(
114+
SearchableField(
115+
name="layoutText",
116+
type=SearchFieldDataType.String,
117+
filterable=False,
118+
sortable=False,
119+
)
120+
)
121+
102122
vector_search = self.get_vector_search_config()
103123

104124
semantic_search = self.get_semantic_search_config()

code/create_app.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,11 @@ def conversation_with_data(conversation: Request, env_helper: EnvHelper):
209209
env_helper.AZURE_SEARCH_CONTENT_VECTOR_COLUMN
210210
],
211211
"title_field": env_helper.AZURE_SEARCH_TITLE_COLUMN or None,
212+
"source_field": env_helper.AZURE_SEARCH_SOURCE_COLUMN
213+
or None,
214+
"text_field": env_helper.AZURE_SEARCH_TEXT_COLUMN or None,
215+
"layoutText_field": env_helper.AZURE_SEARCH_LAYOUT_TEXT_COLUMN
216+
or None,
212217
"url_field": env_helper.AZURE_SEARCH_FIELDS_METADATA
213218
or None,
214219
"filepath_field": (

code/tests/functional/tests/backend_api/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ existing set of tests, rather than creating a new set, as this removes the need
1313
starting up a new instance of the application on another port.
1414

1515
New environment variables common to all tests can be directly added to the `config`
16-
dict in [app_config.py](../app_config.py), while variables only needed for one set
16+
dict in [app_config.py](../../app_config.py), while variables only needed for one set
1717
of tests can be added to the `app_config` fixture in the respective `conftest.py`
1818
file, e.g. [./default/conftest.py](./default/conftest.py).
1919

code/tests/functional/tests/backend_api/with_byod/test_conversation_flow.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,9 @@ def test_post_makes_correct_call_to_azure_openai(
147147
"AZURE_SEARCH_FIELDS_METADATA"
148148
),
149149
"filepath_field": "filepath",
150+
"source_field": "source",
151+
"text_field": "text",
152+
"layoutText_field": "layoutText",
150153
},
151154
"filter": app_config.get("AZURE_SEARCH_FILTER"),
152155
"in_scope": True,

code/tests/functional/tests/functions/integrated_vectorization/test_integrated_vectorization_resource_creation.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,9 @@ def test_integrated_vectorization_datasouce_created(
9999
"credentials": {
100100
"connectionString": f"DefaultEndpointsProtocol=https;AccountName={app_config.get_from_json('AZURE_BLOB_STORAGE_INFO','accountName')};AccountKey={app_config.get_from_json('AZURE_BLOB_STORAGE_INFO','accountKey')};EndpointSuffix=core.windows.net"
101101
},
102-
"container": {"name": f"{app_config.get_from_json('AZURE_BLOB_STORAGE_INFO','containerName')}"},
102+
"container": {
103+
"name": f"{app_config.get_from_json('AZURE_BLOB_STORAGE_INFO','containerName')}"
104+
},
103105
"dataDeletionDetectionPolicy": {
104106
"@odata.type": "#Microsoft.Azure.Search.NativeBlobSoftDeleteDeletionDetectionPolicy"
105107
},
@@ -210,6 +212,26 @@ def test_integrated_vectorization_index_created(
210212
"facetable": True,
211213
"analyzer": "keyword",
212214
},
215+
{
216+
"name": "text",
217+
"type": "Edm.String",
218+
"key": False,
219+
"retrievable": True,
220+
"searchable": True,
221+
"filterable": False,
222+
"sortable": False,
223+
"facetable": False,
224+
},
225+
{
226+
"name": "layoutText",
227+
"type": "Edm.String",
228+
"key": False,
229+
"retrievable": True,
230+
"searchable": True,
231+
"filterable": False,
232+
"sortable": False,
233+
"facetable": False,
234+
},
213235
],
214236
"semantic": {
215237
"configurations": [

code/tests/test_app.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525
AZURE_SEARCH_CONTENT_COLUMN = "field1|field2"
2626
AZURE_SEARCH_CONTENT_VECTOR_COLUMN = "vector-column"
2727
AZURE_SEARCH_TITLE_COLUMN = "title"
28+
AZURE_SEARCH_SOURCE_COLUMN = "source"
29+
AZURE_SEARCH_TEXT_COLUMN = "text"
30+
AZURE_SEARCH_LAYOUT_TEXT_COLUMN = "layoutText"
2831
AZURE_SEARCH_FILENAME_COLUMN = "filename"
2932
AZURE_SEARCH_URL_COLUMN = "metadata"
3033
AZURE_SEARCH_FILTER = "filter"
@@ -73,6 +76,9 @@ def env_helper_mock():
7376
AZURE_SEARCH_CONTENT_VECTOR_COLUMN
7477
)
7578
env_helper.AZURE_SEARCH_TITLE_COLUMN = AZURE_SEARCH_TITLE_COLUMN
79+
env_helper.AZURE_SEARCH_SOURCE_COLUMN = AZURE_SEARCH_SOURCE_COLUMN
80+
env_helper.AZURE_SEARCH_TEXT_COLUMN = AZURE_SEARCH_TEXT_COLUMN
81+
env_helper.AZURE_SEARCH_LAYOUT_TEXT_COLUMN = AZURE_SEARCH_LAYOUT_TEXT_COLUMN
7682
env_helper.AZURE_SEARCH_FILENAME_COLUMN = AZURE_SEARCH_FILENAME_COLUMN
7783
env_helper.AZURE_SEARCH_URL_COLUMN = AZURE_SEARCH_URL_COLUMN
7884
env_helper.AZURE_SEARCH_FILTER = AZURE_SEARCH_FILTER
@@ -684,6 +690,9 @@ def test_conversation_azure_byod_returns_correct_response_when_streaming_with_da
684690
"title_field": AZURE_SEARCH_TITLE_COLUMN,
685691
"url_field": env_helper_mock.AZURE_SEARCH_FIELDS_METADATA,
686692
"filepath_field": AZURE_SEARCH_FILENAME_COLUMN,
693+
"source_field": AZURE_SEARCH_SOURCE_COLUMN,
694+
"text_field": AZURE_SEARCH_TEXT_COLUMN,
695+
"layoutText_field": AZURE_SEARCH_LAYOUT_TEXT_COLUMN,
687696
},
688697
"filter": AZURE_SEARCH_FILTER,
689698
"in_scope": AZURE_SEARCH_ENABLE_IN_DOMAIN,

code/tests/utilities/helpers/test_azure_search_helper.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
AZURE_SEARCH_TITLE_COLUMN = "mock-title"
3333
AZURE_SEARCH_FIELDS_METADATA = "mock-metadata"
3434
AZURE_SEARCH_SOURCE_COLUMN = "mock-source"
35+
AZURE_SEARCH_TEXT_COLUMN = "mock-text"
36+
AZURE_SEARCH_LAYOUT_TEXT_COLUMN = "mock-layout-text"
3537
AZURE_SEARCH_CHUNK_COLUMN = "mock-chunk"
3638
AZURE_SEARCH_OFFSET_COLUMN = "mock-offset"
3739
AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG = "default"
@@ -78,6 +80,8 @@ def env_helper_mock():
7880
env_helper.AZURE_SEARCH_TITLE_COLUMN = AZURE_SEARCH_TITLE_COLUMN
7981
env_helper.AZURE_SEARCH_FIELDS_METADATA = AZURE_SEARCH_FIELDS_METADATA
8082
env_helper.AZURE_SEARCH_SOURCE_COLUMN = AZURE_SEARCH_SOURCE_COLUMN
83+
env_helper.AZURE_SEARCH_TEXT_COLUMN = AZURE_SEARCH_TEXT_COLUMN
84+
env_helper.AZURE_SEARCH_LAYOUT_TEXT_COLUMN = AZURE_SEARCH_LAYOUT_TEXT_COLUMN
8185
env_helper.AZURE_SEARCH_CHUNK_COLUMN = AZURE_SEARCH_CHUNK_COLUMN
8286
env_helper.AZURE_SEARCH_OFFSET_COLUMN = AZURE_SEARCH_OFFSET_COLUMN
8387
env_helper.AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG = (
@@ -232,6 +236,16 @@ def test_creates_search_index_if_not_exists(
232236
type=SearchFieldDataType.Int32,
233237
filterable=True,
234238
),
239+
SearchableField(
240+
name=AZURE_SEARCH_TEXT_COLUMN,
241+
type=SearchFieldDataType.String,
242+
filterable=False,
243+
),
244+
SearchableField(
245+
name=AZURE_SEARCH_LAYOUT_TEXT_COLUMN,
246+
type=SearchFieldDataType.String,
247+
filterable=False,
248+
),
235249
]
236250

237251
expected_index = SearchIndex(

0 commit comments

Comments
 (0)