|
5 | 5 | "cell_type": "markdown", |
6 | 6 | "metadata": {}, |
7 | 7 | "source": [ |
8 | | - "# Semantic search using the azure.search.documents library in the Azure SDK for Python" |
| 8 | + "# Semantic ranking using the azure.search.documents library in the Azure SDK for Python" |
9 | 9 | ] |
10 | 10 | }, |
11 | 11 | { |
12 | 12 | "attachments": {}, |
13 | 13 | "cell_type": "markdown", |
14 | 14 | "metadata": {}, |
15 | 15 | "source": [ |
16 | | - "This Jupyter Notebook adds semantic search, using pre-trained models from Microsoft to re-rank results based on a semantic match to the query. " |
| 16 | + "This notebook demonstrates a semantic configuration in a search index and the semanti query syntax for reranking search results." |
17 | 17 | ] |
18 | 18 | }, |
19 | 19 | { |
20 | | - "cell_type": "code", |
21 | | - "execution_count": null, |
22 | | - "metadata": {}, |
23 | | - "outputs": [], |
24 | | - "source": [ |
25 | | - "# Import the libraries needed to create a search index, upload documents, and query the index\n", |
26 | | - "%pip install azure-search-documents\n", |
27 | | - "%pip show azure-search-documents\n", |
28 | | - "%pip install python-dotenv\n", |
29 | | - "\n", |
30 | | - "import os\n", |
31 | | - "from azure.core.credentials import AzureKeyCredential\n", |
32 | | - "from azure.search.documents.indexes import SearchIndexClient \n", |
33 | | - "from azure.search.documents import SearchClient\n", |
34 | | - "from azure.search.documents.indexes.models import ( \n", |
35 | | - " SearchIndex, \n", |
36 | | - " SearchFieldDataType, \n", |
37 | | - " SimpleField, \n", |
38 | | - " SearchableField,\n", |
39 | | - " ComplexField,\n", |
40 | | - " SearchIndex, \n", |
41 | | - " SemanticConfiguration, \n", |
42 | | - " PrioritizedFields, \n", |
43 | | - " SemanticField, \n", |
44 | | - " SemanticSettings, \n", |
45 | | - ")" |
46 | | - ] |
47 | | - }, |
48 | | - { |
49 | | - "attachments": {}, |
50 | 20 | "cell_type": "markdown", |
51 | 21 | "metadata": {}, |
52 | 22 | "source": [ |
53 | | - "In this step, initialize the search client used to make each request. Provide the name and admin API key of your search service. If you get ConnectionError \"Failed to establish a new connection\", verify that the api-key is a primary or secondary admin key, and not a query key." |
| 23 | + "## Install packages and set variables" |
54 | 24 | ] |
55 | 25 | }, |
56 | 26 | { |
|
59 | 29 | "metadata": {}, |
60 | 30 | "outputs": [], |
61 | 31 | "source": [ |
62 | | - "# Set the service endpoint and API key from the environment\n", |
63 | | - "\n", |
64 | | - "service_name = \"<YOUR-SEARCH-SERVICE-NAME>\"\n", |
65 | | - "admin_key = \"<YOUR-SEARCH-SERVICE-ADMIN-KEY>\"\n", |
66 | | - "\n", |
67 | | - "index_name = \"hotels-quickstart\"\n", |
68 | | - "\n", |
69 | | - "# Create an SDK client\n", |
70 | | - "endpoint = \"https://{}.search.windows.net/\".format(service_name)\n", |
71 | | - "admin_client = SearchIndexClient(endpoint=endpoint,\n", |
72 | | - " index_name=index_name,\n", |
73 | | - " credential=AzureKeyCredential(admin_key))\n", |
74 | | - "\n", |
75 | | - "search_client = SearchClient(endpoint=endpoint,\n", |
76 | | - " index_name=index_name,\n", |
77 | | - " credential=AzureKeyCredential(admin_key))\n" |
78 | | - ] |
79 | | - }, |
80 | | - { |
81 | | - "attachments": {}, |
82 | | - "cell_type": "markdown", |
83 | | - "metadata": {}, |
84 | | - "source": [ |
85 | | - "In the next cell, the index \"hotels-quickstart\" will be deleted if it previously existed. This step allows you to reuse the index name." |
| 32 | + "! pip install azure-search-documents==11.6.0b1 --quiet\n", |
| 33 | + "! pip install azure-identity --quiet\n", |
| 34 | + "! pip install python-dotenv --quiet" |
86 | 35 | ] |
87 | 36 | }, |
88 | 37 | { |
|
91 | 40 | "metadata": {}, |
92 | 41 | "outputs": [], |
93 | 42 | "source": [ |
94 | | - "# Delete the index if it exists\n", |
95 | | - "try:\n", |
96 | | - " result = admin_client.delete_index(index_name)\n", |
97 | | - " print ('Index', index_name, 'Deleted')\n", |
98 | | - "except Exception as ex:\n", |
99 | | - " print (ex)\n" |
| 43 | + "# Provide variables\n", |
| 44 | + "search_endpoint: str = \"PUT-YOUR-SEARCH-ENDPOINT-HERE\"\n", |
| 45 | + "search_api_key: str = \"PUT-YOUR-SEARCH-API-KEY-HERE\"\n", |
| 46 | + "index_name: str = \"hotels-quickstart\"" |
100 | 47 | ] |
101 | 48 | }, |
102 | 49 | { |
103 | 50 | "attachments": {}, |
104 | 51 | "cell_type": "markdown", |
105 | 52 | "metadata": {}, |
106 | 53 | "source": [ |
107 | | - "Specify the index definition, including the fields that define each search document. This schema adds a semantic configuration that specifies how to use search fields during semantic ranking." |
| 54 | + "## Create an index" |
108 | 55 | ] |
109 | 56 | }, |
110 | 57 | { |
|
113 | 60 | "metadata": {}, |
114 | 61 | "outputs": [], |
115 | 62 | "source": [ |
116 | | - "# Specify the index schema\n", |
117 | | - "name = index_name\n", |
| 63 | + "from azure.core.credentials import AzureKeyCredential\n", |
| 64 | + "\n", |
| 65 | + "credential = AzureKeyCredential(search_api_key)\n", |
| 66 | + "\n", |
| 67 | + "from azure.search.documents.indexes import SearchIndexClient\n", |
| 68 | + "from azure.search.documents import SearchClient\n", |
| 69 | + "from azure.search.documents.indexes.models import (\n", |
| 70 | + " ComplexField,\n", |
| 71 | + " SimpleField,\n", |
| 72 | + " SearchFieldDataType,\n", |
| 73 | + " SearchableField,\n", |
| 74 | + " SearchIndex,\n", |
| 75 | + " SemanticConfiguration,\n", |
| 76 | + " SemanticField,\n", |
| 77 | + " SemanticPrioritizedFields,\n", |
| 78 | + " SemanticSearch\n", |
| 79 | + ")\n", |
| 80 | + "\n", |
| 81 | + "# Create a search schema\n", |
| 82 | + "index_client = SearchIndexClient(\n", |
| 83 | + " endpoint=search_endpoint, credential=credential)\n", |
118 | 84 | "fields = [\n", |
119 | 85 | " SimpleField(name=\"HotelId\", type=SearchFieldDataType.String, key=True),\n", |
120 | 86 | " SearchableField(name=\"HotelName\", type=SearchFieldDataType.String, sortable=True),\n", |
121 | 87 | " SearchableField(name=\"Description\", type=SearchFieldDataType.String, analyzer_name=\"en.lucene\"),\n", |
122 | 88 | " SearchableField(name=\"Description_fr\", type=SearchFieldDataType.String, analyzer_name=\"fr.lucene\"),\n", |
123 | 89 | " SearchableField(name=\"Category\", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),\n", |
124 | | - " \n", |
| 90 | + "\n", |
125 | 91 | " SearchableField(name=\"Tags\", collection=True, type=SearchFieldDataType.String, facetable=True, filterable=True),\n", |
126 | 92 | "\n", |
127 | 93 | " SimpleField(name=\"ParkingIncluded\", type=SearchFieldDataType.Boolean, facetable=True, filterable=True, sortable=True),\n", |
|
136 | 102 | " SearchableField(name=\"Country\", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),\n", |
137 | 103 | " ])\n", |
138 | 104 | " ]\n", |
| 105 | + "\n", |
139 | 106 | "semantic_config = SemanticConfiguration(\n", |
140 | 107 | " name=\"my-semantic-config\",\n", |
141 | | - " prioritized_fields=PrioritizedFields(\n", |
| 108 | + " prioritized_fields=SemanticPrioritizedFields(\n", |
142 | 109 | " title_field=SemanticField(field_name=\"HotelName\"),\n", |
143 | | - " prioritized_keywords_fields=[SemanticField(field_name=\"Category\")],\n", |
144 | | - " prioritized_content_fields=[SemanticField(field_name=\"Description\")]\n", |
| 110 | + " keywords_fields=[SemanticField(field_name=\"Category\")],\n", |
| 111 | + " content_fields=[SemanticField(field_name=\"Description\")]\n", |
145 | 112 | " )\n", |
146 | 113 | ")\n", |
147 | 114 | "\n", |
148 | | - "semantic_settings = SemanticSettings(configurations=[semantic_config])\n", |
| 115 | + "# Create the semantic settings with the configuration\n", |
| 116 | + "semantic_search = SemanticSearch(configurations=[semantic_config])\n", |
| 117 | + "\n", |
| 118 | + "semantic_settings = SemanticSearch(configurations=[semantic_config])\n", |
149 | 119 | "scoring_profiles = []\n", |
150 | | - "suggester = [{'name': 'sg', 'source_fields': ['Tags', 'Address/City', 'Address/Country']}]\n" |
151 | | - ] |
152 | | - }, |
153 | | - { |
154 | | - "attachments": {}, |
155 | | - "cell_type": "markdown", |
156 | | - "metadata": {}, |
157 | | - "source": [ |
158 | | - "Formulate the create_index request. This request targets the indexes collection of your search service and creates an index using the index schema from the previous cell." |
159 | | - ] |
160 | | - }, |
161 | | - { |
162 | | - "cell_type": "code", |
163 | | - "execution_count": null, |
164 | | - "metadata": {}, |
165 | | - "outputs": [], |
166 | | - "source": [ |
167 | | - "index = SearchIndex(\n", |
168 | | - " name=name,\n", |
169 | | - " fields=fields,\n", |
170 | | - " semantic_settings=semantic_settings,\n", |
171 | | - " scoring_profiles=scoring_profiles,\n", |
172 | | - " suggesters = suggester)\n", |
| 120 | + "suggester = [{'name': 'sg', 'source_fields': ['Tags', 'Address/City', 'Address/Country']}]\n", |
173 | 121 | "\n", |
174 | | - "try:\n", |
175 | | - " result = admin_client.create_index(index)\n", |
176 | | - " print ('Index', result.name, 'created')\n", |
177 | | - "except Exception as ex:\n", |
178 | | - " print (ex)" |
| 122 | + "# Create the search index with the semantic settings\n", |
| 123 | + "index = SearchIndex(name=index_name, fields=fields, suggesters=suggester, scoring_profiles=scoring_profiles, semantic_search=semantic_search)\n", |
| 124 | + "result = index_client.create_or_update_index(index)\n", |
| 125 | + "print(f' {result.name} created')" |
179 | 126 | ] |
180 | 127 | }, |
181 | 128 | { |
182 | 129 | "attachments": {}, |
183 | 130 | "cell_type": "markdown", |
184 | 131 | "metadata": {}, |
185 | 132 | "source": [ |
186 | | - "Next, set up documents to include four hotel documents conforming to the schema." |
| 133 | + "## Create a documents payload" |
187 | 134 | ] |
188 | 135 | }, |
189 | 136 | { |
|
192 | 139 | "metadata": {}, |
193 | 140 | "outputs": [], |
194 | 141 | "source": [ |
| 142 | + "# Create a documents payload\n", |
195 | 143 | "documents = [\n", |
196 | 144 | " {\n", |
197 | 145 | " \"@search.action\": \"upload\",\n", |
|
277 | 225 | "cell_type": "markdown", |
278 | 226 | "metadata": {}, |
279 | 227 | "source": [ |
280 | | - "Formulate the request. This upload_documents request targets the docs collection of the hotels-quickstart index and pushes the documents from the previous step into the search index." |
| 228 | + "## Upload documents" |
281 | 229 | ] |
282 | 230 | }, |
283 | 231 | { |
|
286 | 234 | "metadata": {}, |
287 | 235 | "outputs": [], |
288 | 236 | "source": [ |
| 237 | + "# Upload documents to the index\n", |
| 238 | + "search_client = SearchClient(endpoint=search_endpoint,\n", |
| 239 | + " index_name=index_name,\n", |
| 240 | + " credential=credential)\n", |
289 | 241 | "try:\n", |
290 | 242 | " result = search_client.upload_documents(documents=documents)\n", |
291 | 243 | " print(\"Upload of new document succeeded: {}\".format(result[0].succeeded))\n", |
292 | 244 | "except Exception as ex:\n", |
293 | | - " print (ex.message)" |
| 245 | + " print (ex.message)\n", |
| 246 | + "\n", |
| 247 | + "\n", |
| 248 | + " index_client = SearchIndexClient(\n", |
| 249 | + " endpoint=search_endpoint, credential=credential)" |
294 | 250 | ] |
295 | 251 | }, |
296 | 252 | { |
297 | 253 | "attachments": {}, |
298 | 254 | "cell_type": "markdown", |
299 | 255 | "metadata": {}, |
300 | 256 | "source": [ |
301 | | - "You're now ready to run some queries. For this operation, use search_client. \n", |
302 | | - "\n", |
303 | | - "### Empty query with unscored results\n", |
304 | | - "\n", |
305 | | - "The next cell contains a query expression that executes an empty search (`search=*`), returning an unranked list (search score = 1.0) of arbitrary documents. Because there is no criteria, all documents are included in results. This query prints fields from each document. It also adds `include_total_count=True` to get a count of all documents (4) in the results." |
| 257 | + "## Run your first query" |
306 | 258 | ] |
307 | 259 | }, |
308 | 260 | { |
|
311 | 263 | "metadata": {}, |
312 | 264 | "outputs": [], |
313 | 265 | "source": [ |
| 266 | + "# Run an empty query (returns selected fields, all documents)\n", |
314 | 267 | "results = search_client.search(query_type='simple',\n", |
315 | 268 | " search_text=\"*\" ,\n", |
316 | 269 | " select='HotelName,Description',\n", |
|
328 | 281 | "cell_type": "markdown", |
329 | 282 | "metadata": {}, |
330 | 283 | "source": [ |
331 | | - "### Full text search with BM25 ranking\n", |
332 | | - "\n", |
333 | | - "The previous query used an empty search string, which bypasses the search engine. In this query, search for \"what hotel has a good restaurant on site\". The query string undergoes lexical analysis and tokenization. The search engine scans for matches and assigns a search score based on term frequency and proximity. Higher scoring matches are returned first. In this query for \"what hotel has a good restaurant on site\", Sublime Cliff Hotel comes out on top because it's description includes \"site\". Terms that occur infrequently raise the search score of the document." |
| 284 | + "## Run a term query" |
334 | 285 | ] |
335 | 286 | }, |
336 | 287 | { |
|
339 | 290 | "metadata": {}, |
340 | 291 | "outputs": [], |
341 | 292 | "source": [ |
| 293 | + "# Run a text query (returns a BM25-scored result set)\n", |
342 | 294 | "results = search_client.search(query_type='simple',\n", |
343 | 295 | " search_text=\"what hotel has a good restaurant on site\" ,\n", |
344 | | - " select='HotelName,HotelId,Description')\n", |
345 | | - "\n", |
| 296 | + " select='HotelName,HotelId,Description',\n", |
| 297 | + " include_total_count=True)\n", |
| 298 | + " \n", |
346 | 299 | "for result in results:\n", |
347 | 300 | " print(result[\"@search.score\"])\n", |
348 | 301 | " print(result[\"HotelName\"])\n", |
|
354 | 307 | "cell_type": "markdown", |
355 | 308 | "metadata": {}, |
356 | 309 | "source": [ |
357 | | - "### Semantic search with captions\n", |
358 | | - "\n", |
359 | | - "Here's the same query, but with semantic ranking. Notice that the semantic ranker correctly identifies Triple Landscape Hotel as a more relevant result given the initial query. This query also returns captions generated by the models. The inputs are too minimal in this sample to create interesting captions, but the example succeeds in demonstrating the syntax." |
| 310 | + "## Run a semantic query" |
360 | 311 | ] |
361 | 312 | }, |
362 | 313 | { |
|
365 | 316 | "metadata": {}, |
366 | 317 | "outputs": [], |
367 | 318 | "source": [ |
| 319 | + "# Runs a semantic query (runs a BM25-ranked query and promotes the most relevant matches to the top)\n", |
368 | 320 | "results = search_client.search(query_type='semantic', semantic_configuration_name='my-semantic-config',\n", |
369 | 321 | " search_text=\"what hotel has a good restaurant on site\", \n", |
370 | 322 | " select='HotelName,Description,Category', query_caption='extractive')\n", |
|
373 | 325 | " print(result[\"@search.reranker_score\"])\n", |
374 | 326 | " print(result[\"HotelName\"])\n", |
375 | 327 | " print(f\"Description: {result['Description']}\")\n", |
376 | | - " \n", |
| 328 | + "\n", |
377 | 329 | " captions = result[\"@search.captions\"]\n", |
378 | 330 | " if captions:\n", |
379 | 331 | " caption = captions[0]\n", |
|
384 | 336 | ] |
385 | 337 | }, |
386 | 338 | { |
387 | | - "attachments": {}, |
388 | 339 | "cell_type": "markdown", |
389 | 340 | "metadata": {}, |
390 | 341 | "source": [ |
391 | | - "### Add semantic answers\n", |
392 | | - "\n", |
393 | | - "Semantic search can generate answers to a query string that has the characteristics of a question. The generated answer is extracted verbatim from your content. To get a semantic answer, the question and answer must be closely aligned, and the model must find content that clearly answers the question. If potential answers don't have a high enough confidence score, the model won't return an answer. For demonstration purposes, the question in this example is designed to get a response so that you can see the syntax." |
| 342 | + "## Return semantic answers" |
394 | 343 | ] |
395 | 344 | }, |
396 | 345 | { |
|
399 | 348 | "metadata": {}, |
400 | 349 | "outputs": [], |
401 | 350 | "source": [ |
| 351 | + "# Run a semantic query that returns semantic answers \n", |
402 | 352 | "results = search_client.search(query_type='semantic', semantic_configuration_name='my-semantic-config',\n", |
403 | | - " search_text=\"what hotel stands out for its gastronomic excellence\", \n", |
404 | | - " select='HotelName,Description,Category', query_caption='extractive', query_answer=\"extractive\",)\n", |
| 353 | + " search_text=\"what hotel is in a historic building\",\n", |
| 354 | + " select='HotelName,Description,Category', query_caption='extractive', query_answer=\"extractive\",)\n", |
405 | 355 | "\n", |
406 | 356 | "semantic_answers = results.get_answers()\n", |
407 | 357 | "for answer in semantic_answers:\n", |
|
415 | 365 | " print(result[\"@search.reranker_score\"])\n", |
416 | 366 | " print(result[\"HotelName\"])\n", |
417 | 367 | " print(f\"Description: {result['Description']}\")\n", |
418 | | - " \n", |
| 368 | + "\n", |
419 | 369 | " captions = result[\"@search.captions\"]\n", |
420 | 370 | " if captions:\n", |
421 | 371 | " caption = captions[0]\n", |
|
448 | 398 | "outputs": [], |
449 | 399 | "source": [ |
450 | 400 | "try:\n", |
451 | | - " result = admin_client.delete_index(index_name)\n", |
| 401 | + " result = index_client.delete_index(index_name)\n", |
452 | 402 | " print ('Index', index_name, 'Deleted')\n", |
453 | 403 | "except Exception as ex:\n", |
454 | 404 | " print (ex)" |
|
469 | 419 | "outputs": [], |
470 | 420 | "source": [ |
471 | 421 | "try:\n", |
472 | | - " result = admin_client.get_index(index_name)\n", |
| 422 | + " result = index_client.get_index(index_name)\n", |
473 | 423 | " print (result)\n", |
474 | 424 | "except Exception as ex:\n", |
475 | 425 | " print (ex)\n" |
|
0 commit comments