Skip to content

Commit 041fdf2

Browse files
committed
Added Python Cognitive API tutorial example + updated readme file
1 parent 2864f3b commit 041fdf2

File tree

2 files changed

+322
-1
lines changed

2 files changed

+322
-1
lines changed

README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,8 @@ This repository contains Python sample code used in Azure Search quickstarts, tu
1515

1616
## Quickstart-Jupyter-Notebook
1717

18-
This sample is a .ipynb file containing a Python3 notebook used in [Quickstart: Create and query an Azure Search index using a Jupyter Python notebook](https://docs.microsoft.com/azure/search/search-get-started-python). There are two placeholder values for an Azure Search service and admin API key. Replace them with valid values to create, load, and query an index on your own service.
18+
This sample is a .ipynb file containing a Python3 notebook used in [Quickstart: Create and query an Azure Search index using a Jupyter Python notebook](https://docs.microsoft.com/azure/search/search-get-started-python). There are two placeholder values for an Azure Search service and admin API key. Replace them with valid values to create, load, and query an index on your own service.
19+
20+
## Tutorial-AI-Enrichment-Jupyter-Notebook
21+
22+
This sample is a .ipynb file containing a Python3 notebook used in [Tutorial: Python Tutorial: Call Cognitive Services APIs in an Azure Search indexing pipeline](https://docs.microsoft.com/azure/search/cognitive-search-tutorial-blob-python). There are three placeholder values to insert: an Azure Search service, an admin API key, and a connection string to a blob storage resource that you will create in the tutorial. Replace them with valid values to create an indexing pipeline that searches for and extracts text and text representations of images and scanned documents. This sample leverages cognitive skills from the Azure Cognitive Services API, such as entity recognition and language detection.
Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import json\n",
10+
"import requests\n",
11+
"from pprint import pprint"
12+
]
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": null,
17+
"metadata": {},
18+
"outputs": [],
19+
"source": [
20+
"#Define the names for the data source, skillset, index and indexer\n",
21+
"datasource_name=\"cogsrch-py-datasource\"\n",
22+
"skillset_name=\"cogsrch-py-skillset\"\n",
23+
"index_name=\"cogsrch-py-index\"\n",
24+
"indexer_name=\"cogsrch-py-indexer\"\n"
25+
]
26+
},
27+
{
28+
"cell_type": "code",
29+
"execution_count": null,
30+
"metadata": {},
31+
"outputs": [],
32+
"source": [
33+
"#Setup the endpoint\n",
34+
"endpoint = 'https://<YOUR-SERVICE-NAME>.search.windows.net/'\n",
35+
"headers = {'Content-Type': 'application/json',\n",
36+
" 'api-key': '<YOUR-ADMIN-API-KEY>' }\n",
37+
"params = {\n",
38+
" 'api-version': '2019-05-06'\n",
39+
"}"
40+
]
41+
},
42+
{
43+
"cell_type": "code",
44+
"execution_count": null,
45+
"metadata": {},
46+
"outputs": [],
47+
"source": [
48+
"#Create a data source\n",
49+
"datasourceConnectionString = \"<YOUR-BLOB-RESOURCE-CONNECTION-STRING>\"\n",
50+
"datasource_payload = {\n",
51+
" \"name\": datasource_name,\n",
52+
" \"description\": \"Demo files to demonstrate cognitive search capabilities.\",\n",
53+
" \"type\": \"azureblob\",\n",
54+
" \"credentials\": {\n",
55+
" \"connectionString\": datasourceConnectionString\n",
56+
" },\n",
57+
" \"container\": {\n",
58+
" \"name\": \"basic-demo-data-pr\"\n",
59+
" }\n",
60+
"}\n",
61+
"r = requests.put( endpoint + \"/datasources/\" + datasource_name, data=json.dumps(datasource_payload), headers=headers, params=params )\n",
62+
"pprint (r.json())"
63+
]
64+
},
65+
{
66+
"cell_type": "code",
67+
"execution_count": null,
68+
"metadata": {},
69+
"outputs": [],
70+
"source": [
71+
"#Create a skillset\n",
72+
"skillset_payload = {\n",
73+
" \"name\": skillset_name,\n",
74+
" \"description\":\n",
75+
" \"Extract entities, detect language and extract key-phrases\",\n",
76+
" \"skills\":\n",
77+
" [\n",
78+
" {\n",
79+
" \"@odata.type\": \"#Microsoft.Skills.Text.EntityRecognitionSkill\",\n",
80+
" \"categories\": [ \"Organization\" ],\n",
81+
" \"defaultLanguageCode\": \"en\",\n",
82+
" \"inputs\": [\n",
83+
" {\n",
84+
" \"name\": \"text\", \"source\": \"/document/content\"\n",
85+
" }\n",
86+
" ],\n",
87+
" \"outputs\": [\n",
88+
" {\n",
89+
" \"name\": \"organizations\", \"targetName\": \"organizations\"\n",
90+
" }\n",
91+
" ]\n",
92+
" },\n",
93+
" {\n",
94+
" \"@odata.type\": \"#Microsoft.Skills.Text.LanguageDetectionSkill\",\n",
95+
" \"inputs\": [\n",
96+
" {\n",
97+
" \"name\": \"text\", \"source\": \"/document/content\"\n",
98+
" }\n",
99+
" ],\n",
100+
" \"outputs\": [\n",
101+
" {\n",
102+
" \"name\": \"languageCode\",\n",
103+
" \"targetName\": \"languageCode\"\n",
104+
" }\n",
105+
" ]\n",
106+
" },\n",
107+
" {\n",
108+
" \"@odata.type\": \"#Microsoft.Skills.Text.SplitSkill\",\n",
109+
" \"textSplitMode\" : \"pages\",\n",
110+
" \"maximumPageLength\": 4000,\n",
111+
" \"inputs\": [\n",
112+
" {\n",
113+
" \"name\": \"text\",\n",
114+
" \"source\": \"/document/content\"\n",
115+
" },\n",
116+
" {\n",
117+
" \"name\": \"languageCode\",\n",
118+
" \"source\": \"/document/languageCode\"\n",
119+
" }\n",
120+
" ],\n",
121+
" \"outputs\": [\n",
122+
" {\n",
123+
" \"name\": \"textItems\",\n",
124+
" \"targetName\": \"pages\"\n",
125+
" }\n",
126+
" ]\n",
127+
" },\n",
128+
" {\n",
129+
" \"@odata.type\": \"#Microsoft.Skills.Text.KeyPhraseExtractionSkill\",\n",
130+
" \"context\": \"/document/pages/*\",\n",
131+
" \"inputs\": [\n",
132+
" {\n",
133+
" \"name\": \"text\", \"source\": \"/document/pages/*\"\n",
134+
" },\n",
135+
" {\n",
136+
" \"name\":\"languageCode\", \"source\": \"/document/languageCode\"\n",
137+
" }\n",
138+
" ],\n",
139+
" \"outputs\": [\n",
140+
" {\n",
141+
" \"name\": \"keyPhrases\",\n",
142+
" \"targetName\": \"keyPhrases\"\n",
143+
" }\n",
144+
" ]\n",
145+
" }\n",
146+
" ]\n",
147+
"}\n",
148+
"\n",
149+
"r = requests.put(endpoint + \"/skillsets/\" + skillset_name, data=json.dumps(skillset_payload), headers=headers, params=params)\n",
150+
"pprint(r.json())"
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"execution_count": null,
156+
"metadata": {},
157+
"outputs": [],
158+
"source": [
159+
"#Create an index\n",
160+
"index_payload = {\n",
161+
" \"name\": index_name,\n",
162+
" \"fields\": [\n",
163+
" {\n",
164+
" \"name\": \"id\",\n",
165+
" \"type\": \"Edm.String\",\n",
166+
" \"key\": \"true\",\n",
167+
" \"searchable\": \"true\",\n",
168+
" \"filterable\": \"false\",\n",
169+
" \"facetable\": \"false\",\n",
170+
" \"sortable\": \"true\"\n",
171+
" },\n",
172+
" {\n",
173+
" \"name\": \"content\",\n",
174+
" \"type\": \"Edm.String\",\n",
175+
" \"sortable\": \"false\",\n",
176+
" \"searchable\": \"true\",\n",
177+
" \"filterable\": \"false\",\n",
178+
" \"facetable\": \"false\"\n",
179+
" },\n",
180+
" {\n",
181+
" \"name\": \"languageCode\",\n",
182+
" \"type\": \"Edm.String\",\n",
183+
" \"searchable\": \"true\",\n",
184+
" \"filterable\": \"false\",\n",
185+
" \"facetable\": \"false\"\n",
186+
" },\n",
187+
" {\n",
188+
" \"name\": \"keyPhrases\",\n",
189+
" \"type\": \"Collection(Edm.String)\",\n",
190+
" \"searchable\": \"true\",\n",
191+
" \"filterable\": \"false\",\n",
192+
" \"facetable\": \"false\"\n",
193+
" },\n",
194+
" {\n",
195+
" \"name\": \"organizations\",\n",
196+
" \"type\": \"Collection(Edm.String)\",\n",
197+
" \"searchable\": \"true\",\n",
198+
" \"sortable\": \"false\",\n",
199+
" \"filterable\": \"false\",\n",
200+
" \"facetable\": \"false\"\n",
201+
" }\n",
202+
" ]\n",
203+
"}\n",
204+
"\n",
205+
"r = requests.put(endpoint + \"/indexes/\" + index_name, data=json.dumps(index_payload), headers=headers, params=params)\n",
206+
"pprint(r.json())"
207+
]
208+
},
209+
{
210+
"cell_type": "code",
211+
"execution_count": null,
212+
"metadata": {},
213+
"outputs": [],
214+
"source": [
215+
"# Create an indexer\n",
216+
"indexer_payload = {\n",
217+
" \"name\": indexer_name,\n",
218+
" \"dataSourceName\": datasource_name,\n",
219+
" \"targetIndexName\": index_name,\n",
220+
" \"skillsetName\": skillset_name,\n",
221+
" \"fieldMappings\" : [\n",
222+
" {\n",
223+
" \"sourceFieldName\" : \"metadata_storage_path\",\n",
224+
" \"targetFieldName\" : \"id\",\n",
225+
" \"mappingFunction\" :\n",
226+
" { \"name\" : \"base64Encode\" }\n",
227+
" },\n",
228+
" {\n",
229+
" \"sourceFieldName\" : \"content\",\n",
230+
" \"targetFieldName\" : \"content\"\n",
231+
" }\n",
232+
" ],\n",
233+
" \"outputFieldMappings\" :\n",
234+
" [\n",
235+
" {\n",
236+
" \"sourceFieldName\" : \"/document/organizations\",\n",
237+
" \"targetFieldName\" : \"organizations\"\n",
238+
" },\n",
239+
" {\n",
240+
" \"sourceFieldName\" : \"/document/pages/*/keyPhrases/*\",\n",
241+
" \"targetFieldName\" : \"keyPhrases\"\n",
242+
" },\n",
243+
" {\n",
244+
" \"sourceFieldName\": \"/document/languageCode\",\n",
245+
" \"targetFieldName\": \"languageCode\"\n",
246+
" }\n",
247+
" ],\n",
248+
" \"parameters\":\n",
249+
" {\n",
250+
" \"maxFailedItems\":-1,\n",
251+
" \"maxFailedItemsPerBatch\":-1,\n",
252+
" \"configuration\":\n",
253+
" {\n",
254+
" \"dataToExtract\": \"contentAndMetadata\",\n",
255+
" \"imageAction\": \"generateNormalizedImages\"\n",
256+
" }\n",
257+
" }\n",
258+
"}\n",
259+
"\n",
260+
"r = requests.put(endpoint + \"/indexers/\" + indexer_name, data=json.dumps(indexer_payload), headers=headers, params=params)\n",
261+
"pprint(r.json())\n"
262+
]
263+
},
264+
{
265+
"cell_type": "code",
266+
"execution_count": null,
267+
"metadata": {},
268+
"outputs": [],
269+
"source": [
270+
"#Query the index for all fields\n",
271+
"r = requests.get(endpoint + \"/indexes/\" + index_name, headers=headers,params=params)\n",
272+
"print(json.dumps(r.json(), indent=1))"
273+
]
274+
},
275+
{
276+
"cell_type": "code",
277+
"execution_count": null,
278+
"metadata": {},
279+
"outputs": [],
280+
"source": [
281+
"#Query the index to return the contents of organizations\n",
282+
"#Note: Index creation may take time. If this step returns no data, wait a few minutes\n",
283+
"# and then try again\n",
284+
"r = requests.get(endpoint + \"/indexes/\" + index_name + \"/docs?&search=*&$select=organizations\", headers=headers, params=params)\n",
285+
"pprint(r.json())"
286+
]
287+
},
288+
{
289+
"cell_type": "code",
290+
"execution_count": null,
291+
"metadata": {},
292+
"outputs": [],
293+
"source": []
294+
}
295+
],
296+
"metadata": {
297+
"kernelspec": {
298+
"display_name": "Python 3",
299+
"language": "python",
300+
"name": "python3"
301+
},
302+
"language_info": {
303+
"codemirror_mode": {
304+
"name": "ipython",
305+
"version": 3
306+
},
307+
"file_extension": ".py",
308+
"mimetype": "text/x-python",
309+
"name": "python",
310+
"nbconvert_exporter": "python",
311+
"pygments_lexer": "ipython3",
312+
"version": "3.7.3"
313+
}
314+
},
315+
"nbformat": 4,
316+
"nbformat_minor": 2
317+
}

0 commit comments

Comments
 (0)