Added Python Cognitive API tutorial example + updated readme file

LisaLeib · LisaLeib · commit 041fdf2765f3 · 2019-05-31T14:46:54.000-07:00
diff --git a/README.md b/README.md
@@ -15,4 +15,8 @@ This repository contains Python sample code used in Azure Search quickstarts, tu
 
 ## Quickstart-Jupyter-Notebook
 
-This sample is a .ipynb file containing a Python3 notebook used in [Quickstart: Create and query an Azure Search index using a Jupyter Python notebook](https://docs.microsoft.com/azure/search/search-get-started-python). There are two placeholder values for an Azure Search service and admin API key. Replace them with valid values to create, load, and query an index on your own service.
+This sample is a .ipynb file containing a Python3 notebook used in [Quickstart: Create and query an Azure Search index using a Jupyter Python notebook](https://docs.microsoft.com/azure/search/search-get-started-python). There are two placeholder values for an Azure Search service and admin API key. Replace them with valid values to create, load, and query an index on your own service.
+
+## Tutorial-AI-Enrichment-Jupyter-Notebook
+
+This sample is a .ipynb file containing a Python3 notebook used in [Tutorial: Python Tutorial: Call Cognitive Services APIs in an Azure Search indexing pipeline](https://docs.microsoft.com/azure/search/cognitive-search-tutorial-blob-python). There are three placeholder values to insert: an Azure Search service, an admin API key, and a connection string to a blob storage resource that you will create in the tutorial. Replace them with valid values to create an indexing pipeline that searches for and extracts text and text representations of images and scanned documents. This sample leverages cognitive skills from the Azure Cognitive Services API, such as entity recognition and language detection.
diff --git a/Tutorial-AI-Enrichment-Jupyter-Notebook/PythonTutorial-AzureSearch-AIEnrichment.ipynb b/Tutorial-AI-Enrichment-Jupyter-Notebook/PythonTutorial-AzureSearch-AIEnrichment.ipynb
@@ -0,0 +1,317 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import requests\n",
+    "from pprint import pprint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Define the names for the data source, skillset, index and indexer\n",
+    "datasource_name=\"cogsrch-py-datasource\"\n",
+    "skillset_name=\"cogsrch-py-skillset\"\n",
+    "index_name=\"cogsrch-py-index\"\n",
+    "indexer_name=\"cogsrch-py-indexer\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Setup the endpoint\n",
+    "endpoint = 'https://<YOUR-SERVICE-NAME>.search.windows.net/'\n",
+    "headers = {'Content-Type': 'application/json',\n",
+    "        'api-key': '<YOUR-ADMIN-API-KEY>' }\n",
+    "params = {\n",
+    "    'api-version': '2019-05-06'\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Create a data source\n",
+    "datasourceConnectionString = \"<YOUR-BLOB-RESOURCE-CONNECTION-STRING>\"\n",
+    "datasource_payload = {\n",
+    "    \"name\": datasource_name,\n",
+    "    \"description\": \"Demo files to demonstrate cognitive search capabilities.\",\n",
+    "    \"type\": \"azureblob\",\n",
+    "    \"credentials\": {\n",
+    "    \"connectionString\": datasourceConnectionString\n",
+    "   },\n",
+    "    \"container\": {\n",
+    "     \"name\": \"basic-demo-data-pr\"\n",
+    "   }\n",
+    "}\n",
+    "r = requests.put( endpoint + \"/datasources/\" + datasource_name, data=json.dumps(datasource_payload), headers=headers, params=params )\n",
+    "pprint (r.json())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Create a skillset\n",
+    "skillset_payload = {\n",
+    "  \"name\": skillset_name,\n",
+    "  \"description\":\n",
+    "  \"Extract entities, detect language and extract key-phrases\",\n",
+    "  \"skills\":\n",
+    "  [\n",
+    "    {\n",
+    "      \"@odata.type\": \"#Microsoft.Skills.Text.EntityRecognitionSkill\",\n",
+    "      \"categories\": [ \"Organization\" ],\n",
+    "      \"defaultLanguageCode\": \"en\",\n",
+    "      \"inputs\": [\n",
+    "        {\n",
+    "          \"name\": \"text\", \"source\": \"/document/content\"\n",
+    "        }\n",
+    "      ],\n",
+    "      \"outputs\": [\n",
+    "        {\n",
+    "          \"name\": \"organizations\", \"targetName\": \"organizations\"\n",
+    "        }\n",
+    "      ]\n",
+    "    },\n",
+    "    {\n",
+    "      \"@odata.type\": \"#Microsoft.Skills.Text.LanguageDetectionSkill\",\n",
+    "      \"inputs\": [\n",
+    "        {\n",
+    "          \"name\": \"text\", \"source\": \"/document/content\"\n",
+    "        }\n",
+    "      ],\n",
+    "      \"outputs\": [\n",
+    "        {\n",
+    "          \"name\": \"languageCode\",\n",
+    "          \"targetName\": \"languageCode\"\n",
+    "        }\n",
+    "      ]\n",
+    "    },\n",
+    "    {\n",
+    "      \"@odata.type\": \"#Microsoft.Skills.Text.SplitSkill\",\n",
+    "      \"textSplitMode\" : \"pages\",\n",
+    "      \"maximumPageLength\": 4000,\n",
+    "      \"inputs\": [\n",
+    "        {\n",
+    "          \"name\": \"text\",\n",
+    "          \"source\": \"/document/content\"\n",
+    "        },\n",
+    "        {\n",
+    "          \"name\": \"languageCode\",\n",
+    "          \"source\": \"/document/languageCode\"\n",
+    "        }\n",
+    "      ],\n",
+    "      \"outputs\": [\n",
+    "        {\n",
+    "          \"name\": \"textItems\",\n",
+    "          \"targetName\": \"pages\"\n",
+    "        }\n",
+    "      ]\n",
+    "    },\n",
+    "    {\n",
+    "      \"@odata.type\": \"#Microsoft.Skills.Text.KeyPhraseExtractionSkill\",\n",
+    "      \"context\": \"/document/pages/*\",\n",
+    "      \"inputs\": [\n",
+    "        {\n",
+    "          \"name\": \"text\", \"source\": \"/document/pages/*\"\n",
+    "        },\n",
+    "        {\n",
+    "          \"name\":\"languageCode\", \"source\": \"/document/languageCode\"\n",
+    "        }\n",
+    "      ],\n",
+    "      \"outputs\": [\n",
+    "        {\n",
+    "          \"name\": \"keyPhrases\",\n",
+    "          \"targetName\": \"keyPhrases\"\n",
+    "        }\n",
+    "      ]\n",
+    "    }\n",
+    "  ]\n",
+    "}\n",
+    "\n",
+    "r = requests.put(endpoint + \"/skillsets/\" + skillset_name, data=json.dumps(skillset_payload), headers=headers, params=params)\n",
+    "pprint(r.json())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Create an index\n",
+    "index_payload = {\n",
+    "    \"name\": index_name,\n",
+    "    \"fields\": [\n",
+    "      {\n",
+    "        \"name\": \"id\",\n",
+    "        \"type\": \"Edm.String\",\n",
+    "        \"key\": \"true\",\n",
+    "        \"searchable\": \"true\",\n",
+    "        \"filterable\": \"false\",\n",
+    "        \"facetable\": \"false\",\n",
+    "        \"sortable\": \"true\"\n",
+    "      },\n",
+    "      {\n",
+    "        \"name\": \"content\",\n",
+    "        \"type\": \"Edm.String\",\n",
+    "        \"sortable\": \"false\",\n",
+    "        \"searchable\": \"true\",\n",
+    "        \"filterable\": \"false\",\n",
+    "        \"facetable\": \"false\"\n",
+    "      },\n",
+    "      {\n",
+    "        \"name\": \"languageCode\",\n",
+    "        \"type\": \"Edm.String\",\n",
+    "        \"searchable\": \"true\",\n",
+    "        \"filterable\": \"false\",\n",
+    "        \"facetable\": \"false\"\n",
+    "      },\n",
+    "      {\n",
+    "        \"name\": \"keyPhrases\",\n",
+    "        \"type\": \"Collection(Edm.String)\",\n",
+    "        \"searchable\": \"true\",\n",
+    "        \"filterable\": \"false\",\n",
+    "        \"facetable\": \"false\"\n",
+    "      },\n",
+    "      {\n",
+    "        \"name\": \"organizations\",\n",
+    "        \"type\": \"Collection(Edm.String)\",\n",
+    "        \"searchable\": \"true\",\n",
+    "        \"sortable\": \"false\",\n",
+    "        \"filterable\": \"false\",\n",
+    "        \"facetable\": \"false\"\n",
+    "      }\n",
+    "   ]\n",
+    "}\n",
+    "\n",
+    "r = requests.put(endpoint + \"/indexes/\" + index_name, data=json.dumps(index_payload), headers=headers, params=params)\n",
+    "pprint(r.json())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create an indexer\n",
+    "indexer_payload = {\n",
+    "    \"name\": indexer_name,\n",
+    "    \"dataSourceName\": datasource_name,\n",
+    "    \"targetIndexName\": index_name,\n",
+    "    \"skillsetName\": skillset_name,\n",
+    "    \"fieldMappings\" : [\n",
+    "    {\n",
+    "      \"sourceFieldName\" : \"metadata_storage_path\",\n",
+    "      \"targetFieldName\" : \"id\",\n",
+    "      \"mappingFunction\" :\n",
+    "        { \"name\" : \"base64Encode\" }\n",
+    "    },\n",
+    "    {\n",
+    "      \"sourceFieldName\" : \"content\",\n",
+    "      \"targetFieldName\" : \"content\"\n",
+    "    }\n",
+    "  ],\n",
+    "   \"outputFieldMappings\" :\n",
+    "  [\n",
+    "    {\n",
+    "      \"sourceFieldName\" : \"/document/organizations\",\n",
+    "      \"targetFieldName\" : \"organizations\"\n",
+    "    },\n",
+    "    {\n",
+    "      \"sourceFieldName\" : \"/document/pages/*/keyPhrases/*\",\n",
+    "      \"targetFieldName\" : \"keyPhrases\"\n",
+    "    },\n",
+    "    {\n",
+    "      \"sourceFieldName\": \"/document/languageCode\",\n",
+    "      \"targetFieldName\": \"languageCode\"\n",
+    "    }\n",
+    "  ],\n",
+    "   \"parameters\":\n",
+    "  {\n",
+    "    \"maxFailedItems\":-1,\n",
+    "    \"maxFailedItemsPerBatch\":-1,\n",
+    "    \"configuration\":\n",
+    "    {\n",
+    "      \"dataToExtract\": \"contentAndMetadata\",\n",
+    "      \"imageAction\": \"generateNormalizedImages\"\n",
+    "    }\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "r = requests.put(endpoint + \"/indexers/\" + indexer_name, data=json.dumps(indexer_payload), headers=headers, params=params)\n",
+    "pprint(r.json())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Query the index for all fields\n",
+    "r = requests.get(endpoint + \"/indexes/\" + index_name, headers=headers,params=params)\n",
+    "print(json.dumps(r.json(), indent=1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Query the index to return the contents of organizations\n",
+    "#Note: Index creation may take time. If this step returns no data, wait a few minutes\n",
+    "#      and then try again\n",
+    "r = requests.get(endpoint + \"/indexes/\" + index_name + \"/docs?&search=*&$select=organizations\", headers=headers, params=params)\n",
+    "pprint(r.json())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}