added PII redaction

vkurpad · vkurpad · commit 2f433685598f · 2020-11-14T00:32:10.000-08:00
diff --git a/Image-Processing/BFR_Sample_Rest.ipynb b/Image-Processing/BFR_Sample_Rest.ipynb
@@ -193,7 +193,32 @@
     "            }\n",
     "          ]\n",
     "        },\n",
-    "    \n",
+    "        {\n",
+    "          \"@odata.type\": \"#Microsoft.Skills.Text.PIIDetectionSkill\",\n",
+    "          \"name\": \"#1\",\n",
+    "          \"description\": null,\n",
+    "          \"context\": \"/document/merged_content\",\n",
+    "          \"defaultLanguageCode\": \"en\",\n",
+    "          \"minimumPrecision\": 0.5,\n",
+    "          \"maskingMode\": \"replace\",\n",
+    "          \"maskingCharacter\": \"*\",\n",
+    "          \"inputs\": [\n",
+    "            {\n",
+    "              \"name\": \"text\",\n",
+    "              \"source\": \"/document/merged_content\"\n",
+    "            }\n",
+    "          ],\n",
+    "          \"outputs\": [\n",
+    "            {\n",
+    "              \"name\": \"piiEntities\",\n",
+    "              \"targetName\": \"pii_entities\"\n",
+    "            },\n",
+    "            {\n",
+    "              \"name\": \"maskedText\",\n",
+    "              \"targetName\": \"masked_text\"\n",
+    "            }\n",
+    "          ]\n",
+    "        },\n",
     "        {\n",
     "          \"@odata.type\": \"#Microsoft.Skills.Custom.WebApiSkill\",\n",
     "          \"name\": \"ImageSkill\",\n",
@@ -212,40 +237,69 @@
     "            {\n",
     "              \"name\": \"layoutText\",\n",
     "              \"source\": \"/document/normalized_images/*/layoutText\"\n",
+    "            },\n",
+    "            {\n",
+    "              \"name\": \"pii_entities\",\n",
+    "              \"source\": \"/document/merged_content/pii_entities\"\n",
     "            }\n",
     "          ],\n",
     "          \"outputs\": [\n",
     "            {\n",
     "              \"name\": \"slices\",\n",
     "              \"targetName\": \"slices\"\n",
+    "            },\n",
+    "            {\n",
+    "              \"name\": \"original\",\n",
+    "              \"targetName\": \"original\"\n",
     "            }\n",
     "          ],\n",
     "          \"httpHeaders\": {}\n",
     "        }\n",
     "        \n",
     "    ],\n",
-    "    'cognitiveServices': {\n",
-    "        '@odata.type': '#Microsoft.Azure.Search.CognitiveServicesByKey',\n",
-    "        'description': '/subscriptions/subscription_id/resourceGroups/resource_group/providers/Microsoft.CognitiveServices/accounts/cog_svcs_acct',\n",
-    "        'key': f'{cog_svcs_key}'\n",
-    "    },\n",
+    "    'cognitiveServices':None,\n",
     "    'knowledgeStore': {\n",
     "        'storageConnectionString': f'{STORAGECONNSTRING}',\n",
     "        'projections': [\n",
     "          {\n",
-    "            \"tables\": [],\n",
-    "            \"objects\": [],\n",
-    "            \"files\": [\n",
-    "              {\n",
-    "                \"storageContainer\": f'{know_store_container}',\n",
-    "                \"referenceKeyName\": None,\n",
-    "                \"generatedKeyName\": \"slicesKey\",\n",
-    "                \"source\": \"/document/normalized_images/*/slices/*\",\n",
-    "                \"sourceContext\": None,\n",
-    "                \"inputs\": []\n",
-    "              }\n",
-    "            ]\n",
+    "        \"tables\": [],\n",
+    "        \"objects\": [\n",
+    "          {\n",
+    "            \"storageContainer\": \"layout\",\n",
+    "            \"referenceKeyName\": null,\n",
+    "            \"generatedKeyName\": \"layoutKey\",\n",
+    "            \"source\": \"/document/normalized_images/*/layoutText\",\n",
+    "            \"sourceContext\": null,\n",
+    "            \"inputs\": []\n",
     "          }\n",
+    "        ],\n",
+    "        \"files\": [\n",
+    "          {\n",
+    "            \"storageContainer\": \"slices\",\n",
+    "            \"referenceKeyName\": null,\n",
+    "            \"generatedKeyName\": \"slicesKey\",\n",
+    "            \"source\": \"/document/normalized_images/*/slices/*\",\n",
+    "            \"sourceContext\": null,\n",
+    "            \"inputs\": []\n",
+    "          },\n",
+    "          {\n",
+    "            \"storageContainer\": \"images\",\n",
+    "            \"referenceKeyName\": null,\n",
+    "            \"generatedKeyName\": \"imageKey\",\n",
+    "            \"source\": \"/document/normalized_images/*\",\n",
+    "            \"sourceContext\": null,\n",
+    "            \"inputs\": []\n",
+    "          },\n",
+    "          {\n",
+    "            \"storageContainer\": f'{know_store_container}',\n",
+    "            \"referenceKeyName\": null,\n",
+    "            \"generatedKeyName\": \"originalKey\",\n",
+    "            \"source\": \"/document/normalized_images/*/original\",\n",
+    "            \"sourceContext\": null,\n",
+    "            \"inputs\": []\n",
+    "          }\n",
+    "        ]\n",
+    "      }\n",
     "        ]\n",
     "    }\n",
     "}\n",
diff --git a/Image-Processing/Microsoft.jpg b/Image-Processing/Microsoft.jpg
diff --git a/Image-Processing/README.md b/Image-Processing/README.md
@@ -0,0 +1,23 @@
+# Image Processing Sample
+
+Cognitive Search can enrich images with text or images with other images. This sample demonstrates how to pass images to a custom skill and return images from the  custom skill back to the skillset.
+
+## Redacting PII information from images
+
+This sample deploys a skill to obfuscate or redact phone numbers from images. The skillset contains three skills:
+1. OCR 
+2. PII detection
+3. Custom Skill to redact PII
+
+The skillset OCR's the images and runs the extracted text through the PII detection skill to identify PII information. The custom skill then takes the image, layout text from  OCR and the identified PII information to obfuscate the image. The image with the PII infomration obfuscted is then returned to the skillset and projected to the knwoledge store.
+
+## Confingure the components
+
+This sample contains a Azure function and a Jupyter Python3 .ipynb file. Start by deploying the Azure function and saving the URL and code. 
+
+The folder also contains a sample image with a phone number. Save this image to a container in a storage account. This container will be your data source for the enrichment pipeline.
+
+Open the norebook in this folder and set the URL and other required variables in the first cell of the notebook, execute the cells of the notebook to configure and run the solution.
+
+## Validation
+Once the indexer completes, you will see a container `obfuscated` in the knowledge store with the phone number redacted. For comparision the original images are stored in a container `images`.
diff --git a/Image-Processing/SplitImage/ImageSkill/__init__.py b/Image-Processing/SplitImage/ImageSkill/__init__.py
@@ -7,6 +7,26 @@
 import logging
 import azure.functions as func
 
+def base64EncodeImage(image):
+    is_success, im_buf_arr = cv2.imencode(".jpg", image)
+    byte_im = im_buf_arr.tobytes()
+    base64Bytes = base64.b64encode(byte_im)
+    base64String = base64Bytes.decode('utf-8')
+    return base64String
+
+def obfuscate_data(image, factor=3.0):
+    (h, w) = image.shape[:2]
+    kW = int(w / factor)
+    kH = int(h / factor)
+    # ensure the width of the kernel is odd
+    if kW % 2 == 0:
+        kW -= 1
+    # ensure the height of the kernel is odd
+    if kH % 2 == 0:
+        kH -= 1
+    # apply a Gaussian blur to the input image using our computed
+    # kernel size
+    return cv2.GaussianBlur(image, (kW, kH), 0)
 
 def main(req: func.HttpRequest) -> func.HttpResponse:
     logging.info('Python HTTP trigger function processed a request.')
@@ -60,17 +80,23 @@ def transform_value(value):
         jpg_as_np = np.frombuffer(inputBytes, dtype=np.uint8)
         originalImage = cv2.imdecode(jpg_as_np, flags=1)
         slices = []
-        for line in data["layoutText"]["lines"]:
-            slicedImage = originalImage[line["boundingBox"][0]["x"]:line["boundingBox"][0]["y"], line["boundingBox"][3]["x"]:line["boundingBox"][3]["y"]]
-            if(slicedImage.size >0):
-                is_success, im_buf_arr = cv2.imencode(".jpg", slicedImage)
-                byte_im = im_buf_arr.tobytes()
-                base64Bytes = base64.b64encode(byte_im)
-                base64String = base64Bytes.decode('utf-8')
-                aslice = { "$type": "file", 
-                        "data": base64String 
-                        }
-                slices.append(aslice)
+        for pii_entity in data["pii_entities"]:
+            if(pii_entity["type"] == "Phone Number"):
+                for line in data["layoutText"]["lines"]:
+                    if(pii_entity["text"] in line["text"]):
+                        startX = line["boundingBox"][0]["x"]
+                        startY = line["boundingBox"][0]["y"]
+                        endX = line["boundingBox"][2]["x"]
+                        endY = line["boundingBox"][2]["y"]
+                        slicedImage = originalImage[startY:endY, startX:endX]
+                        if(slicedImage.size >0):
+                            fuzzy = obfuscate_data(slicedImage)
+                            originalImage[startY:endY, startX:endX] = fuzzy 
+                            base64String = base64EncodeImage(slicedImage)   
+                            aslice = { "$type": "file", 
+                                    "data": base64String 
+                                    }
+                            slices.append(aslice)
 
 
     except AssertionError  as error:
@@ -83,8 +109,11 @@ def transform_value(value):
     
 
     return ({
-            "recordId": recordId,
+            "recordId": recordId,   
             "data": {
-                "slices": slices
+                "slices": slices,
+                "original": { "$type": "file", 
+                        "data": base64EncodeImage(originalImage) 
+                        }
                     }
             })