Skip to content

Commit 2f43368

Browse files
committed
added PII redaction
1 parent 0a473e4 commit 2f43368

File tree

4 files changed

+137
-31
lines changed

4 files changed

+137
-31
lines changed

Image-Processing/BFR_Sample_Rest.ipynb

Lines changed: 72 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,32 @@
193193
" }\n",
194194
" ]\n",
195195
" },\n",
196-
" \n",
196+
" {\n",
197+
" \"@odata.type\": \"#Microsoft.Skills.Text.PIIDetectionSkill\",\n",
198+
" \"name\": \"#1\",\n",
199+
" \"description\": null,\n",
200+
" \"context\": \"/document/merged_content\",\n",
201+
" \"defaultLanguageCode\": \"en\",\n",
202+
" \"minimumPrecision\": 0.5,\n",
203+
" \"maskingMode\": \"replace\",\n",
204+
" \"maskingCharacter\": \"*\",\n",
205+
" \"inputs\": [\n",
206+
" {\n",
207+
" \"name\": \"text\",\n",
208+
" \"source\": \"/document/merged_content\"\n",
209+
" }\n",
210+
" ],\n",
211+
" \"outputs\": [\n",
212+
" {\n",
213+
" \"name\": \"piiEntities\",\n",
214+
" \"targetName\": \"pii_entities\"\n",
215+
" },\n",
216+
" {\n",
217+
" \"name\": \"maskedText\",\n",
218+
" \"targetName\": \"masked_text\"\n",
219+
" }\n",
220+
" ]\n",
221+
" },\n",
197222
" {\n",
198223
" \"@odata.type\": \"#Microsoft.Skills.Custom.WebApiSkill\",\n",
199224
" \"name\": \"ImageSkill\",\n",
@@ -212,40 +237,69 @@
212237
" {\n",
213238
" \"name\": \"layoutText\",\n",
214239
" \"source\": \"/document/normalized_images/*/layoutText\"\n",
240+
" },\n",
241+
" {\n",
242+
" \"name\": \"pii_entities\",\n",
243+
" \"source\": \"/document/merged_content/pii_entities\"\n",
215244
" }\n",
216245
" ],\n",
217246
" \"outputs\": [\n",
218247
" {\n",
219248
" \"name\": \"slices\",\n",
220249
" \"targetName\": \"slices\"\n",
250+
" },\n",
251+
" {\n",
252+
" \"name\": \"original\",\n",
253+
" \"targetName\": \"original\"\n",
221254
" }\n",
222255
" ],\n",
223256
" \"httpHeaders\": {}\n",
224257
" }\n",
225258
" \n",
226259
" ],\n",
227-
" 'cognitiveServices': {\n",
228-
" '@odata.type': '#Microsoft.Azure.Search.CognitiveServicesByKey',\n",
229-
" 'description': '/subscriptions/subscription_id/resourceGroups/resource_group/providers/Microsoft.CognitiveServices/accounts/cog_svcs_acct',\n",
230-
" 'key': f'{cog_svcs_key}'\n",
231-
" },\n",
260+
" 'cognitiveServices':None,\n",
232261
" 'knowledgeStore': {\n",
233262
" 'storageConnectionString': f'{STORAGECONNSTRING}',\n",
234263
" 'projections': [\n",
235264
" {\n",
236-
" \"tables\": [],\n",
237-
" \"objects\": [],\n",
238-
" \"files\": [\n",
239-
" {\n",
240-
" \"storageContainer\": f'{know_store_container}',\n",
241-
" \"referenceKeyName\": None,\n",
242-
" \"generatedKeyName\": \"slicesKey\",\n",
243-
" \"source\": \"/document/normalized_images/*/slices/*\",\n",
244-
" \"sourceContext\": None,\n",
245-
" \"inputs\": []\n",
246-
" }\n",
247-
" ]\n",
265+
" \"tables\": [],\n",
266+
" \"objects\": [\n",
267+
" {\n",
268+
" \"storageContainer\": \"layout\",\n",
269+
" \"referenceKeyName\": null,\n",
270+
" \"generatedKeyName\": \"layoutKey\",\n",
271+
" \"source\": \"/document/normalized_images/*/layoutText\",\n",
272+
" \"sourceContext\": null,\n",
273+
" \"inputs\": []\n",
248274
" }\n",
275+
" ],\n",
276+
" \"files\": [\n",
277+
" {\n",
278+
" \"storageContainer\": \"slices\",\n",
279+
" \"referenceKeyName\": null,\n",
280+
" \"generatedKeyName\": \"slicesKey\",\n",
281+
" \"source\": \"/document/normalized_images/*/slices/*\",\n",
282+
" \"sourceContext\": null,\n",
283+
" \"inputs\": []\n",
284+
" },\n",
285+
" {\n",
286+
" \"storageContainer\": \"images\",\n",
287+
" \"referenceKeyName\": null,\n",
288+
" \"generatedKeyName\": \"imageKey\",\n",
289+
" \"source\": \"/document/normalized_images/*\",\n",
290+
" \"sourceContext\": null,\n",
291+
" \"inputs\": []\n",
292+
" },\n",
293+
" {\n",
294+
" \"storageContainer\": f'{know_store_container}',\n",
295+
" \"referenceKeyName\": null,\n",
296+
" \"generatedKeyName\": \"originalKey\",\n",
297+
" \"source\": \"/document/normalized_images/*/original\",\n",
298+
" \"sourceContext\": null,\n",
299+
" \"inputs\": []\n",
300+
" }\n",
301+
" ]\n",
302+
" }\n",
249303
" ]\n",
250304
" }\n",
251305
"}\n",

Image-Processing/Microsoft.jpg

241 KB
Loading

Image-Processing/README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Image Processing Sample
2+
3+
Cognitive Search can enrich images with text or images with other images. This sample demonstrates how to pass images to a custom skill and return images from the custom skill back to the skillset.
4+
5+
## Redacting PII information from images
6+
7+
This sample deploys a skill to obfuscate or redact phone numbers from images. The skillset contains three skills:
8+
1. OCR
9+
2. PII detection
10+
3. Custom Skill to redact PII
11+
12+
The skillset OCR's the images and runs the extracted text through the PII detection skill to identify PII information. The custom skill then takes the image, layout text from OCR and the identified PII information to obfuscate the image. The image with the PII infomration obfuscted is then returned to the skillset and projected to the knwoledge store.
13+
14+
## Confingure the components
15+
16+
This sample contains a Azure function and a Jupyter Python3 .ipynb file. Start by deploying the Azure function and saving the URL and code.
17+
18+
The folder also contains a sample image with a phone number. Save this image to a container in a storage account. This container will be your data source for the enrichment pipeline.
19+
20+
Open the norebook in this folder and set the URL and other required variables in the first cell of the notebook, execute the cells of the notebook to configure and run the solution.
21+
22+
## Validation
23+
Once the indexer completes, you will see a container `obfuscated` in the knowledge store with the phone number redacted. For comparision the original images are stored in a container `images`.

Image-Processing/SplitImage/ImageSkill/__init__.py

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,26 @@
77
import logging
88
import azure.functions as func
99

10+
def base64EncodeImage(image):
11+
is_success, im_buf_arr = cv2.imencode(".jpg", image)
12+
byte_im = im_buf_arr.tobytes()
13+
base64Bytes = base64.b64encode(byte_im)
14+
base64String = base64Bytes.decode('utf-8')
15+
return base64String
16+
17+
def obfuscate_data(image, factor=3.0):
18+
(h, w) = image.shape[:2]
19+
kW = int(w / factor)
20+
kH = int(h / factor)
21+
# ensure the width of the kernel is odd
22+
if kW % 2 == 0:
23+
kW -= 1
24+
# ensure the height of the kernel is odd
25+
if kH % 2 == 0:
26+
kH -= 1
27+
# apply a Gaussian blur to the input image using our computed
28+
# kernel size
29+
return cv2.GaussianBlur(image, (kW, kH), 0)
1030

1131
def main(req: func.HttpRequest) -> func.HttpResponse:
1232
logging.info('Python HTTP trigger function processed a request.')
@@ -60,17 +80,23 @@ def transform_value(value):
6080
jpg_as_np = np.frombuffer(inputBytes, dtype=np.uint8)
6181
originalImage = cv2.imdecode(jpg_as_np, flags=1)
6282
slices = []
63-
for line in data["layoutText"]["lines"]:
64-
slicedImage = originalImage[line["boundingBox"][0]["x"]:line["boundingBox"][0]["y"], line["boundingBox"][3]["x"]:line["boundingBox"][3]["y"]]
65-
if(slicedImage.size >0):
66-
is_success, im_buf_arr = cv2.imencode(".jpg", slicedImage)
67-
byte_im = im_buf_arr.tobytes()
68-
base64Bytes = base64.b64encode(byte_im)
69-
base64String = base64Bytes.decode('utf-8')
70-
aslice = { "$type": "file",
71-
"data": base64String
72-
}
73-
slices.append(aslice)
83+
for pii_entity in data["pii_entities"]:
84+
if(pii_entity["type"] == "Phone Number"):
85+
for line in data["layoutText"]["lines"]:
86+
if(pii_entity["text"] in line["text"]):
87+
startX = line["boundingBox"][0]["x"]
88+
startY = line["boundingBox"][0]["y"]
89+
endX = line["boundingBox"][2]["x"]
90+
endY = line["boundingBox"][2]["y"]
91+
slicedImage = originalImage[startY:endY, startX:endX]
92+
if(slicedImage.size >0):
93+
fuzzy = obfuscate_data(slicedImage)
94+
originalImage[startY:endY, startX:endX] = fuzzy
95+
base64String = base64EncodeImage(slicedImage)
96+
aslice = { "$type": "file",
97+
"data": base64String
98+
}
99+
slices.append(aslice)
74100

75101

76102
except AssertionError as error:
@@ -83,8 +109,11 @@ def transform_value(value):
83109

84110

85111
return ({
86-
"recordId": recordId,
112+
"recordId": recordId,
87113
"data": {
88-
"slices": slices
114+
"slices": slices,
115+
"original": { "$type": "file",
116+
"data": base64EncodeImage(originalImage)
117+
}
89118
}
90119
})

0 commit comments

Comments
 (0)