1+ import io
12import re
23from collections import defaultdict
3- from typing import Dict , List , Union
4+ from typing import Annotated , Dict , List , Tuple , Union
45
6+ import matplotlib .pyplot as plt
57from datasets import Dataset
8+ from PIL import Image
69from zenml import log_artifact_metadata , step
710
811
@@ -209,8 +212,68 @@ def scan_dataset(
209212 return results
210213
211214
215+ def plot_pii_results (
216+ train_results : Dict [str , Dict ], test_results : Dict [str , Dict ]
217+ ) -> Image :
218+ total_findings = {
219+ "Emails" : (
220+ train_results ["statistics" ]["total_findings" ]["emails" ]
221+ + test_results ["statistics" ]["total_findings" ]["emails" ]
222+ ),
223+ "Phone Numbers" : (
224+ train_results ["statistics" ]["total_findings" ]["phones" ]
225+ + test_results ["statistics" ]["total_findings" ]["phones" ]
226+ ),
227+ "SSNs" : (
228+ train_results ["statistics" ]["total_findings" ]["ssns" ]
229+ + test_results ["statistics" ]["total_findings" ]["ssns" ]
230+ ),
231+ "Credit Cards" : (
232+ train_results ["statistics" ]["total_findings" ]["credit_cards" ]
233+ + test_results ["statistics" ]["total_findings" ]["credit_cards" ]
234+ ),
235+ "Dates" : (
236+ train_results ["statistics" ]["total_findings" ]["dates" ]
237+ + test_results ["statistics" ]["total_findings" ]["dates" ]
238+ ),
239+ "IP Addresses" : (
240+ train_results ["statistics" ]["total_findings" ]["ips" ]
241+ + test_results ["statistics" ]["total_findings" ]["ips" ]
242+ ),
243+ }
244+
245+ plt .figure (figsize = (10 , 8 ))
246+ labels = [f"{ k } \n ({ v } )" for k , v in total_findings .items () if v > 0 ]
247+ values = [v for v in total_findings .values () if v > 0 ]
248+
249+ if values : # Only create pie chart if there are findings
250+ plt .pie (values , labels = labels , autopct = "%1.1f%%" )
251+ plt .title ("Distribution of PII Findings in Dataset" )
252+ else :
253+ plt .text (
254+ 0.5 ,
255+ 0.5 ,
256+ "No PII Found" ,
257+ horizontalalignment = "center" ,
258+ verticalalignment = "center" ,
259+ )
260+
261+ # Convert plot to PIL Image
262+ buf = io .BytesIO ()
263+ plt .savefig (buf , format = "png" , bbox_inches = "tight" )
264+ buf .seek (0 )
265+ plt .close () # Clean up matplotlib figure
266+ return Image .open (buf )
267+
268+
212269@step
213- def eval_pii (train_dataset : Dataset , test_dataset : Dataset ) -> None :
270+ def eval_pii (
271+ train_dataset : Dataset , test_dataset : Dataset
272+ ) -> Tuple [
273+ Annotated [Dict [str , Dict ], "train_results" ],
274+ Annotated [Dict [str , Dict ], "test_results" ],
275+ Annotated [Image , "PII chart" ],
276+ ]:
214277 detector = PIIDetector ()
215278 train_results = detector .scan_dataset (
216279 dataset = train_dataset ,
@@ -222,7 +285,7 @@ def eval_pii(train_dataset: Dataset, test_dataset: Dataset) -> None:
222285 test_results = detector .scan_dataset (
223286 dataset = test_dataset , columns = ["text" ], max_samples = 1000
224287 )
225- # Log train results
288+
226289 train_metadata = {
227290 "samples_scanned" : train_results ["statistics" ][
228291 "total_samples_scanned"
@@ -244,7 +307,6 @@ def eval_pii(train_dataset: Dataset, test_dataset: Dataset) -> None:
244307 metadata = train_metadata , artifact_name = "train_pii_results"
245308 )
246309
247- # Log test results
248310 test_metadata = {
249311 "samples_scanned" : test_results ["statistics" ]["total_samples_scanned" ],
250312 "emails_found" : test_results ["statistics" ]["total_findings" ]["emails" ],
@@ -260,4 +322,6 @@ def eval_pii(train_dataset: Dataset, test_dataset: Dataset) -> None:
260322 metadata = test_metadata , artifact_name = "test_pii_results"
261323 )
262324
263- return train_results , test_results
325+ pii_chart = plot_pii_results (train_results , test_results )
326+
327+ return train_results , test_results , pii_chart
0 commit comments