Skip to content

Commit fb2e15b

Browse files
committed
add image export as well
1 parent b19b0e5 commit fb2e15b

File tree

1 file changed

+69
-5
lines changed

1 file changed

+69
-5
lines changed

llm-complete-guide/steps/eval_pii.py

Lines changed: 69 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1+
import io
12
import re
23
from collections import defaultdict
3-
from typing import Dict, List, Union
4+
from typing import Annotated, Dict, List, Tuple, Union
45

6+
import matplotlib.pyplot as plt
57
from datasets import Dataset
8+
from PIL import Image
69
from zenml import log_artifact_metadata, step
710

811

@@ -209,8 +212,68 @@ def scan_dataset(
209212
return results
210213

211214

215+
def plot_pii_results(
216+
train_results: Dict[str, Dict], test_results: Dict[str, Dict]
217+
) -> Image:
218+
total_findings = {
219+
"Emails": (
220+
train_results["statistics"]["total_findings"]["emails"]
221+
+ test_results["statistics"]["total_findings"]["emails"]
222+
),
223+
"Phone Numbers": (
224+
train_results["statistics"]["total_findings"]["phones"]
225+
+ test_results["statistics"]["total_findings"]["phones"]
226+
),
227+
"SSNs": (
228+
train_results["statistics"]["total_findings"]["ssns"]
229+
+ test_results["statistics"]["total_findings"]["ssns"]
230+
),
231+
"Credit Cards": (
232+
train_results["statistics"]["total_findings"]["credit_cards"]
233+
+ test_results["statistics"]["total_findings"]["credit_cards"]
234+
),
235+
"Dates": (
236+
train_results["statistics"]["total_findings"]["dates"]
237+
+ test_results["statistics"]["total_findings"]["dates"]
238+
),
239+
"IP Addresses": (
240+
train_results["statistics"]["total_findings"]["ips"]
241+
+ test_results["statistics"]["total_findings"]["ips"]
242+
),
243+
}
244+
245+
plt.figure(figsize=(10, 8))
246+
labels = [f"{k}\n({v})" for k, v in total_findings.items() if v > 0]
247+
values = [v for v in total_findings.values() if v > 0]
248+
249+
if values: # Only create pie chart if there are findings
250+
plt.pie(values, labels=labels, autopct="%1.1f%%")
251+
plt.title("Distribution of PII Findings in Dataset")
252+
else:
253+
plt.text(
254+
0.5,
255+
0.5,
256+
"No PII Found",
257+
horizontalalignment="center",
258+
verticalalignment="center",
259+
)
260+
261+
# Convert plot to PIL Image
262+
buf = io.BytesIO()
263+
plt.savefig(buf, format="png", bbox_inches="tight")
264+
buf.seek(0)
265+
plt.close() # Clean up matplotlib figure
266+
return Image.open(buf)
267+
268+
212269
@step
213-
def eval_pii(train_dataset: Dataset, test_dataset: Dataset) -> None:
270+
def eval_pii(
271+
train_dataset: Dataset, test_dataset: Dataset
272+
) -> Tuple[
273+
Annotated[Dict[str, Dict], "train_results"],
274+
Annotated[Dict[str, Dict], "test_results"],
275+
Annotated[Image, "PII chart"],
276+
]:
214277
detector = PIIDetector()
215278
train_results = detector.scan_dataset(
216279
dataset=train_dataset,
@@ -222,7 +285,7 @@ def eval_pii(train_dataset: Dataset, test_dataset: Dataset) -> None:
222285
test_results = detector.scan_dataset(
223286
dataset=test_dataset, columns=["text"], max_samples=1000
224287
)
225-
# Log train results
288+
226289
train_metadata = {
227290
"samples_scanned": train_results["statistics"][
228291
"total_samples_scanned"
@@ -244,7 +307,6 @@ def eval_pii(train_dataset: Dataset, test_dataset: Dataset) -> None:
244307
metadata=train_metadata, artifact_name="train_pii_results"
245308
)
246309

247-
# Log test results
248310
test_metadata = {
249311
"samples_scanned": test_results["statistics"]["total_samples_scanned"],
250312
"emails_found": test_results["statistics"]["total_findings"]["emails"],
@@ -260,4 +322,6 @@ def eval_pii(train_dataset: Dataset, test_dataset: Dataset) -> None:
260322
metadata=test_metadata, artifact_name="test_pii_results"
261323
)
262324

263-
return train_results, test_results
325+
pii_chart = plot_pii_results(train_results, test_results)
326+
327+
return train_results, test_results, pii_chart

0 commit comments

Comments
 (0)