Skip to content

Commit b19b0e5

Browse files
committed
add PII eval step
1 parent 1dc5b39 commit b19b0e5

File tree

2 files changed

+270
-0
lines changed

2 files changed

+270
-0
lines changed

llm-complete-guide/pipelines/distilabel_generation.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
EMBEDDINGS_MODEL_NAME_ZENML,
1919
)
2020
from steps.distilabel_generate_queries import generate_synthetic_queries
21+
from steps.eval_pii import eval_pii
2122
from steps.hf_dataset_loader import load_hf_dataset
2223
from steps.push_to_argilla import push_to_argilla
2324
from steps.push_to_hf import push_to_hf
@@ -47,16 +48,22 @@
4748
@pipeline(model=model_definition)
4849
def generate_synthetic_data():
4950
train_dataset, test_dataset = load_hf_dataset()
51+
train_pii_results, test_pii_results = eval_pii(
52+
train_dataset=train_dataset,
53+
test_dataset=test_dataset,
54+
)
5055
train_with_queries, test_with_queries = generate_synthetic_queries(
5156
train_dataset=train_dataset, test_dataset=test_dataset
5257
)
5358
push_to_hf(
5459
train_dataset=train_with_queries,
5560
test_dataset=test_with_queries,
61+
after="eval_pii",
5662
)
5763
push_to_argilla(
5864
train_dataset=train_with_queries,
5965
test_dataset=test_with_queries,
66+
after="eval_pii",
6067
)
6168

6269

Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
import re
2+
from collections import defaultdict
3+
from typing import Dict, List, Union
4+
5+
from datasets import Dataset
6+
from zenml import log_artifact_metadata, step
7+
8+
9+
class PIIDetector:
10+
"""A class to detect PII in HuggingFace datasets."""
11+
12+
def __init__(self):
13+
# Email regex pattern
14+
self.email_pattern = re.compile(
15+
r"""
16+
(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")
17+
@
18+
(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])
19+
""",
20+
re.VERBOSE | re.IGNORECASE,
21+
)
22+
23+
# Phone number patterns (US formats)
24+
self.phone_pattern = re.compile(
25+
r"""
26+
(?:
27+
# Format: (123) 456-7890 or 123-456-7890
28+
(?:\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4}))|
29+
# Format: +1 123-456-7890 or +1 (123) 456-7890
30+
(?:\+1[-.\s]?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4}))|
31+
# Format: 1234567890
32+
(?:[0-9]{10})
33+
)
34+
""",
35+
re.VERBOSE,
36+
)
37+
38+
# SSN pattern (XXX-XX-XXXX)
39+
self.ssn_pattern = re.compile(
40+
r"""
41+
(?!000|666|9\d{2}) # SSN cannot start with 000, 666, or 900-999
42+
([0-8]\d{2}|7([0-6]\d))
43+
[-\s]?
44+
(?!00) # Cannot have 00 in the middle group
45+
([0-9]{2})
46+
[-\s]?
47+
(?!0000) # Cannot end with 0000
48+
([0-9]{4})
49+
""",
50+
re.VERBOSE,
51+
)
52+
53+
# Credit card pattern (major card types)
54+
self.credit_card_pattern = re.compile(
55+
r"""
56+
(?:
57+
# Visa
58+
4[0-9]{12}(?:[0-9]{3})?|
59+
# Mastercard
60+
(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}|
61+
# American Express
62+
3[47][0-9]{13}|
63+
# Discover
64+
6(?:011|5[0-9][0-9])[0-9]{12}
65+
)
66+
""",
67+
re.VERBOSE,
68+
)
69+
70+
# IP address pattern (IPv4)
71+
self.ip_pattern = re.compile(
72+
r"""
73+
\b
74+
(?:
75+
(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.
76+
(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.
77+
(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.
78+
(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)
79+
)
80+
\b
81+
""",
82+
re.VERBOSE,
83+
)
84+
85+
# Date pattern (common formats)
86+
self.date_pattern = re.compile(
87+
r"""
88+
(?:
89+
# MM/DD/YYYY or MM-DD-YYYY
90+
(?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12][0-9]|3[01])[/-](?:19|20)\d\d|
91+
# YYYY/MM/DD or YYYY-MM-DD
92+
(?:19|20)\d\d[/-](?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12][0-9]|3[01])|
93+
# Month DD, YYYY
94+
(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|
95+
Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|
96+
Dec(?:ember)?)\s+(?:0[1-9]|[12][0-9]|3[01])(?:,|\s)+(?:19|20)\d\d
97+
)
98+
""",
99+
re.VERBOSE | re.IGNORECASE,
100+
)
101+
102+
def find_pii(self, text: str) -> Dict[str, List[str]]:
103+
"""
104+
Find all PII in a given text.
105+
106+
Args:
107+
text (str): The text to search for PII
108+
109+
Returns:
110+
Dict[str, List[str]]: Dictionary of PII types and their findings
111+
"""
112+
if not isinstance(text, str):
113+
return {
114+
"emails": [],
115+
"phones": [],
116+
"ssns": [],
117+
"credit_cards": [],
118+
"dates": [],
119+
"ips": [],
120+
}
121+
122+
return {
123+
"emails": self.email_pattern.findall(text),
124+
"phones": self.phone_pattern.findall(text),
125+
"ssns": self.ssn_pattern.findall(text),
126+
"credit_cards": self.credit_card_pattern.findall(text),
127+
"dates": self.date_pattern.findall(text),
128+
"ips": self.ip_pattern.findall(text),
129+
}
130+
131+
def scan_dataset(
132+
self,
133+
dataset: Dataset,
134+
columns: Union[List[str], None] = None,
135+
max_samples: int = None,
136+
) -> Dict[str, Dict]:
137+
"""Scan a HuggingFace dataset for PII (currently only emails).
138+
139+
Args:
140+
dataset (Dataset): HuggingFace dataset to scan
141+
columns (List[str], optional): Specific columns to scan. If None, scans all string columns
142+
max_samples (int, optional): Maximum number of samples to scan. If None, scans entire dataset
143+
144+
Returns:
145+
Dict[str, Dict]: Dictionary containing:
146+
- 'statistics': Overall statistics about the scan
147+
- 'findings': Detailed findings per column
148+
"""
149+
# Initialize results
150+
results = {
151+
"statistics": {
152+
"total_samples_scanned": 0,
153+
"columns_scanned": 0,
154+
"total_findings": {
155+
"emails": 0,
156+
"phones": 0,
157+
"ssns": 0,
158+
"credit_cards": 0,
159+
"dates": 0,
160+
"ips": 0,
161+
},
162+
},
163+
"findings": defaultdict(list),
164+
}
165+
166+
# Determine which columns to scan
167+
if columns is None:
168+
# Get all columns that contain string data
169+
columns = [
170+
col
171+
for col in dataset.column_names
172+
if dataset.features[col].dtype in ["string", "str"]
173+
]
174+
175+
results["statistics"]["columns_scanned"] = len(columns)
176+
177+
# Determine number of samples to scan
178+
n_samples = (
179+
len(dataset)
180+
if max_samples is None
181+
else min(max_samples, len(dataset))
182+
)
183+
results["statistics"]["total_samples_scanned"] = n_samples
184+
185+
# Scan the dataset
186+
for idx in range(n_samples):
187+
sample = dataset[idx]
188+
189+
for column in columns:
190+
if column not in sample:
191+
continue
192+
193+
text = sample[column]
194+
pii_findings = self.find_pii(text)
195+
196+
# Check if any PII was found
197+
if any(findings for findings in pii_findings.values()):
198+
# Update statistics
199+
for pii_type, findings in pii_findings.items():
200+
results["statistics"]["total_findings"][pii_type] += (
201+
len(findings)
202+
)
203+
204+
# Record detailed findings
205+
results["findings"][column].append(
206+
{"index": idx, "findings": pii_findings}
207+
)
208+
209+
return results
210+
211+
212+
@step
213+
def eval_pii(train_dataset: Dataset, test_dataset: Dataset) -> None:
214+
detector = PIIDetector()
215+
train_results = detector.scan_dataset(
216+
dataset=train_dataset,
217+
columns=[
218+
"text"
219+
], # specify columns to scan, or None for all string columns
220+
max_samples=1000, # optional: limit number of samples to scan
221+
)
222+
test_results = detector.scan_dataset(
223+
dataset=test_dataset, columns=["text"], max_samples=1000
224+
)
225+
# Log train results
226+
train_metadata = {
227+
"samples_scanned": train_results["statistics"][
228+
"total_samples_scanned"
229+
],
230+
"emails_found": train_results["statistics"]["total_findings"][
231+
"emails"
232+
],
233+
"phones_found": train_results["statistics"]["total_findings"][
234+
"phones"
235+
],
236+
"ssns_found": train_results["statistics"]["total_findings"]["ssns"],
237+
"credit_cards_found": train_results["statistics"]["total_findings"][
238+
"credit_cards"
239+
],
240+
"dates_found": train_results["statistics"]["total_findings"]["dates"],
241+
"ips_found": train_results["statistics"]["total_findings"]["ips"],
242+
}
243+
log_artifact_metadata(
244+
metadata=train_metadata, artifact_name="train_pii_results"
245+
)
246+
247+
# Log test results
248+
test_metadata = {
249+
"samples_scanned": test_results["statistics"]["total_samples_scanned"],
250+
"emails_found": test_results["statistics"]["total_findings"]["emails"],
251+
"phones_found": test_results["statistics"]["total_findings"]["phones"],
252+
"ssns_found": test_results["statistics"]["total_findings"]["ssns"],
253+
"credit_cards_found": test_results["statistics"]["total_findings"][
254+
"credit_cards"
255+
],
256+
"dates_found": test_results["statistics"]["total_findings"]["dates"],
257+
"ips_found": test_results["statistics"]["total_findings"]["ips"],
258+
}
259+
log_artifact_metadata(
260+
metadata=test_metadata, artifact_name="test_pii_results"
261+
)
262+
263+
return train_results, test_results

0 commit comments

Comments
 (0)