Skip to content

Commit 5a01673

Browse files
committed
Merge branch 'main' of github.com:zenml-io/zenml-projects
2 parents 80ca67f + 9e2e1f7 commit 5a01673

File tree

4 files changed

+338
-1
lines changed

4 files changed

+338
-1
lines changed

llm-complete-guide/pipelines/distilabel_generation.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
EMBEDDINGS_MODEL_NAME_ZENML,
1919
)
2020
from steps.distilabel_generate_queries import generate_synthetic_queries
21+
from steps.eval_pii import eval_pii
2122
from steps.hf_dataset_loader import load_hf_dataset
2223
from steps.push_to_argilla import push_to_argilla
2324
from steps.push_to_hf import push_to_hf
@@ -47,16 +48,22 @@
4748
@pipeline(model=model_definition)
4849
def generate_synthetic_data():
4950
train_dataset, test_dataset = load_hf_dataset()
51+
_, _, _ = eval_pii(
52+
train_dataset=train_dataset,
53+
test_dataset=test_dataset,
54+
)
5055
train_with_queries, test_with_queries = generate_synthetic_queries(
5156
train_dataset=train_dataset, test_dataset=test_dataset
5257
)
5358
push_to_hf(
5459
train_dataset=train_with_queries,
5560
test_dataset=test_with_queries,
61+
after="eval_pii",
5662
)
5763
push_to_argilla(
5864
train_dataset=train_with_queries,
5965
test_dataset=test_with_queries,
66+
after="eval_pii",
6067
)
6168

6269

llm-complete-guide/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ rerankers[flashrank]
2121
datasets
2222
torch
2323
gradio
24+
huggingface-hub
2425

2526
# optional requirements for S3 artifact store
2627
# s3fs>2022.3.0

llm-complete-guide/steps/distilabel_generate_queries.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def generate_synthetic_queries(
4545

4646
with Pipeline(name="generate_embedding_queries") as pipeline:
4747
load_dataset = LoadDataFromHub(
48-
# num_examples=20, # use this for demo purposes
48+
num_examples=40, # use this for demo purposes
4949
output_mappings={"page_content": "anchor"},
5050
)
5151
generate_sentence_pair = GenerateSentencePair(
Lines changed: 329 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
import io
2+
import re
3+
from collections import defaultdict
4+
from typing import Annotated, Dict, List, Tuple, Union
5+
6+
import matplotlib.pyplot as plt
7+
from datasets import Dataset
8+
from PIL import Image
9+
from zenml import log_artifact_metadata, step
10+
11+
12+
class PIIDetector:
13+
"""A class to detect PII in HuggingFace datasets."""
14+
15+
def __init__(self):
16+
# Email regex pattern
17+
self.email_pattern = re.compile(
18+
r"""
19+
(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")
20+
@
21+
(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])
22+
""",
23+
re.VERBOSE | re.IGNORECASE,
24+
)
25+
26+
# Phone number patterns (US formats)
27+
self.phone_pattern = re.compile(
28+
r"""
29+
(?:
30+
# Format: (123) 456-7890 or 123-456-7890
31+
(?:\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4}))|
32+
# Format: +1 123-456-7890 or +1 (123) 456-7890
33+
(?:\+1[-.\s]?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4}))|
34+
# Format: 1234567890
35+
(?:[0-9]{10})
36+
)
37+
""",
38+
re.VERBOSE,
39+
)
40+
41+
# SSN pattern (XXX-XX-XXXX)
42+
self.ssn_pattern = re.compile(
43+
r"""
44+
(?!000|666|9\d{2}) # SSN cannot start with 000, 666, or 900-999
45+
([0-8]\d{2}|7([0-6]\d))
46+
[-\s]?
47+
(?!00) # Cannot have 00 in the middle group
48+
([0-9]{2})
49+
[-\s]?
50+
(?!0000) # Cannot end with 0000
51+
([0-9]{4})
52+
""",
53+
re.VERBOSE,
54+
)
55+
56+
# Credit card pattern (major card types)
57+
self.credit_card_pattern = re.compile(
58+
r"""
59+
(?:
60+
# Visa
61+
4[0-9]{12}(?:[0-9]{3})?|
62+
# Mastercard
63+
(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}|
64+
# American Express
65+
3[47][0-9]{13}|
66+
# Discover
67+
6(?:011|5[0-9][0-9])[0-9]{12}
68+
)
69+
""",
70+
re.VERBOSE,
71+
)
72+
73+
# IP address pattern (IPv4)
74+
self.ip_pattern = re.compile(
75+
r"""
76+
\b
77+
(?:
78+
(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.
79+
(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.
80+
(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.
81+
(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)
82+
)
83+
\b
84+
""",
85+
re.VERBOSE,
86+
)
87+
88+
# Date pattern (common formats)
89+
self.date_pattern = re.compile(
90+
r"""
91+
(?:
92+
# MM/DD/YYYY or MM-DD-YYYY
93+
(?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12][0-9]|3[01])[/-](?:19|20)\d\d|
94+
# YYYY/MM/DD or YYYY-MM-DD
95+
(?:19|20)\d\d[/-](?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12][0-9]|3[01])|
96+
# Month DD, YYYY
97+
(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|
98+
Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|
99+
Dec(?:ember)?)\s+(?:0[1-9]|[12][0-9]|3[01])(?:,|\s)+(?:19|20)\d\d
100+
)
101+
""",
102+
re.VERBOSE | re.IGNORECASE,
103+
)
104+
105+
def find_pii(self, text: str) -> Dict[str, List[str]]:
106+
"""
107+
Find all PII in a given text.
108+
109+
Args:
110+
text (str): The text to search for PII
111+
112+
Returns:
113+
Dict[str, List[str]]: Dictionary of PII types and their findings
114+
"""
115+
if not isinstance(text, str):
116+
return {
117+
"emails": [],
118+
"phones": [],
119+
"ssns": [],
120+
"credit_cards": [],
121+
"dates": [],
122+
"ips": [],
123+
}
124+
125+
return {
126+
"emails": self.email_pattern.findall(text),
127+
"phones": self.phone_pattern.findall(text),
128+
"ssns": self.ssn_pattern.findall(text),
129+
"credit_cards": self.credit_card_pattern.findall(text),
130+
"dates": self.date_pattern.findall(text),
131+
"ips": self.ip_pattern.findall(text),
132+
}
133+
134+
def scan_dataset(
135+
self,
136+
dataset: Dataset,
137+
columns: Union[List[str], None] = None,
138+
max_samples: int = None,
139+
) -> Dict[str, Dict]:
140+
"""Scan a HuggingFace dataset for PII (currently only emails).
141+
142+
Args:
143+
dataset (Dataset): HuggingFace dataset to scan
144+
columns (List[str], optional): Specific columns to scan. If None, scans all string columns
145+
max_samples (int, optional): Maximum number of samples to scan. If None, scans entire dataset
146+
147+
Returns:
148+
Dict[str, Dict]: Dictionary containing:
149+
- 'statistics': Overall statistics about the scan
150+
- 'findings': Detailed findings per column
151+
"""
152+
# Initialize results
153+
results = {
154+
"statistics": {
155+
"total_samples_scanned": 0,
156+
"columns_scanned": 0,
157+
"total_findings": {
158+
"emails": 0,
159+
"phones": 0,
160+
"ssns": 0,
161+
"credit_cards": 0,
162+
"dates": 0,
163+
"ips": 0,
164+
},
165+
},
166+
"findings": defaultdict(list),
167+
}
168+
169+
# Determine which columns to scan
170+
if columns is None:
171+
# Get all columns that contain string data
172+
columns = [
173+
col
174+
for col in dataset.column_names
175+
if dataset.features[col].dtype in ["string", "str"]
176+
]
177+
178+
results["statistics"]["columns_scanned"] = len(columns)
179+
180+
# Determine number of samples to scan
181+
n_samples = (
182+
len(dataset)
183+
if max_samples is None
184+
else min(max_samples, len(dataset))
185+
)
186+
results["statistics"]["total_samples_scanned"] = n_samples
187+
188+
# Scan the dataset
189+
for idx in range(n_samples):
190+
sample = dataset[idx]
191+
192+
for column in columns:
193+
if column not in sample:
194+
continue
195+
196+
text = sample[column]
197+
pii_findings = self.find_pii(text)
198+
199+
# Check if any PII was found
200+
if any(findings for findings in pii_findings.values()):
201+
# Update statistics
202+
for pii_type, findings in pii_findings.items():
203+
results["statistics"]["total_findings"][pii_type] += (
204+
len(findings)
205+
)
206+
207+
# Record detailed findings
208+
results["findings"][column].append(
209+
{"index": idx, "findings": pii_findings}
210+
)
211+
212+
return results
213+
214+
215+
def plot_pii_results(
216+
train_results: Dict[str, Dict], test_results: Dict[str, Dict]
217+
) -> Image:
218+
total_findings = {
219+
"Emails": (
220+
train_results["statistics"]["total_findings"]["emails"]
221+
+ test_results["statistics"]["total_findings"]["emails"]
222+
),
223+
"Phone Numbers": (
224+
train_results["statistics"]["total_findings"]["phones"]
225+
+ test_results["statistics"]["total_findings"]["phones"]
226+
),
227+
"SSNs": (
228+
train_results["statistics"]["total_findings"]["ssns"]
229+
+ test_results["statistics"]["total_findings"]["ssns"]
230+
),
231+
"Credit Cards": (
232+
train_results["statistics"]["total_findings"]["credit_cards"]
233+
+ test_results["statistics"]["total_findings"]["credit_cards"]
234+
),
235+
"Dates": (
236+
train_results["statistics"]["total_findings"]["dates"]
237+
+ test_results["statistics"]["total_findings"]["dates"]
238+
),
239+
"IP Addresses": (
240+
train_results["statistics"]["total_findings"]["ips"]
241+
+ test_results["statistics"]["total_findings"]["ips"]
242+
),
243+
}
244+
245+
plt.figure(figsize=(10, 8))
246+
labels = [f"{k}\n({v})" for k, v in total_findings.items() if v > 0]
247+
values = [v for v in total_findings.values() if v > 0]
248+
249+
if values: # Only create pie chart if there are findings
250+
plt.pie(values, labels=labels, autopct="%1.1f%%")
251+
plt.title("Distribution of PII Findings in Dataset")
252+
else:
253+
plt.text(
254+
0.5,
255+
0.5,
256+
"No PII Found",
257+
horizontalalignment="center",
258+
verticalalignment="center",
259+
)
260+
261+
# Convert plot to PIL Image
262+
buf = io.BytesIO()
263+
plt.savefig(buf, format="png", bbox_inches="tight")
264+
buf.seek(0)
265+
plt.close() # Clean up matplotlib figure
266+
return Image.open(buf)
267+
268+
269+
@step
270+
def eval_pii(
271+
train_dataset: Dataset, test_dataset: Dataset
272+
) -> Tuple[
273+
Annotated[Dict[str, Dict], "train_pii_results"],
274+
Annotated[Dict[str, Dict], "test_pii_results"],
275+
Annotated[Image.Image, "PII chart"],
276+
]:
277+
detector = PIIDetector()
278+
train_results = detector.scan_dataset(
279+
dataset=train_dataset,
280+
# columns=[
281+
# "text"
282+
# ], # specify columns to scan, or None for all string columns
283+
# max_samples=1000, # optional: limit number of samples to scan
284+
)
285+
test_results = detector.scan_dataset(
286+
dataset=test_dataset,
287+
# columns=["text"],
288+
# max_samples=1000, # optional: limit number of samples to scan
289+
)
290+
291+
train_metadata = {
292+
"samples_scanned": train_results["statistics"][
293+
"total_samples_scanned"
294+
],
295+
"emails_found": train_results["statistics"]["total_findings"][
296+
"emails"
297+
],
298+
"phones_found": train_results["statistics"]["total_findings"][
299+
"phones"
300+
],
301+
"ssns_found": train_results["statistics"]["total_findings"]["ssns"],
302+
"credit_cards_found": train_results["statistics"]["total_findings"][
303+
"credit_cards"
304+
],
305+
"dates_found": train_results["statistics"]["total_findings"]["dates"],
306+
"ips_found": train_results["statistics"]["total_findings"]["ips"],
307+
}
308+
log_artifact_metadata(
309+
metadata=train_metadata, artifact_name="train_pii_results"
310+
)
311+
312+
test_metadata = {
313+
"samples_scanned": test_results["statistics"]["total_samples_scanned"],
314+
"emails_found": test_results["statistics"]["total_findings"]["emails"],
315+
"phones_found": test_results["statistics"]["total_findings"]["phones"],
316+
"ssns_found": test_results["statistics"]["total_findings"]["ssns"],
317+
"credit_cards_found": test_results["statistics"]["total_findings"][
318+
"credit_cards"
319+
],
320+
"dates_found": test_results["statistics"]["total_findings"]["dates"],
321+
"ips_found": test_results["statistics"]["total_findings"]["ips"],
322+
}
323+
log_artifact_metadata(
324+
metadata=test_metadata, artifact_name="test_pii_results"
325+
)
326+
327+
pii_chart = plot_pii_results(train_results, test_results)
328+
329+
return train_results, test_results, pii_chart

0 commit comments

Comments
 (0)