From 1dc5b3958065eb59f374426818a215944643bfed Mon Sep 17 00:00:00 2001 From: Alex Strick van Linschoten Date: Tue, 29 Oct 2024 14:25:15 +0100 Subject: [PATCH 1/5] add hf hub requirement --- llm-complete-guide/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/llm-complete-guide/requirements.txt b/llm-complete-guide/requirements.txt index 506d03cc..13563b92 100644 --- a/llm-complete-guide/requirements.txt +++ b/llm-complete-guide/requirements.txt @@ -21,6 +21,7 @@ rerankers[flashrank] datasets torch gradio +huggingface-hub # optional requirements for S3 artifact store # s3fs>2022.3.0 From b19b0e55d3a9b2bd04293f965914d7b36b41062c Mon Sep 17 00:00:00 2001 From: Alex Strick van Linschoten Date: Tue, 29 Oct 2024 14:25:43 +0100 Subject: [PATCH 2/5] add PII eval step --- .../pipelines/distilabel_generation.py | 7 + llm-complete-guide/steps/eval_pii.py | 263 ++++++++++++++++++ 2 files changed, 270 insertions(+) create mode 100644 llm-complete-guide/steps/eval_pii.py diff --git a/llm-complete-guide/pipelines/distilabel_generation.py b/llm-complete-guide/pipelines/distilabel_generation.py index 18ddc5d8..7c76c689 100644 --- a/llm-complete-guide/pipelines/distilabel_generation.py +++ b/llm-complete-guide/pipelines/distilabel_generation.py @@ -18,6 +18,7 @@ EMBEDDINGS_MODEL_NAME_ZENML, ) from steps.distilabel_generate_queries import generate_synthetic_queries +from steps.eval_pii import eval_pii from steps.hf_dataset_loader import load_hf_dataset from steps.push_to_argilla import push_to_argilla from steps.push_to_hf import push_to_hf @@ -47,16 +48,22 @@ @pipeline(model=model_definition) def generate_synthetic_data(): train_dataset, test_dataset = load_hf_dataset() + train_pii_results, test_pii_results = eval_pii( + train_dataset=train_dataset, + test_dataset=test_dataset, + ) train_with_queries, test_with_queries = generate_synthetic_queries( train_dataset=train_dataset, test_dataset=test_dataset ) push_to_hf( train_dataset=train_with_queries, test_dataset=test_with_queries, + after="eval_pii", ) push_to_argilla( train_dataset=train_with_queries, test_dataset=test_with_queries, + after="eval_pii", ) diff --git a/llm-complete-guide/steps/eval_pii.py b/llm-complete-guide/steps/eval_pii.py new file mode 100644 index 00000000..ee664156 --- /dev/null +++ b/llm-complete-guide/steps/eval_pii.py @@ -0,0 +1,263 @@ +import re +from collections import defaultdict +from typing import Dict, List, Union + +from datasets import Dataset +from zenml import log_artifact_metadata, step + + +class PIIDetector: + """A class to detect PII in HuggingFace datasets.""" + + def __init__(self): + # Email regex pattern + self.email_pattern = re.compile( + r""" + (?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*") + @ + (?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\]) + """, + re.VERBOSE | re.IGNORECASE, + ) + + # Phone number patterns (US formats) + self.phone_pattern = re.compile( + r""" + (?: + # Format: (123) 456-7890 or 123-456-7890 + (?:\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4}))| + # Format: +1 123-456-7890 or +1 (123) 456-7890 + (?:\+1[-.\s]?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4}))| + # Format: 1234567890 + (?:[0-9]{10}) + ) + """, + re.VERBOSE, + ) + + # SSN pattern (XXX-XX-XXXX) + self.ssn_pattern = re.compile( + r""" + (?!000|666|9\d{2}) # SSN cannot start with 000, 666, or 900-999 + ([0-8]\d{2}|7([0-6]\d)) + [-\s]? + (?!00) # Cannot have 00 in the middle group + ([0-9]{2}) + [-\s]? + (?!0000) # Cannot end with 0000 + ([0-9]{4}) + """, + re.VERBOSE, + ) + + # Credit card pattern (major card types) + self.credit_card_pattern = re.compile( + r""" + (?: + # Visa + 4[0-9]{12}(?:[0-9]{3})?| + # Mastercard + (?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}| + # American Express + 3[47][0-9]{13}| + # Discover + 6(?:011|5[0-9][0-9])[0-9]{12} + ) + """, + re.VERBOSE, + ) + + # IP address pattern (IPv4) + self.ip_pattern = re.compile( + r""" + \b + (?: + (?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\. + (?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\. + (?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\. + (?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?) + ) + \b + """, + re.VERBOSE, + ) + + # Date pattern (common formats) + self.date_pattern = re.compile( + r""" + (?: + # MM/DD/YYYY or MM-DD-YYYY + (?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12][0-9]|3[01])[/-](?:19|20)\d\d| + # YYYY/MM/DD or YYYY-MM-DD + (?:19|20)\d\d[/-](?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12][0-9]|3[01])| + # Month DD, YYYY + (?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?| + Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?| + Dec(?:ember)?)\s+(?:0[1-9]|[12][0-9]|3[01])(?:,|\s)+(?:19|20)\d\d + ) + """, + re.VERBOSE | re.IGNORECASE, + ) + + def find_pii(self, text: str) -> Dict[str, List[str]]: + """ + Find all PII in a given text. + + Args: + text (str): The text to search for PII + + Returns: + Dict[str, List[str]]: Dictionary of PII types and their findings + """ + if not isinstance(text, str): + return { + "emails": [], + "phones": [], + "ssns": [], + "credit_cards": [], + "dates": [], + "ips": [], + } + + return { + "emails": self.email_pattern.findall(text), + "phones": self.phone_pattern.findall(text), + "ssns": self.ssn_pattern.findall(text), + "credit_cards": self.credit_card_pattern.findall(text), + "dates": self.date_pattern.findall(text), + "ips": self.ip_pattern.findall(text), + } + + def scan_dataset( + self, + dataset: Dataset, + columns: Union[List[str], None] = None, + max_samples: int = None, + ) -> Dict[str, Dict]: + """Scan a HuggingFace dataset for PII (currently only emails). + + Args: + dataset (Dataset): HuggingFace dataset to scan + columns (List[str], optional): Specific columns to scan. If None, scans all string columns + max_samples (int, optional): Maximum number of samples to scan. If None, scans entire dataset + + Returns: + Dict[str, Dict]: Dictionary containing: + - 'statistics': Overall statistics about the scan + - 'findings': Detailed findings per column + """ + # Initialize results + results = { + "statistics": { + "total_samples_scanned": 0, + "columns_scanned": 0, + "total_findings": { + "emails": 0, + "phones": 0, + "ssns": 0, + "credit_cards": 0, + "dates": 0, + "ips": 0, + }, + }, + "findings": defaultdict(list), + } + + # Determine which columns to scan + if columns is None: + # Get all columns that contain string data + columns = [ + col + for col in dataset.column_names + if dataset.features[col].dtype in ["string", "str"] + ] + + results["statistics"]["columns_scanned"] = len(columns) + + # Determine number of samples to scan + n_samples = ( + len(dataset) + if max_samples is None + else min(max_samples, len(dataset)) + ) + results["statistics"]["total_samples_scanned"] = n_samples + + # Scan the dataset + for idx in range(n_samples): + sample = dataset[idx] + + for column in columns: + if column not in sample: + continue + + text = sample[column] + pii_findings = self.find_pii(text) + + # Check if any PII was found + if any(findings for findings in pii_findings.values()): + # Update statistics + for pii_type, findings in pii_findings.items(): + results["statistics"]["total_findings"][pii_type] += ( + len(findings) + ) + + # Record detailed findings + results["findings"][column].append( + {"index": idx, "findings": pii_findings} + ) + + return results + + +@step +def eval_pii(train_dataset: Dataset, test_dataset: Dataset) -> None: + detector = PIIDetector() + train_results = detector.scan_dataset( + dataset=train_dataset, + columns=[ + "text" + ], # specify columns to scan, or None for all string columns + max_samples=1000, # optional: limit number of samples to scan + ) + test_results = detector.scan_dataset( + dataset=test_dataset, columns=["text"], max_samples=1000 + ) + # Log train results + train_metadata = { + "samples_scanned": train_results["statistics"][ + "total_samples_scanned" + ], + "emails_found": train_results["statistics"]["total_findings"][ + "emails" + ], + "phones_found": train_results["statistics"]["total_findings"][ + "phones" + ], + "ssns_found": train_results["statistics"]["total_findings"]["ssns"], + "credit_cards_found": train_results["statistics"]["total_findings"][ + "credit_cards" + ], + "dates_found": train_results["statistics"]["total_findings"]["dates"], + "ips_found": train_results["statistics"]["total_findings"]["ips"], + } + log_artifact_metadata( + metadata=train_metadata, artifact_name="train_pii_results" + ) + + # Log test results + test_metadata = { + "samples_scanned": test_results["statistics"]["total_samples_scanned"], + "emails_found": test_results["statistics"]["total_findings"]["emails"], + "phones_found": test_results["statistics"]["total_findings"]["phones"], + "ssns_found": test_results["statistics"]["total_findings"]["ssns"], + "credit_cards_found": test_results["statistics"]["total_findings"][ + "credit_cards" + ], + "dates_found": test_results["statistics"]["total_findings"]["dates"], + "ips_found": test_results["statistics"]["total_findings"]["ips"], + } + log_artifact_metadata( + metadata=test_metadata, artifact_name="test_pii_results" + ) + + return train_results, test_results From fb2e15bb6082ad64b56150d904c39da7c2edb8c4 Mon Sep 17 00:00:00 2001 From: Alex Strick van Linschoten Date: Tue, 29 Oct 2024 14:47:03 +0100 Subject: [PATCH 3/5] add image export as well --- llm-complete-guide/steps/eval_pii.py | 74 ++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 5 deletions(-) diff --git a/llm-complete-guide/steps/eval_pii.py b/llm-complete-guide/steps/eval_pii.py index ee664156..16237372 100644 --- a/llm-complete-guide/steps/eval_pii.py +++ b/llm-complete-guide/steps/eval_pii.py @@ -1,8 +1,11 @@ +import io import re from collections import defaultdict -from typing import Dict, List, Union +from typing import Annotated, Dict, List, Tuple, Union +import matplotlib.pyplot as plt from datasets import Dataset +from PIL import Image from zenml import log_artifact_metadata, step @@ -209,8 +212,68 @@ def scan_dataset( return results +def plot_pii_results( + train_results: Dict[str, Dict], test_results: Dict[str, Dict] +) -> Image: + total_findings = { + "Emails": ( + train_results["statistics"]["total_findings"]["emails"] + + test_results["statistics"]["total_findings"]["emails"] + ), + "Phone Numbers": ( + train_results["statistics"]["total_findings"]["phones"] + + test_results["statistics"]["total_findings"]["phones"] + ), + "SSNs": ( + train_results["statistics"]["total_findings"]["ssns"] + + test_results["statistics"]["total_findings"]["ssns"] + ), + "Credit Cards": ( + train_results["statistics"]["total_findings"]["credit_cards"] + + test_results["statistics"]["total_findings"]["credit_cards"] + ), + "Dates": ( + train_results["statistics"]["total_findings"]["dates"] + + test_results["statistics"]["total_findings"]["dates"] + ), + "IP Addresses": ( + train_results["statistics"]["total_findings"]["ips"] + + test_results["statistics"]["total_findings"]["ips"] + ), + } + + plt.figure(figsize=(10, 8)) + labels = [f"{k}\n({v})" for k, v in total_findings.items() if v > 0] + values = [v for v in total_findings.values() if v > 0] + + if values: # Only create pie chart if there are findings + plt.pie(values, labels=labels, autopct="%1.1f%%") + plt.title("Distribution of PII Findings in Dataset") + else: + plt.text( + 0.5, + 0.5, + "No PII Found", + horizontalalignment="center", + verticalalignment="center", + ) + + # Convert plot to PIL Image + buf = io.BytesIO() + plt.savefig(buf, format="png", bbox_inches="tight") + buf.seek(0) + plt.close() # Clean up matplotlib figure + return Image.open(buf) + + @step -def eval_pii(train_dataset: Dataset, test_dataset: Dataset) -> None: +def eval_pii( + train_dataset: Dataset, test_dataset: Dataset +) -> Tuple[ + Annotated[Dict[str, Dict], "train_results"], + Annotated[Dict[str, Dict], "test_results"], + Annotated[Image, "PII chart"], +]: detector = PIIDetector() train_results = detector.scan_dataset( dataset=train_dataset, @@ -222,7 +285,7 @@ def eval_pii(train_dataset: Dataset, test_dataset: Dataset) -> None: test_results = detector.scan_dataset( dataset=test_dataset, columns=["text"], max_samples=1000 ) - # Log train results + train_metadata = { "samples_scanned": train_results["statistics"][ "total_samples_scanned" @@ -244,7 +307,6 @@ def eval_pii(train_dataset: Dataset, test_dataset: Dataset) -> None: metadata=train_metadata, artifact_name="train_pii_results" ) - # Log test results test_metadata = { "samples_scanned": test_results["statistics"]["total_samples_scanned"], "emails_found": test_results["statistics"]["total_findings"]["emails"], @@ -260,4 +322,6 @@ def eval_pii(train_dataset: Dataset, test_dataset: Dataset) -> None: metadata=test_metadata, artifact_name="test_pii_results" ) - return train_results, test_results + pii_chart = plot_pii_results(train_results, test_results) + + return train_results, test_results, pii_chart From 02c893e694ea77c4a81982c9c882a053df29cbb7 Mon Sep 17 00:00:00 2001 From: Alex Strick van Linschoten Date: Tue, 29 Oct 2024 15:30:19 +0100 Subject: [PATCH 4/5] fix type hints --- llm-complete-guide/pipelines/distilabel_generation.py | 2 +- llm-complete-guide/steps/distilabel_generate_queries.py | 2 +- llm-complete-guide/steps/eval_pii.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llm-complete-guide/pipelines/distilabel_generation.py b/llm-complete-guide/pipelines/distilabel_generation.py index 7c76c689..4fb38906 100644 --- a/llm-complete-guide/pipelines/distilabel_generation.py +++ b/llm-complete-guide/pipelines/distilabel_generation.py @@ -48,7 +48,7 @@ @pipeline(model=model_definition) def generate_synthetic_data(): train_dataset, test_dataset = load_hf_dataset() - train_pii_results, test_pii_results = eval_pii( + _, _, _ = eval_pii( train_dataset=train_dataset, test_dataset=test_dataset, ) diff --git a/llm-complete-guide/steps/distilabel_generate_queries.py b/llm-complete-guide/steps/distilabel_generate_queries.py index 0b23a639..55372d1b 100644 --- a/llm-complete-guide/steps/distilabel_generate_queries.py +++ b/llm-complete-guide/steps/distilabel_generate_queries.py @@ -45,7 +45,7 @@ def generate_synthetic_queries( with Pipeline(name="generate_embedding_queries") as pipeline: load_dataset = LoadDataFromHub( - # num_examples=20, # use this for demo purposes + num_examples=40, # use this for demo purposes output_mappings={"page_content": "anchor"}, ) generate_sentence_pair = GenerateSentencePair( diff --git a/llm-complete-guide/steps/eval_pii.py b/llm-complete-guide/steps/eval_pii.py index 16237372..8e9e406b 100644 --- a/llm-complete-guide/steps/eval_pii.py +++ b/llm-complete-guide/steps/eval_pii.py @@ -272,7 +272,7 @@ def eval_pii( ) -> Tuple[ Annotated[Dict[str, Dict], "train_results"], Annotated[Dict[str, Dict], "test_results"], - Annotated[Image, "PII chart"], + Annotated[Image.Image, "PII chart"], ]: detector = PIIDetector() train_results = detector.scan_dataset( From 3711479bd29b412b5b0e50ba87b3ecec01b7731b Mon Sep 17 00:00:00 2001 From: Alex Strick van Linschoten Date: Tue, 29 Oct 2024 15:43:52 +0100 Subject: [PATCH 5/5] finalise PII step --- llm-complete-guide/steps/eval_pii.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/llm-complete-guide/steps/eval_pii.py b/llm-complete-guide/steps/eval_pii.py index 8e9e406b..b81237f3 100644 --- a/llm-complete-guide/steps/eval_pii.py +++ b/llm-complete-guide/steps/eval_pii.py @@ -270,20 +270,22 @@ def plot_pii_results( def eval_pii( train_dataset: Dataset, test_dataset: Dataset ) -> Tuple[ - Annotated[Dict[str, Dict], "train_results"], - Annotated[Dict[str, Dict], "test_results"], + Annotated[Dict[str, Dict], "train_pii_results"], + Annotated[Dict[str, Dict], "test_pii_results"], Annotated[Image.Image, "PII chart"], ]: detector = PIIDetector() train_results = detector.scan_dataset( dataset=train_dataset, - columns=[ - "text" - ], # specify columns to scan, or None for all string columns - max_samples=1000, # optional: limit number of samples to scan + # columns=[ + # "text" + # ], # specify columns to scan, or None for all string columns + # max_samples=1000, # optional: limit number of samples to scan ) test_results = detector.scan_dataset( - dataset=test_dataset, columns=["text"], max_samples=1000 + dataset=test_dataset, + # columns=["text"], + # max_samples=1000, # optional: limit number of samples to scan ) train_metadata = {