diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..aaa42b4 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,107 @@ +name: CI + +on: + push: + branches: [ main, master ] + pull_request: + branches: [ main, master ] + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ['3.9', '3.10', '3.11', '3.12'] + exclude: + # Reduce matrix size - test key combinations + - os: macos-latest + python-version: '3.9' + - os: macos-latest + python-version: '3.10' + + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + version: "latest" + + - name: Set up Python ${{ matrix.python-version }} + run: uv python install ${{ matrix.python-version }} + + - name: Install dependencies + run: uv sync --all-extras --dev + + - name: Install just + uses: extractions/setup-just@v2 + + - name: Run linting + run: just lint-py + + - name: Run tests + run: just test + + - name: Check formatting + run: | + uv run ruff format --check src/ tests/ + + build: + runs-on: ubuntu-latest + needs: test + + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + version: "latest" + + - name: Set up Python + run: uv python install 3.12 + + - name: Install dependencies + run: uv sync --dev + + - name: Build package + run: uv build + + - name: Upload build artifacts + uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ + + # Windows executable build (runs only on Windows) + build-exe: + runs-on: windows-latest + needs: test + if: github.event_name == 'push' && github.ref == 'refs/heads/master' + + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + version: "latest" + + - name: Set up Python + run: uv python install 3.12 + + - name: Install dependencies + run: uv sync --dev + + - name: Install just + uses: extractions/setup-just@v2 + + - name: Build Windows executable + run: just build-exe + + - name: Upload executable + uses: actions/upload-artifact@v4 + with: + name: windows-executable + path: dist/ diff --git a/.gitignore b/.gitignore index e0e1dfb..7346c50 100644 --- a/.gitignore +++ b/.gitignore @@ -104,3 +104,31 @@ t.csv # exe packaging PII Charts Update.pdf *.msi + +# Modern Python tooling +dist/ +share/python-wheels/ +MANIFEST +.nox/ +*.py,cover +.pytest_cache/ +cover/ + +# uv +uv.lock + +# VS Code +.vscode/ + +# macOS +.DS_Store + +# Windows +Thumbs.db +ehthumbs.db +Desktop.ini + +# PII Detector specific +temp_files/ +output_files/ +test_data_private/ diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 0000000..ea742f6 --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1,24 @@ +# docs: https://github.com/DavidAnson/markdownlint/blob/v0.32.1/README.md + +# default to true for all rules +default: true + +# MD007/unordered-list-indent +MD007: + indent: 2 + +# MD033/no-inline-html +MD033: false + +# MD041/first-line-h1 +MD041: false + +# MD013/line-length +MD013: false + +# MD024/no-duplicate-heading +MD024: + # Allow when nested under different parents e.g. CHANGELOG.md + siblings_only: true + +MD038: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..1614bc5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,31 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: check-yaml + - id: check-json + - id: check-toml + - id: check-merge-conflict + - id: trailing-whitespace + - id: end-of-file-fixer + + - repo: https://github.com/abravalheri/validate-pyproject + rev: v0.24.1 + hooks: + - id: validate-pyproject + + - repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell + additional_dependencies: + - tomli + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.13.1 + hooks: + - id: ruff-check + args: [--fix] + types_or: [python, pyi, jupyter] + - id: ruff-format + types_or: [python, pyi, jupyter] diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..7d5a58b --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,253 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Overview + +This is a modern Python-based PII (Personally Identifiable Information) detection tool that identifies potential PII in datasets and helps create de-identified versions. The application provides both a GUI interface and CLI for analyzing CSV, Excel, and Stata files. Built with modern Python packaging using uv and pyproject.toml. + +## Commands + +### Environment Setup + +```bash +# Get started with development environment +just get-started + +# Or manually: +uv venv +uv sync +``` + +### Running the Application + +```bash +# Launch GUI +just run-gui +# or +uv run python -m pii_detector.gui.frontend + +# Launch CLI +just run-cli +# or +uv run python -m pii_detector.cli.main --help + +# Install Presidio for enhanced PII detection (default: English, small model) +just install-presidio + +# Install Presidio with specific language and model size +just install-presidio spanish md # Spanish, medium model +just install-presidio german lg # German, large model + +# Install specific spaCy model +just install-spacy-model en_core_web_md + +# List available spaCy models +just list-spacy-models + +# Run Presidio demonstration +uv run python examples/presidio_demo.py +``` + +### Development Workflow + +```bash +# Install dependencies +uv sync + +# Run tests +just test + +# Code formatting and linting +just fmt-all + +# Build package +just build +``` + +### Legacy Executable Creation + +```bash +# Create Windows executable (maintains backward compatibility) +just build-exe + +# Create Windows executable with Presidio support +just build-exe-presidio + +# Create installer +just create-installer +``` + +## Architecture + +### Modern Package Structure + +```text +src/pii_detector/ +├── __init__.py # Package initialization +├── core/ # Core PII detection logic +│ ├── processor.py # Main data processing engine (legacy methods) +│ ├── text_analysis.py # Basic text PII detection +│ ├── presidio_engine.py # NEW: Presidio ML-powered text analysis +│ ├── unified_processor.py # NEW: Hybrid structural + ML detection +│ ├── hybrid_anonymizer.py # NEW: Combined anonymization methods +│ ├── hash_utils.py # Basic hashing utilities +│ └── anonymization.py # Comprehensive anonymization techniques +├── data/ # Static data and configurations +│ ├── constants.py # Application constants +│ ├── restricted_words.py # Multi-language PII word lists +│ └── stopwords/ # Language-specific stopwords +├── gui/ # Graphical user interface +│ └── frontend.py # Modern tkinter GUI application +├── cli/ # Command-line interface +│ └── main.py # CLI entry point +└── api/ # External API integrations + └── queries.py # Location/population lookup services +``` + +### Core Components + +**GUI Layer:** + +- `src/pii_detector/gui/frontend.py` - Modern tkinter GUI with improved error handling, class-based design, and better UX + +**CLI Layer:** + +- `src/pii_detector/cli/main.py` - Command-line interface supporting both GUI launch and direct file processing + +**Data Processing Layer:** + +- `src/pii_detector/core/processor.py` - Core backend engine with type hints, improved error handling, and modern Python patterns +- `src/pii_detector/core/text_analysis.py` - Basic text-based PII detection with regex patterns +- `src/pii_detector/core/presidio_engine.py` - **NEW**: Microsoft Presidio integration for ML-powered text analysis +- `src/pii_detector/core/unified_processor.py` - **NEW**: Hybrid detection combining structural analysis with Presidio +- `src/pii_detector/core/hybrid_anonymizer.py` - **NEW**: Advanced anonymization using both statistical and ML methods + +**Configuration and Data:** + +- `src/pii_detector/data/constants.py` - Type-safe constants with clear organization +- `src/pii_detector/data/restricted_words.py` - Centralized word lists with proper typing and documentation +- `src/pii_detector/data/stopwords/` - Language-specific stopword files for text processing + +**External Integration:** + +- `src/pii_detector/api/queries.py` - Location population queries with improved error handling and API credential management + +**Utilities:** + +- `src/pii_detector/core/hash_utils.py` - Basic hashing utilities for pseudonymization +- `src/pii_detector/core/anonymization.py` - Comprehensive anonymization techniques based on academic research + +### PII Detection Methods + +The system uses four primary detection strategies implemented in `src/pii_detector/core/processor.py`: + +1. **Column Name/Label Matching** (`find_piis_based_on_column_name()`) - Matches column names against restricted word lists using strict or fuzzy matching +2. **Format Pattern Detection** (`find_piis_based_on_column_format()`) - Identifies phone numbers, dates, and other formatted data +3. **Sparsity Analysis** (`find_piis_based_on_sparse_entries()`) - Flags columns where most values are unique (open-ended questions) +4. **Location Population Checks** (`find_piis_based_on_locations_population()`) - Identifies small locations via external API queries + +### Comprehensive Anonymization Techniques + +The system provides extensive anonymization capabilities in `src/pii_detector/core/anonymization.py` based on FSD guidelines and academic research: + +**Removal Techniques:** +- **Variable Removal** - Complete deletion of identifying columns +- **Record Removal** - Elimination of records with unique quasi-identifier combinations +- **Selective Suppression** - Targeted removal of specific data points + +**Pseudonymization Methods:** +- **Hash-based Pseudonymization** - Consistent pseudonyms using cryptographic hashing +- **Name Replacement** - Systematic replacement with generic identifiers +- **Identifier Encoding** - Convert identifiers to non-reversible codes + +**Recoding/Categorization:** +- **Age Categorization** - Convert ages to broad age groups +- **Income Bracketing** - Group income values into ranges +- **Geographic Generalization** - Convert specific locations to broader regions +- **Date Generalization** - Reduce date precision (year, month, quarter) +- **Top/Bottom Coding** - Cap extreme values in continuous variables + +**Randomization Techniques:** +- **Noise Addition** - Add statistical noise (Gaussian or uniform) to numeric data +- **Permutation Swapping** - Randomly swap values between records +- **Data Perturbation** - Introduce controlled random variations + +**Statistical Disclosure Control:** +- **K-anonymity** - Ensure each record is indistinguishable from k-1 others +- **L-diversity** - Maintain diversity in sensitive attributes (mock implementation) +- **T-closeness** - Preserve overall distribution of sensitive attributes (mock) +- **Differential Privacy** - Add calibrated noise for privacy guarantees (mock) + +**Text Anonymization:** +- **Pattern Masking** - Replace PII patterns (emails, phones, SSNs) with placeholders +- **Selective Text Suppression** - Remove specific types of information from text +- **Named Entity Redaction** - Identify and mask person/location names in text + +**Quality Assurance:** +- **Anonymization Reporting** - Detailed reports on transformations applied +- **Data Utility Metrics** - Measure information loss from anonymization +- **Privacy Risk Assessment** - Evaluate remaining disclosure risks + +### Data Flow + +1. User selects dataset file through GUI (`app_frontend.py`) +2. File is loaded and parsed (`import_dataset()` in `PII_data_processor.py`) +3. PII detection algorithms are applied based on user-selected options +4. Results are presented in GUI for user review and action selection (Drop/Encode/Keep) +5. De-identified dataset and accompanying files are generated based on user choices + +### File Format Support + +- **CSV/Excel**: Direct pandas import +- **Stata (.dta)**: Preserves variable labels and value labels for comprehensive analysis + +### Key Dependencies + +- `pandas` - Primary data manipulation +- `tkinter` - GUI framework +- `requests` - API communication for location lookups +- `selenium` - Web scraping capabilities (likely for location data) +- PyInstaller ecosystem for executable creation + +## Development Notes + +### Modern Python Practices + +- **Type hints**: Core modules use type annotations for better code documentation and IDE support +- **Error handling**: Improved exception handling and user feedback throughout the application +- **Code organization**: Clear separation of concerns with dedicated modules for each functionality +- **Environment variables**: Secure handling of API keys and configuration through environment variables + +### Build System + +- **uv build backend**: Fast, modern build system replacing setuptools +- **pyproject.toml**: Centralized project configuration following PEP 518 standards +- **just task runner**: Simplified development workflow with cross-platform commands +- **pre-commit hooks**: Automated code quality checks with ruff formatting and linting + +### Testing and Quality + +- **pytest framework**: Modern testing setup with coverage reporting +- **ruff**: Fast Python linter and formatter replacing multiple tools +- **codespell**: Spell checking for documentation and code comments +- **CI/CD ready**: Configuration files support automated testing workflows + +### Backward Compatibility + +- **Executable creation**: Maintains PyInstaller workflow for Windows deployment +- **Asset handling**: Logo and template files preserved in `assets/` directory +- **Functionality preservation**: All original PII detection capabilities maintained + +### Deployment Options + +- **Package installation**: `uv pip install .` for local development +- **Executable distribution**: Traditional `.exe` creation for end users +- **PyPI ready**: Package structure supports publishing to Python Package Index +- **Cross-platform**: Works on Windows, macOS, and Linux (GUI requires display) + +### API Integration + +- **GeoNames API**: Location population lookup (requires `GEONAMES_USERNAME` environment variable) +- **Forebears API**: Name validation service (requires `FOREBEARS_API_KEY` environment variable) +- **Chrome/Selenium**: Google search fallback for population data (requires ChromeDriver) diff --git a/Justfile b/Justfile new file mode 100644 index 0000000..518f865 --- /dev/null +++ b/Justfile @@ -0,0 +1,245 @@ +# PII Detector Development Workflow +# Requires: just, uv + +set windows-shell := ["powershell.exe", "-NoLogo", "-Command"] + +# Set path to virtual environment's python + +python_dir := ".venv/" +python := python_dir + if os_family() == "windows" { "Script/python.exe" } else { "/python3" } + +# List available commands +default: + @just --list + +# Display system information +system-info: + @echo "CPU architecture: {{ arch() }}" + @echo "Operating system type: {{ os_family() }}" + @echo "Operating system: {{ os() }}" + +# Initial set up and global installations +get-started: pre-install venv activate-venv + +# Environment setup and management +clean: + @echo "Removing virtual environment..." + uv venv --rm || true + @echo "Environment cleaned." + +# create virtual environment +venv: + uv sync + uv tool install pre-commit + pre-commit install + +activate-venv: + @echo "To activate the virtual environment, run:" + @echo " .venv\\Scripts\\activate (Windows)" + @echo " source .venv/bin/activate (Unix)" + +update-reqs: + @echo "Updating dependencies and pre-commit hooks..." + uv sync --upgrade + uv run pre-commit autoupdate + +# Legacy application execution +run-gui-legacy: + @echo "Launching PII Detector GUI..." + uv run python -m pii_detector.gui.frontend + +# Application execution +run-gui: + @echo "Launching PII Detector GUI..." + uv run python -m pii_detector.gui.flet_main + +run-cli: + @echo "PII Detector CLI - Available commands:" + uv run python -m pii_detector.cli.main + +# CLI subcommands for direct usage +cli-help: + @echo "Available CLI commands:" + uv run python -m pii_detector.cli.main --help + +cli-analyze file *args: + @echo "Analyzing file: {{ file }}" + uv run python -m pii_detector.cli.main analyze {{ file }} {{ args }} + +cli-batch pattern *args: + @echo "Batch processing: {{ pattern }}" + uv run python -m pii_detector.cli.main batch {{ pattern }} {{ args }} + +cli-anonymize file *args: + @echo "Anonymizing file: {{ file }}" + uv run python -m pii_detector.cli.main anonymize {{ file }} {{ args }} + +cli-report file *args: + @echo "Generating report for: {{ file }}" + uv run python -m pii_detector.cli.main report {{ file }} {{ args }} + +# Development tools +test: + @echo "Running test suite..." + uv run pytest + +test-cov: + @echo "Running tests with coverage report..." + uv run pytest --cov-report=html + @echo "Coverage report generated in htmlcov/" + +# Test batch processing functionality specifically +test-batch: + @echo "Testing batch processing functionality..." + uv run python tests/test_runner.py + +# Test batch processing with minimal dependencies +test-batch-basic: + @echo "Testing batch processing with basic dependencies only..." + uv run python -c "import sys; sys.path.append('src'); from tests.test_runner import check_imports; check_imports()" + +# Run batch processing tests with pytest +test-batch-full: + @echo "Running full batch processing test suite..." + uv run pytest tests/test_batch_processing.py -v + +# Test presidio integration +test-presidio: + @echo "Running Presidio integration tests..." + uv run pytest tests/test_presidio_integration.py -v + +# Code quality +lint-py: + @echo "Linting Python code..." + uv run ruff check src/ tests/ + +fmt-python: + @echo "Formatting Python code..." + uv run ruff format src/ tests/ + +lint-fix: + @echo "Linting and fixing Python code..." + uv run ruff check --fix src/ tests/ + +spell-check: + @echo "Checking spelling..." + uv run codespell src/ tests/ docs/ README.md + +# Format all markdown and config files +fmt-markdown: + markdownlint --config .markdownlint.yaml "**/*.{md,qmd}" --fix + +# Format a single markdown file, "f" +fmt-md f: + markdownlint --config .markdownlint.yaml {{ f }} --fix + +# Check format of all markdown files +fmt-check-markdown: + markdownlint --config .markdownlint.yaml "**/*.{md,qmd}" + +fmt-all: fmt-python lint-fix spell-check fmt-markdown + @echo "All formatting and linting complete!" + +# Pre-commit hooks +pre-commit-install: + @echo "Installing pre-commit hooks..." + uv run pre-commit install + +pre-commit-run: + @echo "Running pre-commit hooks..." + uv run pre-commit run --all-files + +# Build and distribution +build: + @echo "Building distribution packages..." + uv build + +install-local: + @echo "Installing package locally in development mode..." + uv pip install -e . + +# Executable creation +build-exe: + @echo "Creating Windows executable with PyInstaller..." + uv run pyinstaller --windowed --name=pii_detector --icon=assets/app-icon.ico --add-data="assets/app-icon.ico;." --add-data="assets/ipa-logo.jpg;." --add-data="assets/anonymize_script_template_v2.do;." --additional-hooks-dir=assets --hiddenimport srsly.msgpack.util --noconfirm src/pii_detector/gui/frontend.py + +# Executable creation with Presidio support +build-exe-presidio: + @echo "Creating Windows executable with Presidio support..." + uv sync --extra presidio + uv run pyinstaller --windowed --name=pii_detector_presidio --icon=assets/app-icon.ico --add-data="assets/app-icon.ico;." --add-data="assets/ipa-logo.jpg;." --add-data="assets/anonymize_script_template_v2.do;." --additional-hooks-dir=assets --hiddenimport presidio_analyzer --hiddenimport presidio_anonymizer --hiddenimport spacy --hiddenimport en_core_web_sm --hiddenimport srsly.msgpack.util --noconfirm src/pii_detector/gui/frontend.py + +# Install Presidio dependencies +install-presidio language="en" model_size="sm": + @echo "Installing Presidio dependencies..." + uv sync --extra presidio + @echo "Installing spaCy model for {{ language }} ({{ model_size }} size)..." + uv run python -c "from pii_detector.core.model_manager import ensure_spacy_model; ensure_spacy_model('{{ language }}', '{{ model_size }}')" + @echo "Presidio installation complete!" + @echo "Test installation with: uv run python examples/presidio_demo.py" + +# Install specific spaCy model +install-spacy-model model_name: + @echo "Installing spaCy model: {{ model_name }}..." + uv run python -c "from pii_detector.core.model_manager import install_spacy_model; install_spacy_model('{{ model_name }}')" + +# List available spaCy models +list-spacy-models: + @echo "Available spaCy models:" + uv run python scripts/manage_models.py list + +# Model management utility +manage-models *args: + @echo "Running model management utility..." + uv run python scripts/manage_models.py {{ args }} + +# Install Presidio with structured data support for batch processing +install-presidio-batch: + @echo "Installing Presidio with batch processing support..." + uv sync --extra batch + uv run python -c "from pii_detector.core.model_manager import ensure_spacy_model; ensure_spacy_model('en', 'sm')" + @echo "Batch processing installation complete!" + @echo "Test with: just run-batch-demo" + +# Run batch processing demo +run-batch-demo: + @echo "Running batch processing demonstration..." + uv run python examples/run_batch_examples.py + +# Run presidio demo +run-presidio-demo: + @echo "Running Presidio demonstration..." + uv run python examples/presidio_demo.py + +# Documentation +docs-serve: + @echo "Serving documentation locally..." + @echo "Documentation serving not yet implemented" + +# Cleanup +clean-build: + @echo "Cleaning build artifacts..." + rm -rf dist/ build/ *.egg-info/ htmlcov/ .coverage .pytest_cache/ + +clean-all: clean clean-build + @echo "All clean!" + +# Platform-specific pre-install commands +[windows] +pre-install: + @echo "Installing Windows prerequisites..." + @echo "Ensure you have installed: just, uv" + winget install Git.Git Casey.Just astral-sh.uv OpenJS.NodeJS + npm install -g markdownlint-cli + +[linux] +pre-install: + @echo "Installing Unix prerequisites..." + @echo "Ensure you have Homebrew installed: https://brew.sh/" + brew install just uv markdownlint-cli + +[macos] +pre-install: + @echo "Installing macOS prerequisites..." + @echo "Ensure you have Homebrew installed: https://brew.sh/" + brew install just uv markdownlint-cli diff --git a/LICENSE b/LICENSE index 880515b..b2246f9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2017 Innovations for Poverty Action +Copyright (c) 2025 Innovations for Poverty Action Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/PII_data_processor.py b/PII_data_processor.py deleted file mode 100644 index fe709d6..0000000 --- a/PII_data_processor.py +++ /dev/null @@ -1,807 +0,0 @@ -import restricted_words as restricted_words_list -import pandas as pd -# from nltk.stem.porter import PorterStemmer -import time -import numpy as np - -from constant_strings import * - -import urllib.request as urllib2 - -import api_queries - -import find_piis_in_unstructured_text as unstructured_text - -import fileinput -import shutil -import os -from datetime import date - -import hash_generator - -import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) - -import os -from os import listdir -from os.path import isfile, isdir, join -import ntpath -import shutil - -OUTPUTS_FOLDER = None -LOG_FILE_PATH = None - -def get_surveycto_restricted_vars(): - return restricted_words_list.get_surveycto_restricted_vars() - -def import_dataset(dataset_path): - - dataset, label_dict, value_label_dict = False, False, False - raise_error = False - status_message = False - - # if dataset_path.endswith(('"', "'")): - # dataset_path = dataset_path[1:-1] - - # dataset_path_l = dataset_path.lower() - - - #Check format - if(dataset_path.endswith(('xlsx', 'xls','csv','dta')) is False): - return (False, 'Supported files are .csv, .dta, .xlsx, .xls') - - try: - if dataset_path.endswith(('xlsx', 'xls')): - dataset = pd.read_excel(dataset_path) - elif dataset_path.endswith('csv'): - dataset = pd.read_csv(dataset_path) - elif dataset_path.endswith('dta'): - try: - dataset = pd.read_stata(dataset_path) - except ValueError: - dataset = pd.read_stata(dataset_path, convert_categoricals=False) - label_dict = pd.io.stata.StataReader(dataset_path).variable_labels() - try: - value_label_dict = pd.io.stata.StataReader(dataset_path).value_labels() - except AttributeError: - status_message = "No value labels detected. " # Not printed in the app, overwritten later. - elif dataset_path.endswith(('xpt', '.sas7bdat')): - dataset = pd.read_sas(dataset_path) - elif dataset_path.endswith('vc'): - status_message = "**ERROR**: This folder appears to be encrypted using VeraCrypt." - raise Exception - elif dataset_path.endswith('bc'): - status_message = "**ERROR**: This file appears to be encrypted using Boxcryptor. Sign in to Boxcryptor and then select the file in your X: drive." - raise Exception - else: - raise Exception - - except (FileNotFoundError, Exception): - if status_message is False: - status_message = '**ERROR**: This path appears to be invalid. If your folders or filename contain colons or commas, try renaming them or moving the file to a different location.' - raise - - if (status_message): - log_and_print("There was an error") - log_and_print(status_message) - return (False, status_message) - - log_and_print('The dataset has been read successfully.\n') - dataset_read_return = [dataset, dataset_path, label_dict, value_label_dict] - return (True, dataset_read_return) - -def word_match(column_name, restricted_word, type_of_matching=STRICT): - - if(type_of_matching == STRICT): - return column_name.lower() == restricted_word.lower() - else: # type_of_matching == FUZZY - #Check if restricted word is inside column_name - return restricted_word.lower() in column_name.lower() - - -def remove_other_refuse_and_dont_know(column): - - #List of values to remove. All numbers with 3 digits where all digits are the same - values_to_remove = [str(111*i) for i in range(-9,10) if i !=0] - - filtered_column = column[~column.isin(values_to_remove)] - - return filtered_column - - -def clean_column(column): - #Drop NaNs - column_filtered = column.dropna() - - #Remove empty entries - column_filtered = column_filtered[column_filtered!=''] - - #Remove other, refuses and dont knows - if len(column_filtered)!=0: - column_filtered = remove_other_refuse_and_dont_know(column_filtered) - - return column_filtered - -def column_is_sparse(dataset, column_name, sparse_threshold): - - column_filtered = clean_column(dataset[column_name]) - - #Check sparcity - n_entries = len(column_filtered) - n_unique_entries = column_filtered.nunique() - - if n_entries != 0 and n_unique_entries/n_entries > sparse_threshold: - return True - else: - return False - -def column_has_sufficiently_sparse_strings(dataset, column_name, sparse_threshold=0.2): - ''' - Checks if 'valid' column entries are sparse, defined as ratio between unique_entries/total_entries. - Consider only valid stands, aka, exludet NaN, '', Other, Refuse to respond, Not Know - ''' - - #Check if column type is string - if dataset[column_name].dtypes == 'object': - return column_is_sparse(dataset, column_name, sparse_threshold) - else: - return False - - -def column_has_sparse_value_label_dicts(column_name, value_label_dict, sparse_threshold = 10): - ''' - Check if for a given column, its values come encoded in a dictionary and are sufficiently sparse - ''' - if column_name in value_label_dict and value_label_dict[column_name] != '' and len(value_label_dict[column_name])>sparse_threshold: - return True - else: - return False - -def find_piis_based_on_column_name(dataset, label_dict, value_label_dict, columns_to_check, consider_locations_cols): - - #Identifies columns whose names or labels match (strict or fuzzy) any word in the predefined list of restricted words. Also considers that data entries must be sufficiently sparse strings (Ideally, this method will capture columns with people names) or value label dictionaries (for locations) - - pii_strict_restricted_words = restricted_words_list.get_strict_restricted_words() - pii_fuzzy_restricted_words = restricted_words_list.get_fuzzy_restricted_words() - - - #If consider_locations_cols = 1, then consider locations columns in the search - if(consider_locations_cols == 1): - #If we are not checking locations populations, then include locations columns as part of restricted words - locations_strict_restricted_words = restricted_words_list.get_locations_strict_restricted_words() - locations_fuzzy_restricted_words = restricted_words_list.get_locations_fuzzy_restricted_words() - - pii_strict_restricted_words = set(pii_strict_restricted_words + locations_strict_restricted_words) - pii_fuzzy_restricted_words = set(pii_fuzzy_restricted_words + locations_fuzzy_restricted_words) - - #We will save all restricted words in a dictionary, where the keys are the words and their values is if we are looking for a strict or fuzzy matching with that word - restricted_words = {} - for word in pii_strict_restricted_words: - restricted_words[word] = STRICT - for word in pii_fuzzy_restricted_words: - restricted_words[word] = FUZZY - - # Looks for matches between column names (and labels) to restricted words - possible_pii = {} - - #For every column name in our dataset - for column_name in columns_to_check: - #For every restricted word - for restricted_word, type_of_matching in restricted_words.items(): - #Check if restricted word is in the column name - column_name_match = word_match(column_name, restricted_word, type_of_matching) - - #If there is a dictionary of labels, check match with label - if label_dict is not False: #label_dict will be False in case of no labels - column_label = label_dict[column_name] - column_label_match = word_match(column_label, restricted_word, type_of_matching) - else: - column_label_match = False - - #If there was a match between column name or label with restricted word - if column_name_match or column_label_match: - - #If there was a strict match with restricted word - if type_of_matching == STRICT: - log_and_print("Column '"+column_name+"' considered possible pii given column name had a "+type_of_matching+" match with restricted word '"+ restricted_word+"'") - - possible_pii[column_name] = "Name had "+ type_of_matching + " match with restricted word '"+restricted_word+"'" - - - #If column has strings and is sparse - elif column_has_sufficiently_sparse_strings(dataset, column_name): - - #Log result and save column as possible pii. Theres different log depending if match was with column or label - if(column_name_match): - log_and_print("Column '"+column_name+"' considered possible pii given column name had a "+type_of_matching+" match with restricted word '"+ restricted_word+"' and has sufficiently sparse strings") - - possible_pii[column_name] = "Name had "+ type_of_matching + " match with restricted word '"+restricted_word+"' and has sufficiently sparse strings" - - elif(column_label_match): - log_and_print("Column '"+column_name+ "' considered possible pii given column label '"+column_label+"' had a "+type_of_matching+" match with restricted word '"+ restricted_word+"' and has sufficiently sparse strings") - - possible_pii[column_name] = "Label had "+ type_of_matching + " match with restricted word '"+restricted_word+"' and has sufficiently sparse strings" - #If found, I dont need to keep checking this column with other restricted words - break - - #Else, check if column has values labels (locations are usually stores this way) - elif column_has_sparse_value_label_dicts(column_name, value_label_dict): - - if(column_name_match): - log_and_print("Column '"+column_name+"' considered possible pii given column name had a "+type_of_matching+" match with restricted word '"+ restricted_word+"' and values labels are sparse") - - possible_pii[column_name] = "Name had "+ type_of_matching + " match with restricted word '"+restricted_word+"' and values labels are sparse" - - elif(column_label_match): - log_and_print("Column '"+column_name+ "' considered possible pii given column label '"+column_label+"' had a "+type_of_matching+" match with restricted word '"+ restricted_word+"' and values labels are sparse") - - possible_pii[column_name] = "Label had "+ type_of_matching + " match with restricted word '"+restricted_word+"' and values labels are sparse" - #If found, I dont need to keep checking this column with other restricted words - break - - return possible_pii - - - -def column_has_locations_with_low_populations(dataset, column_name, country): - - column_filtered = clean_column(dataset[column_name]) - - #Get unique values - unique_locations = column_filtered.unique().tolist() - - return api_queries.get_locations_with_low_population(unique_locations, country=country, return_one=True) - - -def log_and_print(message): - file = open(LOG_FILE_PATH, "a") - file.write(message+'\n') - file.close() - print(message) - - -def log_and_print(message): - file = open(LOG_FILE_PATH, "a") - file.write(message+'\n') - file.close() - print(message) - -def find_piis_based_on_locations_population(dataset, label_dict, columns_to_check, country): - #Identifies columns whose names or labels match (strict or fuzzy) words related to locations. Then, check if for those columns, any value relates to a location with population under 20,000. If it is the case, then it flags the column. - - #Lots of repeated code respect to find_piis_based_on_column_name, could refactor. - - locations_strict_restricted_words = restricted_words_list.get_locations_strict_restricted_words() - locations_fuzzy_restricted_words = restricted_words_list.get_locations_fuzzy_restricted_words() - - #We will save all restricted words in a dictionary, where the keys are the words and their values is if we are looking for a strict or fuzzy matching with that word - restricted_words = {} - for word in locations_strict_restricted_words: - restricted_words[word] = STRICT - for word in locations_fuzzy_restricted_words: - restricted_words[word] = FUZZY - - # Looks for matches between column names (and labels) to restricted words - possible_pii = {} - - #For every column name in our dataset - for column_name in columns_to_check: - #For every restricted word - for restricted_word, type_of_matching in restricted_words.items(): - #Check if restricted word is in the column name - column_name_match = word_match(column_name, restricted_word, type_of_matching) - - #If there is a dictionary of labels, check match with label - if label_dict is not False: #label_dict will be False in case of no labels - column_label = label_dict[column_name] - column_label_match = word_match(column_label, restricted_word, type_of_matching) - else: - column_label_match = False - - #If there was a match between column name or label with restricted word - if column_name_match or column_label_match: - - location_with_low_population = column_has_locations_with_low_populations(dataset, column_name, country) - - if(location_with_low_population): - #Log result and save column as possible pii. Theres different log depending if match was with column or label - if(column_name_match): - log_and_print("Column '"+column_name+"' considered possible pii given column name had a "+type_of_matching+" match with restricted word '"+ restricted_word+"' and has a location with population under 20,000: "+location_with_low_population) - - possible_pii[column_name] = "Name had "+ type_of_matching + " match with restricted word '"+restricted_word+"' and has a location with population under 20,000: "+location_with_low_population - - elif(column_label_match): - log_and_print("Column '"+column_name+ "' considered possible pii given column label '"+column_label+"' had a "+type_of_matching+" match with restricted word '"+ restricted_word+"' and has a location with population under 20,000: "+location_with_low_population) - - possible_pii[column_name] = "Label had "+ type_of_matching + " match with restricted word '"+restricted_word+"' and has a location with population under 20,000: "+location_with_low_population - #If found, I dont need to keep checking this column with other restricted words - break - - return possible_pii - -def find_piis_based_on_sparse_entries(dataset, label_dict, columns_to_check, sparse_values_threshold=0.3): - #Identifies pii based on columns having sparse values - - possible_pii={} - for column_name in columns_to_check: - - if column_is_sparse(dataset, column_name, sparse_threshold=sparse_values_threshold): - - log_and_print("Column '"+column_name+"' considered possible pii given entries are sparse") - possible_pii[column_name] = "Column entries are too sparse" - - return possible_pii - - -def find_columns_with_specific_format(dataset, format_to_search, columns_to_check): - - columns_with_phone_numbers = {} - - if format_to_search == PHONE_NUMBER: - regex_expression = ".*(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}).*" - - elif format_to_search == DATE: - - #dd/mm/yy, (with -, / or .) - regex_date_1 = "((0[1-9]|[12]\d|3[01])(\/|-|\.)(0[1-9]|1[0-2])(\/|-|\.)[12]\d{3})" - #mm/dd/yyy, (with -, / or .) - regex_date_2 = "((0[1-9]|1[0-2])(\/|-|\.)(0[1-9]|[12]\d|3[01])(\/|-|\.)[12]\d{3})" - #yyyy/mm/dd, (with -, / or .) - regex_date_3 = "([12]\d{3}(\/|-|\.)(0[1-9]|1[0-2])(\/|-|\.)(0[1-9]|[12]\d|3[01]))" - - regex_expression = regex_date_1+'|'+regex_date_2+'|'+regex_date_3 - - for column in columns_to_check: - - #Check that all values in column are not NaN - if(pd.isnull(dataset[column]).all() == False): - - #Find first 10 values that are not NaN nor empty space '' - column_with_no_nan = dataset[column].dropna() - column_with_no_empty_valyes = column_with_no_nan[column_with_no_nan != ''] - first_10_values = column_with_no_empty_valyes.iloc[0:10] - - match_result = first_10_values.astype(str).str.match(pat = regex_expression) - - #If all not NaN values matched with regex, save column as PII candidate - if(any(match_result)): - log_and_print("Column '"+column+"' considered possible pii given column entries have "+format_to_search+" format") - columns_with_phone_numbers[column]= "Column entries have "+format_to_search+" format" - - return columns_with_phone_numbers - -def export_encoding(dataset_path, encoding_dict): - dataset_complete_file_name = ntpath.basename(dataset_path) - dataset_file_name_no_extension, dataset_type = os.path.splitext(dataset_complete_file_name) - - encoding_file_path = os.path.join(OUTPUTS_FOLDER, dataset_file_name_no_extension + '_encodingmap.csv') - -def save_all_piis_in_txt_file(list_variables_to_drop, list_variables_to_encode): - - all_piis_txt_file = os.path.join(OUTPUTS_FOLDER,'all_piis_identified.txt') - delete_if_exists(all_piis_txt_file) - file = open(all_piis_txt_file, "a") - if len(list_variables_to_drop)>0: - file.write(f'Columns to drop: {" ".join(list_variables_to_drop)}\n') - if len(list_variables_to_encode)>0: - file.write(f'Columns to encode: {" ".join(list_variables_to_encode)}') - file.close() - - -def create_deidentifying_do_file(dataset_path, pii_candidates_to_action): - ''' - Using anonymize_script_tempalte.txt as a starting point, we create a .do file that deidentifies dataset according to pii_candidates_to_action - ''' - #Make a copy of the template file - template_file = 'anonymize_script_template_v2.do' - script_filename= os.path.join(OUTPUTS_FOLDER, 'anonymize_script.do') - - delete_if_exists(script_filename) - shutil.copyfile(template_file, script_filename) - - deidentified_dataset_path = dataset_path.split('.')[0] + '_deidentified.dta' - - #Create list of vars to drop and encode - list_variables_to_drop = [] - list_variables_to_encode = [] - for pii_candidate, action in pii_candidates_to_action.items(): - if action == 'Drop': - list_variables_to_drop.append(pii_candidate) - elif action == 'Encode': - list_variables_to_encode.append(pii_candidate) - - - #Read all lines and replace whenever we find one of the keywords - with fileinput.FileInput(script_filename, inplace=True) as file: #, backup='.bak' - today_string = date.today().strftime("%m/%d/%y") - for line in file: - #Create modified_line - modified_line = line - modified_line = modified_line.replace('[date]', today_string) - modified_line = modified_line.replace('[input_file_path]', dataset_path) - modified_line = modified_line.replace('[output_file_path]', deidentified_dataset_path) - - modified_line = modified_line.replace('[list_variables_to_drop_space_delimited]', " ".join(list_variables_to_drop)) - modified_line = modified_line.replace('[list_variables_to_hash_space_delimited]', " ".join(list_variables_to_encode)) - - #The template .do file has an option to only remove value labels, we are not using that option so we will by default select no variables for that. - modified_line = modified_line.replace('[list_variables_to_remove_value_labelling_space_delimited]', "") - - #Save modified line in file - #print here will print in the file, not actually printing in console - print(modified_line, end='') - - #Write down list of variables in a document - save_all_piis_in_txt_file(list_variables_to_drop, list_variables_to_encode) - -def delete_if_exists(file_path): - if os.path.exists(file_path): - os.remove(file_path) - -def export_encoding(dataset_path, encoding_dict): - encoding_file_path = dataset_path.split('.')[0] + '_encodingmap.csv' - - #Delete if file exists - delete_if_exists(encoding_file_path) - - encoding_df = pd.DataFrame(columns=['variable','orginial value', 'encoded value']) - - for variable, values_dict in encoding_dict.items(): - for original_value, encoded_value in values_dict.items(): - encoding_df.loc[-1] = [variable, original_value, encoded_value] - encoding_df.index = encoding_df.index + 1 - encoding_df.to_csv(encoding_file_path, index=False) - -def create_anonymized_dataset(dataset, label_dict, dataset_path, pii_candidate_to_action, columns_where_to_replace_piis = None, piis_found_in_ustructured_text = None): - - #Drop columns - columns_to_drop = [column for column in pii_candidate_to_action if pii_candidate_to_action[column]=='Drop'] - - dataset = dataset.drop(columns=columns_to_drop) - log_and_print("Dropped columns: "+ " ".join(columns_to_drop)) - - #Encode columns - columns_to_encode = [column for column in pii_candidate_to_action if pii_candidate_to_action[column]=='Encode'] - - if(len(columns_to_encode)>0): - log_and_print("Hashed columns: "+ " ".join(columns_to_encode)) - dataset, encoding_used = recode(dataset, columns_to_encode) - log_and_print("Map file for encoded values created.") - export_encoding(dataset_path, encoding_used) - - #Replace piis in unstructured text - if(columns_where_to_replace_piis and piis_found_in_ustructured_text): - for c in columns_where_to_replace_piis: - dataset[c].replace(piis_found_in_ustructured_text, 'XXXX', regex=True, inplace=True) - - exported_file_path = export(dataset, dataset_path, label_dict) - - return exported_file_path - -def find_survey_cto_vars(dataset): - surveycto_vars = restricted_words_list.get_surveycto_vars() - - possible_pii = {} - #For every column name in our dataset - for column_name in dataset.columns: - #For every restricted word - for restricted_word in surveycto_vars: - #Check if restricted word is in the column name - if word_match(column_name, restricted_word): - possible_pii[column_name] = 'SurveyCTO variable' - - return possible_pii - - -def find_piis_based_on_column_format(dataset, label_dict, columns_to_check): - - all_piis_detected = {} - - #Find columns with phone numbers formats - columns_with_phone_numbers = find_columns_with_specific_format(dataset, PHONE_NUMBER, columns_to_check) - all_piis_detected.update(columns_with_phone_numbers) - - columns_with_dates = find_columns_with_specific_format(dataset, DATE, columns_to_check) - all_piis_detected.update(columns_with_dates) - - return all_piis_detected - -def create_outputs_folder(dataset_path): - directory_path = os.path.dirname(dataset_path) - - global OUTPUTS_FOLDER - OUTPUTS_FOLDER = directory_path+'/pii_detection_outputs' - - if os.path.exists(OUTPUTS_FOLDER): - shutil.rmtree(OUTPUTS_FOLDER) - os.mkdir(OUTPUTS_FOLDER) - - -def create_log_file_path(dataset_path): - - global LOG_FILE_PATH - LOG_FILE_PATH = OUTPUTS_FOLDER+"/log.txt" - delete_if_exists(LOG_FILE_PATH) - -def import_file(dataset_path): - - #Create outputs folder and log file - create_outputs_folder(dataset_path) - - #Create log file - create_log_file_path(dataset_path) - - #Read file - import_status, import_result = import_dataset(dataset_path) - - #Check if error ocurr - if import_status is False: - return import_status, import_result - - #If no error, decouple import result - dataset, dataset_path, label_dict, value_label_dict = import_result - - #Save results in dictionary for return - response_content = {} - response_content[DATASET] = dataset - response_content[LABEL_DICT] = label_dict - response_content[VALUE_LABEL_DICT] = value_label_dict - - return True, response_content - - - -def recode(dataset, columns_to_encode): - - #Keep record of encoding - econding_used = {} - - for var in columns_to_encode: - - #For hashing, we will use hmac-sha1, then sort the hashed values and assign values 1-n. - # Make dictionary of old and new values. - #First there is a step between - unique_val_to_hmacsha1 = {} - hmacsha1_to_final_hash = {} - - for unique_val in dataset[var].dropna().unique(): - unique_val_to_hmacsha1[unique_val] = hash_generator.hmac_sha1('[SECRET KEY]', unique_val) - - #Get list of all hmac-sha1 hashes and sort them - sorted_hash = [v for k, v in sorted(unique_val_to_hmacsha1.items(), key=lambda item: item[1])] - - #Create dict that points from hmac-sha1 hashes to a 1-n value - hmacsha1_to_final_hash = {} - for index, hash in enumerate(sorted_hash): - hmacsha1_to_final_hash[hash]=index+1 - - #Join two dictionaries - unique_val_to_final_hash = {} - for k, v in unique_val_to_hmacsha1.items(): - unique_val_to_final_hash[k] = hmacsha1_to_final_hash[v] - - #Replace column with its hashes. First create list of all hashed values - hashed_column = [] - for value in dataset[var].tolist(): - if value is np.nan: - hashed_column.append(np.nan) - else: - hashed_column.append(unique_val_to_final_hash[value]) - dataset[var] = hashed_column - - print(var + ' has been successfully encoded.') - econding_used[var] = unique_val_to_final_hash - - return dataset, econding_used - -def find_piis_unstructured_text(dataset, label_dict, columns_still_to_check, language, country): - - #Filter columns to those that have sparse entries - columns_to_check = [] - for column_name in columns_still_to_check: - if column_has_sufficiently_sparse_strings(dataset, column_name): - columns_to_check.append(column_name) - - pii_candidates_unstructured_text = unstructured_text.find_piis(dataset, label_dict, columns_to_check, language, country) - - log_and_print(f'Piis found in columns {columns_to_check} with unstructured text: {pii_candidates_unstructured_text}') - - return pii_candidates_unstructured_text, columns_to_check - - - -def input_file_is_dta(dataset_path): - dataset_file_name_no_extension, dataset_type = os.path.splitext(dataset_path) - - if dataset_type == '.dta': - return True - else: - return False - -def export(dataset, dataset_path, variable_labels = None): - - dataset_complete_file_name = ntpath.basename(dataset_path) - dataset_file_name_no_extension, dataset_type = os.path.splitext(dataset_complete_file_name) - - if(dataset_type == '.csv'): - new_file_path = os.path.join(OUTPUTS_FOLDER, dataset_file_name_no_extension + '_deidentified.csv') - delete_if_exists(new_file_path) - dataset.to_csv(new_file_path, index=False) - - elif(dataset_type == '.dta'): - new_file_path = os.path.join(OUTPUTS_FOLDER, dataset_file_name_no_extension + '_deidentified.dta') - delete_if_exists(new_file_path) - try: - dataset.to_stata(new_file_path, variable_labels = variable_labels, write_index=False) - except: - dataset.to_stata(new_file_path, version = 118, variable_labels = variable_labels, write_index=False) - - elif(dataset_type == '.xlsx'): - new_file_path = os.path.join(OUTPUTS_FOLDER, dataset_file_name_no_extension + '_deidentified.xlsx') - delete_if_exists(new_file_path) - dataset.to_excel(new_file_path, index=False) - - elif(dataset_type == '.xls'): - new_file_path = os.path.join(OUTPUTS_FOLDER, dataset_file_name_no_extension + '_deidentified.xls') - delete_if_exists(new_file_path) - dataset.to_excel(new_file_path, index=False) - - else: - log_and_print("Data type not supported") - new_file_path = None - - return new_file_path - - -def internet_on(): - try: - urllib2.urlopen('http://google.com', timeout=2) - return True - except Exception as e: - log_and_print(e) - return False - -def get_directories_path_in_folder(folder_path): - only_directories = [join(folder_path, f) for f in listdir(folder_path) if isdir(join(folder_path, f))] - return only_directories - -def get_files_path_in_folder(folder_path): - only_files = [join(folder_path, f) for f in listdir(folder_path) if isfile(join(folder_path, f))] - return only_files - -def get_testing_tuple(folder_path): - only_files = get_files_path_in_folder(folder_path) - - data_source = None - excel_with_ground_truth_pii = None - country_file = None - - for file in only_files: - if file.split('.')[-1]=='dta': - data_source = file - continue - if file.split('-')[-1]=='true_piis.xlsx': - excel_with_ground_truth_pii = file - continue - if file.split('.')[-1]=='txt': - country_file = file - continue - if data_source and excel_with_ground_truth_pii and country_file: - return True, (data_source, excel_with_ground_truth_pii, country_file) - else: - return False, False - - -def get_test_files_tuples(): - - all_test_files_tuples = [] - - #Look for files in X:\Box Sync\GRDS_Resources\Data Science\Test data\Raw\ - #For every folder inside, if folder has .dta and .xlsx ending with -piis.xlsx, add it to list - - folder_with_raw_data = 'X:\Box Sync\GRDS_Resources\Data Science\Test data\Raw' - only_directories = get_directories_path_in_folder(folder_with_raw_data) - - for dir in only_directories: - #Check that dir has .dta and .xls - dir_has_testing_tuple, testing_tuple = get_testing_tuple(dir) - if dir_has_testing_tuple: - all_test_files_tuples.append((testing_tuple[0], testing_tuple[1], testing_tuple[2])) - - return all_test_files_tuples - -def get_country(country_file_path): - with open(country_file_path) as f: - lines = f.readlines() - return lines[0] - -def run_tests(): - - test_files_tuples = get_test_files_tuples() - - for test_files_tuple in test_files_tuples: - dataset_path, true_piis_path, country_file_path = test_files_tuple - country = get_country(country_file_path) - - print(f'RUNNING TEST FOR {dataset_path}.\nCountry {country}') - - #Import dataset - reading_status, reading_content = import_file(dataset_path) - - #Check if reading was succesful - if(reading_status is False): - return - - dataset = reading_content[DATASET] - label_dict = reading_content[LABEL_DICT] - value_label_dict = reading_content[VALUE_LABEL_DICT] - columns_still_to_check = [c for c in dataset.columns if c not in restricted_words_list.get_surveycto_restricted_vars()] - - #Search piis using all methods - all_piis_found = {} - - #Options - consider_locations_cols = 1 - search_pii_in_unstructured_text = 0 - - pii_candidates = find_piis_based_on_column_name(dataset, label_dict, value_label_dict, columns_still_to_check, consider_locations_cols) - all_piis_found.update(pii_candidates) - columns_still_to_check = [c for c in columns_still_to_check if c not in pii_candidates] - log_and_print("Piis found using column names: "+",".join(pii_candidates.keys())) - - if(consider_locations_cols==0): - pii_candidates = find_piis_based_on_locations_population(dataset, label_dict, columns_still_to_check, country) - all_piis_found.update(pii_candidates) - columns_still_to_check = [c for c in columns_still_to_check if c not in pii_candidates] - log_and_print("Piis found basen on locations with low population: "+",".join(pii_candidates.keys())) - - - pii_candidates = find_piis_based_on_column_format(dataset, label_dict, columns_still_to_check) - all_piis_found.update(pii_candidates) - columns_still_to_check = [c for c in columns_still_to_check if c not in pii_candidates] - log_and_print("Piis found using column formats: "+",".join(pii_candidates.keys())) - - if search_pii_in_unstructured_text == 0: - pii_candidates_unstructured_text = None - column_with_unstructured_text = None - - pii_candidates = find_piis_based_on_sparse_entries(dataset, label_dict, columns_still_to_check) - all_piis_found.update(pii_candidates) - log_and_print("Piis based on sparse entries: "+",".join(pii_candidates.keys())) - - else: - pii_candidates_unstructured_text, column_with_unstructured_text = find_piis_unstructured_text(dataset, label_dict, columns_still_to_check, SPANISH, MEXICO) - - log_and_print("Piis found in unstructured text: "+",".join(pii_candidates_unstructured_text)) - log_and_print(len(pii_candidates_unstructured_text)) - - - #Create fake pii_candidate_to_action - pii_candidate_to_action = {} - for pii in pii_candidates: - pii_candidate_to_action[pii] = 'Drop' - - #Create deidentified dataset - create_anonymized_dataset(dataset, label_dict, dataset_path, pii_candidate_to_action, pii_candidates_unstructured_text, column_with_unstructured_text) - - #Now we check identified PIIs are the correct ones based on ground truth - reading_status, reading_content = import_file(true_piis_path) - if(reading_status is False): - return - true_piis_dataset = reading_content[DATASET] - true_piis = true_piis_dataset.iloc[:,0].to_list() - - #Announce wrongly detected ppis - print("THE FOLLOWING PIIS WERE WRONGLY DETECTED:") - wrongly_detected = [pii for pii in all_piis_found.keys() if pii not in true_piis] - print(wrongly_detected) - - #Announce missing piis - print("THE FOLLOWING PIIS WERE NOT DETECTED:") - not_detected = [pii for pii in true_piis if pii not in all_piis_found.keys()] - print(not_detected) - - - -if __name__ == "__main__": - run_tests() diff --git a/README.md b/README.md index 1656bac..db954b2 100644 --- a/README.md +++ b/README.md @@ -1,66 +1,550 @@ -# PII Application +# PII Detector -### About -This application identifies likely PII (personally identifiable information) in a dataset. To use, download the .exe installer from the [latest release](https://github.com/PovertyAction/PII_detection/releases/latest) and follow the in-app directions. +A modern Python tool for identifying and handling personally identifiable information (PII) in datasets. -This tool is current listed as an alpha release because it is still being tested on IPA PII-containing field datasets. +## About -### How does it work? +This application identifies likely PII (personally identifiable information) in a dataset. To use: -There are a series of rules that are applied to a dataset's column to identify if a given column is a PII. Such rules are: +- **End users**: Download the .exe installer from the [latest release](https://github.com/PovertyAction/PII_detection/releases/latest) +- **Developers**: Use the modern Python package with `uv` for development -* If column name or label match with any word of the list of restricted words ( ex 'name', 'surname', 'ssn', etc; check restricted_words.py). The match could be strict or fuzzy. Check `find_piis_based_on_column_name()` in `PII_data_processory.py`. -* If entries in a given column have a specific format (at the moment checking phone number format and date format, we can expand to gps, national identifiers, etc). -Check `find_piis_based_on_column_format()` in `PII_data_processory.py`. -* If all entries in a given column are sufficiently sparse (almost all unique). Ideal to identify open ended questions. -Check `find_piis_based_on_sparse_entries()` in `PII_data_processory.py`. -* If columns with locations have any location with population under 20,000. Check `find_piis_based_on_locations_population()` in `PII_data_processory.py`. +This tool is currently in beta as it continues to be tested on IPA PII-containing field datasets. -Importantly, this is an arbitrary defined list of conditions, and for sure can be improved. Very open to feedback! +## Quick Start -Once the PIIs are identified, users have the opportunity to say what they would like to do with those columns. Options are: drop column, encode column or keep column. According to those instructions, a new de-identified dataset is created. Also, the system outputs a log .txt file and a .csv file that maps the new and encoded values. +### For End Users -### Finding PII in unstructured text +Download and run the latest installer from [GitHub Releases](https://github.com/PovertyAction/PII_detection/releases/latest). -The repo has code written to identify PII in text, and replace the PIIs for a 'xxxxxx' string. So, rather than flagging a whole column and dropping/encoding it, they user might prefer to replace the PII by this string and keep everything else. The code searches for PII based on classic common names of people and cities. This functionality is finished but super slow at the moment, so it is currently not enabled. +### For Developers -### Files included +```bash +# Clone the repository +git clone https://github.com/PovertyAction/PII_detection.git +cd PII_detection -#### Main files -* app_frontend.py: App GUI script using tkinter. -* PII_data_processor.py: App backend, it reads data files, identifies PIIs and creates new de-identified data files. -* find_piis_in_unstructed_text.py: Script used by PII_data_processor to particularly detect piis in unstructured text +# Set up development environment +just get-started -### Other utility files -* restricted_words.py: Script to get restricted words for PII identification -* constant_strings.py: Declares strings used across app. -* query_google_answer_boxes.py: Script to query locations and populations -* dist folder: Contains .exe file for execution -* hook-spacy.py: Dependency file needed when creating .exe +# Run the GUI application +just run-gui -### How to run +# Or use the CLI +just run-cli --help -`python app_frontend.py` +# For enhanced PII detection with Presidio (optional) +just install-presidio # Install with English small model +just install-presidio spanish md # Install with Spanish medium model +uv run python examples/presidio_demo.py # Test the installation -Remember to install dependencies mentioned in `requirements.txt`. +# For efficient batch processing of large datasets +just install-presidio-batch # Install with batch processing support +just run-batch-demo # Run batch processing demonstration +``` -### Distribution +## How it Works -#### To create executable app -`pyinstaller --windowed --icon=app_icon.ico --add-data="app_icon.ico;." --add-data="ipa_logo.jpg;." --add-data="anonymize_script_template_v2.do;." --additional-hooks-dir=. --hiddenimport srsly.msgpack.util --noconfirm app_frontend.py` +The PII detector uses multiple detection strategies to identify potential PII in dataset columns: -#### To create windows application installer -Compile `create_installer.iss` using Inno Setup Compiler -Reference: https://www.youtube.com/watch?v=RrpvNvklmFA https://www.youtube.com/watch?v=DTQ-atboQiI&t=135s +### Core Detection Methods -### Credit +1. **Column Name/Label Matching** - Matches column names against restricted word lists using strict or fuzzy matching + - Check `find_piis_based_on_column_name()` in `src/pii_detector/core/processor.py` + - Supports multiple languages (English, Spanish, Swahili) + - Includes domain-specific terms (SurveyCTO, medical, locations) -IPA's RT-DEG teams. +2. **Format Pattern Detection** - Identifies phone numbers, dates, and other formatted data + - Check `find_piis_based_on_column_format()` in `src/pii_detector/core/processor.py` + - Expandable to GPS coordinates, national identifiers, etc. -J-PAL: stata_PII_scan. 2020. https://github.com/J-PAL/stata_PII_scan +3. **Sparsity Analysis** - Flags columns where most values are unique (open-ended questions) + - Check `find_piis_based_on_sparse_entries()` in `src/pii_detector/core/processor.py` + - Ideal for identifying free-text name/address fields -J-PAL: PII-Scan. 2017. https://github.com/J-PAL/PII-Scan +4. **Location Population Analysis** - Identifies small locations (< 20,000 people) that may be PII + - Check `find_piis_based_on_locations_population()` in `src/pii_detector/core/processor.py` + - Uses external APIs for population lookups -### Licensing +### Enhanced Detection with Presidio (Optional) -The PII script is [MIT Licensed](https://github.com/PovertyAction/PII_detection/blob/master/LICENSE). +For improved accuracy, the tool integrates with Microsoft Presidio for ML-powered text analysis: + +5. **Advanced Text Content Analysis** - Uses machine learning models to detect PII within text content + - Check `src/pii_detector/core/presidio_engine.py` for Presidio integration + - Context-aware detection using spaCy NLP models + - Supports multiple languages with confidence scoring + - Detects names, emails, phone numbers, SSNs, addresses, and more within free text + +6. **Hybrid Detection** - Combines structural analysis with ML-based text analysis + - Check `src/pii_detector/core/unified_processor.py` for unified detection + - Confidence-weighted scoring from multiple detection methods + - Graceful degradation when Presidio is not available + +7. **Batch Processing** - Efficient processing for large datasets + - Check `src/pii_detector/core/batch_processor.py` for batch processing capabilities + - Chunked processing with parallel workers for improved performance + - Memory-efficient handling of large datasets + - Integration with presidio-structured for advanced tabular data processing + +### User Workflow + +1. Load your dataset (supports CSV, Excel, Stata formats) +2. Configure detection options (language, country, detection methods) +3. Review detected PII candidates +4. Choose actions for each column: **Drop**, **Encode**, or **Keep** +5. Export de-identified dataset, mapping files, and audit logs + +## Batch Processing Examples + +The tool includes efficient batch processing capabilities for large datasets. Here are practical examples using the included test data: + +### Basic Batch Processing + +```python +# Example 1: Analyze a single dataset with batch processing +import pandas as pd +from pii_detector.core.batch_processor import BatchPIIProcessor + +# Initialize batch processor +processor = BatchPIIProcessor( + chunk_size=1000, # Process 1000 rows at a time + max_workers=4 # Use 4 parallel workers +) + +# Load test data +dataset = pd.read_csv("tests/data/comprehensive_pii_data.csv") + +# Run batch detection +results = processor.detect_pii_batch(dataset) + +# View results +for column, result in results.items(): + print(f"{column}: {result.detection_method} (confidence: {result.confidence:.2f})") +``` + +### Complete Batch Workflow + +```python +# Example 2: Complete detection and anonymization workflow +from pii_detector.core.batch_processor import process_dataset_batch + +# Process dataset with progress tracking +def show_progress(percent, message): + print(f"Progress: {percent:.1f}% - {message}") + +dataset = pd.read_csv("tests/data/sample_pii_data.csv") + +# Run complete batch processing workflow +detection_results, anonymized_dataset, report = process_dataset_batch( + dataset, + language="en", + chunk_size=500, + max_workers=2, + progress_callback=show_progress +) + +print(f"Detected PII in {len(detection_results)} columns:") +for col, result in detection_results.items(): + print(f" - {col}: {result.detection_method}") + +print(f"\nAnonymization report:") +print(f" - Original shape: {report['original_shape']}") +print(f" - Final shape: {report['final_shape']}") +``` + +### DataFrame-Level Presidio Functions + +```python +# Example 3: Use DataFrame-level Presidio functions for text analysis +from pii_detector.core.presidio_engine import ( + presidio_analyze_dataframe_batch, + presidio_anonymize_dataframe_batch +) + +# Load dataset with rich text content +dataset = pd.read_csv("tests/data/comprehensive_pii_data.csv") + +# Analyze text columns for PII +analysis_results = presidio_analyze_dataframe_batch( + dataset, + text_columns=["full_name", "notes", "address"], + confidence_threshold=0.7, + sample_size=50 +) + +print("Presidio text analysis results:") +for col, result in analysis_results.items(): + entities = result.get('entities_found', {}) + print(f" {col}: {list(entities.keys())} ({result.get('total_detections', 0)} detections)") + +# Anonymize detected text columns +anonymized_df = presidio_anonymize_dataframe_batch( + dataset, + columns_to_anonymize=list(analysis_results.keys()) +) + +print("\nText anonymization complete!") +``` + +### Batch Processing Multiple Files + +```python +# Example 4: Process multiple test files in batch +import glob +from pathlib import Path + +# Process all CSV files in test data directory +csv_files = glob.glob("tests/data/*.csv") + +for file_path in csv_files: + print(f"\nProcessing: {Path(file_path).name}") + + try: + dataset = pd.read_csv(file_path) + + # Quick batch analysis + processor = BatchPIIProcessor(chunk_size=1000) + results = processor.detect_pii_batch(dataset) + + print(f" Dataset shape: {dataset.shape}") + print(f" PII columns found: {len(results)}") + + if results: + print(f" PII columns: {list(results.keys())}") + + except Exception as e: + print(f" Error: {e}") +``` + +### Performance Comparison + +```python +# Example 5: Compare processing strategies +from pii_detector.core.batch_processor import BatchPIIProcessor + +dataset = pd.read_csv("tests/data/comprehensive_pii_data.csv") + +# Create multiple copies to simulate larger dataset +large_dataset = pd.concat([dataset] * 100, ignore_index=True) +print(f"Large dataset shape: {large_dataset.shape}") + +processor = BatchPIIProcessor() + +# Get processing strategy recommendation +strategy = processor.get_processing_strategy(large_dataset) +print(f"Recommended strategy: {strategy}") + +# Get time estimates +estimates = processor.estimate_processing_time(large_dataset) +for strategy_name, estimate in estimates.items(): + print(f"{strategy_name}:") + print(f" Estimated time: {estimate['time_seconds']:.2f} seconds") + print(f" Memory usage: {estimate['memory_mb']:.1f} MB") + print(f" Recommended: {estimate['recommended']}") +``` + +### Test Data Files Description + +The `tests/data/` directory contains sample datasets for testing: + +- **`comprehensive_pii_data.csv`**: Rich dataset with multiple PII types (names, emails, SSNs, addresses, medical info, notes) +- **`sample_pii_data.csv`**: Basic PII dataset with standard identifiers +- **`clean_data.csv`**: Anonymized dataset with no PII (for testing clean data detection) +- **`qualitative_data.csv`**: Text-heavy data for testing Presidio text analysis +- **`test_data.csv`**: General test dataset + +### Command Line Usage (Future) + +```bash +# Once CLI is enhanced, these commands will work: + +# Analyze single file +pii-detector analyze tests/data/sample_pii_data.csv --presidio --output-format json + +# Batch process multiple files +pii-detector batch "tests/data/*.csv" --chunk-size 500 --workers 2 + +# Anonymize dataset +pii-detector anonymize tests/data/comprehensive_pii_data.csv --method presidio --output clean_data.csv +``` + +### Unstructured Text PII Detection + +The tool includes functionality to identify PII within text content and replace it with placeholder strings (e.g., 'XXXXXX'). This allows preserving most text content while removing personal identifiers. + +*Note: This feature is currently optimized for performance and may be disabled by default.* + +## Project Structure + +### Modern Python Package Layout + +```text +src/pii_detector/ +├── core/ # Core PII detection algorithms +│ ├── processor.py # Main data processing engine (legacy methods) +│ ├── text_analysis.py # Basic text PII detection +│ ├── presidio_engine.py # NEW: Microsoft Presidio ML-powered analysis +│ ├── unified_processor.py # NEW: Hybrid structural + ML detection +│ ├── hybrid_anonymizer.py # NEW: Advanced anonymization methods +│ ├── model_manager.py # NEW: Dynamic spaCy model management +│ ├── hash_utils.py # Basic hashing utilities +│ └── anonymization.py # Comprehensive anonymization techniques +├── data/ # Static data and configurations +│ ├── constants.py # Application constants +│ ├── restricted_words.py # Multi-language PII word lists +│ └── stopwords/ # Language-specific stopwords +├── gui/ # Graphical user interface +│ └── frontend.py # Modern tkinter application +├── cli/ # Command-line interface +│ └── main.py # CLI entry point +└── api/ # External API integrations + └── queries.py # Location/population lookup services +``` + +### Supporting Files + +- `assets/` - Application icons, logos, and PyInstaller hooks for spaCy/Presidio +- `examples/` - Demonstration scripts and usage examples +- `scripts/` - Utility scripts for model management and development +- `tests/` - Test suite with pytest +- `pyproject.toml` - Modern Python project configuration +- `Justfile` - Development workflow commands + +## Development + +### Requirements + +- Python 3.9+ +- [uv](https://docs.astral.sh/uv/) - Fast Python package manager +- [just](https://github.com/casey/just) - Command runner + +### Development Commands + +```bash +# Environment setup +just get-started # Complete development setup +just venv # Create virtual environment +just install-deps # Install dependencies + +# Running the application +just run-gui # Launch GUI interface +just run-gui-legacy # Launch Legacy (0.23.0) GUI interface built in TKinter +just run-cli # Launch CLI interface + +# Enhanced PII detection (optional) +just install-presidio # Install Presidio with English small model +just install-presidio spanish md # Install with Spanish medium model +just list-spacy-models # Show installed spaCy models +just manage-models list # Detailed model information +uv run python examples/presidio_demo.py # Test Presidio functionality + +# spaCy model management +just install-spacy-model en_core_web_md # Install specific model +just manage-models ensure en lg # Ensure English large model exists +just manage-models cleanup --keep en es # Remove unused models + +# Testing +just test # Run test suite (unit + integration) +uv run pytest tests/test_integration.py -v # Run integration tests only +uv run pytest tests/test_presidio_integration.py -v # Test Presidio integration +uv run pytest -m "slow" # Run slow tests (includes API calls) +uv run pytest -m "not slow" # Skip slow tests + +# Code quality +just fmt-all # Format and lint code +just pre-commit-run # Run all pre-commit hooks + +# Building and distribution +just build # Build Python package +just build-exe # Create Windows executable +just build-exe-presidio # Create executable with Presidio support +just create-installer # Generate Windows installer +``` + +### Test Data + +The project includes comprehensive test datasets for integration testing: + +- `tests/data/sample_pii_data.csv` - Dataset containing various PII types for testing detection algorithms +- `tests/data/clean_data.csv` - Clean dataset with minimal PII for testing false positive rates +- `tests/data/comprehensive_pii_data.csv` - Complex dataset with multiple PII types for anonymization testing +- `tests/data/qualitative_data.csv` - Text-based data for testing text anonymization techniques +- `tests/data/test_data.csv` - Simple dataset for basic functionality testing + +These datasets are used by the integration test suite to verify that PII detection and anonymization work correctly across different scenarios. + +### Anonymization Capabilities + +The system provides extensive anonymization techniques based on academic research and FSD guidelines: + +**Traditional Anonymization Methods:** + +- Variable removal and record suppression +- Hash-based and systematic pseudonymization +- Age, income, and geographic categorization +- Statistical noise addition and permutation +- K-anonymity enforcement +- Text pattern masking and redaction + +**Enhanced Anonymization with Presidio:** + +- Context-aware text anonymization using ML models +- Entity-specific replacement strategies +- Confidence-based anonymization decisions +- Multi-language text processing + +**Example Usage:** + +*Traditional Methods:* + +```python +from pii_detector.core.anonymization import AnonymizationTechniques + +anonymizer = AnonymizationTechniques() + +# Remove direct identifiers +clean_data = anonymizer.remove_variables(dataset, ['name', 'ssn', 'email']) + +# Categorize sensitive data +clean_data['age_group'] = anonymizer.age_categorization(dataset['age']) +clean_data['income_bracket'] = anonymizer.income_categorization(dataset['income']) + +# Apply k-anonymity +final_data = anonymizer.achieve_k_anonymity(clean_data, ['age_group', 'city'], k=3) +``` + +*Hybrid Anonymization with Presidio:* + +```python +from pii_detector.core.unified_processor import detect_pii_unified +from pii_detector.core.hybrid_anonymizer import anonymize_dataset_hybrid + +# Detect PII using hybrid methods +detection_results = detect_pii_unified(dataset, language="en") + +# Anonymize using both traditional and ML-based methods +anonymized_data, report = anonymize_dataset_hybrid(dataset, detection_results) +``` + +See `examples/anonymization_demo.py` and `examples/presidio_demo.py` for complete demonstrations. + +### spaCy Model Management + +The enhanced PII detection uses spaCy language models. The system automatically manages model installation: + +**Supported Languages:** + +- English (`en`): en_core_web_sm, en_core_web_md, en_core_web_lg +- Spanish (`es`): es_core_news_sm, es_core_news_md, es_core_news_lg +- German (`de`): de_core_news_sm, de_core_news_md, de_core_news_lg +- French (`fr`): fr_core_news_sm, fr_core_news_md, fr_core_news_lg +- And more... + +**Model Sizes:** + +- `sm` (small): ~15MB, fast, good accuracy +- `md` (medium): ~50MB, balanced speed/accuracy +- `lg` (large): ~750MB, best accuracy, slower + +**Management Commands:** + +```bash +# Check what's installed +just list-spacy-models + +# Install for specific language/size +just install-presidio german md + +# Advanced model management +just manage-models list # Detailed model info +just manage-models ensure spanish lg # Ensure model exists +just manage-models install en_core_web_lg # Install specific model +just manage-models cleanup --keep en es # Remove unused models +``` + +**Automatic Installation:** +The system automatically installs missing models when needed. No manual intervention required for basic usage. + +### Environment Variables + +For API integrations, set these optional environment variables: + +- `GEONAMES_USERNAME` - GeoNames API for location population lookups +- `FOREBEARS_API_KEY` - Forebears API for name validation +- `PII_HASH_SECRET_KEY` - Secret key for hashing (uses default if not set) + +## File Format Support + +The PII Detector supports reading and writing multiple file formats: + +- **CSV files** (`.csv`) - Universal comma-separated format +- **Excel files** (`.xlsx`, `.xls`) - Microsoft Excel formats +- **Stata files** (`.dta`) - Preserves variable labels and value labels, full round-trip support + +### Command Line Format Handling + +The CLI automatically detects input file formats and can preserve them in output: + +```bash +# Anonymize Stata file, output as Stata +pii-detector anonymize survey_data.dta --output clean_survey.dta + +# Batch process mixed formats, preserving original types +pii-detector batch "data/*" --output-dir results/ +# → .dta files → .dta output, .csv files → .csv output, etc. + +# Cross-format conversion supported +pii-detector anonymize data.dta --output data_clean.csv +``` + +## Distribution + +### For End Users (Windows Executable) + +```bash +# Create executable and installer +just build-exe +just create-installer + +# Output locations: +# - Executable: dist/ +# - Installer: compile create_installer.iss with Inno Setup +``` + +### For Python Package Distribution + +```bash +# Build package for PyPI +just build + +# Install locally in development mode +uv pip install -e . +``` + +## Contributing + +1. Fork the repository +2. Set up development environment: `just get-started` +3. Make your changes +4. Run tests and formatting: `just fmt-all && just test` +5. Submit a pull request + +## Credits + +**Development Team:** + +- IPA Global Research and Data Science Team + +**Inspiration:** + +- J-PAL: [stata_PII_scan](https://github.com/J-PAL/stata_PII_scan) (2020) +- J-PAL: [PII-Scan](https://github.com/J-PAL/PII-Scan) (2017) + +## License + +The PII Detector is [MIT Licensed](LICENSE). + +--- + +**Feedback Welcome!** Help us improve this tool by reporting issues or suggestions on [GitHub Issues](https://github.com/PovertyAction/PII_detection/issues). diff --git a/api_queries.py b/api_queries.py deleted file mode 100644 index fdbac7c..0000000 --- a/api_queries.py +++ /dev/null @@ -1,260 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.common.keys import Keys -from selenium.webdriver.chrome.options import Options -import pandas as pd -from secret_keys import get_geonames_username, get_forebears_api_key -import requests -import json -from webdriver_manager.chrome import ChromeDriverManager - -from constant_strings import * - - -driver=None -def ask_google(query): - global driver - - if driver is None: - chrome_options = Options() - chrome_options.add_argument("--window-size=1024x768") - chrome_options.add_argument("--headless") - driver = webdriver.Chrome(ChromeDriverManager().install(),options=chrome_options) #executable_path=r'chromedriver.exe' - - # Search for query - query = query.replace(' ', '+') - - driver.get('http://www.google.com/search?q=' + query) - - # Get text from Google answer box - for different_answer_box_y_location in [230,350]: #Usually 230 is fine, but for searches that come with images (La Magdalena Contreras population for ex) 350 is better - answer = driver.execute_script("return document.elementFromPoint(arguments[0], arguments[1]);", - 350, different_answer_box_y_location).text - if answer != "": - return answer - - return False - -def get_country_iso_code(country_name): - - if country_name in COUNTRY_NAME_TO_ISO_CODE: - return COUNTRY_NAME_TO_ISO_CODE[country_name] - else: - return None - -def check_location_exists_and_population_size(location, country): - #https://www.geonames.org/export/geonames-search.html - - api_url = 'http://api.geonames.org/searchJSON?name='+location+'&name_equals='+location+'&maxRows=1&orderby=population&isNameRequired=true&username='+get_geonames_username() - country_iso = get_country_iso_code(country) - if country_iso: - api_url = api_url+'&country='+country_iso - - response = requests.get(api_url) - - if location == 'el.aire': - print(api_url) - print(response) - - response_json = json.loads(response.text) - - if 'totalResultsCount' in response_json and response_json['totalResultsCount'] > 0: - - if 'population' in response_json['geonames'][0] and response_json['geonames'][0]['population'] !=0: - # print("Location "+location+" exists and its population is "+str(response_json['geonames'][0]['population'])) - return True, response_json['geonames'][0]['population'] - else: - # print("Location "+location+" exists but we couldnt find population") - return True, False - else: - # print(location+" is NOT a location") - return False, False - -def get_population_from_google_query_result(query_result): - ''' - Get ready to receive populations in different formats, such as: - - 3,685\n2010 - 91,411 (2018) - 14,810,001 // New england - - - 17 million people - 1.655 million (2010) // Ecatepec de Morelos - ''' - - try: - clean_query_result = query_result - - #14,810,001 - clean_query_result = clean_query_result.replace(',','') - - #3685\n2010 - clean_query_result = clean_query_result.split("\n")[0] - - #1.655 million (2010) - if(" " in clean_query_result): - clean_query_result = " ".join(clean_query_result.split(" ")[:-1]) - - #1.655 million - #Replace '.' and million - if len(clean_query_result.split(" "))>1: - result = float(clean_query_result.split(" ")[0]) - multiplier = clean_query_result.split(" ")[1] - if multiplier == 'million': - result = result * 1000000 - - clean_query_result = result - - result = int(clean_query_result) - except Exception as e: - # print("problem paring query result to int") - # print(e) - # print(query_result) - return False - - return result - -def google_population(location): - #Query google - query_result = ask_google(location+" population") - - # print("Google query result: ") - # print(query_result) - - population = get_population_from_google_query_result(query_result) - if population: - # print("Googled population for "+location+" is "+str(population)) - return population - else: - # print("Could not google population for "+location) - return False - -def get_locations_with_low_population(locations, country, low_populations_threshold=20000, return_one=None, consider_low_population_if_unknown_population=False): - #Check which strings of locations correspond to locations whith low_populations - #If return_one is set to True, method returns first location with low population - #If consider_low_population_if_unknown_population is set to True, locations with unknown population will be labelled as low population (conservative approach) - - locations_with_low_population = [] - locations_with_unknown_population = [] - - # print("Locations to look at:") - # print(locations) - - for index, location in enumerate(locations): - if(index%50==0): - print(str(index)+'/'+str(len(locations))) - print(location) - - location_exists, population = check_location_exists_and_population_size(location, country) - if location_exists: - if not population: - population = google_population(location) - - if population: - print(f"Found a population for {location}") - if population < low_populations_threshold: - print(location+" is a location with LOW pop") - if return_one: - return location - else: - locations_with_low_population.append(location) - else: - #We know for sure now that we are indeed in a column with locations, given that for one of them we were able to get its population - - #We want to activate consider_low_population_if_unknown_population as long as we are sure that this column has locations (aka, we have already found at least one location and we were able to extract its population) - #We also add all locations found so far with unkwon population to the list of locations with low population - if consider_low_population_if_unknown_population is False: - locations_with_low_population.extend(locations_with_unknown_population) - consider_low_population_if_unknown_population = True - - - else: - #If the population is unknown, there are 2 possibilities. - #The first one is a conservative approach: a location with unkown population is considered to have low population - #The other is to discard them. This is useful for the case of columns that actually dont have locations, but some word might match a location - #For this second scenario, we will save all locations with unknown population, and if we happen to realize we are in the scenario of a column with locations, only then we will add them all to the list of location wit low populations. - if consider_low_population_if_unknown_population: - if return_one: - return location - else: - locations_with_low_population.append(location) - else: #We still dont know if we are in a column with locations - locations_with_unknown_population.append(location) - - - if return_one: - return False - else: - return locations_with_low_population - - - -#**************FOREBEARS API TO CHECK NAMES********* - -def generate_names_parameter_for_api(list_names, option): - #According to https://forebears.io/onograph/documentation/api/location/batch - - list_of_names_json=[] - for name in list_names: - list_of_names_json.append('{"name":"'+name+'","type":"'+option+'","limit":2}') - - names_parameter = '['+','.join(list_of_names_json)+']' - return names_parameter - -def get_names_from_json_response(response): - - names_found = [] - - json_response = json.loads(response) - - if "results" in json_response: - for result in json_response["results"]: - #Names that exist come with the field 'jurisdictions' - #We will also ask a minimum of 50 world incidences - if('jurisdictions' in result and len(result['jurisdictions'])>0): - try: - world_incidences = int(result['world']['incidence']) - - if world_incidences > 50: - names_found.append(result['name']) - except Exception as e: - print("error in get_names_from_json_response") - print(e) - print(result) - print(json_response["results"]) - else: - print("NO RESULTS IN RESPONSE") - print(json_response) - - return names_found - -def find_names_in_list_string(list_potential_names): - ''' - Uses https://forebears.io/onograph/documentation/api/location/batch to find names in list_potential_names - ''' - API_KEY = get_forebears_api_key() - - all_names_found = set() - - #Api calls must query at most 1,000 names. - n = 1000 - list_of_list_1000_potential_names = [list_potential_names[i:i + n] for i in range(0, len(list_potential_names), n)] - - for list_1000_potential_names in list_of_list_1000_potential_names: - #Need to 2 to API calls, one checking forenames and one checking surnames - for forename_or_surname in ['forename', 'surname']: - api_url = 'https://ono.4b.rs/v1/jurs?key='+API_KEY - - names_parameter = generate_names_parameter_for_api(list_1000_potential_names, forename_or_surname) - - - response = requests.post(api_url, data={'names':names_parameter}) - - - names_found = get_names_from_json_response(response.text) - for name in names_found: - all_names_found.add(name) - - #Opportunity of improvement: If i already found a name as a forename, dont query it as a surname - - return list(all_names_found) diff --git a/app_frontend.py b/app_frontend.py deleted file mode 100644 index 5dad19d..0000000 --- a/app_frontend.py +++ /dev/null @@ -1,767 +0,0 @@ -# Imports and Set-up -import sys -import tkinter as tk -from tkinter import ttk -from tkinter.filedialog import askopenfilename -from tkinter import messagebox -from PIL import ImageTk, Image -import webbrowser -import os -import requests - -import PII_data_processor - -from constant_strings import * - -intro_text = "This script is meant to assist in the detection of PII\ -(personally identifiable information) and subsequent removal from a dataset. \ -This is an alpha program, not fully tested yet." -intro_text_p2 = "You will first load a dataset that might contain PII variables. \ -The system will try to identify the PII candidates. \ -Please indicate if you would like to Drop, Encode or Keep them.\n\n\ -Once finished, you will be able to export a list of the PII detected, a do-file \ -to generate a deidentified dataset according to your options, and an already \ -deidentified dataset in case your input file is not a .dta\n\n\ -Please help improve the program by filling out the survey on your experience using it (Help -> Provide Feedback)." -version_number = "0.2.23" -app_title = "IPA's PII Detector - v"+version_number - -#Maps pii to action to do with them -pii_candidates_to_dropdown_element = {} - -#Dataset we are working with -dataset = None -dataset_path = None -new_file_path = None -label_dict = None - -find_piis_options={} - -window_width=None -window_height=None - -columns_where_to_replace_piis = None - -piis_in_text_box = None - -check_survey_cto_checkbutton_var = None -check_locations_pop_checkbutton_var = None -column_level_option_for_unstructured_text_checkbutton_var = None -keep_unstructured_text_option_checkbutton_var = None - -country_dropdown = None -language_dropdown = None - -piis_frame = None -anonymized_dataset_creation_frame = None -new_dataset_message_frame = None -do_file_message_frame = None - -pii_search_in_unstructured_text_enabled = False - -def display_title(title, frame_where_to_display): - label = ttk.Label(frame_where_to_display, text=title, wraplength=546, justify=tk.LEFT, font=("Calibri", 12, 'bold'), style='my.TLabel') - label.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - frame.update() - return label - -def display_message(the_message, frame_where_to_display): - label = ttk.Label(frame_where_to_display, text=the_message, wraplength=546, justify=tk.LEFT, font=("Calibri Italic", 11), style='my.TLabel') - label.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - frame.update() - return label - -def tkinter_display_title(title): - label = ttk.Label(frame, text=title, wraplength=546, justify=tk.LEFT, font=("Calibri", 12, 'bold'), style='my.TLabel') - label.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - frame.update() - return label - -def tkinter_display(the_message): - # the_message = datetime.now().strftime("%H:%M:%S") + ' ' + the_message - label = ttk.Label(frame, text=the_message, wraplength=546, justify=tk.LEFT, font=("Calibri Italic", 11), style='my.TLabel') - label.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - frame.update() - return label - -def display_pii_candidates(pii_candidates, label_dict, frame_where_to_display, default_dropdown_option="Drop"): - - #Automatic scroll up - canvas.yview_moveto( 0 ) - - #Create a frame for the pii labels and actions dropdown - #padx determines space between label and dropdown - pii_frame = tk.Frame(master=frame_where_to_display, bg="white") - pii_frame.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - - #Add title to grid - ttk.Label(pii_frame, text='PII candidate', wraplength=546, justify=tk.LEFT, font=("Calibri", 11, 'bold'), style='my.TLabel').grid(row=0, column = 0, sticky = 'w', pady=(0,2)) - ttk.Label(pii_frame, text='Reason detected', wraplength=546, justify=tk.LEFT, font=("Calibri", 11, 'bold'), style='my.TLabel').grid(row=0, column = 1, sticky = 'w', pady=(0,2)) - ttk.Label(pii_frame, text='Desired action', wraplength=546, justify=tk.LEFT, font=("Calibri", 11, 'bold'), style='my.TLabel').grid(row=0, column = 2, sticky = 'w', padx=(5,0), pady=(0,2)) - - #Display a label for each pii candidate and save their action dropdown element in dictionary for future reference - for idx, (pii_candidate, reason_detected) in enumerate(pii_candidates.items()): - - #Given that in fist row of grid we have title of columns - idx=idx+1 - - #Add labels to pii candidates for better user understanding of column names - if label_dict and pii_candidate in label_dict and label_dict[pii_candidate]!="": - pii_candidate_label = pii_candidate + ": "+label_dict[pii_candidate]+"\t" - else: - pii_candidate_label = pii_candidate+"\t" - - ttk.Label(pii_frame, text=pii_candidate_label, wraplength=546, justify=tk.LEFT, font=("Calibri", 11), style='my.TLabel').grid(row=idx, column = 0, sticky = 'w', pady=(0,2)) - - ttk.Label(pii_frame, text=reason_detected+"\t", wraplength=546, justify=tk.LEFT, font=("Calibri", 11), style='my.TLabel').grid(row=idx, column = 1, sticky = 'w', pady=(0,2)) - - dropdown = tk.StringVar(pii_frame) - w = ttk.OptionMenu(pii_frame, dropdown, default_dropdown_option, "Drop", "Encode", "Keep", style='my.TMenubutton').grid(row=idx, column = 2, sticky = 'w', pady=(0,2)) - - pii_candidates_to_dropdown_element[pii_candidate] = dropdown - - frame.update() - - return pii_frame - -def do_file_created_message(creating_do_file_message): - creating_do_file_message.pack_forget() - - #Automatic scroll up - canvas.yview_moveto( 0 ) - - goodbye_frame = tk.Frame(master=frame, bg="white") - goodbye_frame.pack(anchor='nw', padx=(0, 0), pady=(0, 0)) - - do_file_message_frame = tk.Frame(master=anonymized_dataset_creation_frame, bg="white") - do_file_message_frame.pack(anchor='nw', padx=(0, 0), pady=(0, 0)) - - display_message("anonymize_script.do has been created and saved in the 'pii_detection_outputs' folder, in the same directory as the input file.\nYou will also find all_piis_identified.txt with a list of all the pii variables", do_file_message_frame) - display_goodby_message(do_file_message_frame) - -def display_goodby_message(goodbye_frame): - display_message("Do you want to work on a new file? Click File/Restart in the menu bar.", goodbye_frame) - - #Create a frame for the survey link - survey_frame = tk.Frame(master=goodbye_frame, bg="white") - survey_frame.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - - survey_text = "Can you provide feedback to improve the app? Please click " - ttk.Label(survey_frame, text=survey_text, wraplength=546, justify=tk.LEFT, font=("Calibri Italic", 11), style='my.TLabel').grid(row=0, column = 0) - link = tk.Label(survey_frame, text="here", fg="blue", font=("Calibri Italic", 11), cursor="hand2", background='white') - link.grid(row = 0, column=1) - link.bind("", lambda e: open_survey()) - -def new_dataset_created_message(creating_dataset_message): - - creating_dataset_message.pack_forget() - - global new_dataset_message_frame - - new_dataset_message_frame = tk.Frame(master=anonymized_dataset_creation_frame, bg="white") - new_dataset_message_frame.pack(anchor='nw', padx=(0, 0), pady=(0, 0)) - - if(new_file_path): - display_message("The new dataset has been created and saved in the original file directory.\nYou will also find a log file describing the detection process.\nIf you encoded variables, you will find a .csv file that maps original to encoded values.\n", new_dataset_message_frame) - - #PENDING: ADD A BUTTOM TO FOLDER WITH OUTPUTS - - display_goodby_message(new_dataset_message_frame) - #Need this? - #frame.update() - -def remove_previous_dataset_do_file_message(): - global new_dataset_message_frame - global do_file_message_frame - - if new_dataset_message_frame is not None: - new_dataset_message_frame.pack_forget() - - if do_file_message_frame is not None: - do_file_message_frame.pack_forget() - -def create_do_file(): - remove_previous_dataset_do_file_message() - - creating_do_file_message = display_message("Creating .do file...", anonymized_dataset_creation_frame) - - #Create dictionary that maps pii_candidate_to_action based on value of dropdown elements - pii_candidates_to_action = create_pii_candidates_to_action() - - new_file_path = PII_data_processor.create_deidentifying_do_file(dataset_path, pii_candidates_to_action) - - do_file_created_message(creating_do_file_message) - -def create_anonymized_dataset_creation_frame(): - - #Scroll up - canvas.yview_moveto( 0 ) - - global anonymized_dataset_creation_frame - piis_frame.forget() - - anonymized_dataset_creation_frame = tk.Frame(master=frame, bg="white") - anonymized_dataset_creation_frame.pack(anchor='nw', padx=(0, 0), pady=(0, 0)) - - display_title('Decide how to export your deidentified dataset', anonymized_dataset_creation_frame) - - #If input is not .dta, users can either download deidentified dataset and download .do file for deidentificaiton. If its a .dta, only second option - if not PII_data_processor.input_file_is_dta(dataset_path): - display_message('You can either directly download a deidentified dataset, and/or download a .do file that creates the deidentified dataset', anonymized_dataset_creation_frame) - - create_dataset_button = ttk.Button(anonymized_dataset_creation_frame, text='Download deidentified dataset', command=create_anonymized_dataset, style='my.TButton') - create_dataset_button.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - - create_do_file_button = ttk.Button(anonymized_dataset_creation_frame, text='Create .do file for deidentification', command=create_do_file, style='my.TButton') - create_do_file_button.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - - frame.update() - - -def create_pii_candidates_to_action(): - - pii_candidates_to_action = {} - for pii, dropdown_elem in pii_candidates_to_dropdown_element.items(): - pii_candidates_to_action[pii] = dropdown_elem.get() - return pii_candidates_to_action - -def create_anonymized_dataset(): - - remove_previous_dataset_do_file_message() - - creating_dataset_message = display_message("Creating new dataset...", anonymized_dataset_creation_frame) - - #Automatic scroll down - canvas.yview_moveto( 1 ) - frame.update() - - global new_file_path - - #We create a new dictionary that maps pii_candidate_to_action based on value of dropdown elements - pii_candidates_to_action = create_pii_candidates_to_action() - - #Capture words to replace in unstructured text - if(pii_search_in_unstructured_text_enabled and keep_unstructured_text_option_checkbutton_var.get()==1): - piis_found_in_ustructured_text = [w.strip() for w in piis_in_text_box.get("1.0", "end").split(',')] - else: - piis_found_in_ustructured_text = None - - new_file_path = PII_data_processor.create_anonymized_dataset(dataset, label_dict, dataset_path, pii_candidates_to_action, columns_where_to_replace_piis, piis_found_in_ustructured_text) - - new_dataset_created_message(creating_dataset_message) - - - -def display_piis_found_in_ustructured_text(piis_found_in_ustructured_text, frame_where_to_display): - global piis_in_text_box - piis_in_text_box = tk.Text(frame_where_to_display, height=20, width=70) - piis_in_text_box.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - piis_in_text_box.insert(tk.END, ", ".join(piis_found_in_ustructured_text)) - return piis_in_text_box - - -def create_unstructured_piis_frame(next_search_method, next_search_method_button_text, piis_found_in_ustructured_text): - - piis_frame = tk.Frame(master=frame, bg="white") - piis_frame.pack(anchor='nw', padx=(0, 0), pady=(0, 0)) - - - display_title('PIIs found in unstructured text:', piis_frame) - display_message("These are the potential PIIs found in open ended questions and which will be replaced by 'XXXX' in the new de-identified dataset", piis_frame) - display_message("Feel free to remove from the list if you find wrongly identified PIIs, just keep words separated by commas.", piis_frame) - display_piis_found_in_ustructured_text(piis_found_in_ustructured_text, piis_frame) - - - #COPIED FROM create_piis_frame() - if(next_search_method is not None): - buttom_text = next_search_method_button_text - next_command = find_piis - else: - buttom_text = 'Create anonymized dataset and download .do files' - next_command = create_anonymized_dataset_creation_frame - - next_method_button = ttk.Button(piis_frame, text=buttom_text, command=next_command, style='my.TButton') - next_method_button.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - frame.update() - - return piis_frame - -def create_piis_frame(next_search_method, next_search_method_button_text, pii_candidates): - - global columns_still_to_check - - piis_frame = tk.Frame(master=frame, bg="white") - piis_frame.pack(anchor='nw', padx=(0, 0), pady=(0, 0)) - - - display_title('PII candidates found using '+search_method+':', piis_frame) - - if(len(pii_candidates)==0): - display_message('No PII candidates found.', piis_frame) - else: - #Create title, instructions, and display piis - display_message('For each PII candidate, select an action', piis_frame) - display_pii_candidates(pii_candidates, label_dict, piis_frame) - - #Update columns_still_to_check, removing pii candidates found - columns_still_to_check = [c for c in columns_still_to_check if c not in pii_candidates] - - - if(next_search_method is not None): - buttom_text = next_search_method_button_text - next_command = find_piis - else: - buttom_text = 'Create anonymized dataset and download .do files' - next_command = create_anonymized_dataset_creation_frame - - next_method_button = ttk.Button(piis_frame, text=buttom_text, command=next_command, style='my.TButton') - next_method_button.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - frame.update() - - return piis_frame - -def find_piis(): - - global columns_still_to_check - global search_method - global next_search_method - global columns_where_to_replace_piis - global piis_frame - - #Update search method (considering find_piis() is recurrently called) - search_method = next_search_method - - #Add a 'Working on it...' message - if (search_method == COLUMNS_NAMES_SEARCH_METHOD): - display_message('Working on it...', first_view_frame) - else: - display_message('Working on it...', piis_frame) - #Scroll down - canvas.yview_moveto( 1 ) - frame.update() - - - #Figure out what method for finding pii to use - if (search_method == COLUMNS_NAMES_SEARCH_METHOD): - - #Check if surveyCTO vars should be considered - if(check_survey_cto_checkbutton_var.get()==0): - columns_still_to_check = [column for column in dataset.columns if column not in PII_data_processor.get_surveycto_restricted_vars()] - else: - columns_still_to_check = dataset.columns - - #Find piis basen on column names - #If we are not checking locations populations, then we do include locations column in the next search - consider_locations_col = 1 if check_locations_pop_checkbutton_var.get()==0 else 0 - - pii_candidates = PII_data_processor.find_piis_based_on_column_name(dataset, label_dict, value_label_dict, columns_still_to_check, consider_locations_col) - - #Indicate next search method - if(check_locations_pop_checkbutton_var.get()==1): - next_search_method_button_text = "Continue: Find columns with potential PIIs for columns with locations" - next_search_method = LOCATIONS_POPULATIONS_SEARCH_METHOD - else: - next_search_method_button_text = "Continue: Find columns with potential PIIs based on columns format" - next_search_method = COLUMNS_FORMAT_SEARCH_METHOD - - elif(search_method == LOCATIONS_POPULATIONS_SEARCH_METHOD): - pii_candidates = PII_data_processor.find_piis_based_on_locations_population(dataset, label_dict, columns_still_to_check, country_dropdown.get()) - next_search_method_button_text = "Continue: Find columns with potential PIIs based on columns format" - next_search_method = COLUMNS_FORMAT_SEARCH_METHOD - - elif(search_method == COLUMNS_FORMAT_SEARCH_METHOD): - pii_candidates = PII_data_processor.find_piis_based_on_column_format(dataset, label_dict, columns_still_to_check) - - if (not pii_search_in_unstructured_text_enabled or column_level_option_for_unstructured_text_checkbutton_var.get()==1): - next_search_method_button_text = "Continue: Find columns with potential PIIs based on sparse entries" - next_search_method = SPARSE_ENTRIES_SEARCH_METHOD - else: - next_search_method_button_text = "Continue: Find PIIs in open ended questions" - next_search_method = UNSTRUCTURED_TEXT_SEARCH_METHOD - - elif(search_method == SPARSE_ENTRIES_SEARCH_METHOD): - pii_candidates = PII_data_processor.find_piis_based_on_sparse_entries(dataset, label_dict, columns_still_to_check) - next_search_method_button_text = "Create anonymized dataset" - next_search_method = None - - elif(search_method == UNSTRUCTURED_TEXT_SEARCH_METHOD): - piis_found_in_ustructured_text, columns_where_to_replace_piis = PII_data_processor.find_piis_unstructured_text(dataset, label_dict, columns_still_to_check, language_dropdown.get(), country_dropdown.get()) - next_search_method_button_text = "Create anonymized dataset" - next_search_method = None - pii_candidates = None - - - #UPDATE VIEW - - #Remove previous view - if (search_method == COLUMNS_NAMES_SEARCH_METHOD): - first_view_frame.pack_forget() - else: - piis_frame.pack_forget() - - #Create new frame - if(search_method != UNSTRUCTURED_TEXT_SEARCH_METHOD): - piis_frame = create_piis_frame(pii_candidates=pii_candidates, next_search_method=next_search_method, next_search_method_button_text=next_search_method_button_text) - else: - piis_frame = create_unstructured_piis_frame(piis_found_in_ustructured_text=piis_found_in_ustructured_text, next_search_method=next_search_method, next_search_method_button_text=next_search_method_button_text) - -def restart_program(): - """Restarts the current program. - Note: this function does not return. Any cleanup action (like - saving data) must be done before calling this function.""" - python = tk.sys.executable - os.execl(python, python, * tk.sys.argv) - -def window_setup(master): - - global window_width - global window_height - - #Add window title - master.title(app_title) - - #Add window icon - if hasattr(sys, "_MEIPASS"): - icon_location = os.path.join(sys._MEIPASS, 'app_icon.ico') - else: - icon_location = 'app_icon.ico' - master.iconbitmap(icon_location) - - #Set window position and max size - window_width, window_height = master.winfo_screenwidth(), master.winfo_screenheight() - # master.geometry("%dx%d+0+0" % (window_width, window_height)) - master.state('zoomed') - - - #Make window reziable - master.resizable(True, True) - -def open_survey(): - webbrowser.open('https://docs.google.com/forms/d/e/1FAIpQLSfxB_pnReUd0EvFfQxPu5JI9oRGCpDgULWkTeDHYoqx8x7q-Q/viewform') - -def menubar_setup(root): - - def about(): - webbrowser.open('https://github.com/PovertyAction/PII_detection/blob/master/README.md#pii_detection') - - def contact(): - webbrowser.open('https://github.com/PovertyAction/PII_detection/issues') - - def article(): - webbrowser.open('https://povertyaction.force.com/support/s/article/IPAs-Personally-Identifiable-Information-Application') - - def comparison(): - webbrowser.open('https://ipastorage.box.com/s/35jbvflnt6e4ev868290c3hygubofz2r') - - def PII_field_names(): - webbrowser.open('https://github.com/PovertyAction/PII_detection/blob/fa1325094ecdd085864a58374d9f687181ac09fd/PII_data_processor.py#L115') - - - - menubar = tk.Menu(root) - - # Create file menu pulldown - filemenu = tk.Menu(menubar, tearoff=0) - menubar.add_cascade(label="File", menu=filemenu) - - # Add commands to filemenu menu - filemenu.add_command(label="Restart", command=restart_program) - filemenu.add_separator() - filemenu.add_command(label="Exit", command=root.quit) - - # Create help menu pulldown - helpmenu = tk.Menu(menubar, tearoff=0) - menubar.add_cascade(label="Help", menu=helpmenu) - - # Add commands to help menu - helpmenu.add_command(label="About", command=about) - # helpmenu.add_command(label="- Knowledge Article", command=article) - # helpmenu.add_command(label="- Comparison with Other Scripts", command=comparison) - #helpmenu.add_command(label="- PII Field Names", command=PII_field_names) - #helpmenu.add_command(label="- Data Security", command=PII_field_names) - helpmenu.add_separator() - helpmenu.add_command(label="File Issue on GitHub", command=contact) - # helpmenu.add_separator() - #helpmenu.add_command(label="Contribute", command=contact) - helpmenu.add_command(label="Provide Feedback", command=open_survey) - - # Add menu bar to window - root.configure(menu=menubar) - -def window_style_setup(root): - root.style = ttk.Style() - # # root.style.theme_use("clam") # ('winnative', 'clam', 'alt', 'default', 'classic', 'vista', 'xpnative') - root.style.configure('my.TButton', font=("Calibri", 11, 'bold'), background='white') - root.style.configure('my.TLabel', background='white') - root.style.configure('my.TCheckbutton', background='white') - root.style.configure('my.TMenubutton', background='white') - -def add_scrollbar(root, canvas, frame): - - #Configure frame to recognize scrollregion - def onFrameConfigure(canvas): - '''Reset the scroll region to encompass the inner frame''' - canvas.configure(scrollregion=canvas.bbox("all")) - - frame.bind("", lambda event, canvas=canvas: onFrameConfigure(canvas)) - - def onMouseWheel(canvas, event): - canvas.yview_scroll(int(-1*(event.delta/120)), "units") - - #Bind mousewheel to scrollbar - frame.bind_all("", lambda event, canvas=canvas: onMouseWheel(canvas, event)) - - - #Create scrollbar - vsb = tk.Scrollbar(root, orient="vertical", command=canvas.yview) - canvas.configure(yscrollcommand=vsb.set) - vsb.pack(side="right", fill="y") - - -def create_first_view_page(internet_connection): - - global check_survey_cto_checkbutton_var - global check_locations_pop_checkbutton_var - global column_level_option_for_unstructured_text_checkbutton_var - global keep_unstructured_text_option_checkbutton_var - - global country_dropdown - global language_dropdown - - first_view_frame = tk.Frame(master=frame, bg="white") - first_view_frame.pack(anchor='nw', padx=(0, 0), pady=(0, 0))#padx=(30, 30), pady=(0, 5)) - - #Add intro text - intro_text_1_label = ttk.Label(first_view_frame, text=intro_text, wraplength=746, justify=tk.LEFT, font=("Calibri", 11), style='my.TLabel') - intro_text_1_label.pack(anchor='nw', padx=(30, 30), pady=(0, 12)) - - intro_text_2_label = ttk.Label(first_view_frame, text=intro_text_p2, wraplength=746, justify=tk.LEFT, font=("Calibri", 11), style='my.TLabel') - intro_text_2_label.pack(anchor='nw', padx=(30, 30), pady=(0, 12)) - - #Labels and checkbox for settings - settings_label = ttk.Label(first_view_frame, text="Settings:", wraplength=546, justify=tk.LEFT, font=("Calibri", 12, 'bold'), style='my.TLabel') - settings_label.pack(anchor='nw', padx=(30, 30), pady=(0, 10)) - - if pii_search_in_unstructured_text_enabled: - #Create a frame for the language selection - language_frame = tk.Frame(master=first_view_frame, bg="white") - language_frame.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - - ttk.Label(language_frame, text='In which language are the answers in the dataset?', wraplength=546, justify=tk.LEFT, font=("Calibri", 10), style='my.TLabel').grid(row=0, column = 0, sticky = 'w', pady=(0,2)) - - language_dropdown = tk.StringVar(language_frame) - w = ttk.OptionMenu(language_frame, language_dropdown, SPANISH, ENGLISH, SPANISH, OTHER, style='my.TMenubutton').grid(row=0, column = 1, sticky = 'w', pady=(0,2)) - - #Create a frame for country selection - country_frame = tk.Frame(master=first_view_frame, bg="white") - country_frame.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - - ttk.Label(country_frame, text='In which country was this survey run?', wraplength=546, justify=tk.LEFT, font=("Calibri", 10), style='my.TLabel').grid(row=0, column = 0, sticky = 'w', pady=(0,2)) - - country_dropdown = tk.StringVar(country_frame) - w = ttk.OptionMenu(country_frame, country_dropdown, MEXICO, *ALL_COUNTRIES, OTHER, style='my.TMenubutton').grid(row=0, column = 1, sticky = 'w', pady=(0,2)) - - #Labels and checkbox for options - options_label = ttk.Label(first_view_frame, text="Options:", wraplength=546, justify=tk.LEFT, font=("Calibri", 12, 'bold'), style='my.TLabel') - options_label.pack(anchor='nw', padx=(30, 30), pady=(0, 10)) - - #SurveyCTO vars option - check_survey_cto_checkbutton_var = tk.IntVar() - check_survey_cto_checkbutton = tk.Checkbutton(first_view_frame, text="Consider surveyCTO variables for PII detection (ex: 'deviceid', 'subscriberid', 'simid', 'duration','starttime').", - bg="white", - activebackground="white", - variable=check_survey_cto_checkbutton_var, - onvalue=1, offvalue=0) - check_survey_cto_checkbutton.pack(anchor='nw', padx=(30, 30), pady=(0, 10)) - - #Check locations population option - check_locations_pop_checkbutton_var = tk.IntVar() - check_locations_pop_checkbutton = tk.Checkbutton(first_view_frame, text="Flag locations columns (ex: Village) as PII only if population of a location is under 20,000 [Default is to flag all locations columns].", - bg="white", - activebackground="white", - variable=check_locations_pop_checkbutton_var, - onvalue=1, - offvalue=0) - - if internet_connection: - check_locations_pop_checkbutton.pack(anchor='nw', padx=(30, 30), pady=(0, 10)) - - - if pii_search_in_unstructured_text_enabled: - - #Option related to unstructured text - unstructured_text_label = ttk.Label(first_view_frame, text="What would you like to do respect to searching PIIs in open ended questions (unstructured text)?", wraplength=546, justify=tk.LEFT, font=("Calibri Italic", 10), style='my.TLabel') - if internet_connection: - unstructured_text_label.pack(anchor='nw', padx=(30, 30), pady=(0, 10)) - - def column_level_option_for_unstructured_text_checkbutton_command(): - - #If both are now off, reselect this one - if(column_level_option_for_unstructured_text_checkbutton_var.get()==0 and keep_unstructured_text_option_checkbutton_var.get()==0): - messagebox.showinfo("Error", "You must have one option selected") - column_level_option_for_unstructured_text_checkbutton_var.set(True) - - #If the other one is on, turn it off. - if(column_level_option_for_unstructured_text_checkbutton_var.get()==1 and keep_unstructured_text_option_checkbutton_var.get()==1): - keep_unstructured_text_option_checkbutton.deselect() - - - column_level_option_for_unstructured_text_checkbutton_var = tk.IntVar(value=1) - column_level_option_for_unstructured_text_checkbutton_text = "Identify open ended questions and choose what to do with them at the column level (either drop or keep the whole column)" - column_level_option_for_unstructured_text_checkbutton = tk.Checkbutton(first_view_frame, - text=column_level_option_for_unstructured_text_checkbutton_text, - bg="white", - activebackground="white", - variable=column_level_option_for_unstructured_text_checkbutton_var, - onvalue=1, - offvalue=0, - command = column_level_option_for_unstructured_text_checkbutton_command) - - if internet_connection: - column_level_option_for_unstructured_text_checkbutton.pack(anchor='nw', padx=(30, 30), pady=(0, 10)) - - def keep_unstructured_text_option_checkbutton_command(): - - #If both are now off, reselect this one - if(column_level_option_for_unstructured_text_checkbutton_var.get()==0 and keep_unstructured_text_option_checkbutton_var.get()==0): - messagebox.showinfo("Error", "You must have one option selected") - keep_unstructured_text_option_checkbutton_var.set(True) - - else:#Disable other option - column_level_option_for_unstructured_text_checkbutton.deselect() - - - keep_unstructured_text_option_checkbutton_var = tk.IntVar(value=0) - keep_unstructured_text_option_checkbutton_text = "Keep columns with open ended questions, but replace any PIIs found on them with a 'XXXX' string [Slow process, use only if ryou really need to keep unstructured text]" - keep_unstructured_text_option_checkbutton = tk.Checkbutton(first_view_frame, - text=keep_unstructured_text_option_checkbutton_text, - bg="white", - activebackground="white", - variable=keep_unstructured_text_option_checkbutton_var, - onvalue=1, - offvalue=0, - command=keep_unstructured_text_option_checkbutton_command) - - if internet_connection: - keep_unstructured_text_option_checkbutton.pack(anchor='nw', padx=(30, 30), pady=(0, 10)) - - - def import_file(): - - global dataset - global dataset_path - global label_dict - global value_label_dict - global next_search_method - global columns_still_to_check - - dataset_path = askopenfilename() - - #If no file was selected, do nothing - if not dataset_path: - return - - display_message("Importing file...", first_view_frame) - - #Scroll down - canvas.yview_moveto( 1 ) - frame.update() - - #Read file - reading_status, reading_content = PII_data_processor.import_file(dataset_path) - - if(reading_status is False): - display_message(reading_content[ERROR_MESSAGE], first_view_frame) - return - else: - display_message("Success reading file: "+dataset_path, first_view_frame) - dataset = reading_content[DATASET] - label_dict = reading_content[LABEL_DICT] - value_label_dict = reading_content[VALUE_LABEL_DICT] - columns_still_to_check = dataset.columns - - #Creat bottom to find piis based on columns names - next_search_method = COLUMNS_NAMES_SEARCH_METHOD - buttom_text = "Find PIIs!" - - find_piis_next_step_button = ttk.Button(first_view_frame, text=buttom_text, command=find_piis, style='my.TButton') - find_piis_next_step_button.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - - #Scroll down - frame.update() - canvas.yview_moveto( 1 ) - - #Labels and buttoms to run app - start_application_label = ttk.Label(first_view_frame, text="Run application: ", wraplength=546, justify=tk.LEFT, font=("Calibri", 12, 'bold'), style='my.TLabel') - start_application_label.pack(anchor='nw', padx=(30, 30), pady=(0, 10)) - - select_dataset_button = ttk.Button(first_view_frame, text="Select Dataset", command=import_file, style='my.TButton') - select_dataset_button.pack(anchor='nw', padx=(30, 30), pady=(0, 5)) - - - print(f'Internet connection: {internet_connection}') - if(internet_connection is False): - messagebox.showinfo("Message", "No internet connection, some features are diabled") - - return first_view_frame - -def check_for_updates(): - if internet_connection: - #Get version of latest release - response = requests.get("https://api.github.com/repos/PovertyAction/PII_detection/releases/latest") - latest_version = response.json()["tag_name"] - - #Case it has a v before version number, remove it - latest_version = latest_version.replace("v","") - - #Check if this version_number is different to latest - if version_number != latest_version: - - messagebox.showinfo("Message", "Version "+latest_version+ " is available. You can uninstall this version from Control Panel and download latest from https://github.com/PovertyAction/PII_detection/releases/latest") - -if __name__ == '__main__': - - #Check internet connection - internet_connection = PII_data_processor.internet_on() - - # Create GUI window - root = tk.Tk() - - window_setup(root) - - menubar_setup(root) - - window_style_setup(root) - - # Create canvas where app will displayed - canvas = tk.Canvas(root, width=window_width, height=window_height, bg="white") - canvas.pack(side="left", fill="both", expand=True) - - # Create main frame inside canvas - frame = tk.Frame(canvas, width=window_width, height=window_height, bg="white") - frame.pack(side="left", fill="both", expand=True) - - #Add scrollbar - canvas.create_window(0,0, window=frame, anchor="nw") - add_scrollbar(root, canvas, frame) - - #Add logo - if hasattr(tk.sys, "_MEIPASS"): - logo_location = os.path.join(sys._MEIPASS, 'ipa_logo.jpg') - else: - logo_location = 'ipa_logo.jpg' - logo = ImageTk.PhotoImage(Image.open(logo_location).resize((147, 71), Image.ANTIALIAS)) # Source is 2940 x 1416 - tk.Label(frame, image=logo, borderwidth=0).pack(anchor="nw", padx=(30, 30), pady=(30, 0)) - - #Add app title - app_title_label = ttk.Label(frame, text=app_title, wraplength=536, justify=tk.LEFT, font=("Calibri", 13, 'bold'), style='my.TLabel') - app_title_label.pack(anchor='nw', padx=(30, 30), pady=(30, 10)) - - #Create first view page - first_view_frame = create_first_view_page(internet_connection) - - #Check for updates of this program - check_for_updates() - - # Constantly looping event listener - root.mainloop() diff --git a/app_icon.ico b/app_icon.ico deleted file mode 100644 index c7e2d9b..0000000 Binary files a/app_icon.ico and /dev/null differ diff --git a/anonymize_script_template_v2.do b/assets/anonymize_script_template_v2.do similarity index 98% rename from anonymize_script_template_v2.do rename to assets/anonymize_script_template_v2.do index 29638f3..f26b6a0 100644 --- a/anonymize_script_template_v2.do +++ b/assets/anonymize_script_template_v2.do @@ -3,7 +3,7 @@ ** ** PURPOSE: This do file anonymizes datasets based on users instruction ** -** NOTES: This scrpit was automatically generated by IPA's PII detector app. For details, check https://github.com/PovertyAction/PII_detection +** NOTES: This script was automatically generated by IPA's PII detector app. For details, check https://github.com/PovertyAction/PII_detection ** ** AUTHOR: PII Detector app ** @@ -145,7 +145,7 @@ mata: w[i] = leftrotate(w[i],1) } - // initalize hash + // initialize hash a = h0 b = h1 c = h2 @@ -387,7 +387,7 @@ end **D. Hash an ID /* - We hash the vairable using the hashfunction. It requires inputs as + We hash the variable using the hashfunction. It requires inputs as strings, so variables are treated differently based on storage format: (1) String - No change @@ -400,7 +400,7 @@ end *Save label loc varlab : variable label `var' - + * Convert format to string based on value label cap confirm string variable `var' // Ensure only string variables to hash if _rc & "`: value label `var''" != "" { // Decode labeled variables @@ -415,7 +415,7 @@ end label var `var' "`varlab'" } else if _rc & "`: value label `var''" == "" { // Encode labeled variables - tostring `var', replace usedisplayformat + tostring `var', replace usedisplayformat } *Create tempvar diff --git a/assets/app-icon.ico b/assets/app-icon.ico new file mode 100644 index 0000000..9947473 Binary files /dev/null and b/assets/app-icon.ico differ diff --git a/assets/datasure_logo.svg b/assets/datasure_logo.svg new file mode 100644 index 0000000..29a0ea0 --- /dev/null +++ b/assets/datasure_logo.svg @@ -0,0 +1,73 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/design/CLI_IMPLEMENTATION_PLAN.md b/assets/design/CLI_IMPLEMENTATION_PLAN.md new file mode 100644 index 0000000..9425d67 --- /dev/null +++ b/assets/design/CLI_IMPLEMENTATION_PLAN.md @@ -0,0 +1,168 @@ +# CLI & TUI Implementation Plan + +## Phase 1: Fix Current CLI (1-2 weeks) + +### Immediate Issues to Fix + +1. **Default Behavior**: Remove auto-GUI launch when no args provided +2. **Add Missing Features**: Integrate batch processing, anonymization +3. **Better Argument Structure**: Subcommands for different operations +4. **Output Formats**: JSON, CSV, TSV options + +### Enhanced CLI Structure + +```bash +# Main commands +pii-detector analyze [FILE] [OPTIONS] # Detect PII +pii-detector anonymize [FILE] [OPTIONS] # Anonymize data +pii-detector batch [PATTERN] [OPTIONS] # Batch processing +pii-detector report [FILE] [OPTIONS] # Generate reports + +# Global options +--output-format {json,csv,table,quiet} # Output format +--config [CONFIG_FILE] # Configuration file +--verbose, -v # Verbose output +--quiet, -q # Minimal output + +# Analysis options +--presidio # Enable Presidio +--no-location-check # Disable location API +--confidence-threshold FLOAT # Minimum confidence +--sample-size INT # Sample size for text analysis + +# Batch options +--chunk-size INT # Batch chunk size +--workers INT # Parallel workers +--resume # Resume interrupted batch + +# Anonymization options +--method {hash,remove,categorize,presidio} # Anonymization method +--preserve-structure # Keep original structure +``` + +### Implementation Files + +- `src/pii_detector/cli/commands/analyze.py` - Analysis command +- `src/pii_detector/cli/commands/anonymize.py` - Anonymization command +- `src/pii_detector/cli/commands/batch.py` - Batch processing command +- `src/pii_detector/cli/config.py` - Configuration handling +- `src/pii_detector/cli/output.py` - Output formatting + +## Phase 2: Add TUI with Textual (2-3 weeks) + +### TUI Features + +1. **File Selection**: Browse and select files +2. **Data Preview**: Show dataset structure and sample data +3. **Configuration Forms**: Interactive settings +4. **Progress Tracking**: Real-time processing feedback +5. **Results Review**: Browse and filter results +6. **Export Options**: Save reports and anonymized data + +### TUI Components + +- `src/pii_detector/tui/app.py` - Main Textual application +- `src/pii_detector/tui/widgets/` - Custom widgets +- `src/pii_detector/tui/screens/` - Different screens (analysis, config, results) + +### Key TUI Screens + +1. **Welcome Screen**: File selection and recent files +2. **Configuration Screen**: Detection and anonymization settings +3. **Analysis Screen**: Real-time processing with progress +4. **Results Screen**: Tabular view of PII detections +5. **Export Screen**: Output options and file selection + +## Phase 3: Integration & Polish (1 week) + +### Hybrid Mode Logic + +```python +def determine_interface_mode(args): + """Intelligently choose CLI vs TUI mode.""" + if args.tui: + return "tui" + elif args.file or args.batch or not sys.stdin.isatty(): + return "cli" + elif args.gui: + return "gui" + else: + return "tui" # Default to TUI for interactive use +``` + +### Testing Strategy + +- Unit tests for CLI commands +- Integration tests for batch processing +- Manual testing for TUI interactions +- Cross-platform compatibility testing + +## Pros and Cons Analysis + +### Enhanced CLI Only + +**Pros:** + +- Zero new dependencies +- Excellent for automation +- Universal compatibility +- Fast development +- Pipe-friendly + +**Cons:** + +- Less user-friendly for complex tasks +- No interactive data preview +- Harder to configure visually + +### TUI Addition + +**Pros:** + +- Best user experience for interactive use +- Visual data preview and configuration +- Modern, attractive interface +- Guided workflows + +**Cons:** + +- Additional dependency (Textual ~2MB) +- More development time +- Terminal compatibility considerations +- Less scriptable + +### Hybrid Approach (Recommended) + +**Pros:** + +- ✅ Best of both worlds +- ✅ CLI for scripting, TUI for interactive use +- ✅ Intelligent mode detection +- ✅ Covers all use cases + +**Cons:** + +- ❌ More code to maintain +- ❌ Longer development time +- ❌ Need to keep both interfaces in sync + +## Recommendation: Hybrid Implementation + +1. **Start with Enhanced CLI** - Fix immediate issues, add missing features +2. **Add TUI Later** - Implement Textual interface for interactive use +3. **Intelligent Defaults** - Auto-detect when to use CLI vs TUI vs GUI + +This approach provides: + +- **Immediate value** with enhanced CLI +- **Future user experience** improvements with TUI +- **Flexibility** for all types of users (scriptable CLI, interactive TUI, visual GUI) + +### Development Priority + +1. Fix current CLI default behavior +2. Add batch processing and anonymization commands +3. Implement JSON/CSV output formats +4. Add configuration file support +5. Create TUI interface with Textual +6. Add intelligent mode detection diff --git a/assets/design/DESIGN_DOC.md b/assets/design/DESIGN_DOC.md new file mode 100644 index 0000000..c397116 --- /dev/null +++ b/assets/design/DESIGN_DOC.md @@ -0,0 +1,296 @@ +# PII Detector Desktop Application Design Brief + +**Version:** 2.0 +**Date:** September 2025 +**Target Platform:** Flet + Flutter Desktop Application + +## Product Vision + +Design a **professional desktop application** that enables researchers, data analysts, and compliance officers to **safely detect and anonymize PII in research datasets**. The application must feel trustworthy, efficient, and guide users through complex data privacy workflows with confidence. + +## Core User Problem + +Researchers have sensitive datasets containing personally identifiable information (PII) that must be anonymized before sharing, publication, or analysis. Current solutions are either too technical (command-line tools) or too basic (simple find-and-replace). Users need a **desktop application that intelligently detects PII and provides flexible, research-grade anonymization options**. + +## Target Users + +1. **Research Data Analysts** - Process survey data, need batch capabilities, value accuracy +2. **Graduate Students** - Clean thesis datasets, often work with Stata files, need guidance +3. **IRB Compliance Officers** - Audit data safety, require detailed reporting and audit trails + +--- + +## Technology Decision: Flet + Flutter (100% Python) + +**Why Flet + Flutter is the Right Choice:** + +- ✅ **Keep 100% Python codebase** - No JavaScript/TypeScript learning curve +- ✅ **Modern Flutter UI** - Beautiful Material Design widgets and animations +- ✅ **Native desktop performance** - Compiled Flutter engine, not web wrapper +- ✅ **Real-time reactive updates** - Built-in state management for progress tracking +- ✅ **Rich widget ecosystem** - Charts, data tables, progress indicators out-of-the-box +- ✅ **Simple deployment** - Single executable like current PyInstaller solution +- ✅ **Future-proof** - Can easily extend to web and mobile from same codebase + +--- + +## Application Design Requirements + +### 1. Core User Workflows + +**Primary Workflow - Single File Analysis:** + +1. **File Selection** → 2. **Detection Configuration** → 3. **Analysis Progress** → 4. **Results Review** → 5. **Export Options** + +**Secondary Workflow - Batch Processing:** + +1. **Multi-File Selection** → 2. **Batch Configuration** → 3. **Processing Monitor** → 4. **Results Dashboard** → 5. **Bulk Export** + +### 2. Key Screen Layouts + +#### Dashboard (Landing Page) + +```text +┌─────────────────────────────────────────┐ +│ PII Detector v3.0 [Settings] │ +├─────────────────────────────────────────┤ +│ Quick Actions │ +│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ +│ │ Single │ │ Batch │ │ Recent │ │ +│ │ Analysis │ │ Process │ │ Projects │ │ +│ │ [Icon] │ │ [Icon] │ │ [List] │ │ +│ └──────────┘ └──────────┘ └──────────┘ │ +├─────────────────────────────────────────┤ +│ System Status │ +│ • Detection Methods: ✅ Standard ✅ AI │ +│ • Last Processing: 3 files, 10 min ago │ +│ • Performance: All systems active │ +└─────────────────────────────────────────┘ +``` + +#### File Selection Component + +```text +┌─────────────────────────────────────────┐ +│ Select Dataset Files │ +│ ┌─────────────────────────────────────┐ │ +│ │ 📁 Drag files here │ │ +│ │ or click to browse │ │ +│ │ │ │ +│ │ Supports: .csv .xlsx .dta (100MB) │ │ +│ └─────────────────────────────────────┘ │ +│ │ +│ Selected Files: │ +│ ✓ survey_data.csv (2.1MB) │ +│ ✓ responses.dta (5.8MB) │ +│ [Clear All] [Add More] │ +└─────────────────────────────────────────┘ +``` + +#### Detection Configuration Panel + +```text +┌─────────────────────────────────────────┐ +│ Detection Configuration │ +│ │ +│ Methods: [Quick] [Balanced] [Thorough] │ +│ ☑️ Column Name Analysis │ +│ ☑️ Format Pattern Detection │ +│ ☑️ Sparsity Analysis │ +│ ☑️ AI Text Analysis (Presidio) │ +│ ☐ Population Lookup (slower) │ +│ │ +│ ▼ Advanced Settings │ +│ Language: English ▼ │ +│ Confidence: 0.7 ──●──── │ +│ Workers: 4 │ +│ │ +│ [Start Analysis] │ +└─────────────────────────────────────────┘ +``` + +#### Results Display with Actions + +```text +┌─────────────────────────────────────────┐ +│ PII Detection Results │ +│ │ +│ Summary: 5 PII columns found (of 12) │ +│ ● High confidence: 3 ● Medium: 1 ● Low: 1 │ +│ │ +│ ┌─────────────────────────────────────┐ │ +│ │Column │Method │Conf│Action │ │ +│ ├─────────────────────────────────────┤ │ +│ │email │Presidio│0.95│[🔒Anonymize] │ │ +│ │phone_num │Pattern │0.87│[🔒Anonymize] │ │ +│ │full_name │ML-Text │0.82│[❌Remove] │ │ +│ │survey_id │Sparsity│0.45│[✅Keep] │ │ +│ └─────────────────────────────────────┘ │ +│ │ +│ [Preview Data] [Generate Export] │ +└─────────────────────────────────────────┘ +``` + +#### Real-time Progress Tracking + +```text +┌─────────────────────────────────────────┐ +│ Processing: survey_responses.csv │ +│ │ +│ Overall: 73% ████████████▒▒▒▒ │ +│ ├ Loading data: ✅ Complete (1.2s) │ +│ ├ Column analysis: ✅ Complete (0.8s) │ +│ ├ AI detection: 🔄 Running... (45%) │ +│ └ Report generation: ⏳ Pending │ +│ │ +│ Time remaining: ~1m 23s │ +│ Processing 2 of 5 files │ +│ │ +│ [Pause] [Cancel] [Show Details] │ +└─────────────────────────────────────────┘ +``` + +### 3. Design System Guidelines + +#### Color Palette + +- **Primary:** #2563eb (blue) - main actions, progress bars +- **Success:** #059669 (green) - high confidence, completed states +- **Warning:** #d97706 (orange) - medium confidence, cautions +- **Error:** #dc2626 (red) - low confidence, critical PII +- **Neutral:** #6b7280 (gray) - secondary text, borders + +#### Typography + +- **Headers:** System font, semibold +- **Body:** System font, regular +- **Code/Data:** Monospace font for column names and values + +#### Interactive Elements + +- **Cards:** Rounded corners (8px), subtle shadows +- **Buttons:** Rounded (6px), hover states with slight elevation +- **Progress bars:** Smooth animations, gradient fills +- **Tables:** Alternating row colors, sortable headers + +--- + +## Implementation Roadmap + +### Phase 1: Foundation (Weeks 1-2) + +- [ ] Set up Flet development environment +- [ ] Create app structure with navigation +- [ ] Implement file selection with drag-and-drop +- [ ] Basic single-file analysis workflow + +### Phase 2: Core Features (Weeks 3-4) + +- [ ] Detection configuration panel +- [ ] Real-time progress tracking +- [ ] Results visualization with action buttons +- [ ] Export functionality + +### Phase 3: Batch Processing (Weeks 5-6) + +- [ ] Multi-file selection UI +- [ ] Batch progress monitoring +- [ ] Results dashboard for multiple files +- [ ] Bulk export options + +### Phase 4: Polish & Deploy (Weeks 7-8) + +- [ ] Error handling and recovery flows +- [ ] Help documentation integration +- [ ] Performance optimization +- [ ] Build executable and installer + +--- + +## Key Flet Implementation Notes + +### Application Structure + +```python +def main(page: ft.Page): + # Configure desktop app + page.title = "PII Detector v3.0" + page.window_width = 1200 + page.window_height = 800 + page.theme_mode = ft.ThemeMode.LIGHT + + # State management + app_state = AppState() + + # Main layout with navigation + page.add(create_main_layout(page, app_state)) +``` + +### Real-time Updates + +```python +async def run_analysis(page, files, config): + def update_progress(percent, message): + # Update UI components in real-time + progress_bar.value = percent + status_text.value = message + page.update() + + # Run analysis with progress callbacks + results = await analyze_files_async(files, config, update_progress) + display_results(results) +``` + +### Deployment Command + +```bash +# Development +uv run flet run src/pii_detector/gui/flet_main.py + +# Production build +uv run flet build windows +``` + +--- + +## Success Criteria + +**User Experience Goals:** + +- Users can analyze a file in under 3 clicks +- Batch processing handles 100+ files smoothly +- Real-time progress keeps users informed +- Results are immediately actionable (Keep/Anonymize/Remove) + +**Technical Goals:** + +- Single executable deployment (like current version) +- Handles files up to 100MB without performance issues +- Responsive UI during long operations +- Preserve all current detection capabilities + +**Business Goals:** + +- Maintain 100% Python codebase for easier maintenance +- Support all current file formats (.csv, .xlsx, .dta) +- Provide professional interface suitable for institutional use +- Create foundation for future web/mobile versions + +--- + +## Next Steps for Designer + +1. **Create wireframes** for the 5 core screens listed above +2. **Design interactive prototypes** showing the file → analyze → results flow +3. **Specify component behaviors** for progress tracking and real-time updates +4. **Create design system** with colors, typography, and component styles +5. **Test user flows** with target personas (researchers, compliance officers) + +**Deliverables:** + +- High-fidelity mockups +- Interactive prototype demonstrating core workflows +- Component library with Flet-compatible specifications +- User testing results and iteration recommendations + +This brief provides everything needed to create a modern, professional PII detection tool that researchers will trust and enjoy using. diff --git a/assets/design/DEVELOPMENT_STATUS.md b/assets/design/DEVELOPMENT_STATUS.md new file mode 100644 index 0000000..e51920c --- /dev/null +++ b/assets/design/DEVELOPMENT_STATUS.md @@ -0,0 +1,800 @@ +# Development Status Report: Flet GUI Implementation + +**Date:** 2025-10-28 +**Version:** IPA PII Detector v3.0 (Flet Edition) +**Overall Completion:** ~75% (Phase 3 of 4) + +## Executive Summary + +The Flet GUI is **substantially implemented** and appears to be **functionally complete** for core workflows. This is a **professional, near-production-ready** implementation that delivers on the core PII detection workflow with real backend integration. + +--- + +## ✅ What's Fully Implemented + +### 1. Application Foundation (100%) + +**Theme System** ([ui/themes/ipa_theme.py](../../src/pii_detector/gui/flet_app/ui/themes/ipa_theme.py)) +- ✅ Complete IPA brand color palette implementation + - Primary Green: `#49ac57` (actions, success) + - Dark Blue: `#2b4085` (headers, navigation) + - Red-Orange: `#f26529` (high-confidence alerts) + - Light Blue: `#84d0d4` (accents, hover) +- ✅ Typography system with proper font weights and sizes +- ✅ Confidence-based color coding (High/Medium/Low) +- ✅ 8px grid spacing system + +**State Management** ([ui/app.py](../../src/pii_detector/gui/flet_app/ui/app.py)) +- ✅ Robust `StateManager` with observer pattern +- ✅ Navigation with screen history and back button +- ✅ Centralized error/success messaging +- ✅ File, configuration, and results state tracking + +**Settings & Configuration** +- ✅ Working API key configuration for GeoNames +- ✅ Export location selection +- ✅ About dialog with version info +- ✅ Application reset functionality + +--- + +### 2. All 5 Core Screens (100%) + +#### **Dashboard Screen** ([ui/screens/dashboard.py](../../src/pii_detector/gui/flet_app/ui/screens/dashboard.py)) ✅ + +- ✅ Quick action cards (Single Analysis, Batch Process, Recent Projects) +- ✅ System status panel with real-time indicators +- ✅ Professional layout matching wireframe design +- ✅ Navigation to file selection workflow + +**Status:** Fully functional, matches design specification + +--- + +#### **File Selection Screen** ([ui/screens/file_selection.py](../../src/pii_detector/gui/flet_app/ui/screens/file_selection.py)) ✅ + +- ✅ Native file picker with multi-file support +- ✅ Support for `.csv`, `.xlsx`, `.xls`, `.dta` formats +- ✅ File validation (size limits, format checks) +- ✅ Demo data loading functionality +- ✅ Visual file list with individual remove capability +- ✅ Success/error messaging with auto-dismiss +- ✅ File metadata display (size, format, validation status) + +**Status:** Fully functional, production-ready + +--- + +#### **Configuration Screen** ([ui/screens/configuration.py](../../src/pii_detector/gui/flet_app/ui/screens/configuration.py)) ✅ + +**Detection Method Panels (5 total):** +1. ✅ Column Name/Label Analysis - with fuzzy matching settings +2. ✅ Format Pattern Detection - with pattern type selections +3. ✅ Sparsity Analysis - with threshold sliders +4. ✅ AI-Powered Presidio Engine - with language model selection +5. ✅ Location Population Checks - with GeoNames API integration + +**Features:** +- ✅ Preset modes (Quick/Balanced/Thorough) +- ✅ Expandable/collapsible method panels +- ✅ Method-specific controls (sliders, dropdowns, checkboxes) +- ✅ GeoNames API key configuration with **live testing** +- ✅ Smart defaults and validation +- ✅ Configuration state preservation + +**Status:** Fully functional, excellent UX + +--- + +#### **Progress Tracking Screen** ([ui/screens/progress.py](../../src/pii_detector/gui/flet_app/ui/screens/progress.py)) ✅ + +- ✅ Real-time progress bar and percentage display +- ✅ Detailed progress log with timestamps +- ✅ Copy log to clipboard functionality +- ✅ Cancel analysis capability +- ✅ Completion notifications +- ✅ Background threading for non-blocking analysis +- ✅ Real backend integration (not mocked) + +**Status:** Production-ready with robust error handling + +--- + +#### **Results Display Screen** ([ui/screens/results.py](../../src/pii_detector/gui/flet_app/ui/screens/results.py)) ✅ + +**Summary Metrics:** +- ✅ Total PII detected count +- ✅ High/Medium/Low confidence breakdowns +- ✅ Color-coded metric cards + +**Results Table:** +- ✅ Detected PII columns with confidence scores +- ✅ **Per-column anonymization method dropdowns** (major enhancement!) + - Unchanged (preserve original) + - Remove (delete column) + - Encode (hash/noise) + - Categorize (age groups, date ranges, etc.) + - Mask (pattern masking) +- ✅ Smart default methods based on confidence and column type +- ✅ Visual confidence indicators + +**Export Features:** +- ✅ Data preview with PII highlighting +- ✅ Export deidentified dataset with format preservation +- ✅ Generate comprehensive PII report +- ✅ Anonymization report with detailed change log +- ✅ Open exported files in system file browser + +**Status:** Exceeds original design specification with per-column anonymization + +--- + +### 3. Backend Integration (95%) + +#### **Adapter Layer** ([backend_adapter.py](../../src/pii_detector/gui/flet_app/backend_adapter.py)) + +**`PIIDetectionAdapter` Class:** +- ✅ Bridges GUI state ↔ Core detection engine +- ✅ Dataset loading for all formats using `processor.import_dataset()` +- ✅ Real PII detection using `detect_pii_unified()` +- ✅ Conversion between GUI and backend configuration formats +- ✅ Entity type mapping to human-readable PII types + +**Anonymization Capabilities:** +- ✅ Per-column anonymization with 5 methods +- ✅ Intelligent categorization based on column patterns + - Age categorization for age columns + - Date generalization for date/time columns + - Geographic generalization for location columns + - Income bracketing for financial columns +- ✅ Comprehensive anonymization report generation +- ✅ Change logging for audit trails + +**`BackgroundProcessor` Class:** +- ✅ Async analysis without blocking UI +- ✅ Progress callbacks with real-time updates +- ✅ Cancellation support +- ✅ Error handling and recovery + +**Core Integration Points:** +- ✅ `processor.import_dataset()` - file loading +- ✅ `detect_pii_unified()` - PII detection +- ✅ `AnonymizationTechniques` - all anonymization methods +- ✅ Environment variable handling for API keys +- ✅ File format preservation (CSV→CSV, Excel→Excel, Stata→Stata) + +**Status:** Production-ready, well-architected + +--- + +### 4. Advanced Features (85%) + +#### ✅ **Implemented** + +- **Smart Default Anonymization** + - High confidence (>0.8) → Remove + - Email/Phone/SSN patterns → Mask + - Date/Age columns → Categorize + - Location columns → Categorize + - Everything else → Encode + +- **Intelligent Categorization** + - Age groups (0-17, 18-34, 35-49, 50-64, 65+) + - Date generalization (year, month, quarter) + - Location generalization (state level) + - Income bracketing + - Top/bottom coding for continuous variables + +- **User Experience** + - Progress callbacks with real-time UI updates + - Auto-dismissing success/error messages (3 seconds) + - Timestamped export folders + - Cross-platform folder opening + - Selectable/copiable text in dialogs + +- **API Integration** + - GeoNames API key configuration + - Live API key testing with actual queries + - Error handling for API failures + +#### ⚠️ **Partially Implemented** + +- **Batch Processing** (disabled in UI) + - Button exists but is disabled on dashboard + - Backend batch processor exists in core modules + - UI workflow needs implementation + +- **Recent Projects** (placeholder) + - Shows placeholder dialog + - Would need project persistence/serialization + - No file storage implementation + +#### ❌ **Not Implemented** + +- **Python Script Export** (design spec feature) + - Not present in any screen + - Would generate reproducible `pii-detector` package code + - Useful for programmatic workflows + +- **Drag-and-Drop File Selection** + - Currently browse-only via native file picker + - Design spec mentions drag-and-drop support + - Would enhance UX significantly + +--- + +## 🔧 What's Missing/Incomplete + +### Minor Gaps (~10% of total scope) + +#### 1. **Configuration Value Binding** (High Priority) + +**Issue:** GUI collects detailed settings but doesn't pass them to backend +- Fuzzy match threshold slider (0.5-1.0) +- Pattern type checkboxes (Phone, Email, SSN, Dates) +- Uniqueness threshold for sparsity +- Minimum entries required +- Population threshold for locations +- Presidio confidence threshold + +**Current Behavior:** `_handle_start_analysis()` uses hardcoded defaults: +```python +config = DetectionConfig( + # ... method enabled flags work ... + sparsity_threshold=0.6, # Hardcoded, not from slider + population_threshold=15000, # Hardcoded, not from slider +) +``` + +**Fix Required:** Extract actual slider values and pass to `DetectionConfig` + +--- + +#### 2. **Batch Processing** (Medium Priority) + +**Current State:** +- Dashboard button exists but is disabled +- `BackendProcessor` supports batch operations +- Core `batch_processor.py` module exists + +**Missing:** +- Multi-file progress tracking UI +- Batch results aggregation screen +- Batch export workflow + +**Estimated Effort:** 1-2 days + +--- + +#### 3. **Recent Projects** (Low Priority) + +**Current State:** Placeholder dialog with "Coming soon" message + +**Missing:** +- Project state serialization (JSON/pickle) +- Project history management +- "Open Recent" functionality + +**Estimated Effort:** 1 day + +--- + +#### 4. **Python Script Export** (Medium Priority) + +**Design Spec Feature:** Generate reproducible Python code + +**Example Output:** +```python +from pii_detector.core.unified_processor import detect_pii_unified +import pandas as pd + +# Load dataset +df = pd.read_csv("data.csv") + +# Configure detection +config = { + "use_column_name_detection": True, + "use_format_pattern_detection": True, + "confidence_threshold": 0.7, + # ... all settings from GUI ... +} + +# Run detection +results = detect_pii_unified(df, config=config) +``` + +**Missing:** Script generation screen or export button + +**Estimated Effort:** 0.5 days + +--- + +#### 5. **Drag-and-Drop File Selection** (Medium Priority) + +**Current:** Browse-only via native file picker +**Design Spec:** Drag-and-drop zone with visual feedback + +**Flet Implementation Options:** +- `FilePicker.on_upload` event +- Custom drag event handlers +- Third-party Flet component + +**Estimated Effort:** 0.5-1 day + +--- + +#### 6. **Settings Persistence** (Low Priority) + +**Current State:** `AppSettings` class exists but empty: +```python +def save_settings(self): + """Save settings to file (implementation depends on requirements).""" + pass +``` + +**Missing:** +- Configuration file (JSON/TOML) +- Settings load/save on app startup/exit +- User preference persistence + +**Estimated Effort:** 0.5 days + +--- + +#### 7. **Error Handling Granularity** (Low Priority) + +**Current:** Many generic exception handlers +```python +except Exception as e: + # Generic error message +``` + +**Improvement:** Specific exception types +```python +except FileNotFoundError: + # File-specific message +except pd.errors.ParserError: + # Parse error guidance +except PresidioNotInstalledError: + # Installation instructions +``` + +**Estimated Effort:** 0.5 days (code review and refactor) + +--- + +#### 8. **Executable Packaging** (High Priority for Distribution) + +**Missing:** +- PyInstaller spec file for Flet app +- Briefcase configuration for cross-platform builds +- `just build-exe-flet` command in Justfile +- Installer creation (Inno Setup for Windows) + +**Current:** Only PyInstaller config for tkinter GUI exists + +**Estimated Effort:** 1-2 days (testing across platforms) + +--- + +## 📊 Implementation Quality Assessment + +### Strengths 💪 + +#### 1. **Architecture** +- ✅ Clean separation of concerns (UI / State / Backend) +- ✅ State management with observer pattern is well-implemented +- ✅ Backend adapter provides excellent abstraction layer +- ✅ Screens are self-contained and maintainable +- ✅ Proper use of dataclasses for configuration + +#### 2. **Code Quality** +- ✅ Consistent naming conventions throughout +- ✅ Good use of type hints (`Path`, `tuple[bool, str]`, dataclasses) +- ✅ Proper resource cleanup (file pickers in overlays) +- ✅ Thread-safe UI updates with try/except guards +- ✅ Docstrings for all major functions + +#### 3. **User Experience** +- ✅ Real-time feedback with progress callbacks +- ✅ Auto-dismissing success/error messages (3 seconds) +- ✅ Proper validation before proceeding to next screen +- ✅ Helpful tooltips and instructions throughout +- ✅ Accessible color contrast ratios +- ✅ Responsive layouts with scrolling + +#### 4. **Design Fidelity** +- ✅ Matches wireframe specifications closely +- ✅ IPA brand colors consistently applied +- ✅ Spacing and typography follow 8px grid design system +- ✅ Visual confidence indicators (color-coded badges) + +#### 5. **Production Readiness** +- ✅ Real backend integration (not mocked prototypes) +- ✅ Comprehensive error handling throughout +- ✅ Background threading for long operations +- ✅ Cancellation support for running analyses +- ✅ Audit trails via anonymization reports + +--- + +### Areas for Improvement 🔄 + +#### 1. **Configuration → Backend Binding** (High Impact) +- ⚠️ GUI collects detailed settings via sliders/dropdowns +- ⚠️ But `_handle_start_analysis()` uses hardcoded defaults +- ⚠️ Settings don't fully propagate to `DetectionConfig` +- 🎯 **Fix:** Extract actual control values before creating config + +#### 2. **Error Handling Specificity** (Medium Impact) +- ⚠️ Many generic `except Exception` blocks +- ⚠️ Error messages could be more actionable +- 🎯 **Fix:** Use specific exception types with targeted guidance + +#### 3. **Testing Evidence** (Low Impact) +- ⚠️ No visible unit tests for screen components +- ⚠️ Manual testing comments in code suggest iterative debugging +- 🎯 **Fix:** Add pytest tests for state management and validation logic + +#### 4. **Performance Optimization** (Low Impact) +- ⚠️ Progress log keeps 50 messages but UI shows 20 (minor memory overhead) +- ⚠️ Not tested with very large datasets (>100k rows) +- 🎯 **Fix:** Profile with large datasets, implement chunked processing if needed + +#### 5. **Documentation** (Low Impact) +- ⚠️ No inline code examples for complex flows +- ⚠️ Missing "How to Add a New Detection Method" guide +- 🎯 **Fix:** Add developer documentation for extensibility + +--- + +## 🎯 Phase Completion Status + +Based on the [design_specification.md](design_specification.md) 4-week implementation plan: + +| Phase | Target | Status | Completion | Notes | +|-------|--------|--------|-----------|-------| +| **Week 1: Foundation** | | ✅ Complete | 100% | All deliverables met | +| - Theme, constants, navigation | ✅ | ✅ | 100% | Full IPA theme implementation | +| - Dashboard with action cards | ✅ | ✅ | 100% | 3 action cards + status panel | +| - File selection (browse) | ✅ | ✅ | 100% | Multi-file support, validation | +| **Week 2: Core Flow** | | ✅ Complete | 100% | All deliverables met | +| - Configuration panels (all 5) | ✅ | ✅ | 100% | Expandable panels with settings | +| - Progress tracking | ✅ | ✅ | 100% | Real backend integration | +| - Results display | ✅ | ✅ | 100% | Table + metrics | +| **Week 3: Advanced Features** | | 🟡 Mostly | 85% | 4 of 5 features complete | +| - Drag-and-drop | ❌ | ⚠️ | 0% | Not implemented | +| - All detection settings | ✅ | ✅ | 100% | UI exists, binding incomplete | +| - Action buttons | ✅ | ✅ | 120% | Exceeded spec with per-column methods! | +| - Script export | ❌ | ❌ | 0% | Not implemented | +| **Week 4: Polish & Deploy** | | 🟡 Partial | 70% | 3 of 4 tasks complete | +| - Error handling | ✅ | ✅ | 90% | Good coverage, needs specificity | +| - Performance optimization | ⚠️ | 🟡 | 70% | Adequate for normal datasets | +| - Testing | ⚠️ | ⚠️ | 30% | Manual only, no unit tests | +| - Deployment/installer | ❌ | ❌ | 0% | No Flet packaging config | + +**Overall Progress:** 89% (56 of 63 total features) + +--- + +## 🚀 Recommended Next Steps + +### High Priority (Production Readiness) 🔴 + +#### 1. **Fix Configuration Value Binding** (4-6 hours) +**Problem:** Slider/dropdown values in Configuration screen aren't passed to backend +**Impact:** Users can't actually control detection sensitivity +**Fix:** +- Extract slider values in `_handle_start_analysis()` +- Store in `DetectionConfig` dataclass +- Pass through to backend adapter + +**Files to modify:** +- [ui/screens/configuration.py](../../src/pii_detector/gui/flet_app/ui/screens/configuration.py) (lines 743-806) +- [config/settings.py](../../src/pii_detector/gui/flet_app/config/settings.py) (lines 9-34) + +--- + +#### 2. **Create Flet Executable Build** (1-2 days) +**Problem:** No packaging configuration for distributing Flet app +**Impact:** Can't ship to end users +**Fix:** +- Add `flet build` configuration to pyproject.toml +- Create `just build-flet-exe` command +- Test on Windows/Mac/Linux +- Update installer scripts + +**Files to create/modify:** +- `Justfile` (add new commands) +- `pyproject.toml` (add Flet build config) +- `assets/` (Flet-specific icons/resources) + +--- + +#### 3. **Add Integration Tests** (1 day) +**Problem:** No automated testing of GUI flows +**Impact:** Regression risk during future changes +**Fix:** +- Add pytest tests for state management +- Test file validation logic +- Test configuration validation +- Test anonymization method selection + +**Files to create:** +- `tests/gui/test_state_manager.py` +- `tests/gui/test_file_validation.py` +- `tests/gui/test_backend_adapter.py` + +--- + +### Medium Priority (Feature Completeness) 🟡 + +#### 4. **Implement Script Export** (4 hours) +**Problem:** No way to reproduce analysis programmatically +**Impact:** Research reproducibility gap +**Fix:** +- Add "Export Python Script" button to Results screen +- Generate Python code with current configuration +- Include comments explaining each setting + +**Files to modify:** +- [ui/screens/results.py](../../src/pii_detector/gui/flet_app/ui/screens/results.py) (add new button and handler) + +--- + +#### 5. **Add Drag-and-Drop File Selection** (4-6 hours) +**Problem:** No drag-and-drop support (design spec feature) +**Impact:** Slightly less convenient file selection +**Fix:** +- Research Flet drag-and-drop capabilities +- Implement drop zone with visual feedback +- Handle multiple files dropped simultaneously + +**Files to modify:** +- [ui/screens/file_selection.py](../../src/pii_detector/gui/flet_app/ui/screens/file_selection.py) (enhance drop zone) + +--- + +#### 6. **Enable Batch Processing** (1-2 days) +**Problem:** Batch processing button disabled, no workflow +**Impact:** Can't process multiple datasets efficiently +**Fix:** +- Create batch mode flag in state +- Add batch results aggregation screen +- Enable dashboard batch button +- Wire to existing `batch_processor.py` + +**Files to modify:** +- [ui/screens/dashboard.py](../../src/pii_detector/gui/flet_app/ui/screens/dashboard.py) (enable button) +- Create new `ui/screens/batch_results.py` + +--- + +### Low Priority (Polish) 🟢 + +#### 7. **Implement Settings Persistence** (4 hours) +**Problem:** User preferences don't persist across sessions +**Impact:** Minor UX inconvenience +**Fix:** +- Create config file (~/.pii_detector/settings.json) +- Implement save/load in `AppSettings` +- Load on app startup, save on exit + +**Files to modify:** +- [config/settings.py](../../src/pii_detector/gui/flet_app/config/settings.py) (implement save/load) + +--- + +#### 8. **Add Recent Projects** (1 day) +**Problem:** No project history feature +**Impact:** Can't quickly reopen previous analyses +**Fix:** +- Implement project state serialization +- Store in ~/.pii_detector/projects/ +- Wire up Recent Projects button + +**Files to modify:** +- [ui/screens/dashboard.py](../../src/pii_detector/gui/flet_app/ui/screens/dashboard.py) (implement handler) +- Create `utils/project_manager.py` + +--- + +#### 9. **Improve Error Messages** (4 hours) +**Problem:** Generic exception handling +**Impact:** Users get vague error messages +**Fix:** +- Replace broad `except Exception` with specific types +- Add actionable guidance to error messages +- Log detailed errors for debugging + +**Files to modify:** +- Multiple files (code review and refactor) + +--- + +#### 10. **Performance Profiling** (4 hours) +**Problem:** Not tested with very large datasets +**Impact:** May be slow for 100k+ row datasets +**Fix:** +- Profile with 10k, 50k, 100k, 500k row datasets +- Identify bottlenecks +- Implement chunked processing if needed + +**Files to modify:** +- [backend_adapter.py](../../src/pii_detector/gui/flet_app/backend_adapter.py) (optimize if needed) + +--- + +## 💡 Key Observations + +### 1. **Per-Column Anonymization is a Major Win** 🏆 +The implementation **exceeds the original design specification** by allowing users to select different anonymization methods per column (Unchanged/Remove/Encode/Categorize/Mask), not just a global Drop/Encode/Keep action. This is a significant UX improvement over the tkinter GUI and design wireframes. + +**Example:** +- Column `email` → Mask (replace with ****@****.com) +- Column `age` → Categorize (convert to age groups) +- Column `name` → Remove (delete entirely) +- Column `city` → Categorize (generalize to state level) +- Column `survey_date` → Keep (not actually PII) + +This gives researchers fine-grained control over their anonymization strategy. + +--- + +### 2. **Real Backend Integration (Not a Prototype)** ✅ +Unlike a typical GUI prototype, this connects to the **actual PII detection core**: +- `detect_pii_unified()` from [unified_processor.py](../../src/pii_detector/core/unified_processor.py) +- `AnonymizationTechniques` from [anonymization.py](../../src/pii_detector/core/anonymization.py) +- `processor.import_dataset()` for file loading + +The analysis results are **real ML detections**, not mocked data. The confidence scores are computed by actual Presidio models or pattern matching algorithms. + +--- + +### 3. **Production-Quality Code** 🔧 +The error handling, threading, progress callbacks, and file I/O are all production-ready: +- Thread-safe UI updates with try/except guards +- Background processing without blocking UI +- Cancellation support mid-analysis +- Comprehensive anonymization reports with audit trails +- Cross-platform file operations + +This is **not a quick prototype**—it's well-architected for maintainability. + +--- + +### 4. **Missing Executable Packaging** ⚠️ +Despite being near production-ready, there's **no evidence of** PyInstaller, Briefcase, or `flet build` configuration for creating desktop executables. The [Justfile](../../Justfile) has commands for the tkinter GUI (`just build-exe`) but not for the Flet version. + +**Required for distribution:** +- Flet packaging configuration +- Cross-platform testing (Windows/Mac/Linux) +- Installer creation (Windows: Inno Setup, Mac: DMG, Linux: AppImage) + +--- + +### 5. **Configuration UI ↔ Backend Disconnect** ⚠️ +The Configuration screen collects detailed settings (fuzzy thresholds, pattern types, confidence sliders), but `_handle_start_analysis()` creates a `DetectionConfig` with **hardcoded defaults** instead of reading the actual UI values. + +**Impact:** Users think they're adjusting sensitivity, but the backend ignores their choices. + +**Quick Fix:** +```python +# Current (wrong): +config = DetectionConfig( + sparsity_threshold=0.6, # Hardcoded + population_threshold=15000, # Hardcoded +) + +# Should be: +config = DetectionConfig( + sparsity_threshold=self.uniqueness_slider.value, # From UI + population_threshold=int(self.population_slider.value), # From UI +) +``` + +--- + +### 6. **No Unit Tests** ⚠️ +There are no visible pytest tests for the Flet GUI components. Testing appears to be manual only, with debug print statements scattered throughout: +```python +# print("DEBUG: Settings button clicked!") +# print("DEBUG: About to navigate to file selection") +``` + +**Risk:** Future changes could break existing functionality without detection. + +--- + +### 7. **Settings Don't Persist** 🔹 +The `AppSettings` class exists in [config/settings.py](../../src/pii_detector/gui/flet_app/config/settings.py) but `save_settings()` and `load_settings()` are empty stubs. User preferences (theme, export location, API keys) don't persist across sessions. + +**User Impact:** Must reconfigure API keys every time they launch the app. + +--- + +## 📈 Comparison with Design Specification + +### Features Implemented Beyond Spec 🎉 + +1. **Per-Column Anonymization Methods** 🏆 + - **Spec:** "Action buttons for Drop/Encode/Keep" + - **Implemented:** Dropdown per column with 5 methods (Unchanged/Remove/Encode/Categorize/Mask) + - **Impact:** Major UX improvement + +2. **Smart Default Anonymization** 🧠 + - **Spec:** Not mentioned + - **Implemented:** Intelligently suggests methods based on column type and confidence + - **Impact:** Reduces user decision burden + +3. **Live API Key Testing** 🔍 + - **Spec:** "API key configuration" + - **Implemented:** Test button that validates GeoNames credentials in real-time + - **Impact:** Better user confidence + +4. **Copy Progress Log** 📋 + - **Spec:** Not mentioned + - **Implemented:** Clipboard button with timestamped log export + - **Impact:** Useful for support/debugging + +5. **Comprehensive Anonymization Reports** 📊 + - **Spec:** Basic report generation + - **Implemented:** Detailed reports with method descriptions, change logs, and audit trails + - **Impact:** Research compliance and reproducibility + +--- + +### Features in Spec But Not Implemented ❌ + +1. **Drag-and-Drop File Selection** + - **Spec:** "Drag and drop zone for file selection" + - **Status:** Browse-only via native file picker + - **Priority:** Medium (nice-to-have) + +2. **Python Script Export** + - **Spec:** "Generate reproducible Python script showing detection configuration" + - **Status:** Not implemented + - **Priority:** Medium (research reproducibility) + +3. **Batch Processing Workflow** + - **Spec:** "Batch process multiple datasets" + - **Status:** Button disabled, no UI workflow + - **Priority:** Medium (efficiency feature) + +4. **Recent Projects** + - **Spec:** "View and reopen previously analyzed datasets" + - **Status:** Placeholder dialog only + - **Priority:** Low (convenience feature) + +--- + +## 🏁 Bottom Line + +This is a **professional, near-production-ready Flet implementation** that delivers on the core PII detection workflow with several enhancements over the original design specification. + +### Critical Path to v3.0 Release + +**With 2-3 days of focused work** to address the high-priority items, this could ship as **IPA PII Detector v3.0 (Flet Edition)**: + +1. ✅ **Day 1 Morning:** Fix configuration value binding (4-6 hours) +2. ✅ **Day 1 Afternoon:** Add integration tests (4 hours) +3. ✅ **Day 2:** Create Flet executable build and test cross-platform (1-2 days) +4. ✅ **Day 3 Morning:** Implement script export (4 hours) +5. ✅ **Day 3 Afternoon:** Add drag-and-drop (4 hours) + +**Remaining items** (batch processing, recent projects, performance tuning) can be deferred to v3.1 or later releases. + +--- + +## 📚 Related Documents + +- [Design Specification](design_specification.md) - Full design doc with wireframes +- [Design Document (DESIGN_DOC.md)](DESIGN_DOC.md) - Presidio integration plan +- [CLI Implementation Plan](CLI_IMPLEMENTATION_PLAN.md) - CLI/TUI roadmap +- [Main README](../../README.md) - Project overview and quick start + +--- + +**Report Generated:** 2025-10-28 +**Reviewer:** Claude (Sonnet 4.5) +**Review Scope:** Complete codebase analysis of Flet GUI implementation diff --git a/assets/design/PRESIDIO_INTEGRATION_PLAN.md b/assets/design/PRESIDIO_INTEGRATION_PLAN.md new file mode 100644 index 0000000..1683ed3 --- /dev/null +++ b/assets/design/PRESIDIO_INTEGRATION_PLAN.md @@ -0,0 +1,272 @@ +# Presidio Integration Plan for PII Detector + +## Executive Summary + +This document outlines a comprehensive plan to integrate Microsoft Presidio into the existing PII detector system, creating a hybrid approach that combines the current system's strengths in structured data analysis with Presidio's advanced NLP capabilities for text-based PII detection. + +## Current System Analysis + +### Strengths + +- **Structured data focus**: Excel, CSV, Stata file handling with metadata preservation +- **Statistical analysis**: Sparsity detection, location population analysis +- **Comprehensive anonymization**: Academic research-based techniques (k-anonymity, differential privacy) +- **Domain-specific**: Tailored for survey/research data with SurveyCTO integration + +### Limitations + +- Basic regex-based text analysis +- Limited multi-language support +- No confidence scoring for detections +- Pattern-matching vs context-aware detection + +## Presidio Advantages + +### Core Capabilities + +- **Advanced NLP**: Context-aware detection using spaCy/Transformers vs basic regex +- **Multi-language support**: Built-in language models vs limited multi-language capability +- **Modular architecture**: Easy to extend with custom recognizers +- **Higher accuracy**: ML-based detection vs pattern matching +- **Confidence scores**: Quantified detection confidence vs binary detection + +### Detection Methods + +- Pattern-based recognition for structured data +- NLP-based recognition using spaCy, Stanza, and Transformers +- Context-aware enhancement to improve accuracy + +### Anonymization Operators + +- Replace: Substitutes PII with specified values +- Redact: Removes PII completely +- Mask: Replaces characters with specified character +- Hash: Converts PII to hash values +- Encrypt: Encrypts PII using cryptographic keys +- Custom: User-defined lambda functions + +## Integration Strategy + +### Hybrid Architecture Approach + +We recommend a **hybrid approach** that leverages both systems' strengths: + +#### 1. Enhanced Text Analysis Engine + +**File**: `src/pii_detector/core/presidio_engine.py` + +Replace the basic regex-based text analysis with Presidio-powered detection: + +```python +from presidio_analyzer import AnalyzerEngine +from presidio_anonymizer import AnonymizerEngine + +class PresidioTextAnalyzer: + """Presidio-powered text analysis for advanced PII detection.""" + + def __init__(self): + self.analyzer = AnalyzerEngine() + self.anonymizer = AnonymizerEngine() + + def analyze_column_text(self, column_data: pd.Series, confidence_threshold: float = 0.7) -> dict: + """Enhanced text analysis with confidence scores.""" + # Combines current word extraction with Presidio NLP + + def get_supported_entities(self) -> list: + """Returns all Presidio-supported PII entities.""" +``` + +#### 2. Unified Detection Framework + +**File**: `src/pii_detector/core/unified_processor.py` + +Combine existing structured data detection with Presidio text analysis: + +- **Structured Detection** (current): Column names, formats, sparsity, location populations +- **Text Content Detection** (new): Presidio-powered analysis of cell content +- **Hybrid Scoring**: Confidence-weighted combination of both approaches + +#### 3. Enhanced Anonymization Pipeline + +Extend current anonymization with Presidio operators while preserving existing techniques: + +```python +class HybridAnonymizer: + def __init__(self): + self.current_techniques = AnonymizationTechniques() + self.presidio_anonymizer = AnonymizerEngine() + + def anonymize_text_content(self, text: str, detected_entities: list) -> str: + """Use Presidio for text anonymization.""" + + def anonymize_structured_data(self, df: pd.DataFrame, pii_columns: list) -> pd.DataFrame: + """Use current techniques for structured anonymization.""" +``` + +## Implementation Timeline + +### Phase 1: Foundation + +1. **Add Presidio dependencies** to `pyproject.toml` + - `presidio-analyzer` + - `presidio-anonymizer` + - Required NLP models (spaCy) +2. **Create Presidio wrapper** (`presidio_engine.py`) with current interface patterns +3. **Unit tests** for Presidio integration +4. **Basic integration testing** + +### Phase 2: Enhanced Detection + +5. **Upgrade text analysis** in `text_analysis.py` to use Presidio +6. **Add confidence scoring** to detection results +7. **Create unified detection** that combines structural + text analysis +8. **Update GUI** to show confidence scores and entity types +9. **Preserve backward compatibility** with existing detection methods + +### Phase 3: Advanced Features + +10. **Custom recognizers** for survey-specific PII patterns +11. **Multi-language support** leveraging Presidio's capabilities +12. **Enhanced anonymization** options using Presidio operators +13. **Performance optimization** for large datasets +14. **Advanced configuration options** for detection sensitivity + +### Phase 4: Integration & Testing (1-2 weeks) + +15. **Comprehensive testing** across different data types and languages +16. **Performance benchmarking** against current system +17. **Documentation updates** and user guides +18. **Final backward compatibility verification** + +## Technical Implementation Details + +### Dependencies to Add + +```toml +[project] +dependencies = [ + # ... existing dependencies + "presidio-analyzer>=2.2.0", + "presidio-anonymizer>=2.2.0", + "spacy>=3.4.0", +] +``` + +### New File Structure + +``` +src/pii_detector/ +├── core/ +│ ├── presidio_engine.py # New: Presidio integration layer +│ ├── unified_processor.py # New: Hybrid detection engine +│ ├── hybrid_anonymizer.py # New: Combined anonymization +│ └── ... (existing files) +├── models/ # New: Custom Presidio recognizers +│ ├── survey_recognizers.py +│ └── custom_entities.py +``` + +### Integration Points + +1. **Text Analysis Enhancement** (`src/pii_detector/core/text_analysis.py`) + - Replace regex-based detection with Presidio analyzer + - Add confidence scoring + - Maintain existing interface for backward compatibility + +2. **Main Processor Integration** (`src/pii_detector/core/processor.py`) + - Add Presidio-based text content analysis + - Combine structural detection with text analysis results + - Implement confidence-weighted scoring + +3. **GUI Enhancements** (`src/pii_detector/gui/frontend.py`) + - Display confidence scores + - Show detected entity types + - Add configuration options for detection sensitivity + +## Key Benefits of Integration + +### Accuracy Improvements + +1. **Context-aware detection**: ML models understand semantic context +2. **Reduced false positives**: Better distinction between PII and non-PII text +3. **Multi-language capability**: Native support for multiple languages +4. **Confidence scoring**: Quantified uncertainty for better user decisions + +### Enhanced Functionality + +1. **Custom recognizers**: Easy development of domain-specific detectors +2. **Advanced anonymization**: More sophisticated transformation options +3. **Extensibility**: Modular architecture for future enhancements +4. **Performance optimization**: Efficient processing of large datasets + +### Maintained Strengths + +1. **Statistical analysis**: Keep sparsity and population analysis +2. **Structured data expertise**: Preserve Excel/CSV/Stata handling +3. **Research domain focus**: Maintain SurveyCTO and survey-specific features +4. **Comprehensive anonymization**: Retain academic research-based techniques + +## Risk Mitigation + +### Performance Concerns + +- **Solution**: Implement optional Presidio detection (user-configurable) +- **Fallback**: Maintain current regex-based methods as backup +- **Optimization**: Cache NLP models, batch processing for large datasets + +### Dependency Management + +- **Solution**: Optional installation of Presidio components +- **Graceful degradation**: System works without Presidio (reduced functionality) +- **Version pinning**: Specific version requirements to ensure compatibility + +### Backward Compatibility + +- **Solution**: Maintain existing APIs and interfaces +- **Migration path**: Gradual transition with user configuration options +- **Testing**: Comprehensive regression testing + +## Success Metrics + +### Quantitative Measures + +1. **Detection accuracy**: Precision/recall improvement vs current system +2. **Processing speed**: Performance benchmarks on various dataset sizes +3. **User adoption**: Usage statistics of new features +4. **Error reduction**: Decrease in false positives/negatives + +### Qualitative Measures + +1. **User feedback**: Satisfaction with enhanced detection capabilities +2. **Use case expansion**: New applications enabled by improved accuracy +3. **Development velocity**: Ease of adding custom recognizers +4. **System reliability**: Stability and error handling improvements + +## Recommended Next Steps + +### Immediate Actions (Next 1-2 weeks) + +1. **Pilot Implementation**: Create basic `presidio_engine.py` wrapper +2. **Dependency Setup**: Add Presidio to development environment +3. **Initial Testing**: Compare detection accuracy on sample datasets +4. **Architecture Review**: Validate integration approach with stakeholders + +### Short-term Goals (1-2 months) + +1. **Core Integration**: Implement unified detection framework +2. **GUI Enhancement**: Add confidence scoring display +3. **Performance Testing**: Benchmark against current system +4. **User Testing**: Gather feedback from pilot users + +### Long-term Vision (3-6 months) + +1. **Custom Recognizers**: Develop survey-specific PII detectors +2. **Multi-language Support**: Expand language coverage +3. **Advanced Features**: Implement sophisticated anonymization options +4. **Documentation**: Comprehensive user and developer guides + +## Conclusion + +The integration of Presidio into the existing PII detector system represents a significant evolution from a primarily pattern-based tool to a hybrid statistical-ML system. This approach will dramatically improve detection accuracy while preserving the system's domain expertise in survey data analysis. + +The phased implementation plan ensures manageable development cycles, maintains backward compatibility, and provides clear success metrics. The result will be a more accurate, extensible, and user-friendly PII detection system that serves both current users and opens opportunities for new applications. diff --git a/assets/design/design_specification.md b/assets/design/design_specification.md new file mode 100644 index 0000000..588388f --- /dev/null +++ b/assets/design/design_specification.md @@ -0,0 +1,912 @@ +# IPA PII Detector - Complete Design Specification + +## Python/Flet Implementation Guide + +### Table of Contents + +1. [Application Architecture Overview](#architecture) +2. [Design System & Visual Identity](#design-system) +3. [Component Library Specifications](#components) +4. [Screen-by-Screen Implementation Guide](#screens) +5. [State Management & Data Flow](#state-management) +6. [Implementation Priority Matrix](#implementation) +7. [Code Examples & Patterns](#code-examples) + +--- + +## 1. Application Architecture Overview {#architecture} + +### Core Framework Decision + +The application uses **Flet (Flutter for Python)** to achieve native desktop performance while maintaining a 100% Python codebase. Flet provides Material Design components out-of-the-box, which aligns perfectly with our design requirements. + +### Application Structure + +``` +src/ +├── main.py # Application entry point +├── config/ +│ ├── constants.py # Colors, sizes, text constants +│ └── settings.py # User preferences, detection configs +├── ui/ +│ ├── app.py # Main application controller +│ ├── screens/ +│ │ ├── dashboard.py # Landing page with quick actions +│ │ ├── file_selection.py # File picker and validation +│ │ ├── configuration.py # Detection method settings +│ │ ├── progress.py # Real-time processing feedback +│ │ └── results.py # Results display and actions +│ ├── components/ +│ │ ├── cards.py # Reusable card components +│ │ ├── buttons.py # Button styles and behaviors +│ │ ├── progress_bars.py # Progress indicators +│ │ └── method_panels.py # Expandable configuration panels +│ └── themes/ +│ └── ipa_theme.py # Complete IPA color theme +├── core/ +│ ├── detector.py # PII detection logic integration +│ ├── file_handler.py # File I/O operations +│ └── script_generator.py # Python code generation +└── assets/ + ├── icons/ # Material Design icons (SVG format) + └── fonts/ # System fonts fallback +``` + +This architecture separates concerns clearly, making the codebase maintainable and allowing the UI layer to focus purely on presentation while the core layer handles business logic. + +--- + +## 2. Design System & Visual Identity {#design-system} + +### Color Palette Implementation + +Create a dedicated theme file that centralizes all color definitions. This ensures consistency and makes future updates simple. + +**Primary Color Definitions:** + +```python +# config/constants.py +class IPAColors: + # Primary Brand Colors + IPA_GREEN = "#49ac57" # Primary actions, success states + DARK_GREEN = "#155240" # Sequential data, deep success + LIGHT_BLUE = "#84d0d4" # Secondary actions, hover states + DARK_BLUE = "#2b4085" # Headers, navigation, primary text + RED_ORANGE = "#f26529" # High-confidence alerts, critical actions + + # Neutral Palette + LIGHT_GREY = "#f1f2f2" # Background, card surfaces + DARK_GREY = "#c9c9c8" # Borders, secondary text + CHARCOAL = "#414042" # Primary text, icons + BLUE_ACCENT = "#ceecee" # Subtle highlights, table alternation + + # Confidence Level Indicators + HIGH_CONFIDENCE = RED_ORANGE # 0.8+ confidence scores + MED_CONFIDENCE = "#f5cb57" # 0.5-0.8 confidence scores + LOW_CONFIDENCE = DARK_GREY # <0.5 confidence scores + + # Interactive States + HOVER_COLOR = BLUE_ACCENT + ACTIVE_COLOR = IPA_GREEN + DISABLED_COLOR = DARK_GREY +``` + +### Typography Hierarchy + +```python +class IPATypography: + # Font families (system fonts with fallbacks) + PRIMARY_FONT = "Segoe UI, -apple-system, BlinkMacSystemFont, sans-serif" + MONOSPACE_FONT = "Consolas, Monaco, Courier New, monospace" + + # Font sizes (in pixels for Flet) + HEADER_1 = 32 # Main page titles + HEADER_2 = 24 # Section headers + HEADER_3 = 18 # Subsection titles + BODY_LARGE = 16 # Primary text, buttons + BODY_REGULAR = 14 # Secondary text, labels + BODY_SMALL = 12 # Captions, metadata + CODE_TEXT = 12 # Monospace content + + # Font weights + LIGHT = "300" + REGULAR = "400" + MEDIUM = "500" + SEMIBOLD = "600" + BOLD = "700" +``` + +### Spacing and Layout Constants + +```python +class IPASpacing: + # Base spacing unit (8px grid system) + UNIT = 8 + + # Common spacing values + XS = UNIT // 2 # 4px - tight spacing + SM = UNIT # 8px - compact spacing + MD = UNIT * 2 # 16px - standard spacing + LG = UNIT * 3 # 24px - generous spacing + XL = UNIT * 4 # 32px - section spacing + XXL = UNIT * 6 # 48px - major section breaks + + # Component-specific spacing + CARD_PADDING = MD + BUTTON_PADDING_H = MD + BUTTON_PADDING_V = SM + INPUT_PADDING = SM + + # Border radius values + RADIUS_SM = 4 # Small elements (checkboxes, small buttons) + RADIUS_MD = 8 # Cards, input fields + RADIUS_LG = 12 # Major containers, panels +``` + +--- + +## 3. Component Library Specifications {#components} + +Understanding that consistency is crucial for professional software, we need to establish reusable components that maintain visual harmony throughout the application. + +### Action Card Component + +The action card serves as the primary navigation element on the dashboard, guiding users toward their intended workflow. + +**Visual Specifications:** + +- **Dimensions:** Minimum 200px width, 180px height +- **Background:** LIGHT_GREY (#f1f2f2) default, BLUE_ACCENT on hover +- **Border:** 2px solid DARK_GREY, changes to IPA_GREEN on hover +- **Border Radius:** RADIUS_LG (12px) +- **Padding:** XL (32px) all sides +- **Icon:** 60px diameter circle, IPA_GREEN background +- **Typography:** HEADER_3 for title, BODY_REGULAR for description + +**Flet Implementation Pattern:** + +```python +def create_action_card(title: str, description: str, icon: str, on_click_handler): + return ft.Container( + content=ft.Column([ + ft.Container( # Icon container + content=ft.Icon(icon, size=24, color="white"), + width=60, + height=60, + bgcolor=IPAColors.IPA_GREEN, + border_radius=30, + alignment=ft.alignment.center, + ), + ft.Text( + title, + size=IPATypography.HEADER_3, + weight=IPATypography.SEMIBOLD, + color=IPAColors.CHARCOAL, + text_align=ft.TextAlign.CENTER, + ), + ft.Text( + description, + size=IPATypography.BODY_REGULAR, + color=IPAColors.CHARCOAL, + text_align=ft.TextAlign.CENTER, + ), + ], + horizontal_alignment=ft.CrossAxisAlignment.CENTER, + spacing=IPASpacing.MD, + ), + width=200, + height=180, + padding=IPASpacing.XL, + bgcolor=IPAColors.LIGHT_GREY, + border=ft.border.all(2, IPAColors.DARK_GREY), + border_radius=IPASpacing.RADIUS_LG, + on_click=on_click_handler, + # Hover behavior will be handled through Flet's built-in hover events + ) +``` + +### Expandable Method Panel Component + +These panels house the detection method configurations and represent the most complex UI element in our application. The expandable nature allows us to provide detailed controls without overwhelming the interface. + +**Visual Specifications:** + +- **Header:** LIGHT_GREY background, 15px vertical padding, DARK_GREY bottom border +- **Content:** White background with 20px padding when expanded +- **Animation:** Smooth expand/collapse transition (300ms recommended) +- **Toggle Indicator:** Material Design expand_more icon, rotates 180° when expanded + +**State Management Considerations:** +Each panel needs to track: + +1. Expansion state (collapsed/expanded) +2. Method enabled state (checkbox in header) +3. Individual setting values within the panel +4. Validation state for required settings + +### Progress Bar Component Specifications + +Progress indicators need to feel responsive and provide meaningful feedback during potentially long-running operations. + +**Visual Requirements:** + +- **Height:** 12px for primary progress bars, 6px for mini progress indicators +- **Background:** DARK_GREY (#c9c9c8) +- **Fill:** Linear gradient from IPA_GREEN to LIGHT_BLUE +- **Border Radius:** Half of height value (6px for 12px bar) +- **Animation:** Smooth width transitions, 200ms duration + +**Implementation Note:** Flet's ProgressBar component supports these specifications naturally, but you'll need to override the default colors to match our IPA theme. + +--- + +## 4. Screen-by-Screen Implementation Guide {#screens} + +Let me walk you through each screen systematically, explaining not just what to build, but why certain decisions were made and how they support the user workflow. + +### Screen 1: Dashboard (Landing Page) + +**Purpose:** This screen serves as the application's front door, providing immediate access to core functions while establishing trust through professional presentation and system status information. + +**Layout Structure:** + +``` +┌─────────────────────────────────────────────────────┐ +│ Header Bar (60px height) │ +├─────────────────────────────────────────────────────┤ +│ Quick Actions Grid (3 columns, flexible height) │ +├─────────────────────────────────────────────────────┤ +│ System Status Panel (100px height, fixed) │ +└─────────────────────────────────────────────────────┘ +``` + +**Header Bar Specifications:** + +- **Background Color:** DARK_BLUE +- **Height:** 60px fixed +- **Left Content:** Application title "IPA PII Detector v3.0" with search icon +- **Right Content:** Settings button (IPA_GREEN background) +- **Typography:** BODY_LARGE, white color, SEMIBOLD weight + +**Quick Actions Grid:** + +- **Container:** 3 equal columns with 20px gaps +- **Padding:** 30px all sides +- **Card Specifications:** Use Action Card component (defined above) +- **Cards Required:** + 1. Single Analysis (icon: description, handler: navigate_to_file_selection) + 2. Batch Process (icon: bar_chart, handler: navigate_to_batch_selection) + 3. Recent Projects (icon: history, handler: navigate_to_recent_projects) + +**System Status Panel:** + +- **Background:** BLUE_ACCENT +- **Padding:** 20px all sides +- **Border Radius:** RADIUS_MD +- **Content:** Three status indicators with green/amber/red dot indicators +- **Typography:** BODY_REGULAR for labels, BODY_SMALL for values + +**Flet Screen Structure:** + +```python +def create_dashboard_screen(): + return ft.Column([ + create_header_bar(), + ft.Container( + content=ft.Row([ + create_action_card("Single Analysis", "Analyze one file...", ft.icons.DESCRIPTION, None), + create_action_card("Batch Process", "Process multiple files...", ft.icons.BAR_CHART, None), + create_action_card("Recent Projects", "View past analyses...", ft.icons.HISTORY, None), + ], + alignment=ft.MainAxisAlignment.SPACE_EVENLY), + padding=ft.padding.all(30), + ), + create_system_status_panel(), + ], + expand=True, + spacing=0, + ) +``` + +### Screen 2: File Selection Interface + +**Purpose:** Enable intuitive file selection with clear format support indicators and file size validation. This screen builds confidence by showing exactly what files are supported and providing immediate feedback. + +**Critical Implementation Details:** + +- **Drag-and-Drop Zone:** Use `ft.DragTarget` with visual feedback +- **File Validation:** Immediate validation on selection (format, size, readability) +- **Multiple Selection:** Support both individual and batch file selection +- **Visual Feedback:** Clear success/error states for each selected file + +**Drop Zone Specifications:** + +- **Dimensions:** Full width, 200px minimum height +- **Border:** 3px dashed DARK_GREY, becomes IPA_GREEN on hover/drag-over +- **Background:** LIGHT_GREY default, BLUE_ACCENT on interaction +- **Icon:** Material Design folder_open, 48px size +- **Typography:** HEADER_3 for main text, BODY_SMALL for supported formats + +**Selected Files List:** + +- **Container:** White background, DARK_GREY border, RADIUS_MD +- **File Items:** Each row shows checkmark, filename, size, with subtle separator lines +- **Action Buttons:** "Clear All", "Add More", "Next: Configure Analysis" + +### Screen 3: Detection Configuration Panel + +**Purpose:** This is the most complex screen, allowing granular control over detection methods. The design needs to balance power with usability through progressive disclosure. + +**Implementation Challenge:** Managing the state of 5 different expandable panels, each with multiple settings, while keeping the interface responsive and intuitive. + +**Panel Structure Pattern:** +Each detection method follows this consistent pattern: + +1. **Header Section:** Method name, enable/disable checkbox, expand/collapse toggle +2. **Description Section:** Brief explanation of what the method does +3. **Settings Section:** Method-specific configuration options +4. **Validation Feedback:** Real-time indication of valid/invalid settings + +**Critical State Management:** +You'll need to track: + +- Overall preset selection (Quick/Balanced/Thorough) +- Individual panel expansion states +- Method enable/disable states +- All individual setting values +- Setting validation states +- Interdependencies between methods + +**Preset Button Behavior:** +When users select a preset (Quick/Balanced/Thorough), the system should: + +1. Update all relevant method settings automatically +2. Provide visual feedback about what changed +3. Allow manual override of preset values +4. Remember that user has customized beyond preset + +### Screen 4: Real-time Progress Tracking + +**Purpose:** Keep users engaged during processing by showing detailed progress and maintaining control options. + +**Critical Implementation Requirements:** + +- **Real-time Updates:** Progress bars and status text must update smoothly +- **Granular Feedback:** Show progress for each processing stage +- **Time Estimation:** Calculate and display remaining time estimates +- **User Control:** Always provide pause/cancel options + +**Progress Tracking Levels:** + +1. **Overall Progress:** Main progress bar (0-100%) +2. **Stage Progress:** Individual task completion states +3. **File Progress:** When processing multiple files +4. **Time Estimates:** Based on historical performance data + +**Visual Hierarchy:** + +- **File Name:** Most prominent (20px, semibold) +- **Overall Progress:** Large progress bar with percentage +- **Stage Details:** Smaller text with status icons +- **Time Information:** Secondary information, smaller typography + +### Screen 5: Results Display with Actions + +**Purpose:** Present detection results clearly with immediate actionability. This screen determines whether users trust and adopt the tool. + +**Table Specifications:** + +- **Framework:** Use `ft.DataTable` for built-in sorting and interaction +- **Column Widths:** Column name (25%), Method (20%), Confidence (15%), PII Type (20%), Actions (20%) +- **Row Styling:** Alternating backgrounds using BLUE_ACCENT +- **Confidence Scores:** Color-coded badges (HIGH_CONFIDENCE, MED_CONFIDENCE, LOW_CONFIDENCE) + +**Action Button Specifications:** +Each row contains contextual action buttons: + +- **Anonymize:** IPA_GREEN background, lock icon +- **Remove:** RED_ORANGE background, delete icon +- **Keep:** DARK_GREY background, check icon + +**Summary Cards Implementation:** +Create four metric cards above the table: + +- Total PII columns found +- High confidence count (RED_ORANGE color) +- Medium confidence count (MED_CONFIDENCE color) +- Low confidence count (LOW_CONFIDENCE color) + +### Screen 6: Python Script Export Feature + +**Purpose:** Bridge the gap between GUI usability and programmatic reproducibility by generating executable Python code. + +**Implementation Requirements:** + +- **Code Generation:** Dynamic script creation based on user configurations +- **Syntax Highlighting:** Use a monospace font with basic color coding +- **Export Options:** File download, clipboard copy, email integration +- **Template System:** Maintainable code templates for different export scenarios + +--- + +## 5. State Management & Data Flow {#state-management} + +Understanding data flow is crucial for building a responsive application that maintains consistency across screens. + +### Application State Structure + +```python +@dataclass +class AppState: + # Navigation state + current_screen: str = "dashboard" + screen_history: List[str] = field(default_factory=list) + + # File management + selected_files: List[FileInfo] = field(default_factory=list) + file_validation_results: Dict[str, ValidationResult] = field(default_factory=dict) + + # Configuration state + detection_config: DetectionConfig = field(default_factory=DetectionConfig) + preset_mode: str = "balanced" # quick, balanced, thorough + + # Processing state + is_processing: bool = False + current_progress: float = 0.0 + processing_stage: str = "" + estimated_time_remaining: Optional[int] = None + + # Results state + detection_results: Optional[DetectionResults] = None + user_actions: Dict[str, str] = field(default_factory=dict) # column -> action mapping + + # UI state + panel_expansion_states: Dict[str, bool] = field(default_factory=dict) + error_messages: List[str] = field(default_factory=list) + success_messages: List[str] = field(default_factory=list) +``` + +### State Update Patterns + +All state changes should flow through a central update mechanism to ensure UI consistency: + +```python +class StateManager: + def __init__(self, page: ft.Page): + self.page = page + self.state = AppState() + + def update_state(self, **kwargs): + """Central state update method with UI refresh""" + for key, value in kwargs.items(): + if hasattr(self.state, key): + setattr(self.state, key, value) + + self.refresh_ui() + + def refresh_ui(self): + """Trigger UI updates after state changes""" + self.page.update() +``` + +### Critical Data Flow Patterns + +**File Selection Flow:** + +1. User selects files → Immediate validation → Update selected_files state +2. Validation results → Update file_validation_results → Refresh UI indicators +3. File removal → Update both states → Refresh file list display + +**Configuration Flow:** + +1. Preset selection → Update all method configurations → Refresh all panels +2. Individual setting change → Update specific config → Validate dependencies +3. Method enable/disable → Update config → Show/hide dependent settings + +**Processing Flow:** + +1. Start processing → Set is_processing=True → Show progress screen +2. Progress updates → Update current_progress, processing_stage → Refresh progress bars +3. Completion → Set results state → Navigate to results screen + +--- + +## 6. Implementation Priority Matrix {#implementation} + +To help you build efficiently, I've organized the implementation into logical phases that build upon each other. + +### Phase 1: Foundation (Week 1) + +**Priority:** Critical - Must be completed first + +**Deliverables:** + +1. **Project Structure Setup:** Create all directories and base files +2. **Theme System:** Implement IPAColors, IPATypography, IPASpacing classes +3. **Basic Navigation:** Screen switching mechanism and state management +4. **Dashboard Screen:** Complete implementation with action cards +5. **File Selection Screen:** Basic file picker functionality (no drag-and-drop yet) + +**Success Criteria:** Users can launch app, see professional dashboard, and select files + +### Phase 2: Core Detection Flow (Week 2) + +**Priority:** High - Enables basic functionality + +**Deliverables:** + +1. **Configuration Screen:** All five method panels with basic settings +2. **Integration Layer:** Connect GUI to existing PII detector backend +3. **Progress Screen:** Real-time progress tracking with pause/cancel +4. **Basic Results Display:** Simple table showing detection results + +**Success Criteria:** Complete end-to-end workflow from file selection to results + +### Phase 3: Advanced Features (Week 3) + +**Priority:** Medium - Enhances usability + +**Deliverables:** + +1. **Drag-and-Drop:** Enhanced file selection with visual feedback +2. **Advanced Configuration:** All granular settings for each method +3. **Results Actions:** Implement anonymize/remove/keep functionality +4. **Python Script Export:** Code generation and download capability + +**Success Criteria:** Professional-grade feature set matching wireframe specifications + +### Phase 4: Polish & Deployment (Week 4) + +**Priority:** Low - Final touches + +**Deliverables:** + +1. **Error Handling:** Comprehensive error states and recovery flows +2. **Performance Optimization:** Smooth animations and responsive interactions +3. **Batch Processing:** Multiple file handling capabilities +4. **Build System:** Executable generation and installer creation + +**Success Criteria:** Production-ready application with installer + +--- + +## 7. Code Examples & Patterns {#code-examples} + +These examples demonstrate the specific Flet patterns you'll need to implement our design specifications. + +### Theme Integration Pattern + +```python +# ui/themes/ipa_theme.py +import flet as ft +from config.constants import IPAColors, IPATypography + +def create_ipa_theme(): + """Create Flet theme with IPA color palette""" + return ft.Theme( + color_scheme=ft.ColorScheme( + primary=IPAColors.IPA_GREEN, + primary_container=IPAColors.LIGHT_BLUE, + secondary=IPAColors.DARK_BLUE, + secondary_container=IPAColors.BLUE_ACCENT, + surface=IPAColors.LIGHT_GREY, + surface_variant=IPAColors.BLUE_ACCENT, + error=IPAColors.RED_ORANGE, + on_primary=IPAColors.CHARCOAL, + on_surface=IPAColors.CHARCOAL, + ), + text_theme=ft.TextTheme( + body_large=ft.TextStyle( + size=IPATypography.BODY_LARGE, + color=IPAColors.CHARCOAL, + ), + body_medium=ft.TextStyle( + size=IPATypography.BODY_REGULAR, + color=IPAColors.CHARCOAL, + ), + headline_large=ft.TextStyle( + size=IPATypography.HEADER_1, + color=IPAColors.DARK_BLUE, + weight=ft.FontWeight.BOLD, + ), + ) + ) +``` + +### Expandable Panel Pattern + +```python +def create_method_panel(method_name: str, description: str, settings_content, state_manager): + """Create expandable detection method configuration panel""" + + # Panel expansion state key + panel_key = f"panel_{method_name.lower().replace(' ', '_')}" + is_expanded = state_manager.state.panel_expansion_states.get(panel_key, False) + + def toggle_expansion(e): + new_state = not state_manager.state.panel_expansion_states.get(panel_key, False) + state_manager.update_state( + panel_expansion_states={ + **state_manager.state.panel_expansion_states, + panel_key: new_state + } + ) + + return ft.Container( + content=ft.Column([ + # Header with checkbox and expand/collapse + ft.Container( + content=ft.Row([ + ft.Row([ + ft.Checkbox( + value=True, # Get from state + on_change=lambda e: handle_method_toggle(method_name, e), + ), + ft.Text( + method_name, + size=IPATypography.BODY_LARGE, + weight=ft.FontWeight.W_600, + color=IPAColors.CHARCOAL, + ), + ]), + ft.IconButton( + icon=ft.icons.EXPAND_MORE if not is_expanded else ft.icons.EXPAND_LESS, + on_click=toggle_expansion, + icon_color=IPAColors.IPA_GREEN, + ), + ], + alignment=ft.MainAxisAlignment.SPACE_BETWEEN), + padding=ft.padding.all(IPASpacing.MD), + bgcolor=IPAColors.LIGHT_GREY, + border=ft.border.only(bottom=ft.BorderSide(1, IPAColors.DARK_GREY)), + on_click=toggle_expansion, + ), + + # Expandable content + ft.Container( + content=ft.Column([ + # Description + ft.Container( + content=ft.Text( + description, + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + ), + padding=ft.padding.all(IPASpacing.SM), + bgcolor=IPAColors.BLUE_ACCENT, + border_radius=ft.border_radius.all(IPASpacing.RADIUS_SM), + margin=ft.margin.only(bottom=IPASpacing.MD), + ), + + # Settings content (passed in) + settings_content, + ]), + padding=ft.padding.all(IPASpacing.MD), + visible=is_expanded, + ), + ]), + border=ft.border.all(1, IPAColors.DARK_GREY), + border_radius=ft.border_radius.all(IPASpacing.RADIUS_MD), + margin=ft.margin.only(bottom=IPASpacing.MD), + bgcolor="white", + ) +``` + +### Progress Tracking Pattern + +```python +class ProgressTracker: + def __init__(self, page: ft.Page, state_manager): + self.page = page + self.state_manager = state_manager + self.progress_bar = None + self.stage_indicators = {} + + def create_progress_display(self): + """Create the progress tracking UI elements""" + + # Overall progress bar + self.progress_bar = ft.ProgressBar( + width=500, + height=12, + bgcolor=IPAColors.DARK_GREY, + color=IPAColors.IPA_GREEN, + value=0, + ) + + # Stage indicators + stages = [ + ("loading", "Loading data"), + ("column_analysis", "Column analysis"), + ("ai_detection", "AI detection"), + ("report_generation", "Report generation"), + ] + + stage_widgets = [] + for stage_key, stage_label in stages: + icon = ft.Icon( + ft.icons.CHECK_CIRCLE, + color=IPAColors.DARK_GREY, + size=16, + ) + + self.stage_indicators[stage_key] = icon + + stage_widgets.append( + ft.Row([ + icon, + ft.Text( + stage_label, + size=IPATypography.BODY_REGULAR, + color=IPAColors.CHARCOAL, + ), + ]) + ) + + return ft.Column([ + ft.Text( + "Processing: survey_responses.csv", + size=IPATypography.HEADER_3, + weight=ft.FontWeight.W_600, + text_align=ft.TextAlign.CENTER, + ), + + ft.Container( + content=ft.Column([ + ft.Row([ + ft.Text("Overall Progress", size=IPATypography.BODY_REGULAR), + ft.Text("0%", size=IPATypography.BODY_REGULAR), + ], + alignment=ft.MainAxisAlignment.SPACE_BETWEEN), + + self.progress_bar, + ]), + width=500, + ), + + ft.Column(stage_widgets, spacing=IPASpacing.SM), + + ], + horizontal_alignment=ft.CrossAxisAlignment.CENTER, + spacing=IPASpacing.LG) + + def update_progress(self, overall_percent: float, current_stage: str, stage_percent: float): + """Update progress indicators""" + # Update overall progress bar + self.progress_bar.value = overall_percent / 100.0 + + # Update stage indicators + stage_colors = { + "complete": IPAColors.IPA_GREEN, + "running": IPAColors.MED_CONFIDENCE, + "pending": IPAColors.DARK_GREY, + } + + # Logic to determine stage states based on current_stage and stage_percent + # Update self.stage_indicators[stage_key].color accordingly + + # Refresh the page + self.page.update() +``` + +### Results Table Pattern + +```python +def create_results_table(detection_results, action_handlers): + """Create the PII detection results table""" + + # Create table columns + columns = [ + ft.DataColumn(ft.Text("Column", weight=ft.FontWeight.W_600)), + ft.DataColumn(ft.Text("Method", weight=ft.FontWeight.W_600)), + ft.DataColumn(ft.Text("Confidence", weight=ft.FontWeight.W_600)), + ft.DataColumn(ft.Text("PII Type", weight=ft.FontWeight.W_600)), + ft.DataColumn(ft.Text("Actions", weight=ft.FontWeight.W_600)), + ] + + # Create table rows + rows = [] + for result in detection_results: + # Confidence badge with color coding + confidence_color = IPAColors.HIGH_CONFIDENCE if result.confidence > 0.8 else \ + IPAColors.MED_CONFIDENCE if result.confidence > 0.5 else \ + IPAColors.LOW_CONFIDENCE + + confidence_badge = ft.Container( + content=ft.Text( + f"{result.confidence:.2f}", + color="white", + size=IPATypography.BODY_SMALL, + weight=ft.FontWeight.W_600, + ), + bgcolor=confidence_color, + padding=ft.padding.symmetric(horizontal=8, vertical=4), + border_radius=ft.border_radius.all(IPASpacing.RADIUS_SM), + ) + + # Action buttons + action_buttons = ft.Row([ + ft.ElevatedButton( + "Anonymize", + icon=ft.icons.LOCK, + bgcolor=IPAColors.IPA_GREEN, + color="white", + on_click=lambda e, col=result.column: action_handlers['anonymize'](col), + ), + ft.ElevatedButton( + "Remove", + icon=ft.icons.DELETE, + bgcolor=IPAColors.RED_ORANGE, + color="white", + on_click=lambda e, col=result.column: action_handlers['remove'](col), + ), + ft.ElevatedButton( + "Keep", + icon=ft.icons.CHECK, + bgcolor=IPAColors.DARK_GREY, + color="white", + on_click=lambda e, col=result.column: action_handlers['keep'](col), + ), + ], spacing=IPASpacing.SM) + + rows.append(ft.DataRow( + cells=[ + ft.DataCell(ft.Text(result.column, weight=ft.FontWeight.W_600)), + ft.DataCell(ft.Text(result.method)), + ft.DataCell(confidence_badge), + ft.DataCell(ft.Text(result.pii_type)), + ft.DataCell(action_buttons), + ], + # Alternating row colors + color=IPAColors.BLUE_ACCENT if len(rows) % 2 == 0 else "white", + )) + + return ft.DataTable( + columns=columns, + rows=rows, + border=ft.border.all(1, IPAColors.DARK_GREY), + border_radius=ft.border_radius.all(IPASpacing.RADIUS_MD), + bgcolor="white", + ) +``` + +--- + +## Implementation Checklist + +**Before You Start:** + +- [ ] Review existing PII detector backend code structure +- [ ] Set up development environment with Flet installed +- [ ] Create project directory structure as specified +- [ ] Implement color constants and theme system first + +**Week 1 Deliverables:** + +- [ ] Dashboard screen with three action cards +- [ ] Basic file selection with format validation +- [ ] Navigation system between screens +- [ ] IPA theme fully implemented + +**Week 2 Deliverables:** + +- [ ] All five expandable configuration panels +- [ ] Progress tracking screen with real-time updates +- [ ] Basic results table with confidence color coding +- [ ] Backend integration working end-to-end + +**Week 3 Deliverables:** + +- [ ] Drag-and-drop file selection +- [ ] All granular settings in configuration panels +- [ ] Action buttons working (anonymize/remove/keep) +- [ ] Python script generation and export + +**Week 4 Deliverables:** + +- [ ] Error handling and validation throughout +- [ ] Smooth animations and transitions +- [ ] Batch processing capabilities +- [ ] Executable build system + +This specification provides everything needed to implement the IPA PII Detector exactly as designed. Each section builds upon the previous one, ensuring you have a clear path from setup through deployment. The code examples show specific Flet patterns that match our design requirements, and the implementation phases ensure you can deliver working software incrementally. diff --git a/assets/design/pii_detector_wireframes.html b/assets/design/pii_detector_wireframes.html new file mode 100644 index 0000000..743a14f --- /dev/null +++ b/assets/design/pii_detector_wireframes.html @@ -0,0 +1,1422 @@ + + + + + + + PII Detector - Desktop App Wireframes + + + + +
+

+ IPA PII Detector Desktop Application - Wireframes +

+ + +
+
+ 1. Dashboard / Landing Page +
+
+
+ Purpose: Main entry point providing quick access to core functions and system + status. Users can immediately start single analysis, batch processing, or access recent projects. +
+ +
+
+
🔍 IPA PII Detector v3.0
+ +
+ +
+
+
📄
+
Single Analysis
+
Analyze one file for PII detection
+
+ +
+
📊
+
Batch Process
+
Process multiple files at once
+
+ +
+
📋
+
Recent Projects
+
View and reopen past analyses
+
+
+ +
+
System Status
+
+
+ Detection Methods: ✅ Standard ✅ AI Ready +
+
+
+ Last Processing: 3 files, 10 minutes ago +
+
+
+ Performance: All systems active +
+
+
+
+
+ + +
+
+ 2. File Selection Interface +
+
+
+ Purpose: Intuitive file selection with drag-and-drop functionality. Supports + multiple file formats (.csv, .xlsx, .dta) with size validation and preview. +
+ +
+
Select Dataset Files
+ +
+
📁
+
Drag files here or click to browse
+
Supports: .csv, .xlsx, .dta (max 100MB per file)
+
+ +
+
Selected Files:
+ +
+
+
survey_data.csv
+
(2.1MB)
+
+ +
+
+
responses.dta
+
(5.8MB)
+
+ +
+
+
participant_info.xlsx
+
(1.3MB)
+
+ +
+ + + +
+
+
+
+
+ + +
+
+ 3. Detection Configuration Panel +
+
+
+ Purpose: Configure detection methods with granular control over each technique. + Users can expand sections to fine-tune parameters for Column Name Analysis, Format Patterns, + Sparsity thresholds, Location Population lookup, and AI-powered Presidio engine settings. +
+ +
+
Detection Configuration
+ +
+ + + +
+ + +
+
+
+ + Column Name/Label Analysis +
+ +
+
+
+ Analyzes column headers against restricted word lists for data collection variables, + location identifiers, personal identifiers, and sensitive account information. +
+
+
Strict matching (exact matches)
+
+ +
+
+
+
Fuzzy matching (substring matching)
+
+ +
+
+
+
Personal identifiers (names, addresses, IDs)
+
+ +
+
+
+
Location identifiers (district, village, coordinates)
+
+ +
+
+
+
Data collection variables (deviceid, caseid)
+
+ +
+
+
+
+ + +
+
+
+ + Format Pattern Detection +
+ +
+
+
+ Detects structured data patterns for phone numbers, emails, dates, and identification + numbers across various international formats. +
+
+
Phone number patterns (international formats)
+
+ +
+
+
+
Email address patterns
+
+ +
+
+
+
Date patterns (birthdate detection)
+
+ +
+
+
+
ID patterns (SSN, account numbers)
+
+ +
+
+
+
+ + +
+
+
+ + Sparsity Analysis +
+ +
+
+
+ Identifies columns with high uniqueness and open-ended responses that likely contain + personally identifiable information. +
+
+
Uniqueness threshold
+
+ + 60% +
+
+
+
Open-ended question detection
+
+ +
+
+
+
Free text response analysis
+
+ +
+
+
+
+ + +
+
+
+ + Location Population Analysis +
+ +
+
+
+ Uses geographic APIs to identify small locations where individuals could be + re-identified. Requires internet connection and may be slower. +
+
+
Population threshold
+
+ + 15K +
+
+
+
GeoNames API integration
+
+ +
+
+
+
Re-identification risk assessment
+
+ +
+
+
+
+ + +
+
+
+ + AI-Powered Text Analysis (Presidio) +
+ +
+
+
+ Uses advanced machine learning models for Named Entity Recognition to detect persons, + locations, organizations, and sensitive data patterns. +
+
+
Language model
+
+ +
+
+
+
Confidence threshold
+
+ + 0.7 +
+
+
+
Person names (PERSON entities)
+
+ +
+
+
+
Location names (LOCATION entities)
+
+ +
+
+
+
Organization names
+
+ +
+
+
+
Financial data (credit cards, SSN)
+
+ +
+
+
+
+ + +
+
+
+ + +
+
+ 4. Real-time Progress Tracking +
+
+
+ Purpose: Keep users informed during processing with detailed progress indicators, + estimated time remaining, and ability to pause/cancel operations. +
+ +
+
Processing: survey_responses.csv
+ +
+
+ Overall Progress + 73% +
+
+
+
+
+ +
+
+ + Loading data: Complete (1.2s) +
+
+ + Column analysis: Complete (0.8s) +
+
+ 🔄 + AI detection: Running... (45%) +
+
+ + Report generation: Pending +
+
+ +
+
Time remaining: ~1m 23s
+
Processing 2 of 5 files
+
+ +
+ + + +
+
+
+
+ + +
+
+ 5. Results Display with Actions +
+
+
+ Purpose: Present detection results clearly with actionable options. Users can + review confidence levels, preview data, download a Python script for reproducible processing, and + choose specific anonymization strategies for each column. +
+ +
+
+
PII Detection Results
+ +
+
+
5
+
PII Columns Found
+
+
+
3
+
High Confidence
+
+
+
1
+
Medium Confidence
+
+
+
1
+
Low Confidence
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ColumnDetection MethodConfidencePII TypeActions
emailPresidio0.95Email Address + + +
phone_numberPattern0.87Phone Number + + +
full_nameML-Text0.82Person Name + + +
birth_dateColumn Name0.65Date of Birth + + +
survey_idSparsity0.45Identifier + +
+ +
+ + + + +
+
+
+
+ + +
+
+ 6. Python Script Export Feature +
+
+
+ Purpose: Generate a reproducible Python script using the pii-detector package that + implements the exact same processing steps configured in the GUI. This enables users to automate + their workflow and integrate PII detection into existing data pipelines. +
+ +
+
+#!/usr/bin/env python3
+"""
+Generated PII Detection Script
+Created by: IPA PII Detector v3.0
+Date: 2025-09-25 14:30:22
+Dataset: survey_responses.csv
+"""
+
+import pandas as pd
+from pii_detector import PIIDetector
+
+# Initialize detector with your configuration
+detector = PIIDetector(
+    column_name_analysis=True,
+    format_pattern_detection=True,
+    sparsity_analysis=True,
+    sparsity_threshold=0.60,
+    location_population_analysis=False,
+    presidio_analysis=True,
+    presidio_language='en',
+    presidio_confidence_threshold=0.7
+)
+
+# Load your dataset
+df = pd.read_csv('survey_responses.csv')
+
+# Run PII detection
+results = detector.detect_pii(df)
+
+# Apply your anonymization choices:
+# - email: Anonymize (hash)
+df['email'] = detector.anonymize_column(df['email'], method='hash')
+
+# - phone_number: Anonymize (mask)
+df['phone_number'] = detector.anonymize_column(df['phone_number'], method='mask')
+
+# - full_name: Remove column
+df = df.drop(columns=['full_name'])
+
+# - birth_date: Anonymize (generalize to year)
+df['birth_date'] = detector.anonymize_column(df['birth_date'], method='generalize_date')
+
+# - survey_id: Keep as-is (low confidence)
+
+# Save cleaned dataset
+df.to_csv('survey_responses_cleaned.csv', index=False)
+
+print(f"✅ Cleaned dataset saved with {len(df)} rows and {len(df.columns)} columns")
+print(f"📊 Original PII detection results saved to: pii_detection_report.json")
+
+
+ +
+ + + +
+
+
+ + +
+ + + + + + +
+ + +
+
+ Implementation Notes & Design System +
+
+
+
+

Color Usage

+
+
+
+ IPA Green (#49ac57): Primary actions, success states, progress bars +
+
+
+
+ Dark Blue (#2b4085): Headers, navigation, table headers +
+
+
+
+ Red-Orange (#f26529): High-confidence PII alerts, remove actions +
+
+
+
+ Light Blue (#84d0d4): Accent colors, hover states, progress fills +
+
+
+
+ Yellow (#f5cb57): Medium confidence, warnings, processing states +
+
+ +
+

Key Components

+
    +
  • Cards: 12px border-radius, subtle shadows
  • +
  • Buttons: 6px border-radius, hover elevation
  • +
  • Progress bars: Gradient fills, smooth animations
  • +
  • Tables: Alternating row colors using blue-accent
  • +
  • Icons: System-appropriate sizes (24px, 48px)
  • +
  • Typography: System fonts, clear hierarchy
  • +
+ +

Material Design & Flet + Implementation

+
    +
  • Icons: Use Material Design icons instead of emojis
  • +
  • Cards: ft.Card with elevation and rounded corners
  • +
  • Expansion Panels: ft.ExpansionTile for method + configuration
  • +
  • Buttons: ft.ElevatedButton, ft.OutlinedButton +
  • +
  • Progress: ft.ProgressBar with smooth animations
  • +
  • Data Tables: ft.DataTable with sorting
  • +
  • File Picker: ft.FilePicker with drag-drop support
  • +
  • Sliders: ft.Slider for threshold controls
  • +
+
+
+
+
+
+ + + diff --git a/assets/hook-presidio.py b/assets/hook-presidio.py new file mode 100644 index 0000000..0cb7a13 --- /dev/null +++ b/assets/hook-presidio.py @@ -0,0 +1,53 @@ +# HOOK FILE FOR PRESIDIO AND RELATED DEPENDENCIES +# Required for PyInstaller to properly bundle Presidio components + +from PyInstaller.utils.hooks import collect_all, collect_data_files, collect_submodules + +# ----------------------------- PRESIDIO-ANALYZER ----------------------------- +data = collect_all("presidio_analyzer") +datas = data[0] +binaries = data[1] +hiddenimports = data[2] + +# Collect recognizer modules +hiddenimports += collect_submodules("presidio_analyzer.predefined_recognizers") + +# ----------------------------- PRESIDIO-ANONYMIZER ----------------------------- +data = collect_all("presidio_anonymizer") +datas += data[0] +binaries += data[1] +hiddenimports += data[2] + +# Collect anonymizer operators +hiddenimports += collect_submodules("presidio_anonymizer.operators") + +# ----------------------------- SPACY MODELS ----------------------------- +# Include spaCy models that Presidio uses +# Note: This assumes en_core_web_sm model - adjust based on your needs +try: + import en_core_web_sm # noqa: F401 + + datas += collect_data_files("en_core_web_sm") +except ImportError: + pass + +# ----------------------------- TRANSFORMERS (if using) ----------------------------- +# Presidio can use Transformers models for enhanced NER +try: + data = collect_all("transformers") + datas += data[0] + binaries += data[1] + hiddenimports += data[2] +except ImportError: + pass + +# ----------------------------- ADDITIONAL DEPENDENCIES ----------------------------- +# Other dependencies that Presidio might need +for module in ["regex", "phonenumbers", "python_dateutil"]: + try: + data = collect_all(module) + datas += data[0] + binaries += data[1] + hiddenimports += data[2] + except ImportError: + pass diff --git a/hook-spacy.py b/assets/hook-spacy.py similarity index 54% rename from hook-spacy.py rename to assets/hook-spacy.py index 69c4f58..e963fc5 100644 --- a/hook-spacy.py +++ b/assets/hook-spacy.py @@ -3,28 +3,28 @@ from PyInstaller.utils.hooks import collect_all # ----------------------------- SPACY ----------------------------- -data = collect_all('spacy') +data = collect_all("spacy") datas = data[0] binaries = data[1] hiddenimports = data[2] # ----------------------------- THINC ----------------------------- -data = collect_all('thinc') +data = collect_all("thinc") datas += data[0] binaries += data[1] hiddenimports += data[2] # ----------------------------- CYMEM ----------------------------- -data = collect_all('cymem') +data = collect_all("cymem") datas += data[0] binaries += data[1] hiddenimports += data[2] # ----------------------------- PRESHED ----------------------------- -data = collect_all('preshed') +data = collect_all("preshed") datas += data[0] binaries += data[1] @@ -32,9 +32,33 @@ # ----------------------------- BLIS ----------------------------- -data = collect_all('blis') +data = collect_all("blis") datas += data[0] binaries += data[1] hiddenimports += data[2] -# This hook file is a bit of a hack - really, all of the libraries should be in seperate hook files. (Eg hook-blis.py with the blis part of the hook) \ No newline at end of file +# This hook file is a bit of a hack - really, all of the libraries should be in separate hook files. (Eg hook-blis.py with the blis part of the hook) + +# ----------------------------- SPACY MODELS ----------------------------- +# Include spaCy language models if present +try: + import en_core_web_sm # noqa: F401 + from PyInstaller.utils.hooks import collect_data_files + + datas += collect_data_files("en_core_web_sm") +except ImportError: + pass + +try: + import en_core_web_md # noqa: F401 + + datas += collect_data_files("en_core_web_md") +except ImportError: + pass + +try: + import en_core_web_lg # noqa: F401 + + datas += collect_data_files("en_core_web_lg") +except ImportError: + pass diff --git a/assets/ipa-logo.jpg b/assets/ipa-logo.jpg new file mode 100644 index 0000000..596b1fb Binary files /dev/null and b/assets/ipa-logo.jpg differ diff --git a/constant_strings.py b/constant_strings.py deleted file mode 100644 index 2519782..0000000 --- a/constant_strings.py +++ /dev/null @@ -1,76 +0,0 @@ -CONSIDER_SURVEY_CTO_VARS = 'consider_surveyCTO_vars' -CHECK_LOCATIONS_POP = 'check_locations_pop' - -COLUMNS_NAMES_SEARCH_METHOD = 'columns names search method' -LOCATIONS_POPULATIONS_SEARCH_METHOD = 'locations populations search method' -COLUMNS_FORMAT_SEARCH_METHOD = 'column format search method' -SPARSE_ENTRIES_SEARCH_METHOD = 'sparse entries search method' -UNSTRUCTURED_TEXT_SEARCH_METHOD = 'unstructured text search method' - -STRICT = 'strict' -FUZZY = 'fuzzy' - -PHONE_NUMBER = 'phone number' -DATE = 'date' - -ENGLISH = 'English' -SPANISH = 'Spanish' -OTHER = 'Other' - -ERROR_MESSAGE = 'error_message' -PII_CANDIDATES = 'pii_candidates' -DATASET = 'dataset' -LABEL_DICT = 'label_dict' -VALUE_LABEL_DICT = 'value_label_dict' - -COLUMNS_STILL_TO_CHECK = 'COLUMNS_STILL_TO_CHECK' - - -#Countries list -BANGLADESH = 'Bangladesh' -MYANMAR = 'Myanmar' -PHILIPPINES = 'Philippines' -BOLIVIA = 'Bolivia' -COLOMBIA = 'Colombia' -DOMINICAN_REPUBLIC = 'Dominican Republic' -MEXICO = 'Mexico' -PARAGUAY = 'Paraguay' -PERU = 'Peru' -BURKINA_FASO = 'Burkina Faso' -COTE_DIVOIRE = 'Cote dIvoire' -GHANA = 'Ghana' -LIBERIA = 'Liberia' -MALI = 'Mali' -SIERRA_LEONE = 'Sierra Leone' -KENYA = 'Kenya' -MALAWI = 'Malawi' -RWANDA = 'Rwanda' -TANZANIA = 'Tanzania' -UGANDA = 'Uganda' -ZAMBIA = 'Zambia' - -ALL_COUNTRIES = [PHILIPPINES,BOLIVIA,COLOMBIA,DOMINICAN_REPUBLIC,MEXICO,PARAGUAY,PERU,BURKINA_FASO,COTE_DIVOIRE,GHANA,LIBERIA,MALI,SIERRA_LEONE,KENYA,MALAWI,RWANDA,TANZANIA,UGANDA,ZAMBIA, MYANMAR, BANGLADESH] - -COUNTRY_NAME_TO_ISO_CODE = { - MEXICO:'mx', - BANGLADESH: 'bd', - MYANMAR : 'mm', - PHILIPPINES : 'ph', - BOLIVIA : 'bo', - COLOMBIA : 'co', - DOMINICAN_REPUBLIC : 'do', - PARAGUAY : 'py', - PERU : 'pe', - BURKINA_FASO : 'bf', - COTE_DIVOIRE : 'ci', - GHANA : 'gh', - LIBERIA : 'lr', - MALI : 'ml', - SIERRA_LEONE : 'sl', - KENYA : 'ke', - MALAWI : 'mw', - RWANDA : 'rw', - TANZANIA : 'tz', - UGANDA : 'ug', - ZAMBIA : 'zm' -} diff --git a/create_installer.iss b/create_installer.iss deleted file mode 100644 index ea39c89..0000000 --- a/create_installer.iss +++ /dev/null @@ -1,120 +0,0 @@ -; Script generated by the Inno Setup Script Wizard. -; SEE THE DOCUMENTATION FOR DETAILS ON CREATING INNO SETUP SCRIPT FILES! - -[Setup] -; NOTE: The value of AppId uniquely identifies this application. Do not use the same AppId value in installers for other applications. -; (To generate a new GUID, click Tools | Generate GUID inside the IDE.) -AppId={{94D8F08C-01F1-46FF-89DF-EB240476F308} -AppName=PII Detector -AppVersion=0.2.23 -AppPublisher=IPA -AppPublisherURL=https://www.poverty-action.org/ -AppSupportURL=https://www.poverty-action.org/ -AppUpdatesURL=https://www.poverty-action.org/ -DefaultDirName={autopf}\PII Detector -DisableProgramGroupPage=yes -; Uncomment the following line to run in non administrative install mode (install for current user only.) -;PrivilegesRequired=lowest -OutputDir=C:\Users\felip\PII_detection\dist -OutputBaseFilename=installer -SetupIconFile=C:\Users\felip\PII_detection\dist\app_frontend\app_icon.ico -Compression=lzma -SolidCompression=yes -WizardStyle=modern -UninstallDisplayIcon=C:\Users\felip\PII_detection\dist\app_frontend\app_icon.ico - -[Languages] -Name: "english"; MessagesFile: "compiler:Default.isl" - -[Tasks] -Name: "desktopicon"; Description: "{cm:CreateDesktopIcon}"; GroupDescription: "{cm:AdditionalIcons}"; Flags: unchecked - -[Files] -Source: "C:\Users\felip\PII_detection\dist\app_frontend\app_frontend.exe"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_asyncio.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_bz2.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_cffi_backend.cp38-win_amd64.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_ctypes.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_decimal.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_elementtree.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_hashlib.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_lzma.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_multiprocessing.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_overlapped.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_queue.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_socket.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_sqlite3.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_ssl.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_testcapi.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_tkinter.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\_win32sysloader.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\app_frontend.exe.manifest"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\app_icon.ico"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\base_library.zip"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\ipa_logo.jpg"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\kiwisolver.cp38-win_amd64.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\lib_arpack-.2SFL42HHZ4PZ2ZKTBHULWLJDO6SPOBMX.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\lib_blas_su.SXX6OHFC3HLK4TC7SFHY7EWYWTE5NEP4.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\lib_dop-f2p.MT27WXU45SIAX6ASQY7CWYIOK7BE6F7J.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\lib_test_fo.JF5HTWMUPBXWGAYEBVEJU3OZAHTSVKCT.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libansari.R6EA3HQP5KZ6TAXU4Y4ZVTRPT7UVA53Z.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libbanded5x.UEB2FLNQUK6ENP6F3JWHGFRKAZK4HKSY.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libbispeu.7AH3PCQ2E2NGLC3AQD7FFAH73KGJTZCJ.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libblkdta00.37OSC2UPECTMIA7QHKFZHTBBNLYMZSNK.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libchkder.G7WSOGIYYQO3UWFVEZ3PPXCXR53ADVPA.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libcobyla2.25EVUSEBAW7VKISARB7LO3UGZPN2HXE3.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libcrypto-1_1.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libd_odr.7J2262VJOKVQJ7Z3VSIPRAHJ2HVZT6XS.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libdcsrch.I2AOPDCXAPDRFNPWY55H5UE7XZSU5CVN.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libdet.UUOGCNYYSRH3SKVQWBDSEP6DWIQWRKZZ.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libdfft.77WLEPRRLEPFIH4KZHFRYQ2UQCA5NONS.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libdfitpack.LMAPXDO5462XTHNWXJBZFJU252ZVABKI.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libdgamln.733CLDMODMP7N4V4VYG5MATCENMLG4I7.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libdqag.PQITPNDDR3HSJ44XDB4N3Z7BDX2UD3YH.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libffi-7.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libgetbreak.SA5VDN7OR4E3PWZFT4TPMX4W2XB5FYNN.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\liblbfgsb.NEXG7QLBFLBCFD42PO5V4IWCPCUGPOCB.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\liblsoda-f2.5XWPH7KY2GYX4D6G5T6U6R45H4XHBJNK.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libmvndst.IUWFZM2WSUQ3UTGQHFQ26ATH2A2TIUVI.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libnnls.4HUTGAJQTI623WTX372VAIIWXRLC62YU.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libopenblas.3HBPCJB5BPQGKWVZAVEBXNNJ2Q2G3TUP.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libslsqp_op.RGGN6ZOFD2K47X7YRNDYCM7JFP4AGLER.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libspecfun.LQCTHMCYNULEOOGKIO6AGREE6D6V37RU.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libssl-1_1.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libvode-f2p.RRWIMSVJVJPENFKJSLUK5L6Z2HONNEBQ.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libwrap_dum.FFMEUDAAWA4OWVO76EOZPAZXI2N7FSIX.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\libwrap_dum.TETSETQV7VQPKMY44CVVUBZUQEOXJF73.gfortran-win_amd64.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\mfc140u.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\msvcp140.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\pvectorc.cp38-win_amd64.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\pyexpat.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\python38.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\pythoncom38.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\pywintypes38.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\select.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\sqlite3.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\tcl86t.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\tk86t.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\unicodedata.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\VCRUNTIME140.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\win32api.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\win32clipboard.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\win32evtlog.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\win32pdh.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\win32security.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\win32trace.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\win32ui.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\win32wnet.pyd"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\winpty.dll"; DestDir: "{app}"; Flags: ignoreversion -Source: "C:\Users\felip\PII_detection\dist\app_frontend\*"; DestDir: "{app}"; Flags: ignoreversion recursesubdirs createallsubdirs; Permissions: users-modify -; NOTE: Don't use "Flags: ignoreversion" on any shared system files - -[Icons] -Name: "{autoprograms}\PII Detector"; Filename: "{app}\app_frontend.exe" -Name: "{autodesktop}\PII Detector"; Filename: "{app}\app_frontend.exe"; Tasks: desktopicon -Name: "{autoprograms}\PII Detector"; Filename: "{app}\app_frontend.exe"; IconFilename: "{app}\app_icon.ico" -Name: "{autodesktop}\PII Detector"; Filename: "{app}\app_frontend.exe"; IconFilename: "{app}\app_icon.ico" - -[Run] -Filename: "{app}\app_frontend.exe"; Description: "{cm:LaunchProgram,PII Detector}"; Flags: nowait postinstall skipifsilent diff --git a/examples/anonymization_demo.ipynb b/examples/anonymization_demo.ipynb new file mode 100644 index 0000000..4639126 --- /dev/null +++ b/examples/anonymization_demo.ipynb @@ -0,0 +1,1701 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PII Anonymization Techniques Demo\n", + "\n", + "This notebook demonstrates various anonymization methods implemented based on FSD guidelines and academic research.\n", + "\n", + "The techniques covered include:\n", + "1. **Removal techniques** - Variable removal and record suppression\n", + "2. **Pseudonymization** - Hash-based and systematic pseudonymization\n", + "3. **Recoding/Categorization** - Age, income, and date generalization\n", + "4. **Randomization** - Statistical noise and permutation techniques\n", + "5. **Text anonymization** - Pattern masking and redaction\n", + "6. **Statistical disclosure control** - K-anonymity enforcement\n", + "7. **Comprehensive workflow** - Complete anonymization pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup and Import Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "import sys\nfrom pathlib import Path\n\nimport pandas as pd\n\n# Add the src directory to the path for imports\nsrc_path = Path().parent / \"src\"\nif str(src_path) not in sys.path:\n sys.path.insert(0, str(src_path))\n\n# Import after path setup to avoid import order issues\nfrom pii_detector.core.anonymization import AnonymizationTechniques # noqa: E402" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Sample Dataset\n", + "\n", + "First, let's create a sample dataset containing various types of PII that we'll use for demonstration:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original Data:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
participant_idnameemailageincomecityoccupationphonenotes
0P001John Doejohn@email.com2545000ChicagoEngineer555-1234Called about billing on March 3rd
1P002Jane Smithjane@company.org3475000New YorkTeacher555-5678Prefers email contact
2P003Alice Johnsonalice@uni.edu2952000ChicagoEngineer555-9012Works at Chicago Tech Corp
3P004Bob Wilsonbob@tech.com4595000Los AngelesManager555-3456Manager at LA Consulting
4P005Carol Daviscarol@health.net3868000ChicagoNurse555-7890Nurse at Memorial Hospital
\n", + "
" + ], + "text/plain": [ + " participant_id name email age income city \\\n", + "0 P001 John Doe john@email.com 25 45000 Chicago \n", + "1 P002 Jane Smith jane@company.org 34 75000 New York \n", + "2 P003 Alice Johnson alice@uni.edu 29 52000 Chicago \n", + "3 P004 Bob Wilson bob@tech.com 45 95000 Los Angeles \n", + "4 P005 Carol Davis carol@health.net 38 68000 Chicago \n", + "\n", + " occupation phone notes \n", + "0 Engineer 555-1234 Called about billing on March 3rd \n", + "1 Teacher 555-5678 Prefers email contact \n", + "2 Engineer 555-9012 Works at Chicago Tech Corp \n", + "3 Manager 555-3456 Manager at LA Consulting \n", + "4 Nurse 555-7890 Nurse at Memorial Hospital " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Dataset shape: (5, 9)\n" + ] + } + ], + "source": [ + "# Create sample data with various PII types\n", + "sample_data = pd.DataFrame(\n", + " {\n", + " \"participant_id\": [\"P001\", \"P002\", \"P003\", \"P004\", \"P005\"],\n", + " \"name\": [\n", + " \"John Doe\",\n", + " \"Jane Smith\",\n", + " \"Alice Johnson\",\n", + " \"Bob Wilson\",\n", + " \"Carol Davis\",\n", + " ],\n", + " \"email\": [\n", + " \"john@email.com\",\n", + " \"jane@company.org\",\n", + " \"alice@uni.edu\",\n", + " \"bob@tech.com\",\n", + " \"carol@health.net\",\n", + " ],\n", + " \"age\": [25, 34, 29, 45, 38],\n", + " \"income\": [45000, 75000, 52000, 95000, 68000],\n", + " \"city\": [\"Chicago\", \"New York\", \"Chicago\", \"Los Angeles\", \"Chicago\"],\n", + " \"occupation\": [\"Engineer\", \"Teacher\", \"Engineer\", \"Manager\", \"Nurse\"],\n", + " \"phone\": [\"555-1234\", \"555-5678\", \"555-9012\", \"555-3456\", \"555-7890\"],\n", + " \"notes\": [\n", + " \"Called about billing on March 3rd\",\n", + " \"Prefers email contact\",\n", + " \"Works at Chicago Tech Corp\",\n", + " \"Manager at LA Consulting\",\n", + " \"Nurse at Memorial Hospital\",\n", + " ],\n", + " }\n", + ")\n", + "\n", + "print(\"Original Data:\")\n", + "display(sample_data)\n", + "print(f\"\\nDataset shape: {sample_data.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Anonymization Tools" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Anonymization tools initialized with random seed 42 for reproducible results.\n" + ] + } + ], + "source": [ + "# Initialize anonymization techniques with a fixed seed for reproducible results\n", + "anonymizer = AnonymizationTechniques(random_seed=42)\n", + "print(\"Anonymization tools initialized with random seed 42 for reproducible results.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Removal Techniques\n", + "\n", + "The simplest anonymization approach is to completely remove identifying variables or records." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "After removing direct identifiers (name, email, phone):\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
participant_idageincomecityoccupationnotes
0P0012545000ChicagoEngineerCalled about billing on March 3rd
1P0023475000New YorkTeacherPrefers email contact
2P0032952000ChicagoEngineerWorks at Chicago Tech Corp
3P0044595000Los AngelesManagerManager at LA Consulting
4P0053868000ChicagoNurseNurse at Memorial Hospital
\n", + "
" + ], + "text/plain": [ + " participant_id age income city occupation \\\n", + "0 P001 25 45000 Chicago Engineer \n", + "1 P002 34 75000 New York Teacher \n", + "2 P003 29 52000 Chicago Engineer \n", + "3 P004 45 95000 Los Angeles Manager \n", + "4 P005 38 68000 Chicago Nurse \n", + "\n", + " notes \n", + "0 Called about billing on March 3rd \n", + "1 Prefers email contact \n", + "2 Works at Chicago Tech Corp \n", + "3 Manager at LA Consulting \n", + "4 Nurse at Memorial Hospital " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Remove direct identifiers\n", + "step1 = anonymizer.remove_variables(sample_data, [\"name\", \"email\", \"phone\"])\n", + "print(\"After removing direct identifiers (name, email, phone):\")\n", + "display(step1)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Records with unique city-occupation combinations removed: 3\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
participant_idnameemailageincomecityoccupationphonenotes
0P001John Doejohn@email.com2545000ChicagoEngineer555-1234Called about billing on March 3rd
1P003Alice Johnsonalice@uni.edu2952000ChicagoEngineer555-9012Works at Chicago Tech Corp
\n", + "
" + ], + "text/plain": [ + " participant_id name email age income city \\\n", + "0 P001 John Doe john@email.com 25 45000 Chicago \n", + "1 P003 Alice Johnson alice@uni.edu 29 52000 Chicago \n", + "\n", + " occupation phone notes \n", + "0 Engineer 555-1234 Called about billing on March 3rd \n", + "1 Engineer 555-9012 Works at Chicago Tech Corp " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Remove records with unique combinations\n", + "unique_removed = anonymizer.remove_records_with_unique_combinations(\n", + " sample_data, [\"city\", \"occupation\"], threshold=1\n", + ")\n", + "print(\n", + " f\"Records with unique city-occupation combinations removed: {len(sample_data) - len(unique_removed)}\"\n", + ")\n", + "display(unique_removed)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Pseudonymization Techniques\n", + "\n", + "Replace identifying values with consistent pseudonyms that preserve relationships while removing direct identification." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hash-based pseudonymization of participant IDs:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_idpseudonymized_id
0P001ANON_19e3112c
1P002ANON_59ed8689
2P003ANON_37902265
3P004ANON_ab3b01ed
4P005ANON_c852755e
\n", + "
" + ], + "text/plain": [ + " original_id pseudonymized_id\n", + "0 P001 ANON_19e3112c\n", + "1 P002 ANON_59ed8689\n", + "2 P003 ANON_37902265\n", + "3 P004 ANON_ab3b01ed\n", + "4 P005 ANON_c852755e" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "step2 = step1.copy()\n", + "\n", + "# Hash-based pseudonymization\n", + "step2[\"participant_id\"] = anonymizer.hash_pseudonymization(\n", + " step1[\"participant_id\"], prefix=\"ANON_\"\n", + ")\n", + "print(\"Hash-based pseudonymization of participant IDs:\")\n", + "comparison_df = pd.DataFrame(\n", + " {\n", + " \"original_id\": step1[\"participant_id\"],\n", + " \"pseudonymized_id\": step2[\"participant_id\"],\n", + " }\n", + ")\n", + "display(comparison_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Recoding/Categorization Techniques\n", + "\n", + "Transform continuous variables into categories to reduce precision while preserving analytical utility." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Age categorization:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_ageage_group
02518-29
13430-44
22918-29
34530-44
43830-44
\n", + "
" + ], + "text/plain": [ + " original_age age_group\n", + "0 25 18-29\n", + "1 34 30-44\n", + "2 29 18-29\n", + "3 45 30-44\n", + "4 38 30-44" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "step3 = step2.copy()\n", + "\n", + "# Age categorization\n", + "step3[\"age_group\"] = anonymizer.age_categorization(step2[\"age\"])\n", + "print(\"Age categorization:\")\n", + "age_comparison = pd.DataFrame(\n", + " {\"original_age\": step2[\"age\"], \"age_group\": step3[\"age_group\"]}\n", + ")\n", + "display(age_comparison)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Income categorization:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_incomeincome_bracket
045000Lower-Middle
175000Middle
252000Middle
395000Upper-Middle
468000Middle
\n", + "
" + ], + "text/plain": [ + " original_income income_bracket\n", + "0 45000 Lower-Middle\n", + "1 75000 Middle\n", + "2 52000 Middle\n", + "3 95000 Upper-Middle\n", + "4 68000 Middle" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Income categorization\n", + "step3[\"income_bracket\"] = anonymizer.income_categorization(step2[\"income\"])\n", + "print(\"Income categorization:\")\n", + "income_comparison = pd.DataFrame(\n", + " {\"original_income\": step2[\"income\"], \"income_bracket\": step3[\"income_bracket\"]}\n", + ")\n", + "display(income_comparison)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top/bottom coding of income (80th/20th percentiles):\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
originalcoded
045000≤50600
17500075000
25200052000
395000≤50600
46800068000
\n", + "
" + ], + "text/plain": [ + " original coded\n", + "0 45000 ≤50600\n", + "1 75000 75000\n", + "2 52000 52000\n", + "3 95000 ≤50600\n", + "4 68000 68000" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Top/bottom coding\n", + "step3[\"income_coded\"] = anonymizer.top_bottom_coding(\n", + " step2[\"income\"], top_percentile=80, bottom_percentile=20\n", + ")\n", + "print(\"Top/bottom coding of income (80th/20th percentiles):\")\n", + "coding_comparison = pd.DataFrame(\n", + " {\"original\": step2[\"income\"], \"coded\": step3[\"income_coded\"]}\n", + ")\n", + "display(coding_comparison)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Randomization Techniques\n", + "\n", + "Add controlled randomness to data while preserving statistical properties." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Gaussian noise added to age:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
originalwith_noise
02525.39
13433.89
22929.50
34546.19
43837.82
\n", + "
" + ], + "text/plain": [ + " original with_noise\n", + "0 25 25.39\n", + "1 34 33.89\n", + "2 29 29.50\n", + "3 45 46.19\n", + "4 38 37.82" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "step4 = step3.copy()\n", + "\n", + "# Add noise to numeric data\n", + "step4[\"age_with_noise\"] = anonymizer.add_noise(\n", + " step3[\"age\"], noise_type=\"gaussian\", noise_level=0.1\n", + ")\n", + "print(\"Gaussian noise added to age:\")\n", + "noise_comparison = pd.DataFrame(\n", + " {\"original\": step3[\"age\"], \"with_noise\": step4[\"age_with_noise\"].round(2)}\n", + ")\n", + "display(noise_comparison)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "After permutation swapping (age and income):\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_ageswapped_ageoriginal_incomeswapped_income
025384500075000
134297500068000
229345200052000
345459500095000
438256800045000
\n", + "
" + ], + "text/plain": [ + " original_age swapped_age original_income swapped_income\n", + "0 25 38 45000 75000\n", + "1 34 29 75000 68000\n", + "2 29 34 52000 52000\n", + "3 45 45 95000 95000\n", + "4 38 25 68000 45000" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Permutation swapping\n", + "swapped_data = anonymizer.permutation_swapping(\n", + " step4, [\"age\", \"income\"], swap_probability=0.4\n", + ")\n", + "print(\"After permutation swapping (age and income):\")\n", + "swap_comparison = pd.DataFrame(\n", + " {\n", + " \"original_age\": step4[\"age\"],\n", + " \"swapped_age\": swapped_data[\"age\"],\n", + " \"original_income\": step4[\"income\"],\n", + " \"swapped_income\": swapped_data[\"income\"],\n", + " }\n", + ")\n", + "display(swap_comparison)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Text Anonymization\n", + "\n", + "Identify and mask PII patterns within text content." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Text masking example:\n", + "Original: John Doe called from 555-1234 about his account john@email.com\n", + "Masked: [NAME] called from [PHONE] about his account [EMAIL]\n" + ] + } + ], + "source": [ + "# Text masking demonstration\n", + "sample_text = \"John Doe called from 555-1234 about his account john@email.com\"\n", + "masked_text = anonymizer.text_masking(sample_text)\n", + "print(\"Text masking example:\")\n", + "print(f\"Original: {sample_text}\")\n", + "print(f\"Masked: {masked_text}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original vs Masked notes:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_notesmasked_notes
0Called about billing on March 3rdCalled about billing on March 3rd
1Prefers email contactPrefers email contact
2Works at Chicago Tech CorpWorks at [NAME] Corp
3Manager at LA ConsultingManager at LA Consulting
4Nurse at Memorial HospitalNurse at [NAME]
\n", + "
" + ], + "text/plain": [ + " original_notes masked_notes\n", + "0 Called about billing on March 3rd Called about billing on March 3rd\n", + "1 Prefers email contact Prefers email contact\n", + "2 Works at Chicago Tech Corp Works at [NAME] Corp\n", + "3 Manager at LA Consulting Manager at LA Consulting\n", + "4 Nurse at Memorial Hospital Nurse at [NAME]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Apply to notes column\n", + "masked_notes = sample_data[\"notes\"].apply(anonymizer.text_masking)\n", + "print(\"Original vs Masked notes:\")\n", + "notes_comparison = pd.DataFrame(\n", + " {\"original_notes\": sample_data[\"notes\"], \"masked_notes\": masked_notes}\n", + ")\n", + "display(notes_comparison)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. K-Anonymity Analysis\n", + "\n", + "Ensure that each combination of quasi-identifiers appears for at least k individuals." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test data for k-anonymity:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
age_groupcityoccupationsalary
020-30ChicagoEngineer50000
120-30ChicagoTeacher45000
230-40NYCEngineer75000
330-40NYCTeacher65000
440-50LAManager95000
\n", + "
" + ], + "text/plain": [ + " age_group city occupation salary\n", + "0 20-30 Chicago Engineer 50000\n", + "1 20-30 Chicago Teacher 45000\n", + "2 30-40 NYC Engineer 75000\n", + "3 30-40 NYC Teacher 65000\n", + "4 40-50 LA Manager 95000" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create test data for k-anonymity demonstration\n", + "test_data = pd.DataFrame(\n", + " {\n", + " \"age_group\": [\"20-30\", \"20-30\", \"30-40\", \"30-40\", \"40-50\"],\n", + " \"city\": [\"Chicago\", \"Chicago\", \"NYC\", \"NYC\", \"LA\"],\n", + " \"occupation\": [\"Engineer\", \"Teacher\", \"Engineer\", \"Teacher\", \"Manager\"],\n", + " \"salary\": [50000, 45000, 75000, 65000, 95000],\n", + " }\n", + ")\n", + "\n", + "print(\"Test data for k-anonymity:\")\n", + "display(test_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original data satisfies 2-anonymity: False\n", + "\n", + "Violations (combinations with < 2 records):\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
age_groupcitycount
240-50LA1
\n", + "
" + ], + "text/plain": [ + " age_group city count\n", + "2 40-50 LA 1" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Check k-anonymity\n", + "is_anonymous, violations = anonymizer.k_anonymity_check(\n", + " test_data, [\"age_group\", \"city\"], k=2\n", + ")\n", + "print(f\"Original data satisfies 2-anonymity: {is_anonymous}\")\n", + "if not is_anonymous:\n", + " print(\"\\nViolations (combinations with < 2 records):\")\n", + " display(violations)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "After applying k-anonymity: True\n", + "Rows removed: 1\n", + "\n", + "Final k-anonymous dataset:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
age_groupcityoccupationsalary
020-30ChicagoEngineer50000
120-30ChicagoTeacher45000
230-40NYCEngineer75000
330-40NYCTeacher65000
\n", + "
" + ], + "text/plain": [ + " age_group city occupation salary\n", + "0 20-30 Chicago Engineer 50000\n", + "1 20-30 Chicago Teacher 45000\n", + "2 30-40 NYC Engineer 75000\n", + "3 30-40 NYC Teacher 65000" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Achieve k-anonymity\n", + "k_anonymous_data = anonymizer.achieve_k_anonymity(test_data, [\"age_group\", \"city\"], k=2)\n", + "is_now_anonymous, _ = anonymizer.k_anonymity_check(\n", + " k_anonymous_data, [\"age_group\", \"city\"], k=2\n", + ")\n", + "print(f\"After applying k-anonymity: {is_now_anonymous}\")\n", + "print(f\"Rows removed: {len(test_data) - len(k_anonymous_data)}\")\n", + "print(\"\\nFinal k-anonymous dataset:\")\n", + "display(k_anonymous_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Comprehensive Anonymization Workflow\n", + "\n", + "Apply multiple techniques in sequence for comprehensive anonymization." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Applying comprehensive anonymization workflow...\n", + "✓ Removed direct identifiers\n", + "✓ Pseudonymized participant IDs\n", + "✓ Categorized age and income, removed original values\n", + "✓ Anonymized text content\n", + "✓ Enforced k-anonymity (k=2)\n", + "\n", + "Final anonymized dataset:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
participant_idcityoccupationnotesage_groupincome_bracket
0SUBJ_19e3112cChicagoEngineerCalled about billing on March 3rd18-29Lower-Middle
1SUBJ_37902265ChicagoEngineerWorks at [NAME] Corp18-29Middle
\n", + "
" + ], + "text/plain": [ + " participant_id city occupation notes \\\n", + "0 SUBJ_19e3112c Chicago Engineer Called about billing on March 3rd \n", + "1 SUBJ_37902265 Chicago Engineer Works at [NAME] Corp \n", + "\n", + " age_group income_bracket \n", + "0 18-29 Lower-Middle \n", + "1 18-29 Middle " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Apply full workflow\n", + "final_data = sample_data.copy()\n", + "\n", + "print(\"Applying comprehensive anonymization workflow...\")\n", + "\n", + "# Step 1: Remove direct identifiers\n", + "final_data = anonymizer.remove_variables(final_data, [\"name\", \"email\", \"phone\"])\n", + "print(\"✓ Removed direct identifiers\")\n", + "\n", + "# Step 2: Pseudonymize IDs\n", + "final_data[\"participant_id\"] = anonymizer.hash_pseudonymization(\n", + " final_data[\"participant_id\"], prefix=\"SUBJ_\"\n", + ")\n", + "print(\"✓ Pseudonymized participant IDs\")\n", + "\n", + "# Step 3: Categorize continuous variables\n", + "final_data[\"age_group\"] = anonymizer.age_categorization(final_data[\"age\"])\n", + "final_data[\"income_bracket\"] = anonymizer.income_categorization(final_data[\"income\"])\n", + "final_data = final_data.drop([\"age\", \"income\"], axis=1)\n", + "print(\"✓ Categorized age and income, removed original values\")\n", + "\n", + "# Step 4: Anonymize text\n", + "final_data[\"notes\"] = final_data[\"notes\"].apply(anonymizer.text_masking)\n", + "print(\"✓ Anonymized text content\")\n", + "\n", + "# Step 5: Apply k-anonymity\n", + "final_data = anonymizer.achieve_k_anonymity(\n", + " final_data, [\"age_group\", \"city\", \"occupation\"], k=2\n", + ")\n", + "print(\"✓ Enforced k-anonymity (k=2)\")\n", + "\n", + "print(\"\\nFinal anonymized dataset:\")\n", + "display(final_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Anonymization Report\n", + "\n", + "Generate a comprehensive report comparing the original and anonymized datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== ANONYMIZATION REPORT ===\n", + "Original rows: 5\n", + "Anonymized rows: 2\n", + "Rows removed: 3 (60.0%)\n", + "\n", + "Column transformations:\n", + " participant_id: 5 → 2 unique values (60.0% reduction)\n", + " city: 3 → 1 unique values (66.7% reduction)\n", + " occupation: 4 → 1 unique values (75.0% reduction)\n", + " notes: 5 → 2 unique values (60.0% reduction)\n" + ] + } + ], + "source": [ + "# Generate anonymization report\n", + "report = anonymizer.anonymization_report(sample_data, final_data)\n", + "\n", + "print(\"=== ANONYMIZATION REPORT ===\")\n", + "print(f\"Original rows: {report['original_rows']}\")\n", + "print(f\"Anonymized rows: {report['anonymized_rows']}\")\n", + "print(f\"Rows removed: {report['rows_removed']} ({report['removal_percentage']:.1f}%)\")\n", + "\n", + "print(\"\\nColumn transformations:\")\n", + "for col, stats in report[\"columns_comparison\"].items():\n", + " if col in final_data.columns:\n", + " print(\n", + " f\" {col}: {stats['original_unique_values']} → {stats['anonymized_unique_values']} unique values \"\n", + " f\"({stats['uniqueness_reduction']:.1f}% reduction)\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "This notebook demonstrated comprehensive anonymization techniques including:\n", + "\n", + "- **Variable removal** for direct identifiers\n", + "- **Hash-based pseudonymization** for consistent but anonymous IDs\n", + "- **Categorization** to reduce precision of continuous variables\n", + "- **Statistical noise** and **permutation** for randomization\n", + "- **Text masking** for PII within unstructured content\n", + "- **K-anonymity** enforcement for statistical disclosure control\n", + "- **Comprehensive reporting** for transparency and audit trails\n", + "\n", + "These techniques can be combined and customized based on specific anonymization requirements and privacy regulations." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pii-detector", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/anonymization_demo.py b/examples/anonymization_demo.py new file mode 100644 index 0000000..8cd17fd --- /dev/null +++ b/examples/anonymization_demo.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +"""Demonstration of comprehensive anonymization techniques. + +This script shows how to use various anonymization methods +implemented based on FSD guidelines and academic research. +""" + +import sys +from pathlib import Path + +import pandas as pd + +# Add the src directory to the path for imports +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from pii_detector.core.anonymization import AnonymizationTechniques + + +def main(): + """Demonstrate various anonymization techniques.""" + print("=== PII Anonymization Techniques Demo ===\n") + + # Create sample data + sample_data = pd.DataFrame( + { + "participant_id": ["P001", "P002", "P003", "P004", "P005"], + "name": [ + "John Doe", + "Jane Smith", + "Alice Johnson", + "Bob Wilson", + "Carol Davis", + ], + "email": [ + "john@email.com", + "jane@company.org", + "alice@uni.edu", + "bob@tech.com", + "carol@health.net", + ], + "age": [25, 34, 29, 45, 38], + "income": [45000, 75000, 52000, 95000, 68000], + "city": ["Chicago", "New York", "Chicago", "Los Angeles", "Chicago"], + "occupation": ["Engineer", "Teacher", "Engineer", "Manager", "Nurse"], + "phone": ["555-1234", "555-5678", "555-9012", "555-3456", "555-7890"], + "notes": [ + "Called about billing on March 3rd", + "Prefers email contact", + "Works at Chicago Tech Corp", + "Manager at LA Consulting", + "Nurse at Memorial Hospital", + ], + } + ) + + print("Original Data:") + print(sample_data) + print("\n" + "=" * 80 + "\n") + + # Initialize anonymization techniques + anonymizer = AnonymizationTechniques(random_seed=42) + + # 1. REMOVAL TECHNIQUES + print("1. REMOVAL TECHNIQUES") + print("-" * 20) + + # Remove direct identifiers + step1 = anonymizer.remove_variables(sample_data, ["name", "email", "phone"]) + print("After removing direct identifiers (name, email, phone):") + print(step1.head()) + print() + + # Remove records with unique combinations + unique_removed = anonymizer.remove_records_with_unique_combinations( + sample_data, ["city", "occupation"], threshold=1 + ) + print( + f"Records with unique city-occupation combinations removed: {len(sample_data) - len(unique_removed)}" + ) + print() + + # 2. PSEUDONYMIZATION TECHNIQUES + print("2. PSEUDONYMIZATION TECHNIQUES") + print("-" * 30) + + step2 = step1.copy() + + # Hash-based pseudonymization + step2["participant_id"] = anonymizer.hash_pseudonymization( + step1["participant_id"], prefix="ANON_" + ) + print("Hash-based pseudonymization of participant IDs:") + print(step2["participant_id"].head()) + print() + + # 3. RECODING/CATEGORIZATION + print("3. RECODING/CATEGORIZATION TECHNIQUES") + print("-" * 35) + + step3 = step2.copy() + + # Age categorization + step3["age_group"] = anonymizer.age_categorization(step2["age"]) + print("Age categorization:") + print(pd.DataFrame({"original_age": step2["age"], "age_group": step3["age_group"]})) + print() + + # Income categorization + step3["income_bracket"] = anonymizer.income_categorization(step2["income"]) + print("Income categorization:") + print( + pd.DataFrame( + { + "original_income": step2["income"], + "income_bracket": step3["income_bracket"], + } + ) + ) + print() + + # Top/bottom coding + step3["income_coded"] = anonymizer.top_bottom_coding( + step2["income"], top_percentile=80, bottom_percentile=20 + ) + print("Top/bottom coding of income (80th/20th percentiles):") + print(pd.DataFrame({"original": step2["income"], "coded": step3["income_coded"]})) + print() + + # 4. RANDOMIZATION TECHNIQUES + print("4. RANDOMIZATION TECHNIQUES") + print("-" * 25) + + step4 = step3.copy() + + # Add noise to numeric data + step4["age_with_noise"] = anonymizer.add_noise( + step3["age"], noise_type="gaussian", noise_level=0.1 + ) + print("Gaussian noise added to age:") + print( + pd.DataFrame({"original": step3["age"], "with_noise": step4["age_with_noise"]}) + ) + print() + + # Permutation swapping + swapped_data = anonymizer.permutation_swapping( + step4, ["age", "income"], swap_probability=0.4 + ) + print("After permutation swapping (age and income):") + print( + pd.DataFrame( + { + "original_age": step4["age"], + "swapped_age": swapped_data["age"], + "original_income": step4["income"], + "swapped_income": swapped_data["income"], + } + ) + ) + print() + + # 5. TEXT ANONYMIZATION + print("5. TEXT ANONYMIZATION") + print("-" * 20) + + # Text masking + sample_text = "John Doe called from 555-1234 about his account john@email.com" + masked_text = anonymizer.text_masking(sample_text) + print(f"Original text: {sample_text}") + print(f"Masked text: {masked_text}") + print() + + # Apply to notes column + masked_notes = sample_data["notes"].apply(anonymizer.text_masking) + print("Original vs Masked notes:") + for i, (orig, masked) in enumerate(zip(sample_data["notes"], masked_notes)): + print(f" {i + 1}. {orig}") + print(f" → {masked}") + print() + + # 6. K-ANONYMITY + print("6. K-ANONYMITY ANALYSIS") + print("-" * 20) + + # Check k-anonymity + test_data = pd.DataFrame( + { + "age_group": ["20-30", "20-30", "30-40", "30-40", "40-50"], + "city": ["Chicago", "Chicago", "NYC", "NYC", "LA"], + "occupation": ["Engineer", "Teacher", "Engineer", "Teacher", "Manager"], + "salary": [50000, 45000, 75000, 65000, 95000], + } + ) + + is_anonymous, violations = anonymizer.k_anonymity_check( + test_data, ["age_group", "city"], k=2 + ) + print(f"Original data satisfies 2-anonymity: {is_anonymous}") + if not is_anonymous: + print("Violations:") + print(violations) + + # Achieve k-anonymity + k_anonymous_data = anonymizer.achieve_k_anonymity( + test_data, ["age_group", "city"], k=2 + ) + is_now_anonymous, _ = anonymizer.k_anonymity_check( + k_anonymous_data, ["age_group", "city"], k=2 + ) + print(f"After applying k-anonymity: {is_now_anonymous}") + print(f"Rows removed: {len(test_data) - len(k_anonymous_data)}") + print() + + # 7. COMPREHENSIVE WORKFLOW + print("7. COMPREHENSIVE ANONYMIZATION WORKFLOW") + print("-" * 40) + + # Apply full workflow + final_data = sample_data.copy() + + # Step 1: Remove direct identifiers + final_data = anonymizer.remove_variables(final_data, ["name", "email", "phone"]) + + # Step 2: Pseudonymize IDs + final_data["participant_id"] = anonymizer.hash_pseudonymization( + final_data["participant_id"], prefix="SUBJ_" + ) + + # Step 3: Categorize continuous variables + final_data["age_group"] = anonymizer.age_categorization(final_data["age"]) + final_data["income_bracket"] = anonymizer.income_categorization( + final_data["income"] + ) + final_data = final_data.drop(["age", "income"], axis=1) + + # Step 4: Anonymize text + final_data["notes"] = final_data["notes"].apply(anonymizer.text_masking) + + # Step 5: Apply k-anonymity + final_data = anonymizer.achieve_k_anonymity( + final_data, ["age_group", "city", "occupation"], k=2 + ) + + print("Final anonymized dataset:") + print(final_data) + print() + + # 8. ANONYMIZATION REPORT + print("8. ANONYMIZATION REPORT") + print("-" * 20) + + report = anonymizer.anonymization_report(sample_data, final_data) + print(f"Original rows: {report['original_rows']}") + print(f"Anonymized rows: {report['anonymized_rows']}") + print( + f"Rows removed: {report['rows_removed']} ({report['removal_percentage']:.1f}%)" + ) + print("\nColumn transformations:") + for col, stats in report["columns_comparison"].items(): + if col in final_data.columns: + print( + f" {col}: {stats['original_unique_values']} → {stats['anonymized_unique_values']} unique values " + f"({stats['uniqueness_reduction']:.1f}% reduction)" + ) + + +if __name__ == "__main__": + main() diff --git a/examples/batch_processing_demo.py b/examples/batch_processing_demo.py new file mode 100644 index 0000000..ceaa691 --- /dev/null +++ b/examples/batch_processing_demo.py @@ -0,0 +1,310 @@ +#!/usr/bin/env python3 +"""Demonstration of efficient batch processing for PII detection and anonymization. + +This example shows how to use the new batch processing capabilities for handling +large datasets efficiently with Presidio integration. +""" + +import sys +import time +from pathlib import Path + +import numpy as np +import pandas as pd + +# Add src to path for imports +sys.path.append(str(Path(__file__).parent.parent / "src")) + +from pii_detector.core.batch_processor import BatchPIIProcessor, process_dataset_batch +from pii_detector.core.presidio_engine import ( + presidio_analyze_dataframe_batch, + presidio_anonymize_dataframe_batch, +) + + +def create_sample_dataset(rows: int = 10000) -> pd.DataFrame: + """Create a synthetic dataset with various PII types for testing.""" + print(f"Creating synthetic dataset with {rows} rows...") + + np.random.seed(42) + + # Generate synthetic data + names = [ + "John Smith", + "Jane Doe", + "Michael Johnson", + "Sarah Wilson", + "David Brown", + "Lisa Davis", + "Robert Miller", + "Jennifer Garcia", + ] * (rows // 8 + 1) + + emails = [f"user{i}@example.com" for i in range(rows)] + phones = [ + f"555-{np.random.randint(100, 999):03d}-{np.random.randint(1000, 9999):04d}" + for _ in range(rows) + ] + + addresses = [ + f"{np.random.randint(100, 9999)} Main St, Springfield, IL", + f"{np.random.randint(100, 9999)} Oak Ave, Chicago, IL", + f"{np.random.randint(100, 9999)} First St, Peoria, IL", + ] * (rows // 3 + 1) + + ssns = [ + f"{np.random.randint(100, 999):03d}-{np.random.randint(10, 99):02d}-{np.random.randint(1000, 9999):04d}" + for _ in range(rows) + ] + + # Create DataFrame + data = { + "id": range(1, rows + 1), + "full_name": names[:rows], + "email_address": emails, + "phone_number": phones, + "home_address": addresses[:rows], + "ssn": ssns, + "age": np.random.randint(18, 80, rows), + "salary": np.random.randint(30000, 150000, rows), + "comments": [ + f"This is a comment from {names[i % len(names)]} with email {emails[i]}" + for i in range(rows) + ], + "survey_response": [ + f"I live at {addresses[i % len(addresses)]} and can be reached at {phones[i]}" + for i in range(rows) + ], + } + + return pd.DataFrame(data) + + +def demonstrate_batch_detection(): + """Demonstrate batch PII detection capabilities.""" + print("\n" + "=" * 60) + print("BATCH PII DETECTION DEMONSTRATION") + print("=" * 60) + + # Create test datasets of different sizes + datasets = { + "Small (1K rows)": create_sample_dataset(1000), + "Medium (5K rows)": create_sample_dataset(5000), + "Large (10K rows)": create_sample_dataset(10000), + } + + for name, dataset in datasets.items(): + print(f"\n--- Processing {name} ---") + print(f"Dataset shape: {dataset.shape}") + + # Initialize batch processor + processor = BatchPIIProcessor( + language="en", chunk_size=1000, max_workers=4, use_structured_engine=True + ) + + # Show processing strategy + strategy = processor.get_processing_strategy(dataset) + print(f"Processing strategy: {strategy}") + + # Estimate processing time + estimates = processor.estimate_processing_time(dataset) + print( + f"Time estimate: {estimates.get(strategy, {}).get('time_seconds', 0):.2f} seconds" + ) + + # Track progress + def progress_callback(percent, message): + print(f" Progress: {percent:.1f}% - {message}") + + # Perform detection + start_time = time.time() + results = processor.detect_pii_batch( + dataset, progress_callback=progress_callback + ) + detection_time = time.time() - start_time + + # Show results + print(f"Detection completed in {detection_time:.2f} seconds") + print(f"Found PII in {len(results)} columns:") + + for col, result in results.items(): + print( + f" - {col}: {result.detection_method} (confidence: {result.confidence:.2f})" + ) + if result.entity_types: + print(f" Entity types: {', '.join(result.entity_types)}") + + +def demonstrate_batch_anonymization(): + """Demonstrate batch anonymization capabilities.""" + print("\n" + "=" * 60) + print("BATCH ANONYMIZATION DEMONSTRATION") + print("=" * 60) + + # Create a medium-sized dataset + dataset = create_sample_dataset(5000) + print(f"Original dataset shape: {dataset.shape}") + + # Use the complete batch processing workflow + def progress_callback(percent, message): + print(f"Progress: {percent:.1f}% - {message}") + + print("\nRunning complete batch processing workflow...") + start_time = time.time() + + detection_results, anonymized_dataset, report = process_dataset_batch( + dataset, + language="en", + chunk_size=1000, + max_workers=4, + progress_callback=progress_callback, + ) + + total_time = time.time() - start_time + + print(f"\nBatch processing completed in {total_time:.2f} seconds") + print(f"Processed {len(detection_results)} PII columns") + print(f"Anonymized dataset shape: {anonymized_dataset.shape}") + + # Show before/after comparison for a few columns + print("\n--- Before/After Comparison ---") + pii_columns = list(detection_results.keys())[:3] # Show first 3 PII columns + + for col in pii_columns: + if col in dataset.columns: + print(f"\nColumn: {col}") + print("Original values (first 3):") + for val in dataset[col].head(3): + print(f" {val}") + print("Anonymized values (first 3):") + for val in anonymized_dataset[col].head(3): + print(f" {val}") + + +def demonstrate_presidio_dataframe_functions(): + """Demonstrate new DataFrame-level Presidio functions.""" + print("\n" + "=" * 60) + print("PRESIDIO DATAFRAME FUNCTIONS DEMONSTRATION") + print("=" * 60) + + # Create a smaller dataset for detailed analysis + dataset = create_sample_dataset(1000) + text_columns = ["full_name", "email_address", "comments", "survey_response"] + + print(f"Analyzing text columns: {text_columns}") + + # Batch analysis + print("\n--- Batch Analysis ---") + start_time = time.time() + analysis_results = presidio_analyze_dataframe_batch( + dataset, + text_columns=text_columns, + confidence_threshold=0.6, + sample_size=50, + batch_size=10, + ) + analysis_time = time.time() - start_time + + print(f"Analysis completed in {analysis_time:.2f} seconds") + print(f"Found PII in {len(analysis_results)} columns:") + + for col, result in analysis_results.items(): + entities = result.get("entities_found", {}) + total_detections = result.get("total_detections", 0) + avg_confidence = result.get("average_confidence", 0) + + print(f"\n {col}:") + print(f" Total detections: {total_detections}") + print(f" Average confidence: {avg_confidence:.2f}") + print(f" Entity types found: {list(entities.keys())}") + + # Batch anonymization + print("\n--- Batch Anonymization ---") + columns_to_anonymize = list(analysis_results.keys()) + + start_time = time.time() + anonymized_df = presidio_anonymize_dataframe_batch( + dataset, columns_to_anonymize=columns_to_anonymize + ) + anonymization_time = time.time() - start_time + + print(f"Anonymization completed in {anonymization_time:.2f} seconds") + + # Show examples + print("\n--- Anonymization Examples ---") + for col in columns_to_anonymize[:2]: # Show first 2 columns + print(f"\nColumn: {col}") + print("Original → Anonymized") + for orig, anon in zip(dataset[col].head(3), anonymized_df[col].head(3)): + print(f" {orig}") + print(f" → {anon}") + print() + + +def demonstrate_performance_comparison(): + """Compare performance between different processing approaches.""" + print("\n" + "=" * 60) + print("PERFORMANCE COMPARISON") + print("=" * 60) + + dataset = create_sample_dataset(2000) + + # Standard processing + print("\n--- Standard Processing ---") + start_time = time.time() + processor_standard = BatchPIIProcessor( + chunk_size=10000 + ) # Large chunk = no chunking + results_standard = processor_standard.detect_pii_batch(dataset) + time_standard = time.time() - start_time + + print(f"Standard processing: {time_standard:.2f} seconds") + print(f"Columns detected: {len(results_standard)}") + + # Chunked processing + print("\n--- Chunked Processing ---") + start_time = time.time() + processor_chunked = BatchPIIProcessor(chunk_size=500, max_workers=4) + results_chunked = processor_chunked.detect_pii_batch(dataset) + time_chunked = time.time() - start_time + + print(f"Chunked processing: {time_chunked:.2f} seconds") + print(f"Columns detected: {len(results_chunked)}") + + # Show efficiency + if time_standard > 0: + efficiency = ((time_standard - time_chunked) / time_standard) * 100 + print(f"\nEfficiency improvement: {efficiency:.1f}%") + + +def main(): + """Run main demonstration for PII detection batch processing.""" + print("Batch Processing Demo for PII Detection") + print("This demo showcases efficient processing of large datasets") + + try: + demonstrate_batch_detection() + demonstrate_batch_anonymization() + demonstrate_presidio_dataframe_functions() + demonstrate_performance_comparison() + + print("\n" + "=" * 60) + print("DEMO COMPLETED SUCCESSFULLY!") + print("=" * 60) + print("\nKey features demonstrated:") + print("✓ Batch PII detection with multiple strategies") + print("✓ Efficient chunked processing for large datasets") + print("✓ Parallel processing with configurable workers") + print("✓ Integrated Presidio text analysis") + print("✓ Complete anonymization workflow") + print("✓ Performance optimization techniques") + + except Exception as e: + print(f"\nError during demonstration: {e}") + import traceback + + traceback.print_exc() + + +if __name__ == "__main__": + main() diff --git a/examples/presidio_demo.py b/examples/presidio_demo.py new file mode 100644 index 0000000..4eebf5e --- /dev/null +++ b/examples/presidio_demo.py @@ -0,0 +1,291 @@ +"""Demonstration of Presidio integration with PII Detector. + +This script shows how to use the new Presidio-enhanced PII detection and anonymization +capabilities. It includes examples of both the basic functionality and the hybrid approach. +""" + +import logging + +import pandas as pd + +# Configure logging to see what's happening +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + +def create_sample_dataset(): + """Create a sample dataset with various types of PII for demonstration.""" + data = { + # Column name-based detection + "participant_name": [ + "John Smith", + "Maria Garcia", + "David Johnson", + "Sarah Williams", + "Michael Brown", + ], + # Format pattern detection (email) + "contact_email": [ + "john.smith@gmail.com", + "maria.garcia@yahoo.com", + "david.j@company.com", + "sarah.w@university.edu", + "m.brown@nonprofit.org", + ], + # Format pattern detection (phone) + "phone_number": [ + "555-123-4567", + "555-987-6543", + "555-456-7890", + "555-234-5678", + "555-876-5432", + ], + # Text content with embedded PII (Presidio's strength) + "survey_comments": [ + "Please contact me at john.smith@gmail.com if you need more info", + "My Social Security number is 123-45-6789 for verification", + "Call me at 555-123-4567 or email maria.garcia@yahoo.com", + "I live at 123 Main Street, Springfield, IL 62701", + "You can reach Michael Brown at his office phone 555-876-5432", + ], + # Sparsity detection (open-ended responses) + "detailed_feedback": [ + "The program helped me understand financial planning better", + "I learned about budgeting and saving through this initiative", + "This course on entrepreneurship opened new opportunities", + "The health education sessions were very informative", + "Training on digital literacy was exactly what I needed", + ], + # Non-PII columns + "age_category": ["25-34", "35-44", "25-34", "45-54", "35-44"], + "program_rating": [4, 5, 3, 4, 5], + "completion_status": [ + "completed", + "completed", + "partial", + "completed", + "completed", + ], + } + + return pd.DataFrame(data) + + +def demo_basic_presidio(): + """Demonstrate basic Presidio functionality.""" + print("\n" + "=" * 60) + print("BASIC PRESIDIO DEMONSTRATION") + print("=" * 60) + + from pii_detector.core.presidio_engine import get_presidio_analyzer + + analyzer = get_presidio_analyzer() + + print(f"Presidio Available: {analyzer.is_available()}") + + if analyzer.is_available(): + print( + f"Supported Entities: {analyzer.get_supported_entities()[:10]}..." + ) # Show first 10 + + # Test text analysis + test_text = "Contact John Smith at john.smith@email.com or call 555-123-4567. His SSN is 123-45-6789." + + print(f"\nAnalyzing text: '{test_text}'") + + if analyzer.is_available(): + entities = analyzer.analyze_text(test_text, confidence_threshold=0.7) + print("Detected entities:") + for entity in entities: + print( + f" - {entity['entity_type']}: '{entity['text']}' (confidence: {entity['score']:.2f})" + ) + + # Anonymize the text + anonymized = analyzer.anonymize_text(test_text) + print(f"\nAnonymized text: '{anonymized}'") + else: + print("Presidio not available - would use fallback methods") + + +def demo_unified_detection(): + """Demonstrate unified PII detection combining structural and text analysis.""" + print("\n" + "=" * 60) + print("UNIFIED DETECTION DEMONSTRATION") + print("=" * 60) + + from pii_detector.core.unified_processor import UnifiedPIIProcessor + + df = create_sample_dataset() + print(f"Sample dataset shape: {df.shape}") + print(f"Columns: {list(df.columns)}") + + # Initialize processor + processor = UnifiedPIIProcessor() + + # Custom configuration + config = { + "use_presidio_detection": True, + "presidio_confidence_threshold": 0.7, + "use_column_name_detection": True, + "use_format_detection": True, + "use_sparsity_detection": True, + } + + # Run detection + print("\nRunning comprehensive PII detection...") + detection_results = processor.detect_pii_comprehensive(df, detection_config=config) + + # Display results + print(f"\nDetected PII in {len(detection_results)} columns:") + for column, result in detection_results.items(): + print(f"\n Column: {column}") + print(f" Method: {result.detection_method}") + print(f" Confidence: {result.confidence:.2f}") + if result.entity_types: + print(f" Entity Types: {result.entity_types}") + + # Show some details if available + if "entities_found" in result.details: + entities_count = sum( + len(entities) for entities in result.details["entities_found"].values() + ) + if entities_count > 0: + print(f" Total Detections: {entities_count}") + + # Generate summary + summary = processor.get_detection_summary(detection_results) + print("\nDetection Summary:") + print(f" Total detections: {summary['total_detections']}") + print(f" Average confidence: {summary['average_confidence']:.2f}") + print(f" Methods used: {summary['methods_used']}") + if summary["entity_types_found"]: + print(f" Entity types found: {summary['entity_types_found']}") + + return detection_results + + +def demo_hybrid_anonymization(detection_results): + """Demonstrate hybrid anonymization using the detection results.""" + print("\n" + "=" * 60) + print("HYBRID ANONYMIZATION DEMONSTRATION") + print("=" * 60) + + from pii_detector.core.hybrid_anonymizer import HybridAnonymizer + + df = create_sample_dataset() + + # Initialize anonymizer + anonymizer = HybridAnonymizer() + + print("Available anonymization methods:") + methods = anonymizer.get_available_methods() + for method, info in methods.items(): + print(f" - {method}: {info['description']}") + + # Custom configuration for specific columns + anonymization_config = { + "participant_name": { + "method": "hash_pseudonymization", + "prefix": "PARTICIPANT_", + }, + "contact_email": {"method": "presidio_replace"}, + "survey_comments": {"method": "presidio_replace"}, + "phone_number": {"method": "text_masking"}, + } + + print(f"\nAnonymizing {len(detection_results)} PII columns...") + + # Run anonymization + anonymized_df, report = anonymizer.anonymize_dataset( + df, detection_results, anonymization_config + ) + + # Show before/after comparison + print("\nBEFORE vs AFTER comparison:") + for column in detection_results: + if column in df.columns: + print(f"\n {column}:") + print(f" Original sample: '{df[column].iloc[0]}'") + print(f" Anonymized: '{anonymized_df[column].iloc[0]}'") + + # Show anonymization report + print("\nAnonymization Report:") + print(f" Rows processed: {report['original_shape'][0]}") + print(f" Columns processed: {len(report['columns_processed'])}") + print(f" Methods applied: {report['methods_applied']}") + + if report.get("text_anonymization"): + print( + f" Text anonymization applied to: {list(report['text_anonymization'].keys())}" + ) + + return anonymized_df + + +def demo_end_to_end(): + """Demonstrate complete end-to-end workflow.""" + print("\n" + "=" * 60) + print("END-TO-END WORKFLOW DEMONSTRATION") + print("=" * 60) + + from pii_detector.core.hybrid_anonymizer import anonymize_dataset_hybrid + from pii_detector.core.unified_processor import detect_pii_unified + + # Create sample dataset + df = create_sample_dataset() + print(f"Starting with dataset: {df.shape}") + + # Step 1: Detect PII + print("\nStep 1: Detecting PII...") + pii_results = detect_pii_unified(df, language="en") + print(f"Found PII in {len(pii_results)} columns") + + # Step 2: Anonymize + print("\nStep 2: Anonymizing detected PII...") + anonymized_df, report = anonymize_dataset_hybrid(df, pii_results) + + # Step 3: Verify results + print("\nStep 3: Verification:") + print(f" Original dataset: {df.shape}") + print(f" Anonymized dataset: {anonymized_df.shape}") + print(f" Data integrity preserved: {df.shape == anonymized_df.shape}") + + # Show data utility metrics + if "uniqueness_reduction" in report: + print(f" Average uniqueness reduction: {report['uniqueness_reduction']:.1f}%") + + print("\nWorkflow completed successfully!") + return anonymized_df + + +def main(): + """Run all demonstrations.""" + print("PII Detector with Presidio Integration - Demonstration") + print("=" * 60) + + try: + # Demo 1: Basic Presidio functionality + demo_basic_presidio() + + # Demo 2: Unified detection + detection_results = demo_unified_detection() + + # Demo 3: Hybrid anonymization + if detection_results: + demo_hybrid_anonymization(detection_results) + + # Demo 4: End-to-end workflow + demo_end_to_end() + + print("\n" + "=" * 60) + print("ALL DEMONSTRATIONS COMPLETED SUCCESSFULLY!") + print("=" * 60) + + except Exception as e: + print(f"\nError during demonstration: {e}") + print("This might be due to Presidio dependencies not being installed.") + print("Try running: just install-presidio") + + +if __name__ == "__main__": + main() diff --git a/examples/run_batch_examples.py b/examples/run_batch_examples.py new file mode 100644 index 0000000..40b080f --- /dev/null +++ b/examples/run_batch_examples.py @@ -0,0 +1,339 @@ +#!/usr/bin/env python3 +"""Practical examples of batch processing with test data files. + +This script demonstrates how to use the batch processing functionality +with the included test data files in tests/data/. + +Run this script to see batch processing in action: + uv run python examples/run_batch_examples.py +""" + +import sys +import time +from pathlib import Path + +# Add src to path for imports +sys.path.append(str(Path(__file__).parent.parent / "src")) + +import pandas as pd + +from pii_detector.core.batch_processor import BatchPIIProcessor, process_dataset_batch +from pii_detector.core.presidio_engine import ( + presidio_analyze_dataframe_batch, + presidio_anonymize_dataframe_batch, +) + + +def example_1_basic_batch_processing(): + """Run example 1: Basic batch processing with comprehensive test data.""" + print("=" * 60) + print("EXAMPLE 1: Basic Batch Processing") + print("=" * 60) + + # Load comprehensive test data + data_file = Path(__file__).parent.parent / "tests/data/comprehensive_pii_data.csv" + + if not data_file.exists(): + print(f"Test data file not found: {data_file}") + print("Please ensure you're running from the project root directory.") + return + + print(f"Loading dataset: {data_file.name}") + dataset = pd.read_csv(data_file) + print(f"Dataset shape: {dataset.shape}") + print(f"Columns: {list(dataset.columns)}") + + # Initialize batch processor + processor = BatchPIIProcessor( + chunk_size=10, # Small chunks for demo + max_workers=2, # Limit workers for demo + ) + + print(f"\nProcessing strategy: {processor.get_processing_strategy(dataset)}") + + # Run batch detection + print("\nRunning batch PII detection...") + start_time = time.time() + + results = processor.detect_pii_batch(dataset) + + detection_time = time.time() - start_time + print(f"Detection completed in {detection_time:.2f} seconds") + + print(f"\nFound PII in {len(results)} columns:") + print("-" * 50) + for column, result in results.items(): + entity_info = ( + f" ({', '.join(result.entity_types)})" if result.entity_types else "" + ) + print( + f"{column:<20} | {result.detection_method:<25} | {result.confidence:.2f}{entity_info}" + ) + + print("\nHigh confidence detections (>0.8):") + high_conf = {col: res for col, res in results.items() if res.confidence > 0.8} + for col in high_conf: + print(f" - {col}") + + +def example_2_complete_workflow(): + """Run example 2: Complete detection and anonymization workflow.""" + print("\n" + "=" * 60) + print("EXAMPLE 2: Complete Batch Workflow") + print("=" * 60) + + data_file = Path(__file__).parent.parent / "tests/data/sample_pii_data.csv" + + if not data_file.exists(): + print(f"Test data file not found: {data_file}") + return + + print(f"Loading dataset: {data_file.name}") + dataset = pd.read_csv(data_file) + + # Progress tracking function + def show_progress(percent, message): + print(f" Progress: {percent:5.1f}% - {message}") + + print("\nRunning complete batch processing workflow...") + print("This includes both detection and anonymization phases:") + + start_time = time.time() + + # Run complete workflow + detection_results, anonymized_dataset, report = process_dataset_batch( + dataset, + language="en", + chunk_size=5, # Small chunks for demo + max_workers=2, + progress_callback=show_progress, + ) + + total_time = time.time() - start_time + + print(f"\nWorkflow completed in {total_time:.2f} seconds") + print(f"\nDetected PII in {len(detection_results)} columns:") + for col, result in detection_results.items(): + print( + f" - {col}: {result.detection_method} (confidence: {result.confidence:.2f})" + ) + + print("\nAnonymization report:") + print(f" - Original shape: {report.get('original_shape', 'N/A')}") + print(f" - Final shape: {report.get('final_shape', 'N/A')}") + print(f" - Columns processed: {len(report.get('columns_processed', []))}") + + # Show sample of anonymized data + print("\nSample of anonymized data (first 3 rows):") + print(anonymized_dataset.head(3).to_string(index=False)) + + +def example_3_presidio_dataframe_functions(): + """Run example 3: DataFrame-level Presidio functions.""" + print("\n" + "=" * 60) + print("EXAMPLE 3: Presidio DataFrame Functions") + print("=" * 60) + + data_file = Path(__file__).parent.parent / "tests/data/comprehensive_pii_data.csv" + + if not data_file.exists(): + print(f"Test data file not found: {data_file}") + return + + dataset = pd.read_csv(data_file) + print(f"Loaded dataset with {len(dataset)} rows") + + # Focus on text-rich columns + text_columns = ["full_name", "notes", "address"] + print(f"\nAnalyzing text columns: {text_columns}") + + # Batch Presidio analysis + print("\nRunning Presidio text analysis...") + try: + analysis_results = presidio_analyze_dataframe_batch( + dataset, + text_columns=text_columns, + confidence_threshold=0.6, + sample_size=len(dataset), # Analyze all rows + ) + + if analysis_results: + print("\nPresidio text analysis results:") + for col, result in analysis_results.items(): + entities = result.get("entities_found", {}) + detections = result.get("total_detections", 0) + confidence = result.get("average_confidence", 0) + print(f" {col}:") + print(f" Entities found: {list(entities.keys())}") + print(f" Total detections: {detections}") + print(f" Average confidence: {confidence:.2f}") + + # Batch anonymization + print(f"\nAnonymizing {len(analysis_results)} text columns...") + anonymized_df = presidio_anonymize_dataframe_batch( + dataset, columns_to_anonymize=list(analysis_results.keys()) + ) + + print("\nText anonymization examples:") + for col in list(analysis_results.keys())[:2]: # Show first 2 columns + print(f"\n{col}:") + print(" Original → Anonymized") + for i in range(min(3, len(dataset))): # Show first 3 rows + orig = str(dataset[col].iloc[i]) + anon = str(anonymized_df[col].iloc[i]) + if orig != anon: # Only show changed values + print(f" {orig}") + print(f" → {anon}") + break + else: + print("No PII detected by Presidio in text columns") + + except Exception as e: + print("Note: Presidio functionality requires 'just install-presidio' first") + print(f"Error: {e}") + + +def example_4_multiple_files(): + """Run example 4: Process multiple test files.""" + print("\n" + "=" * 60) + print("EXAMPLE 4: Batch Processing Multiple Files") + print("=" * 60) + + test_data_dir = Path(__file__).parent.parent / "tests/data" + csv_files = list(test_data_dir.glob("*.csv")) + + print(f"Found {len(csv_files)} CSV files in {test_data_dir}") + + processor = BatchPIIProcessor(chunk_size=100) + + for file_path in csv_files: + print(f"\nProcessing: {file_path.name}") + + try: + dataset = pd.read_csv(file_path) + results = processor.detect_pii_batch(dataset) + + print(f" Dataset shape: {dataset.shape}") + print(f" PII columns found: {len(results)}") + + if results: + pii_columns = list(results.keys()) + if len(pii_columns) <= 5: + print(f" PII columns: {pii_columns}") + else: + print( + f" PII columns: {pii_columns[:5]}... (and {len(pii_columns) - 5} more)" + ) + + # Show highest confidence detection + max_conf_col = max(results.items(), key=lambda x: x[1].confidence) + print( + f" Highest confidence: {max_conf_col[0]} ({max_conf_col[1].confidence:.2f})" + ) + else: + print(" No PII detected (clean dataset)") + + except Exception as e: + print(f" Error: {e}") + + +def example_5_performance_comparison(): + """Run example 5: Performance comparison between strategies.""" + print("\n" + "=" * 60) + print("EXAMPLE 5: Performance Comparison") + print("=" * 60) + + data_file = Path(__file__).parent.parent / "tests/data/comprehensive_pii_data.csv" + + if not data_file.exists(): + print(f"Test data file not found: {data_file}") + return + + dataset = pd.read_csv(data_file) + + # Create larger dataset by duplicating rows + print("Creating larger dataset for performance testing...") + large_dataset = pd.concat([dataset] * 50, ignore_index=True) # 50x larger + print(f"Large dataset shape: {large_dataset.shape}") + + processor = BatchPIIProcessor() + + # Get processing strategy + strategy = processor.get_processing_strategy(large_dataset) + print(f"\nRecommended processing strategy: {strategy}") + + # Get time estimates + estimates = processor.estimate_processing_time(large_dataset) + print("\nProcessing time estimates:") + print("-" * 40) + for strategy_name, estimate in estimates.items(): + recommended = "⭐ RECOMMENDED" if estimate["recommended"] else "" + print(f"{strategy_name}:") + print(f" Time: {estimate['time_seconds']:6.2f} seconds") + print(f" Memory: {estimate['memory_mb']:8.1f} MB") + print(f" {recommended}") + + # Actually test performance (smaller dataset for demo) + test_dataset = pd.concat([dataset] * 5, ignore_index=True) # 5x for actual test + print(f"\nActual performance test with {test_dataset.shape[0]} rows:") + + # Standard processing + start_time = time.time() + processor_standard = BatchPIIProcessor( + chunk_size=10000 + ) # Large chunk = no chunking + results_standard = processor_standard.detect_pii_batch(test_dataset) + time_standard = time.time() - start_time + + # Chunked processing + start_time = time.time() + processor_chunked = BatchPIIProcessor(chunk_size=50, max_workers=2) + results_chunked = processor_chunked.detect_pii_batch(test_dataset) + time_chunked = time.time() - start_time + + print( + f" Standard processing: {time_standard:.2f}s ({len(results_standard)} columns)" + ) + print( + f" Chunked processing: {time_chunked:.2f}s ({len(results_chunked)} columns)" + ) + + if time_standard > 0: + efficiency = ((time_standard - time_chunked) / time_standard) * 100 + print(f" Efficiency change: {efficiency:+.1f}%") + + +def main(): + """Run all batch processing examples.""" + print("Batch Processing Examples with Test Data") + print("This script demonstrates the batch processing capabilities") + print("using the test data files in tests/data/") + + try: + example_1_basic_batch_processing() + example_2_complete_workflow() + example_3_presidio_dataframe_functions() + example_4_multiple_files() + example_5_performance_comparison() + + print("\n" + "=" * 60) + print("ALL EXAMPLES COMPLETED SUCCESSFULLY!") + print("=" * 60) + print("\nNext steps:") + print("• Try modifying the examples with your own data") + print("• Experiment with different chunk sizes and worker counts") + print("• Install Presidio for enhanced text analysis: just install-presidio") + print("• Run the full batch demo: just run-batch-demo") + + except KeyboardInterrupt: + print("\n\nExample execution interrupted by user.") + + except Exception as e: + print(f"\nError during example execution: {e}") + import traceback + + traceback.print_exc() + + +if __name__ == "__main__": + main() diff --git a/find_piis_in_unstructured_text.py b/find_piis_in_unstructured_text.py deleted file mode 100644 index db7d6f1..0000000 --- a/find_piis_in_unstructured_text.py +++ /dev/null @@ -1,198 +0,0 @@ -from constant_strings import * -import restricted_words as restricted_words_list -import api_queries -import requests - -import json -from datetime import datetime -import spacy - -def get_stopwords(languages=None): - - from os import listdir - from os.path import isfile, join - - stopwords_path = './stopwords/' - - #If no language selected, get all stopwords - if(languages == None): - stopwords_files = [join(stopwords_path, f) for f in listdir(stopwords_path) if isfile(join(stopwords_path, f))] - else: #Select only stopwords files for given languages - stopwords_files = [join(stopwords_path, language) for language in languages if isfile(join(stopwords_path, language))] - - stopwords_list = [] - for file_path in stopwords_files: - with open(file_path, 'r', encoding="utf-8") as reader: - stopwords = reader.read().split('\n') - stopwords_list.extend(stopwords) - - return list(set(stopwords_list)) - -def remove_stopwords(strings_list, languages=['english','spanish']): - import stopwords - stop_words = get_stopwords(languages) - strings_list = [s for s in list(strings_list) if not s in stop_words] - return strings_list - -def find_phone_numbers_in_list_strings(list_strings): - - phone_n_regex_str = "(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})" - import re - phone_n_regex = re.compile(phone_n_regex_str) - phone_numbers_found = list(filter(phone_n_regex.match, list_strings)) - - return phone_numbers_found - - - - -def filter_based_type_of_word(list_strings, language): - - # CHECK .ENT_TYPE_ - # if (token.ent_type_ == 'PERSON') - # print(token+" is a name") - - if language == SPANISH: - nlp = spacy.load("es_core_news_sm") - - else: - nlp = spacy.load("en_core_web_sm") - - #Accepted types of words - #Reference https://spacy.io/api/annotation#pos-tagging - accepted_types = ['PROPN', 'X','PER','LOC','ORG','MISC',''] - - filtered_list = [] - import datetime - - filtered_list = [] - doc = nlp(" ".join(list_strings)) - # print("b") - for token in doc: - if token.pos_ in accepted_types: - filtered_list.append(token.text) - - filtered_list = list(set(filtered_list)) - - return filtered_list - - - - -#REPEATED FUNCTION FROM PII_DATA_PROCESSOR -def remove_other_refuse_and_dont_know(column): - - filtered_column = column.loc[(column != '777') & (column != '888') & (column != '999') & (column != '-888')] - - return filtered_column - -#REPEATED FUNCTION FROM PII_DATA_PROCESSOR -def clean_column(column): - #Drop NaNs - column_filtered = column.dropna() - - #Remove empty entries - column_filtered = column_filtered[column_filtered!=''] - - #Remove other, refuses and dont knows - column_filtered = remove_other_refuse_and_dont_know(column_filtered) - - return column_filtered - -def get_list_unique_strings_in_dataset(dataset, columns_to_check): - #To make the list, we will go over all columns that have sparse strings - set_string_in_dataset = set() - - #For every column in the dataset - for column_name in columns_to_check: - - #Clean column - column = clean_column(dataset[column_name]) - - for row in column: - #If row contains more than one word, add each word - if (' ' in row): - #For every word in the row - for word in row.split(" "): - #Add word to strings to check - set_string_in_dataset.add(word) - #If row does not contain spaces, add whole row (its only one string) - else: - set_string_in_dataset.add(row) - - return list(set_string_in_dataset) - -def find_piis(dataset, label_dict, columns_to_check, language, country): - - print("columns_to_check") - print(columns_to_check) - - #Do not check surveyCTO columns - #columns_to_check = [column for column in dataset.columns if column not in restricted_words_list.get_surveycto_restricted_vars()] - - #First we will make a list of all strings that need to be checked - print("->Getting list of unique strings in dataset...") - strings_to_check = get_list_unique_strings_in_dataset(dataset, columns_to_check) - - #Remove string with less than 3 chars - piis should be longer than that - print("->Removing strings with less than 3 characters") - strings_to_check = [s for s in strings_to_check if len(s)>2] - - #Find all telephone numbers - print("-->Finding phone numbers") - phone_numbers_found = find_phone_numbers_in_list_strings(strings_to_check) - print(f'Found {len(phone_numbers_found)} phone numbers in open ended questions') - if len(phone_numbers_found)>0: - print(phone_numbers_found) - - #Update strings_to_check - strings_to_check = [s for s in strings_to_check if s not in phone_numbers_found] - - #Clean list of words, now that we have already found numbers - print("Length of list "+str(len(strings_to_check))) - print("->Removing stopwords") - strings_to_check = remove_stopwords(strings_to_check) - print("->Filtering based on word type") - strings_to_check = filter_based_type_of_word(strings_to_check, language) - print("Length of list "+str(len(strings_to_check))) - - #Find all names - print("->Finding names") - names_found = api_queries.find_names_in_list_string(strings_to_check) - print(f'Found {len(names_found)} names in open ended questions') - if len(names_found)>0: - print(names_found) - - - #Update strings_to_check - strings_to_check = [s for s in strings_to_check if s not in names_found] - - #Find all locations with pop less than 20,000 - print("-->Finding locations with low population") - locations_with_low_population_found = api_queries.get_locations_with_low_population(strings_to_check, country) - print(f'Found {len(locations_with_low_population_found)} locations with low populations') - if len(locations_with_low_population_found)>0: - print(locations_with_low_population_found) - - return list(set(phone_numbers_found + names_found + locations_with_low_population_found)) - -if __name__ == "__main__": - - # dataset_path = 'X:\Box Sync\GRDS_Resources\Data Science\Test data\Raw\RECOVR_MEX_r1_Raw.dta' - - # reading_status, reading_content = import_file(dataset_path) - - # if(reading_status is False): - # print("Problem importing file") - - # dataset = reading_content[DATASET] - # label_dict = reading_content[LABEL_DICT] - - # columns_to_check = [c for c in dataset.columns if c not in restricted_words_list.get_surveycto_restricted_vars()] - - # find_piis(dataset, label_dict, columns_to_check) - - print(find_names_in_list_string(['Felipe','nombrequenoexiste', 'George', 'Felipe', 'Enriqueta', 'dededede'])) - - - diff --git a/hash_generator.py b/hash_generator.py deleted file mode 100644 index 85a1706..0000000 --- a/hash_generator.py +++ /dev/null @@ -1,22 +0,0 @@ -import hashlib -import hmac -import hmac_secret_key - -def sha1(message): - return hashlib.sha1(bytes(message, encoding='utf-8')).hexdigest() - -def hmac_sha1(secret_key, message): - - h = hmac.new(bytes(secret_key, encoding='utf-8'), msg=bytes(message, encoding='utf-8'), digestmod=hashlib.sha1) - return h.hexdigest() - -if __name__ == '__main__': - print(sha1(message="The Ore-Ida brand is a syllabic abbreviation of Oregon and Idaho")) - - - example = {} - for name in ['felipe', 'michael', 'lindsey']: - # example[name] = hmac_sha1(secret_key = 'a', message = name) - - secret_key = hmac_secret_key.get_secret_key() - example[name] = hmac_sha1(secret_key = secret_key, message = name) diff --git a/ipa_logo.jpg b/ipa_logo.jpg deleted file mode 100644 index 37163f8..0000000 Binary files a/ipa_logo.jpg and /dev/null differ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7e807e5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,160 @@ +[build-system] +requires = ["uv_build>=0.8.22,<0.9.0"] +build-backend = "uv_build" + +[project] +name = "pii-detector" +version = "1.0.0" +description = "A tool to identify and handle personally identifiable information (PII) in datasets" +readme = "README.md" +requires-python = ">=3.9" +license = { text = "MIT" } +authors = [ + { name = "IPA Global Research and Data Science", email = "researchsupport@poverty-action.org" }, +] +keywords = ["pii", "data-privacy", "anonymization", "data-processing", "ipa"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: End Users/Desktop", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: Microsoft :: Windows", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Security", +] +dependencies = [ + "pandas>=2.0.0", + "requests>=2.25.0", + "selenium>=4.0.0", + "Pillow>=8.0.0", + "numpy>=1.20.0", + "openpyxl>=3.0.0", # For Excel file support + "flet[all]>=0.23.0", # For modern GUI + # Presidio dependencies (optional - graceful degradation if not available) + "presidio-analyzer>=2.2.0; extra == 'presidio'", + "presidio-anonymizer>=2.2.0; extra == 'presidio'", + "spacy>=3.4.0; extra == 'presidio'", + "en-core-web-sm", + "es-core-news-sm", +] + +[project.optional-dependencies] +presidio = [ + "presidio-analyzer>=2.2.0", + "presidio-anonymizer>=2.2.0", + "spacy>=3.4.0", + # Note: spaCy models are installed separately via spacy.cli.download or model_manager.py +] +presidio-structured = [ + "presidio-structured>=0.0.6", + "presidio-analyzer>=2.2.0", + "presidio-anonymizer>=2.2.0", + "spacy>=3.4.0", +] +batch = [ + "presidio-analyzer>=2.2.0", + "presidio-anonymizer>=2.2.0", + "presidio-structured>=0.0.6", + "spacy>=3.4.0", +] + +[dependency-groups] +dev = [ + "codespell>=2.4.1", + "pre-commit>=4.2.0", + "ruff>=0.7.4", + "pytest>=7.0.0", + "pytest-cov>=4.0.0", + "pyinstaller>=5.0.0", + "jupyterlab>=4.4.7", +] + +[project.urls] +Homepage = "https://github.com/PovertyAction/PII_detection" +Repository = "https://github.com/PovertyAction/PII_detection" +Issues = "https://github.com/PovertyAction/PII_detection/issues" +"Bug Reports" = "https://github.com/PovertyAction/PII_detection/issues" + +[project.scripts] +pii-detector = "pii_detector.gui.frontend:main" + +[project.gui-scripts] +pii-detector-gui = "pii_detector.gui.frontend:main" + +[tool.ruff] +line-length = 88 +fix = true +target-version = "py312" +src = ["src", "tests"] + +[tool.ruff.lint] +# docs: https://docs.astral.sh/ruff/rules/ +select = [ + "F", # Pyflakes + "E", # pycodestyle errors + "W", # pycodestyle warnings + "I", # isort + "D", # flake8-docstrings + "UP", # pyupgrade + "SIM", # flake8-simplify +] + +ignore = [ + # do not enable if formatting + # docs: https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules + "W191", # tab indentation + "E111", # indentation + "E114", # indentation + "E117", # over indented + "D206", # indent with spaces + "D300", # triple single quotes + "E501", # line length regulated by formatter + "D105", # missing docstring in magic method + "D100", # missing docstring in public module + "D104", # missing docstring in public package + "SIM110", # Use all instead of `for` loop + "TRY003", # Avoid specifying long messages outside the exception class + "D205", # 1 blank line required between summary line and description + "D203", + "D213", +] + +[tool.ruff.format] +docstring-code-format = true +docstring-code-line-length = 88 + +[tool.ruff.lint.per-file-ignores] +"tests/*" = [ + "D", + "S101", + "PLR2004", +] # Allow missing docstrings and assert statements in tests + +[tool.codespell] +builtin = "clear,rare,informal,usage,code,names" +ignore-words-list = "pii,piis,thead,som,selv,alle,ned,vor,mange,thi,allo,contro,vill,nam,fo,direccion,informacion,mata,als,deine,deines,ist,oder,sie,unser,unter,ba,meu,te,que,sur,toi,bu,siz,doen,ons,wil,ro,sizin,teh,kake,vas,rade,od,sme,mis,mot,vart,datas,noen,noe,somme,vai,eles,meus,couldn,wasn,lama,maka,makin,meni" +skip = "src/pii_detector/data/stopwords/,*.ipynb,.github/workflows/ci.yml,tests/data/clean_data.csv" + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py", "*_test.py"] +python_functions = ["test_*"] +addopts = [ + "--cov=pii_detector", + "--cov-report=term-missing", + "--cov-report=html", + "--strict-markers", +] +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", + "integration: marks tests as integration tests", +] + +[tool.uv.sources] +en-core-web-sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" } +es-core-news-sm = { url = "https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl" } diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 9aeddb5..0000000 --- a/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -image -pandas -requests -selenium diff --git a/restricted_words.py b/restricted_words.py deleted file mode 100644 index 6dd043e..0000000 --- a/restricted_words.py +++ /dev/null @@ -1,45 +0,0 @@ -#Fuzzy = variables that if contained inside a column name/label, there will be a match -#Strict = variables that if are strictly equal to column name/label, there will be a match - -#SURVEY CTO VARIABLES -survey_cto_strict = ['deviceid', 'subscriberid', 'simid', 'formdef_version', 'devicephonenum', 'duration', 'bc_rand','key','starttime','endtime', 'audio_audit_cons_1', 'audio_audit_cons_2', 'audio_audit_cons_positivo', 'text_audit','text_audit_field', 'call_log','caseid','sstrm_pct_conversation','sstat_sound_level','sstrm_sound_level','audio_audit_survey','reschedule_format', 'reschedule_2_format'] - -#LOCATIONS VARIABLES -locations_strict = ['vill', 'lc'] - -locations_fuzzy = ['district', 'country', 'subcountry', 'parish', 'village', 'community', 'location', 'panchayat', 'compound', 'survey_location', 'county', 'subcounty', 'ciudad','distrito','villa','city', 'town', 'neighborhood','neighbourhood', 'barangay', 'brgy', 'municipio', 'colonia','alcaldia','alcaldía', 'upazila', 'tribe'] - -#STATA VARIABLES -stata_strict = ['nam','add','addr','addr1','addr2','dist','parish','loc','acc','plan','medic','insur','num','resid','home','spec','id','enum', 'info', 'data', 'comm', 'count', 'fo'] - -#IPA GUIDELINE DOCUMENT -other_strict = ['gps', 'lat', 'lon', 'coord', 'house', 'social', 'census', 'fax', 'ip', 'url', 'specify', 'enumerator', 'random', 'name', 'enum_name', 'rand','uid','hh', 'age', 'gps','id', 'ip','red','fono','url', 'web', 'number', 'encuestador', 'escuela', 'colegio','edad', 'insurance', 'school', 'birth'] - -other_fuzzy = ['name', '_name','fname', 'lname', 'first_name', 'last_name', 'birthday', 'bday','address', 'network','email','beneficiary','mother','wife','father','husband', 'enumerator ','enumerator_', 'child_age', 'latitude', 'longitude', 'coordinates', 'website', 'nickname', 'nick_name', 'firstname', 'lastname', 'sublocation', 'alternativecontact', 'division', 'resp_name', 'head_name', 'headname', 'respname', 'subvillage'] - -#OTHER LANGUAGES -spanish_fuzzy = ['apellido', 'apellidos', 'beneficiario', 'censo', 'comunidad', 'contar', 'coordenadas', 'direccion', 'edad_nino', 'email', 'esposa', 'esposo', 'fecha_nacimiento', 'identificador', 'identidad', 'informacion', 'latitud', 'latitude', 'locacion', 'longitud', 'madre', 'medico', 'nino', 'nombre', 'numero', 'padre', 'pag_web', 'pais', 'parroquia', 'primer_nombre', 'random', 'salud', 'seguro', 'ubicacion'] - -swahili_strict = ['jina', 'simu', 'mkoa', 'wilaya', 'kata', 'kijiji', 'kitongoji', 'vitongoji', 'nyumba', 'numba', 'namba', 'tarahe ya kuzaliwa', 'umri', 'jinsi', 'jinsia'] - -def get_locations_strict_restricted_words(): - return locations_strict - -def get_locations_fuzzy_restricted_words(): - return locations_fuzzy - -def get_surveycto_restricted_vars(): - return survey_cto_strict - -def get_strict_restricted_words(): - strict_restricted = stata_strict + other_strict + swahili_strict - return list(set(strict_restricted)) - -def get_fuzzy_restricted_words(): - fuzzy_restricted = other_fuzzy + spanish_fuzzy - return list(set(fuzzy_restricted)) - -#Check for repeated words in lists of strict and fuzzy -#strict = get_strict_restricted_words() -#fuzzy = get_fuzzy_restricted_words() -#print([word for word in strict if word in fuzzy]) diff --git a/scripts/manage_models.py b/scripts/manage_models.py new file mode 100644 index 0000000..31b07e3 --- /dev/null +++ b/scripts/manage_models.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +"""Utility script for managing spaCy models for PII detection. + +This script provides a command-line interface for installing, listing, and managing +spaCy models used by Presidio for enhanced PII detection. +""" + +import argparse +import sys + +try: + from pii_detector.core.model_manager import get_model_manager + + MANAGER_AVAILABLE = True +except ImportError as e: + print(f"Error: Could not import model manager: {e}") + print("Please install the presidio dependencies first: just install-presidio") + MANAGER_AVAILABLE = False + + +def list_models(): + """List installed and available spaCy models.""" + if not MANAGER_AVAILABLE: + return False + + manager = get_model_manager() + + print("=== spaCy Model Status ===") + print(f"spaCy Available: {manager.spacy_available}") + + if manager.spacy_available: + print(f"Installed Models: {manager.installed_models}") + print(f"Available Languages: {manager.get_available_languages()}") + + # Show details for installed models + if manager.installed_models: + print("\n=== Installed Model Details ===") + for model in manager.installed_models: + info = manager.get_model_info(model) + if info["status"] == "available": + print(f" {model}:") + print(f" Language: {info['language']}") + print(f" Size: {info['size']}") + print(f" Version: {info['version']}") + print(f" Components: {', '.join(info['components'])}") + else: + print(f" {model}: {info['status']}") + + return True + + +def install_model(model_name: str, force: bool = False): + """Install a specific spaCy model.""" + if not MANAGER_AVAILABLE: + return False + + manager = get_model_manager() + + print(f"Installing spaCy model: {model_name}") + if force: + print("(Force installation enabled)") + + success = manager.install_model(model_name, force=force) + + if success: + print(f"✓ Successfully installed {model_name}") + + # Show model info + info = manager.get_model_info(model_name) + if info["status"] == "available": + print(f" Language: {info['language']}") + print(f" Size: {info['size']}") + print(f" Version: {info['version']}") + else: + print(f"✗ Failed to install {model_name}") + + return success + + +def install_language_model(language: str, size: str = "sm"): + """Install the best model for a language.""" + if not MANAGER_AVAILABLE: + return False + + manager = get_model_manager() + + print(f"Installing {size} model for {language}...") + model_name = manager.install_default_model(language, size) + + if model_name: + print(f"✓ Successfully installed {model_name}") + + # Show model info + info = manager.get_model_info(model_name) + if info["status"] == "available": + print(f" Language: {info['language']}") + print(f" Size: {info['size']}") + print(f" Version: {info['version']}") + else: + print(f"✗ Failed to install model for {language}") + + return model_name is not None + + +def ensure_model(language: str, size: str = "sm"): + """Ensure a model is available for a language.""" + if not MANAGER_AVAILABLE: + return False + + manager = get_model_manager() + + print(f"Ensuring {size} model for {language} is available...") + model_name = manager.ensure_model_available(language, size) + + if model_name: + print(f"✓ Model available: {model_name}") + return True + else: + print(f"✗ Could not ensure model availability for {language}") + return False + + +def cleanup_models(keep_languages: list[str] | None = None): + """Remove unused spaCy models.""" + if not MANAGER_AVAILABLE: + return False + + manager = get_model_manager() + + if keep_languages: + print(f"Cleaning up models, keeping languages: {keep_languages}") + else: + print("Cleaning up all unused models...") + + manager.cleanup_unused_models(keep_languages) + print("✓ Cleanup complete") + + return True + + +def main(): + """Run main entry point.""" + if not MANAGER_AVAILABLE: + sys.exit(1) + + parser = argparse.ArgumentParser( + description="Manage spaCy models for PII detection", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s list # List all models + %(prog)s install en_core_web_sm # Install specific model + %(prog)s install-lang en md # Install medium English model + %(prog)s ensure en sm # Ensure small English model exists + %(prog)s cleanup --keep en es # Remove models except English/Spanish + """, + ) + + subparsers = parser.add_subparsers(dest="command", help="Available commands") + + # List command + subparsers.add_parser("list", help="List installed and available models") + + # Install specific model + install_parser = subparsers.add_parser( + "install", help="Install specific spaCy model" + ) + install_parser.add_argument("model_name", help="Name of the model to install") + install_parser.add_argument( + "--force", action="store_true", help="Force installation" + ) + + # Install language model + lang_parser = subparsers.add_parser( + "install-lang", help="Install model for language" + ) + lang_parser.add_argument("language", help="Language code (en, es, de, etc.)") + lang_parser.add_argument( + "size", nargs="?", default="sm", choices=["sm", "md", "lg"], help="Model size" + ) + + # Ensure model + ensure_parser = subparsers.add_parser("ensure", help="Ensure model is available") + ensure_parser.add_argument("language", help="Language code") + ensure_parser.add_argument( + "size", nargs="?", default="sm", choices=["sm", "md", "lg"], help="Model size" + ) + + # Cleanup + cleanup_parser = subparsers.add_parser("cleanup", help="Remove unused models") + cleanup_parser.add_argument( + "--keep", nargs="*", metavar="LANG", help="Languages to keep models for" + ) + + args = parser.parse_args() + + if not args.command: + parser.print_help() + return + + try: + if args.command == "list": + success = list_models() + elif args.command == "install": + success = install_model(args.model_name, args.force) + elif args.command == "install-lang": + success = install_language_model(args.language, args.size) + elif args.command == "ensure": + success = ensure_model(args.language, args.size) + elif args.command == "cleanup": + success = cleanup_models(args.keep) + else: + parser.print_help() + success = False + + if not success: + sys.exit(1) + + except KeyboardInterrupt: + print("\nOperation cancelled by user") + sys.exit(1) + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/src/pii_detector/__init__.py b/src/pii_detector/__init__.py new file mode 100644 index 0000000..f02e1be --- /dev/null +++ b/src/pii_detector/__init__.py @@ -0,0 +1,9 @@ +"""PII Detector - A tool for identifying and handling personally identifiable information in datasets.""" + +__version__ = "0.2.23" +__author__ = "IPA Global Research and Data Science Team" +__email__ = "researchsupport@poverty-action.org" + +from pii_detector.core import processor + +__all__ = ["processor"] diff --git a/src/pii_detector/api/__init__.py b/src/pii_detector/api/__init__.py new file mode 100644 index 0000000..4ffbfec --- /dev/null +++ b/src/pii_detector/api/__init__.py @@ -0,0 +1,5 @@ +"""External API integrations for location population lookups.""" + +from pii_detector.api.queries import query_location_population + +__all__ = ["query_location_population"] diff --git a/src/pii_detector/api/queries.py b/src/pii_detector/api/queries.py new file mode 100644 index 0000000..1adb53a --- /dev/null +++ b/src/pii_detector/api/queries.py @@ -0,0 +1,342 @@ +"""External API integrations for location population lookups and other queries.""" + +import json +import os + +import requests +from selenium import webdriver +from selenium.webdriver.chrome.options import Options + +from pii_detector.data.constants import COUNTRY_NAME_TO_ISO_CODE + +# Global driver instance for Google queries +_driver = None + + +def get_api_credentials() -> dict[str, str | None]: + """Get API credentials from environment variables.""" + return { + "geonames_username": os.environ.get("GEONAMES_USERNAME"), + "forebears_api_key": os.environ.get("FOREBEARS_API_KEY"), + } + + +def ask_google(query: str) -> str | bool: + """Query Google for population information.""" + global _driver + + if _driver is None: + chrome_options = Options() + chrome_options.add_argument("--window-size=1024x768") + chrome_options.add_argument("--headless") + try: + _driver = webdriver.Chrome(options=chrome_options) + except Exception as e: + print(f"Could not initialize Chrome driver: {e}") + return False + + try: + # Search for query + query = query.replace(" ", "+") + _driver.get("http://www.google.com/search?q=" + query) + + # Get text from Google answer box + for y_location in [230, 350]: + answer = _driver.execute_script( + "return document.elementFromPoint(arguments[0], arguments[1]);", + 350, + y_location, + ).text + if answer != "": + return answer + + return False + except Exception as e: + print(f"Error querying Google: {e}") + return False + + +def get_country_iso_code(country_name: str) -> str | None: + """Get ISO country code from country name.""" + return COUNTRY_NAME_TO_ISO_CODE.get(country_name) + + +def check_location_exists_and_population_size( + location: str, country: str +) -> tuple[bool, int | bool]: + """Check if a location exists and get its population using GeoNames API. + + Returns: + Tuple of (location_exists, population) + population can be int, False (if unknown), or bool False if location doesn't exist + + """ + credentials = get_api_credentials() + username = credentials.get("geonames_username") + + if not username: + print("Warning: GEONAMES_USERNAME not set in environment variables") + return False, False + + api_url = ( + f"http://api.geonames.org/searchJSON?name={location}&name_equals={location}" + f"&maxRows=1&orderby=population&isNameRequired=true&username={username}" + ) + + country_iso = get_country_iso_code(country) + if country_iso: + api_url += f"&country={country_iso}" + + try: + response = requests.get(api_url, timeout=10) + response_json = response.json() + + if ( + "totalResultsCount" in response_json + and response_json["totalResultsCount"] > 0 + ): + geoname = response_json["geonames"][0] + if "population" in geoname and geoname["population"] != 0: + return True, geoname["population"] + else: + return True, False + else: + return False, False + + except Exception as e: + print(f"Error querying GeoNames API: {e}") + return False, False + + +def get_population_from_google_query_result(query_result: str) -> int | bool: + r"""Parse population from Google query result. + + Handles formats like: + - 3,685\n2010 + - 91,411 (2018) + - 14,810,001 + - 17 million people + - 1.655 million (2010) + """ + try: + clean_query_result = query_result + + # Remove commas: 14,810,001 + clean_query_result = clean_query_result.replace(",", "") + + # Handle newlines: 3685\\n2010 + clean_query_result = clean_query_result.split("\\n")[0] + + # Handle parentheses and extra text + if " " in clean_query_result: + parts = clean_query_result.split(" ") + # Keep only the number and potential multiplier + if len(parts) > 1 and parts[1] in ["million", "thousand"]: + clean_query_result = f"{parts[0]} {parts[1]}" + else: + clean_query_result = parts[0] + + # Handle millions: 1.655 million + if " " in clean_query_result: + number_str, multiplier = clean_query_result.split(" ") + result = float(number_str) + if multiplier == "million": + result = result * 1000000 + elif multiplier == "thousand": + result = result * 1000 + clean_query_result = str(int(result)) + + return int(clean_query_result) + + except Exception as e: + print(f"Error parsing population from Google result: {e}") + return False + + +def google_population(location: str) -> int | bool: + """Get population of a location by querying Google.""" + query_result = ask_google(f"{location} population") + + if query_result: + population = get_population_from_google_query_result(query_result) + return population + else: + return False + + +def get_locations_with_low_population( + locations: list[str], + country: str, + low_population_threshold: int = 20000, + return_one: bool | None = None, + consider_low_population_if_unknown_population: bool = False, +) -> list[str] | str | bool: + """Check which locations have low population. + + Args: + locations: List of location names to check + country: Country name for context + low_population_threshold: Population threshold for "low population" + return_one: If True, return first location with low population + consider_low_population_if_unknown_population: If True, treat unknown as low + + Returns: + List of locations with low population, or single location if return_one=True, + or False if none found when return_one=True + + """ + locations_with_low_population = [] + locations_with_unknown_population = [] + + for index, location in enumerate(locations): + if index % 50 == 0: + print(f"{index}/{len(locations)}: {location}") + + location_exists, population = check_location_exists_and_population_size( + location, country + ) + + if location_exists: + if not population: + population = google_population(location) + + if population: + print(f"Found population for {location}: {population}") + if population < low_population_threshold: + print(f"{location} has LOW population") + if return_one: + return location + else: + locations_with_low_population.append(location) + else: + # Found a location with known population - now consider unknowns as low + if not consider_low_population_if_unknown_population: + locations_with_low_population.extend( + locations_with_unknown_population + ) + consider_low_population_if_unknown_population = True + else: + # Unknown population + if consider_low_population_if_unknown_population: + if return_one: + return location + else: + locations_with_low_population.append(location) + else: + locations_with_unknown_population.append(location) + + if return_one: + return False + else: + return locations_with_low_population + + +def find_names_in_list_string(list_potential_names: list[str]) -> list[str]: + """Find actual names from a list of potential names using Forebears API. + + Note: Requires FOREBEARS_API_KEY environment variable. + """ + credentials = get_api_credentials() + api_key = credentials.get("forebears_api_key") + + if not api_key: + print("Warning: FOREBEARS_API_KEY not set in environment variables") + return [] + + all_names_found = set() + + # API calls must query at most 1,000 names + n = 1000 + chunks = [ + list_potential_names[i : i + n] for i in range(0, len(list_potential_names), n) + ] + + for chunk in chunks: + for name_type in ["forename", "surname"]: + try: + api_url = f"https://ono.4b.rs/v1/jurs?key={api_key}" + names_parameter = _generate_names_parameter_for_api(chunk, name_type) + + response = requests.post( + api_url, data={"names": names_parameter}, timeout=30 + ) + + names_found = _get_names_from_json_response(response.text) + all_names_found.update(names_found) + + except Exception as e: + print(f"Error querying Forebears API: {e}") + + return list(all_names_found) + + +def _generate_names_parameter_for_api(list_names: list[str], option: str) -> str: + """Generate names parameter for Forebears API.""" + list_of_names_json = [] + for name in list_names: + list_of_names_json.append(f'{{"name":"{name}","type":"{option}","limit":2}}') + + return "[" + ",".join(list_of_names_json) + "]" + + +def _get_names_from_json_response(response: str) -> list[str]: + """Extract names from Forebears API JSON response.""" + names_found = [] + + try: + json_response = json.loads(response) + + if "results" in json_response: + for result in json_response["results"]: + # Names that exist come with the field 'jurisdictions' + # We will also ask a minimum of 50 world incidences + if "jurisdictions" in result and len(result["jurisdictions"]) > 0: + try: + world_incidences = int(result["world"]["incidence"]) + if world_incidences > 50: + names_found.append(result["name"]) + except Exception as e: + print(f"Error processing result: {e}") + else: + print("No results in response") + + except json.JSONDecodeError as e: + print(f"Error parsing JSON response: {e}") + + return names_found + + +def cleanup_driver(): + """Clean up the global webdriver instance.""" + global _driver + if _driver: + try: + _driver.quit() + _driver = None + except Exception as e: + print(f"Error closing driver: {e}") + + +def query_location_population(location: str, country: str) -> int | None: + """Query location population from external APIs. + + Args: + location: Location name + country: Country name + + Returns: + Population number or None if not found + + """ + location_exists, population = check_location_exists_and_population_size( + location, country + ) + + if location_exists and population: + return population + elif location_exists: + # Try Google as backup + google_pop = google_population(location) + return google_pop if google_pop else None + else: + return None diff --git a/src/pii_detector/cli/__init__.py b/src/pii_detector/cli/__init__.py new file mode 100644 index 0000000..41bd8d8 --- /dev/null +++ b/src/pii_detector/cli/__init__.py @@ -0,0 +1,3 @@ +"""Command-line interface for the PII detector.""" + +__all__ = [] diff --git a/src/pii_detector/cli/fixed_main.py b/src/pii_detector/cli/fixed_main.py new file mode 100644 index 0000000..f9c4545 --- /dev/null +++ b/src/pii_detector/cli/fixed_main.py @@ -0,0 +1,272 @@ +"""Fixed CLI that doesn't auto-launch GUI.""" + +import argparse +import sys +from pathlib import Path + +# Import batch processing functionality +from pii_detector.core.batch_processor import process_dataset_batch +from pii_detector.core.processor import import_dataset +from pii_detector.gui.frontend import main as gui_main + + +def main(): + """Run the CLI interface for PII detection.""" + parser = argparse.ArgumentParser( + description="PII Detector - Identify and handle PII in datasets", + epilog="Use --help with subcommands for more info", + ) + + # Global options + parser.add_argument( + "--version", "-v", action="version", version="PII Detector 0.2.23" + ) + parser.add_argument("--verbose", action="store_true", help="Verbose output") + parser.add_argument( + "--output-format", + choices=["table", "json", "csv"], + default="table", + help="Output format", + ) + + # Subcommands + subparsers = parser.add_subparsers(dest="command", help="Available commands") + + # GUI command + subparsers.add_parser("gui", help="Launch graphical interface") + + # Analyze command + analyze_parser = subparsers.add_parser("analyze", help="Analyze file for PII") + analyze_parser.add_argument("file", help="Path to dataset file") + analyze_parser.add_argument( + "--presidio", action="store_true", help="Enable Presidio ML detection" + ) + analyze_parser.add_argument( + "--no-location", action="store_true", help="Disable location population checks" + ) + analyze_parser.add_argument( + "--confidence", type=float, default=0.7, help="Confidence threshold (0.0-1.0)" + ) + + # Batch command + batch_parser = subparsers.add_parser("batch", help="Batch process multiple files") + batch_parser.add_argument("pattern", help="File pattern (e.g., '*.csv')") + batch_parser.add_argument( + "--chunk-size", type=int, default=1000, help="Processing chunk size" + ) + batch_parser.add_argument( + "--workers", type=int, default=4, help="Number of parallel workers" + ) + + # Anonymize command + anon_parser = subparsers.add_parser("anonymize", help="Anonymize dataset") + anon_parser.add_argument("file", help="Path to dataset file") + anon_parser.add_argument("--output", "-o", help="Output file path") + anon_parser.add_argument( + "--method", + choices=["hash", "remove", "presidio"], + default="hash", + help="Anonymization method", + ) + + args = parser.parse_args() + + # Route to appropriate handler + if args.command == "gui" or args.command is None: + # Only launch GUI if explicitly requested or no command given + if args.command is None: + print( + "No command specified. Available commands: analyze, batch, anonymize, gui" + ) + print( + "Use --help for more information, or 'gui' to launch the graphical interface." + ) + return 1 + gui_main() + + elif args.command == "analyze": + return handle_analyze_command(args) + + elif args.command == "batch": + return handle_batch_command(args) + + elif args.command == "anonymize": + return handle_anonymize_command(args) + + else: + parser.print_help() + return 1 + + return 0 + + +def handle_analyze_command(args): + """Handle the analyze command.""" + file_path = Path(args.file) + if not file_path.exists(): + print(f"Error: File '{file_path}' not found.") + return 1 + + print(f"Analyzing file: {file_path}") + + # Load dataset + success, result = import_dataset(str(file_path)) + if not success: + print(f"Error loading dataset: {result}") + return 1 + + dataset, dataset_path, label_dict, value_label_dict = result + print(f"Loaded dataset: {len(dataset)} rows, {len(dataset.columns)} columns") + + # Configure detection + detection_config = { + "use_presidio_detection": args.presidio, + "use_location_detection": not args.no_location, + "presidio_confidence_threshold": args.confidence, + } + + # Run detection (using basic unified processor for now) + from pii_detector.core.unified_processor import detect_pii_unified + + results = detect_pii_unified(dataset, label_dict, config=detection_config) + + # Output results + print_results(results, args.output_format) + return 0 + + +def handle_batch_command(args): + """Handle the batch processing command.""" + import glob + + files = glob.glob(args.pattern) + if not files: + print(f"No files found matching pattern: {args.pattern}") + return 1 + + print(f"Processing {len(files)} files with batch processing...") + + for file_path in files: + print(f"Processing: {file_path}") + + # Load and process each file + success, result = import_dataset(file_path) + if success: + dataset, _, label_dict, _ = result + + # Use batch processor + detection_results, anonymized_df, report = process_dataset_batch( + dataset, + label_dict=label_dict, + chunk_size=args.chunk_size, + max_workers=args.workers, + ) + + print(f" Found PII in {len(detection_results)} columns") + print(f" Processing completed: {report.get('batch_anonymization', 'No')}") + else: + print(f" Error: {result}") + + return 0 + + +def handle_anonymize_command(args): + """Handle the anonymize command.""" + file_path = Path(args.file) + if not file_path.exists(): + print(f"Error: File '{file_path}' not found.") + return 1 + + # Determine output path + if args.output: + output_path = Path(args.output) + else: + output_path = ( + file_path.parent / f"{file_path.stem}_anonymized{file_path.suffix}" + ) + + print(f"Anonymizing: {file_path} -> {output_path}") + + # Load dataset + success, result = import_dataset(str(file_path)) + if not success: + print(f"Error loading dataset: {result}") + return 1 + + dataset, _, label_dict, _ = result + + # Detect PII first + from pii_detector.core.unified_processor import detect_pii_unified + + detection_results = detect_pii_unified(dataset, label_dict) + + if not detection_results: + print("No PII detected - nothing to anonymize") + return 0 + + # Anonymize using hybrid anonymizer + from pii_detector.core.hybrid_anonymizer import anonymize_dataset_hybrid + + anonymization_config = {col: {"method": args.method} for col in detection_results} + + anonymized_df, report = anonymize_dataset_hybrid( + dataset, detection_results, anonymization_config + ) + + # Save results + if output_path.suffix.lower() == ".csv": + anonymized_df.to_csv(output_path, index=False) + elif output_path.suffix.lower() in [".xlsx", ".xls"]: + anonymized_df.to_excel(output_path, index=False) + else: + # Default to CSV + anonymized_df.to_csv(output_path, index=False) + + print(f"Anonymized dataset saved to: {output_path}") + print(f"Processed {len(report.get('columns_processed', []))} PII columns") + + return 0 + + +def print_results(results, output_format): + """Print detection results in specified format.""" + if output_format == "json": + import json + + # Convert results to JSON-serializable format + json_results = {} + for col, result in results.items(): + json_results[col] = { + "detection_method": result.detection_method, + "confidence": result.confidence, + "entity_types": result.entity_types, + } + print(json.dumps(json_results, indent=2)) + + elif output_format == "csv": + print("Column,Detection Method,Confidence,Entity Types") + for col, result in results.items(): + entity_types = ";".join(result.entity_types) if result.entity_types else "" + print( + f"{col},{result.detection_method},{result.confidence:.2f},{entity_types}" + ) + + else: # table format + if results: + print(f"\nFound PII in {len(results)} columns:") + print("-" * 60) + for col, result in results.items(): + entity_info = ( + f" ({', '.join(result.entity_types)})" + if result.entity_types + else "" + ) + print( + f"{col:25} | {result.detection_method:20} | {result.confidence:.2f}{entity_info}" + ) + else: + print("No PII detected in this dataset.") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/pii_detector/cli/main.py b/src/pii_detector/cli/main.py new file mode 100644 index 0000000..256b099 --- /dev/null +++ b/src/pii_detector/cli/main.py @@ -0,0 +1,577 @@ +"""Enhanced CLI that provides proper subcommands and batch processing.""" + +import argparse +import json +import sys +from pathlib import Path + +# Import batch processing functionality +from pii_detector.core.batch_processor import BatchPIIProcessor, process_dataset_batch +from pii_detector.core.processor import import_dataset +from pii_detector.gui.frontend import main as gui_main + + +def main(): + """Run the CLI interface for PII detection.""" + parser = argparse.ArgumentParser( + description="PII Detector - Identify and handle PII in datasets", + epilog="Use --help with subcommands for more info", + ) + + # Global options + parser.add_argument( + "--version", "-v", action="version", version="PII Detector 0.2.23" + ) + parser.add_argument("--verbose", action="store_true", help="Verbose output") + parser.add_argument( + "--output-format", + choices=["table", "json", "csv"], + default="table", + help="Output format", + ) + + # Subcommands + subparsers = parser.add_subparsers(dest="command", help="Available commands") + + # GUI command + subparsers.add_parser("gui", help="Launch graphical interface") + + # Analyze command + analyze_parser = subparsers.add_parser("analyze", help="Analyze file for PII") + analyze_parser.add_argument("file", help="Path to dataset file (.csv, .xlsx, .dta)") + analyze_parser.add_argument( + "--presidio", action="store_true", help="Enable Presidio ML detection" + ) + analyze_parser.add_argument( + "--no-location", action="store_true", help="Disable location population checks" + ) + analyze_parser.add_argument( + "--confidence", type=float, default=0.7, help="Confidence threshold (0.0-1.0)" + ) + analyze_parser.add_argument( + "--language", + choices=["en", "es", "other"], + default="en", + help="Dataset language", + ) + + # Batch command + batch_parser = subparsers.add_parser("batch", help="Batch process multiple files") + batch_parser.add_argument( + "pattern", help="File pattern (e.g., '*.csv', '*.dta') or directory" + ) + batch_parser.add_argument( + "--chunk-size", type=int, default=1000, help="Processing chunk size" + ) + batch_parser.add_argument( + "--workers", type=int, default=4, help="Number of parallel workers" + ) + batch_parser.add_argument( + "--presidio", action="store_true", help="Enable Presidio ML detection" + ) + batch_parser.add_argument("--output-dir", help="Directory to save results") + + # Anonymize command + anon_parser = subparsers.add_parser("anonymize", help="Anonymize dataset") + anon_parser.add_argument("file", help="Path to dataset file (.csv, .xlsx, .dta)") + anon_parser.add_argument( + "--output", "-o", help="Output file path (.csv, .xlsx, .dta)" + ) + anon_parser.add_argument( + "--method", + choices=["hash", "remove", "categorize", "presidio"], + default="hash", + help="Anonymization method", + ) + anon_parser.add_argument( + "--presidio", action="store_true", help="Enable Presidio ML detection" + ) + + # Report command + report_parser = subparsers.add_parser( + "report", help="Generate PII detection report" + ) + report_parser.add_argument("file", help="Path to dataset file (.csv, .xlsx, .dta)") + report_parser.add_argument("--output", "-o", help="Report output file") + report_parser.add_argument( + "--format", choices=["txt", "json", "html"], default="txt", help="Report format" + ) + + args = parser.parse_args() + + # Route to appropriate handler + if args.command == "gui": + gui_main() + elif args.command == "analyze": + return handle_analyze_command(args) + elif args.command == "batch": + return handle_batch_command(args) + elif args.command == "anonymize": + return handle_anonymize_command(args) + elif args.command == "report": + return handle_report_command(args) + elif args.command is None: + # No command specified - show help and suggest options + print("PII Detector - Identify and handle PII in datasets") + print("\nAvailable commands:") + print(" analyze Analyze a single file for PII") + print(" batch Process multiple files efficiently") + print(" anonymize Anonymize detected PII in a dataset") + print(" report Generate detailed PII detection reports") + print(" gui Launch graphical interface") + print("\nUse 'pii-detector --help' for command-specific help") + print("Use 'pii-detector gui' to launch the graphical interface") + return 0 + else: + parser.print_help() + return 1 + + return 0 + + +def handle_analyze_command(args): + """Handle the analyze command.""" + file_path = Path(args.file) + if not file_path.exists(): + print(f"Error: File '{file_path}' not found.") + return 1 + + if args.verbose: + print(f"Analyzing file: {file_path}") + + # Load dataset + success, result = import_dataset(str(file_path)) + if not success: + print(f"Error loading dataset: {result}") + return 1 + + dataset, dataset_path, label_dict, value_label_dict = result + if args.verbose: + print(f"Loaded dataset: {len(dataset)} rows, {len(dataset.columns)} columns") + + # Use batch processor for consistent results + processor = BatchPIIProcessor( + language=args.language, + chunk_size=len(dataset), # Process all at once for single file + max_workers=1, + ) + + if args.verbose: + print("Running PII detection...") + + try: + # Run detection + results = processor.detect_pii_batch(dataset, label_dict) + + # Output results + print_results(results, args.output_format, args.verbose) + return 0 + + except Exception as e: + print(f"Error during analysis: {e}") + if args.verbose: + import traceback + + traceback.print_exc() + return 1 + + +def handle_batch_command(args): + """Handle the batch processing command.""" + import glob + + # Handle pattern or directory + if Path(args.pattern).is_dir(): + files = list(Path(args.pattern).glob("*.csv")) + files.extend(list(Path(args.pattern).glob("*.xlsx"))) + files.extend(list(Path(args.pattern).glob("*.dta"))) + files = [str(f) for f in files] + else: + files = glob.glob(args.pattern) + + if not files: + print(f"No files found matching pattern: {args.pattern}") + return 1 + + print(f"Processing {len(files)} files with batch processing...") + + # Setup output directory + output_dir = ( + Path(args.output_dir) if args.output_dir else Path.cwd() / "batch_results" + ) + output_dir.mkdir(exist_ok=True) + + batch_results = {} + + for file_path in files: + print(f"\nProcessing: {file_path}") + + try: + # Load and process each file + success, result = import_dataset(file_path) + if success: + dataset, _, label_dict, _ = result + + # Use batch processor + detection_results, anonymized_df, report = process_dataset_batch( + dataset, + label_dict=label_dict, + chunk_size=args.chunk_size, + max_workers=args.workers, + language="en", + ) + + batch_results[file_path] = { + "pii_columns": len(detection_results), + "total_columns": len(dataset.columns), + "processing_time": report.get("processing_time_seconds", 0), + } + + print( + f" Found PII in {len(detection_results)} of {len(dataset.columns)} columns" + ) + + # Save results if output directory specified + if args.output_dir: + file_stem = Path(file_path).stem + + # Save detection results + results_file = output_dir / f"{file_stem}_pii_detection.json" + results_data = { + col: { + "detection_method": result.detection_method, + "confidence": result.confidence, + "entity_types": result.entity_types, + } + for col, result in detection_results.items() + } + with open(results_file, "w") as f: + json.dump(results_data, f, indent=2) + + # Save anonymized dataset in original format when possible + original_ext = Path(file_path).suffix.lower() + if original_ext == ".dta": + anon_file = output_dir / f"{file_stem}_anonymized.dta" + try: + anonymized_df.to_stata(anon_file, write_index=False) + except Exception as e: + print( + f" Warning: Could not save as .dta, using CSV: {e}" + ) + anon_file = output_dir / f"{file_stem}_anonymized.csv" + anonymized_df.to_csv(anon_file, index=False) + elif original_ext in [".xlsx", ".xls"]: + anon_file = output_dir / f"{file_stem}_anonymized.xlsx" + anonymized_df.to_excel(anon_file, index=False) + else: + anon_file = output_dir / f"{file_stem}_anonymized.csv" + anonymized_df.to_csv(anon_file, index=False) + + print(f" Results saved: {results_file}") + print(f" Anonymized data: {anon_file}") + + else: + print(f" Error: {result}") + batch_results[file_path] = {"error": str(result)} + + except Exception as e: + print(f" Error processing {file_path}: {e}") + batch_results[file_path] = {"error": str(e)} + + # Summary + print(f"\n{'=' * 60}") + print("BATCH PROCESSING SUMMARY") + print(f"{'=' * 60}") + + total_files = len(files) + successful_files = len([r for r in batch_results.values() if "error" not in r]) + total_pii_columns = sum(r.get("pii_columns", 0) for r in batch_results.values()) + + print(f"Files processed: {successful_files}/{total_files}") + print(f"Total PII columns detected: {total_pii_columns}") + + if args.output_dir: + print(f"Results saved to: {output_dir}") + + return 0 + + +def handle_anonymize_command(args): + """Handle the anonymize command.""" + file_path = Path(args.file) + if not file_path.exists(): + print(f"Error: File '{file_path}' not found.") + return 1 + + # Determine output path + if args.output: + output_path = Path(args.output) + else: + output_path = ( + file_path.parent / f"{file_path.stem}_anonymized{file_path.suffix}" + ) + + print(f"Anonymizing: {file_path} -> {output_path}") + + # Load dataset + success, result = import_dataset(str(file_path)) + if not success: + print(f"Error loading dataset: {result}") + return 1 + + dataset, _, label_dict, _ = result + + try: + # Use batch processor for detection and anonymization + # First detect PII + processor = BatchPIIProcessor(language="en") + detection_results = processor.detect_pii_batch(dataset, label_dict) + + if not detection_results: + print("No PII detected - nothing to anonymize") + return 0 + + # Then anonymize using the complete workflow + detection_results, anonymized_df, report = process_dataset_batch( + dataset, + label_dict=label_dict, + language="en", + anonymization_config={ + col: {"method": args.method} for col in detection_results + }, + ) + + # Save results + if output_path.suffix.lower() == ".csv": + anonymized_df.to_csv(output_path, index=False) + elif output_path.suffix.lower() in [".xlsx", ".xls"]: + anonymized_df.to_excel(output_path, index=False) + elif output_path.suffix.lower() == ".dta": + # Save as Stata format, preserving variable labels if available + try: + anonymized_df.to_stata(output_path, write_index=False) + except Exception as e: + print(f"Warning: Error saving as .dta format: {e}") + print("Falling back to CSV format") + csv_path = output_path.with_suffix(".csv") + anonymized_df.to_csv(csv_path, index=False) + print(f"Saved as CSV: {csv_path}") + return 0 + else: + # Default to CSV + anonymized_df.to_csv(output_path, index=False) + + print(f"Anonymized dataset saved to: {output_path}") + print(f"Processed {len(report.get('columns_processed', []))} PII columns") + + return 0 + + except Exception as e: + print(f"Error during anonymization: {e}") + return 1 + + +def handle_report_command(args): + """Handle the report generation command.""" + file_path = Path(args.file) + if not file_path.exists(): + print(f"Error: File '{file_path}' not found.") + return 1 + + # Determine output path + if args.output: + output_path = Path(args.output) + else: + extension = ".txt" if args.format == "txt" else f".{args.format}" + output_path = file_path.parent / f"{file_path.stem}_pii_report{extension}" + + print(f"Generating report: {file_path} -> {output_path}") + + # Load dataset + success, result = import_dataset(str(file_path)) + if not success: + print(f"Error loading dataset: {result}") + return 1 + + dataset, _, label_dict, _ = result + + try: + # Use batch processor for detection + processor = BatchPIIProcessor(language="en") + detection_results = processor.detect_pii_batch(dataset, label_dict) + + # Generate report + if args.format == "json": + generate_json_report(output_path, file_path, dataset, detection_results) + elif args.format == "html": + generate_html_report(output_path, file_path, dataset, detection_results) + else: # txt + generate_text_report(output_path, file_path, dataset, detection_results) + + print(f"Report saved to: {output_path}") + return 0 + + except Exception as e: + print(f"Error generating report: {e}") + return 1 + + +def print_results(results, output_format, verbose=False): + """Print detection results in specified format.""" + if output_format == "json": + # Convert results to JSON-serializable format + json_results = {} + for col, result in results.items(): + json_results[col] = { + "detection_method": result.detection_method, + "confidence": result.confidence, + "entity_types": result.entity_types, + } + print(json.dumps(json_results, indent=2)) + + elif output_format == "csv": + print("Column,Detection Method,Confidence,Entity Types") + for col, result in results.items(): + entity_types = ";".join(result.entity_types) if result.entity_types else "" + print( + f"{col},{result.detection_method},{result.confidence:.2f},{entity_types}" + ) + + else: # table format + if results: + print(f"\nFound PII in {len(results)} columns:") + print("-" * 70) + print( + f"{'Column':<25} | {'Method':<20} | {'Confidence':<10} | {'Entity Types'}" + ) + print("-" * 70) + for col, result in results.items(): + entity_info = ( + ", ".join(result.entity_types) if result.entity_types else "N/A" + ) + print( + f"{col:<25} | {result.detection_method:<20} | {result.confidence:<10.2f} | {entity_info}" + ) + else: + print("No PII detected in this dataset.") + + +def generate_text_report(output_path, file_path, dataset, detection_results): + """Generate a text format report.""" + with open(output_path, "w", encoding="utf-8") as f: + f.write("PII Detection Report\n") + f.write("=" * 60 + "\n\n") + f.write(f"Dataset: {file_path}\n") + f.write(f"Rows: {len(dataset)}\n") + f.write(f"Columns: {len(dataset.columns)}\n\n") + + f.write("Summary:\n") + f.write(f" Total columns analyzed: {len(dataset.columns)}\n") + f.write(f" Potential PII columns: {len(detection_results)}\n") + f.write( + f" Clean columns: {len(dataset.columns) - len(detection_results)}\n\n" + ) + + if detection_results: + f.write("Detected PII Columns:\n") + f.write("-" * 40 + "\n") + for i, (column, result) in enumerate(detection_results.items(), 1): + entity_types = ( + ", ".join(result.entity_types) if result.entity_types else "N/A" + ) + f.write(f"{i:2d}. {column:<25}\n") + f.write(f" Method: {result.detection_method}\n") + f.write(f" Confidence: {result.confidence:.2f}\n") + f.write(f" Entity Types: {entity_types}\n\n") + + f.write("=" * 60 + "\n") + f.write("Report generated by PII Detector CLI\n") + + +def generate_json_report(output_path, file_path, dataset, detection_results): + """Generate a JSON format report.""" + report_data = { + "dataset": str(file_path), + "rows": len(dataset), + "columns": len(dataset.columns), + "summary": { + "total_columns": len(dataset.columns), + "pii_columns": len(detection_results), + "clean_columns": len(dataset.columns) - len(detection_results), + }, + "pii_detections": {}, + } + + for col, result in detection_results.items(): + report_data["pii_detections"][col] = { + "detection_method": result.detection_method, + "confidence": result.confidence, + "entity_types": result.entity_types, + } + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(report_data, f, indent=2, ensure_ascii=False) + + +def generate_html_report(output_path, file_path, dataset, detection_results): + """Generate an HTML format report.""" + html_content = f""" + + + + PII Detection Report + + + +

PII Detection Report

+ +
+

Dataset Information

+

File: {file_path}

+

Rows: {len(dataset)}

+

Columns: {len(dataset.columns)}

+

PII Columns Found: {len(detection_results)}

+

Clean Columns: {len(dataset.columns) - len(detection_results)}

+
+ +

Detected PII Columns

+ + + + + + + + """ + + for column, result in detection_results.items(): + entity_types = ", ".join(result.entity_types) if result.entity_types else "N/A" + html_content += f""" + + + + + + + """ + + html_content += """ +
Column NameDetection MethodConfidenceEntity Types
{column}{result.detection_method}{result.confidence:.2f}{entity_types}
+

Report generated by PII Detector CLI

+ + + """ + + with open(output_path, "w", encoding="utf-8") as f: + f.write(html_content) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/pii_detector/core/__init__.py b/src/pii_detector/core/__init__.py new file mode 100644 index 0000000..190d04e --- /dev/null +++ b/src/pii_detector/core/__init__.py @@ -0,0 +1,17 @@ +"""Core PII detection algorithms and data processing logic.""" + +from pii_detector.core.processor import ( + find_piis_based_on_column_format, + find_piis_based_on_column_name, + find_piis_based_on_locations_population, + find_piis_based_on_sparse_entries, + import_dataset, +) + +__all__ = [ + "import_dataset", + "find_piis_based_on_column_name", + "find_piis_based_on_column_format", + "find_piis_based_on_sparse_entries", + "find_piis_based_on_locations_population", +] diff --git a/src/pii_detector/core/anonymization.py b/src/pii_detector/core/anonymization.py new file mode 100644 index 0000000..8fef2a3 --- /dev/null +++ b/src/pii_detector/core/anonymization.py @@ -0,0 +1,404 @@ +"""Comprehensive anonymization techniques for PII data. + +Based on research from FSD guidelines and academic literature on data anonymization. +Implements various statistical disclosure control methods. +""" + +import hashlib +import random +import re +from typing import Any + +import numpy as np +import pandas as pd + +from pii_detector.core.hash_utils import generate_hash + + +class AnonymizationTechniques: + """Collection of anonymization methods for different data types and use cases.""" + + def __init__(self, random_seed: int = 42): + """Initialize with optional random seed for reproducible results.""" + self.random_seed = random_seed + random.seed(random_seed) + np.random.seed(random_seed) + + # ==================== REMOVAL TECHNIQUES ==================== + + def remove_variables(self, df: pd.DataFrame, columns: list[str]) -> pd.DataFrame: + """Remove entire columns containing PII.""" + return df.drop(columns=columns, errors="ignore") + + def remove_records_with_unique_combinations( + self, df: pd.DataFrame, columns: list[str], threshold: int = 1 + ) -> pd.DataFrame: + """Remove records that have unique combinations in specified columns.""" + # Count combinations + combination_counts = df.groupby(columns).size() + rare_combinations = combination_counts[combination_counts <= threshold].index + + # Remove records with rare combinations + mask = ~df.set_index(columns).index.isin(rare_combinations) + return df[mask].reset_index(drop=True) + + # ==================== PSEUDONYMIZATION TECHNIQUES ==================== + + def hash_pseudonymization( + self, series: pd.Series, consistent: bool = True, prefix: str = "" + ) -> pd.Series: + """Replace values with consistent hash-based pseudonyms.""" + if consistent: + # Use consistent hashing for same values + return series.apply( + lambda x: f"{prefix}{generate_hash(str(x))[:8]}" if pd.notna(x) else x + ) + else: + # Random pseudonyms (not consistent across same values) + unique_values = series.dropna().unique() + pseudonym_map = { + val: f"{prefix}{hashlib.md5(f'{val}_{random.random()}'.encode()).hexdigest()[:8]}" + for val in unique_values + } + return series.map(pseudonym_map).fillna(series) + + def name_pseudonymization( + self, series: pd.Series, name_type: str = "generic" + ) -> pd.Series: + """Replace names with consistent pseudonyms.""" + name_pools = { + "generic": ["Person_A", "Person_B", "Person_C", "Person_D", "Person_E"], + "coded": ["P001", "P002", "P003", "P004", "P005"], + "alphabetic": ["Alpha", "Beta", "Gamma", "Delta", "Epsilon"], + } + + pool = name_pools.get(name_type, name_pools["generic"]) + unique_names = series.dropna().unique() + + # Create consistent mapping + name_map = {} + for i, name in enumerate(unique_names): + if i < len(pool): + name_map[name] = pool[i] + else: + # Generate additional names if needed + name_map[name] = f"{name_type}_{i + 1}" + + return series.map(name_map).fillna(series) + + # ==================== RECODING/CATEGORIZATION TECHNIQUES ==================== + + def age_categorization( + self, + series: pd.Series, + bins: list[int] | None = None, + labels: list[str] | None = None, + ) -> pd.Series: + """Convert ages to categories.""" + if bins is None: + bins = [0, 18, 30, 45, 60, 100] + if labels is None: + labels = ["Under 18", "18-29", "30-44", "45-59", "60+"] + + # Handle missing values by converting to numeric first + numeric_series = pd.to_numeric(series, errors="coerce") + return pd.cut(numeric_series, bins=bins, labels=labels, include_lowest=True) + + def income_categorization( + self, + series: pd.Series, + bins: list[int] | None = None, + labels: list[str] | None = None, + ) -> pd.Series: + """Convert income to categories.""" + if bins is None: + bins = [0, 25000, 50000, 75000, 100000, float("inf")] + if labels is None: + labels = ["Low", "Lower-Middle", "Middle", "Upper-Middle", "High"] + + return pd.cut(series, bins=bins, labels=labels, include_lowest=True) + + def date_generalization( + self, series: pd.Series, precision: str = "month" + ) -> pd.Series: + """Generalize dates to reduce precision.""" + date_series = pd.to_datetime(series, errors="coerce") + + if precision == "year": + return date_series.dt.year + elif precision == "month": + return date_series.dt.to_period("M") + elif precision == "quarter": + return date_series.dt.to_period("Q") + else: + return date_series + + def geographic_generalization( + self, series: pd.Series, level: str = "region" + ) -> pd.Series: + """Generalize geographic information.""" + # Mock implementation - in practice would use geographic databases + geo_mappings = { + "region": { + # US States to regions example + "California": "West", + "Nevada": "West", + "Oregon": "West", + "Texas": "South", + "Florida": "South", + "Georgia": "South", + "New York": "Northeast", + "Massachusetts": "Northeast", + "Illinois": "Midwest", + "Ohio": "Midwest", + }, + "country": { + # Cities to countries + "New York": "USA", + "Los Angeles": "USA", + "Chicago": "USA", + "London": "UK", + "Manchester": "UK", + "Paris": "France", + "Berlin": "Germany", + }, + } + + mapping = geo_mappings.get(level, {}) + return series.map(mapping).fillna("Other") + + def top_bottom_coding( + self, + series: pd.Series, + top_percentile: float = 95, + bottom_percentile: float = 5, + ) -> pd.Series: + """Apply top and bottom coding to continuous variables.""" + if not pd.api.types.is_numeric_dtype(series): + return series + + top_threshold = series.quantile(top_percentile / 100) + bottom_threshold = series.quantile(bottom_percentile / 100) + + result = series.copy() + result = result.where(result <= top_threshold, f"≥{top_threshold:.0f}") + # Apply bottom coding only to numeric values + bottom_mask = pd.to_numeric(result, errors="coerce") >= bottom_threshold + result = result.where(bottom_mask.fillna(True), f"≤{bottom_threshold:.0f}") + + return result + + # ==================== RANDOMIZATION TECHNIQUES ==================== + + def add_noise( + self, series: pd.Series, noise_type: str = "gaussian", noise_level: float = 0.1 + ) -> pd.Series: + """Add random noise to numeric data.""" + if not pd.api.types.is_numeric_dtype(series): + return series + + if noise_type == "gaussian": + noise = np.random.normal(0, series.std() * noise_level, size=len(series)) + elif noise_type == "uniform": + noise_range = series.std() * noise_level + noise = np.random.uniform(-noise_range, noise_range, size=len(series)) + else: + return series + + return series + noise + + def permutation_swapping( + self, df: pd.DataFrame, columns: list[str], swap_probability: float = 0.1 + ) -> pd.DataFrame: + """Randomly swap values between records for specified columns.""" + result_df = df.copy() + + for col in columns: + if col in df.columns: + indices = df.index.tolist() + n_swaps = int(len(indices) * swap_probability) + + for _ in range(n_swaps): + # Pick two random indices to swap + idx1, idx2 = random.sample(indices, 2) + result_df.loc[idx1, col], result_df.loc[idx2, col] = ( + result_df.loc[idx2, col], + result_df.loc[idx1, col], + ) + + return result_df + + # ==================== STATISTICAL ANONYMIZATION ==================== + + def k_anonymity_check( + self, df: pd.DataFrame, quasi_identifiers: list[str], k: int = 5 + ) -> tuple[bool, pd.DataFrame]: + """Check if dataset satisfies k-anonymity and return violation groups.""" + if not all(col in df.columns for col in quasi_identifiers): + raise ValueError("Not all quasi-identifiers found in dataset") + + # Group by quasi-identifiers and count + groups = df.groupby(quasi_identifiers).size().reset_index(name="count") + violations = groups[groups["count"] < k] + + is_k_anonymous = len(violations) == 0 + return is_k_anonymous, violations + + def achieve_k_anonymity( + self, df: pd.DataFrame, quasi_identifiers: list[str], k: int = 5 + ) -> pd.DataFrame: + """Attempt to achieve k-anonymity through generalization and suppression.""" + result_df = df.copy() + + # Simple approach: remove records that cause violations + is_anonymous, violations = self.k_anonymity_check( + result_df, quasi_identifiers, k + ) + + if not is_anonymous: + # Create a mask for records to keep + violation_combinations = set() + for _, row in violations.iterrows(): + combo = tuple(row[col] for col in quasi_identifiers) + violation_combinations.add(combo) + + # Remove records with violating combinations + mask = ~result_df[quasi_identifiers].apply( + lambda row: tuple(row) in violation_combinations, axis=1 + ) + result_df = result_df[mask].reset_index(drop=True) + + return result_df + + # ==================== TEXT ANONYMIZATION ==================== + + def text_masking(self, text: str, patterns: dict[str, str] | None = None) -> str: + """Mask PII patterns in text content.""" + if pd.isna(text) or not isinstance(text, str): + return text + + if patterns is None: + patterns = { + r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b": "[EMAIL]", + r"\b\d{3}-\d{3}-\d{4}\b": "[PHONE]", + r"\b\d{3}[-.]?\d{4}\b": "[PHONE]", # Also catch shorter phone patterns + r"\b\d{3}[-.]?\d{2}[-.]?\d{4}\b": "[SSN]", + r"\b\d{1,5}\s+\w+\s+\w+": "[ADDRESS]", + r"\b[A-Z][a-z]+\s+[A-Z][a-z]+\b": "[NAME]", + } + + result = text + for pattern, replacement in patterns.items(): + result = re.sub(pattern, replacement, result) + + return result + + def selective_text_suppression( + self, text: str, suppress_types: list[str] = None + ) -> str: + """Suppress specific types of information from text.""" + if pd.isna(text) or not isinstance(text, str): + return text + + if suppress_types is None: + suppress_types = ["names", "locations", "numbers"] + + result = text + + if "names" in suppress_types: + # Remove proper names (simplified approach) + result = re.sub(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", "[REDACTED]", result) + + if "locations" in suppress_types: + # Remove location indicators + location_words = ["street", "avenue", "road", "city", "state", "zip"] + for word in location_words: + result = re.sub( + rf"\b\w*{word}\w*\b", "[LOCATION]", result, flags=re.IGNORECASE + ) + + if "numbers" in suppress_types: + # Remove number sequences + result = re.sub(r"\b\d{3,}\b", "[NUMBER]", result) + + return result + + # ==================== UTILITY METHODS ==================== + + def anonymization_report( + self, original_df: pd.DataFrame, anonymized_df: pd.DataFrame + ) -> dict[str, Any]: + """Generate a report comparing original and anonymized datasets.""" + report = { + "original_rows": len(original_df), + "anonymized_rows": len(anonymized_df), + "rows_removed": len(original_df) - len(anonymized_df), + "removal_percentage": ( + (len(original_df) - len(anonymized_df)) / len(original_df) + ) + * 100, + "columns_comparison": {}, + "data_utility_metrics": {}, + } + + # Compare columns + for col in original_df.columns: + if col in anonymized_df.columns: + orig_unique = original_df[col].nunique() + anon_unique = anonymized_df[col].nunique() + + report["columns_comparison"][col] = { + "original_unique_values": orig_unique, + "anonymized_unique_values": anon_unique, + "uniqueness_reduction": ((orig_unique - anon_unique) / orig_unique) + * 100 + if orig_unique > 0 + else 0, + } + + return report + + +# ==================== MOCK IMPLEMENTATIONS FOR FUTURE FEATURES ==================== + + +class AdvancedAnonymization: + """Mock implementations for advanced anonymization techniques not yet fully implemented.""" + + @staticmethod + def l_diversity_check( + df: pd.DataFrame, + quasi_identifiers: list[str], + sensitive_attribute: str, + diversity_l: int = 2, + ) -> bool: + """Mock: Check if dataset satisfies l-diversity.""" + # TODO: Implement l-diversity checking + return False + + @staticmethod + def t_closeness_check( + df: pd.DataFrame, + quasi_identifiers: list[str], + sensitive_attribute: str, + t: float = 0.2, + ) -> bool: + """Mock: Check if dataset satisfies t-closeness.""" + # TODO: Implement t-closeness checking + return False + + @staticmethod + def differential_privacy_noise( + series: pd.Series, epsilon: float = 1.0 + ) -> pd.Series: + """Mock: Apply differential privacy noise.""" + # TODO: Implement proper differential privacy + return series + + @staticmethod + def synthetic_data_generation( + df: pd.DataFrame, method: str = "gan" + ) -> pd.DataFrame: + """Mock: Generate synthetic data preserving statistical properties.""" + # TODO: Implement synthetic data generation + return df.copy() diff --git a/src/pii_detector/core/batch_processor.py b/src/pii_detector/core/batch_processor.py new file mode 100644 index 0000000..3876efc --- /dev/null +++ b/src/pii_detector/core/batch_processor.py @@ -0,0 +1,614 @@ +"""Efficient batch processing for PII detection and anonymization using Presidio. + +This module implements efficient batch processing techniques for structured data, +incorporating Presidio's BatchAnalyzerEngine and presidio-structured capabilities. +""" + +import logging +from collections import defaultdict +from collections.abc import Callable +from concurrent.futures import ThreadPoolExecutor +from typing import Any + +import pandas as pd + +from pii_detector.core.hybrid_anonymizer import HybridAnonymizer +from pii_detector.core.presidio_engine import get_presidio_analyzer +from pii_detector.core.unified_processor import PIIDetectionResult, UnifiedPIIProcessor + +logger = logging.getLogger(__name__) + +# Optional imports for advanced batch processing +try: + from presidio_structured import StructuredEngine + from presidio_structured.config import StructuredAnalysisConfig + + PRESIDIO_STRUCTURED_AVAILABLE = True + logger.info("presidio-structured available for advanced batch processing") +except ImportError: + PRESIDIO_STRUCTURED_AVAILABLE = False + logger.info("presidio-structured not available, using standard batch processing") + +try: + from presidio_analyzer import BatchAnalyzerEngine + + BATCH_ANALYZER_AVAILABLE = True +except ImportError: + BATCH_ANALYZER_AVAILABLE = False + + +class BatchPIIProcessor: + """Enhanced batch processor for efficient PII detection and anonymization.""" + + def __init__( + self, + language: str = "en", + chunk_size: int = 1000, + max_workers: int = 4, + use_structured_engine: bool = False, # Default to False for better compatibility + ): + """Initialize batch processor. + + Args: + language: Language code for text analysis + chunk_size: Number of rows to process per chunk + max_workers: Maximum number of parallel workers + use_structured_engine: Whether to use presidio-structured if available + + """ + self.language = language + self.chunk_size = chunk_size + self.max_workers = max_workers + self.use_structured_engine = ( + use_structured_engine and PRESIDIO_STRUCTURED_AVAILABLE + ) + + # Initialize processors + self.unified_processor = UnifiedPIIProcessor(language=language) + self.hybrid_anonymizer = HybridAnonymizer(language=language) + self.presidio_analyzer = get_presidio_analyzer(language=language) + + # Initialize structured engine if available + if self.use_structured_engine: + self._init_structured_engine() + + # Initialize batch analyzer if available + self.batch_analyzer = None + if BATCH_ANALYZER_AVAILABLE and self.presidio_analyzer.is_available(): + try: + self.batch_analyzer = BatchAnalyzerEngine( + analyzer_engine=self.presidio_analyzer.analyzer + ) + logger.info("BatchAnalyzerEngine initialized successfully") + except Exception as e: + logger.warning(f"Failed to initialize BatchAnalyzerEngine: {e}") + + def _safe_callback(self, callback: Callable | None, progress: float, message: str): + """Safely call progress callback, catching any exceptions. + + Args: + callback: Progress callback function + progress: Progress percentage (0-100) + message: Progress message + + """ + if callback: + try: + callback(progress, message) + except Exception as e: + logger.warning(f"Progress callback raised exception: {e}") + + def _init_structured_engine(self): + """Initialize the structured engine for advanced processing.""" + try: + if not PRESIDIO_STRUCTURED_AVAILABLE: + logger.info( + "presidio-structured not available, skipping structured engine initialization" + ) + self.use_structured_engine = False + return + + # Try to configure structured analysis with basic config + try: + structured_config = StructuredAnalysisConfig( + analyzer_config={ + "supported_languages": [self.language], + "default_score_threshold": 0.7, + } + ) + self.structured_engine = StructuredEngine(config=structured_config) + except (TypeError, AttributeError) as config_error: + # Try with simpler configuration if the above fails + logger.info( + f"Advanced config failed ({config_error}), trying basic config" + ) + self.structured_engine = StructuredEngine() + + logger.info("StructuredEngine initialized successfully") + except Exception as e: + logger.warning(f"Failed to initialize StructuredEngine: {e}") + self.use_structured_engine = False + + def detect_pii_batch( + self, + dataset: pd.DataFrame, + label_dict: dict[str, str] | None = None, + detection_config: dict[str, Any] | None = None, + progress_callback: Callable | None = None, + ) -> dict[str, PIIDetectionResult]: + """Perform batch PII detection with optimized processing. + + Args: + dataset: DataFrame to analyze + label_dict: Column labels mapping + detection_config: Detection configuration + progress_callback: Optional callback for progress reporting + + Returns: + Dictionary of PII detection results + + """ + logger.info( + f"Starting batch PII detection on dataset with shape {dataset.shape}" + ) + + if detection_config is None: + detection_config = self._get_optimized_detection_config() + + # Choose processing strategy based on dataset size and available tools + if self.use_structured_engine and len(dataset) > self.chunk_size: + return self._detect_with_structured_engine( + dataset, label_dict, detection_config, progress_callback + ) + elif len(dataset) > self.chunk_size * 2: + return self._detect_with_chunking( + dataset, label_dict, detection_config, progress_callback + ) + else: + return self._detect_standard( + dataset, label_dict, detection_config, progress_callback + ) + + def _detect_with_structured_engine( + self, + dataset: pd.DataFrame, + label_dict: dict[str, str] | None, + config: dict[str, Any], + progress_callback: Callable | None = None, + ) -> dict[str, PIIDetectionResult]: + """Use presidio-structured for efficient batch processing.""" + logger.info("Using presidio-structured for batch detection") + results = {} + + try: + # Analyze with structured engine + structured_results = self.structured_engine.analyze(dataset) + + # Convert structured results to our format + for column_name, analysis_result in structured_results.items(): + if ( + hasattr(analysis_result, "entity_types") + and analysis_result.entity_types + ): + results[column_name] = PIIDetectionResult( + column_name=column_name, + detection_method="presidio_structured", + confidence=getattr(analysis_result, "score", 0.8), + entity_types=list(analysis_result.entity_types), + details={ + "structured_analysis": True, + "detection_count": getattr( + analysis_result, "detection_count", 0 + ), + }, + ) + + # Combine with structural analysis for comprehensive results + structural_results = self.unified_processor._detect_structural_pii( + dataset, label_dict or {}, config + ) + + # Merge results with preference for structured analysis + for col, struct_result in structural_results.items(): + if col not in results: + results[col] = struct_result + else: + # Combine confidence scores + existing = results[col] + combined_confidence = ( + existing.confidence * 0.7 + struct_result.confidence * 0.3 + ) + results[col] = PIIDetectionResult( + column_name=col, + detection_method="hybrid_structured", + confidence=combined_confidence, + entity_types=list( + set(existing.entity_types + struct_result.entity_types) + ), + details={ + "presidio_structured": existing.details, + "structural_analysis": struct_result.details, + }, + ) + + except Exception as e: + logger.error(f"Error in structured engine processing: {e}") + return self._detect_standard(dataset, label_dict, config, progress_callback) + + self._safe_callback(progress_callback, 100, "Structured analysis complete") + + return results + + def _detect_with_chunking( + self, + dataset: pd.DataFrame, + label_dict: dict[str, str] | None, + config: dict[str, Any], + progress_callback: Callable | None = None, + ) -> dict[str, PIIDetectionResult]: + """Process large datasets in chunks with parallel processing.""" + logger.info(f"Processing dataset in chunks of {self.chunk_size} rows") + + # First, do structural analysis on the full dataset (doesn't depend on row content) + structural_results = self.unified_processor._detect_structural_pii( + dataset, label_dict or {}, config + ) + + # For text content analysis, process in chunks + text_results = defaultdict(list) + total_chunks = len(dataset) // self.chunk_size + ( + 1 if len(dataset) % self.chunk_size else 0 + ) + + # Process text columns in parallel chunks + text_columns = [ + col for col in dataset.columns if dataset[col].dtype == "object" + ] + + if text_columns and self.presidio_analyzer.is_available(): + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + futures = [] + + for i, start_idx in enumerate(range(0, len(dataset), self.chunk_size)): + end_idx = min(start_idx + self.chunk_size, len(dataset)) + chunk = dataset.iloc[start_idx:end_idx] + + future = executor.submit( + self._analyze_chunk_text_content, chunk, text_columns, config + ) + futures.append((future, i)) + + # Collect results + for future, chunk_idx in futures: + try: + chunk_results = future.result() + for col, result in chunk_results.items(): + text_results[col].append(result) + + progress = ((chunk_idx + 1) / total_chunks) * 100 + self._safe_callback( + progress_callback, + progress, + f"Processed chunk {chunk_idx + 1}/{total_chunks}", + ) + + except Exception as e: + logger.error(f"Error processing chunk {chunk_idx}: {e}") + + # Aggregate text results + aggregated_text_results = self._aggregate_chunk_results(text_results, config) + + # Combine structural and text results + final_results = {} + all_columns = set(structural_results.keys()) | set( + aggregated_text_results.keys() + ) + + for col in all_columns: + structural = structural_results.get(col) + text = aggregated_text_results.get(col) + + combined = self.unified_processor._combine_detection_results( + col, structural, text, config + ) + if combined: + final_results[col] = combined + + return final_results + + def _analyze_chunk_text_content( + self, chunk: pd.DataFrame, text_columns: list[str], config: dict[str, Any] + ) -> dict[str, Any]: + """Analyze text content in a data chunk.""" + results = {} + + for col in text_columns: + if col not in chunk.columns: + continue + + try: + analysis = self.presidio_analyzer.analyze_column_text( + chunk[col], + confidence_threshold=config.get( + "presidio_confidence_threshold", 0.7 + ), + sample_size=config.get("presidio_sample_size", 100), + ) + + if analysis.get("total_detections", 0) > 0: + results[col] = analysis + + except Exception as e: + logger.error(f"Error analyzing chunk for column {col}: {e}") + + return results + + def _aggregate_chunk_results( + self, chunk_results: dict[str, list[dict[str, Any]]], config: dict[str, Any] + ) -> dict[str, PIIDetectionResult]: + """Aggregate results from multiple chunks.""" + aggregated = {} + + for col, results_list in chunk_results.items(): + if not results_list: + continue + + # Combine detection statistics + total_detections = sum(r.get("total_detections", 0) for r in results_list) + total_samples = sum(r.get("sample_analyzed", 0) for r in results_list) + all_scores = [] + all_entities = defaultdict(int) + + for result in results_list: + all_scores.extend(result.get("confidence_scores", [])) + for entity_type, entities in result.get("entities_found", {}).items(): + all_entities[entity_type] += len(entities) + + if total_detections > 0 and all_scores: + avg_confidence = sum(all_scores) / len(all_scores) + detection_rate = total_detections / max(total_samples, 1) + adjusted_confidence = min(avg_confidence * (1 + detection_rate), 1.0) + + confidence_threshold = config.get("presidio_confidence_threshold", 0.7) + if adjusted_confidence >= confidence_threshold: + aggregated[col] = PIIDetectionResult( + column_name=col, + detection_method="presidio_batch_text", + confidence=adjusted_confidence, + entity_types=list(all_entities.keys()), + details={ + "total_detections": total_detections, + "total_samples": total_samples, + "detection_rate": detection_rate, + "entities_found": dict(all_entities), + "batch_processed": True, + }, + ) + + return aggregated + + def _detect_standard( + self, + dataset: pd.DataFrame, + label_dict: dict[str, str] | None, + config: dict[str, Any], + progress_callback: Callable | None = None, + ) -> dict[str, PIIDetectionResult]: + """Run standard detection for smaller datasets.""" + results = self.unified_processor.detect_pii_comprehensive( + dataset, label_dict, config + ) + + self._safe_callback(progress_callback, 100, "Standard detection complete") + + return results + + def anonymize_batch( + self, + dataset: pd.DataFrame, + pii_results: dict[str, PIIDetectionResult], + anonymization_config: dict[str, Any] | None = None, + progress_callback: Callable | None = None, + ) -> tuple[pd.DataFrame, dict[str, Any]]: + """Perform batch anonymization with optimized processing.""" + logger.info(f"Starting batch anonymization of {len(pii_results)} PII columns") + + if len(dataset) > self.chunk_size * 2: + return self._anonymize_with_chunking( + dataset, pii_results, anonymization_config, progress_callback + ) + else: + return self.hybrid_anonymizer.anonymize_dataset( + dataset, pii_results, anonymization_config + ) + + def _anonymize_with_chunking( + self, + dataset: pd.DataFrame, + pii_results: dict[str, PIIDetectionResult], + config: dict[str, Any] | None, + progress_callback: Callable | None = None, + ) -> tuple[pd.DataFrame, dict[str, Any]]: + """Anonymize large datasets in chunks.""" + logger.info(f"Anonymizing dataset in chunks of {self.chunk_size} rows") + + anonymized_chunks = [] + total_chunks = len(dataset) // self.chunk_size + ( + 1 if len(dataset) % self.chunk_size else 0 + ) + + # Process chunks in parallel for columns that support it + text_pii_columns = { + col: result + for col, result in pii_results.items() + if result.detection_method + in ["presidio_text_analysis", "presidio_batch_text", "hybrid_detection"] + and dataset[col].dtype == "object" + } + + # Non-text columns can be processed normally + non_text_pii = { + col: result + for col, result in pii_results.items() + if col not in text_pii_columns + } + + for i, start_idx in enumerate(range(0, len(dataset), self.chunk_size)): + end_idx = min(start_idx + self.chunk_size, len(dataset)) + chunk = dataset.iloc[start_idx:end_idx].copy() + + # Anonymize text columns with Presidio + for col, detection_result in text_pii_columns.items(): + if col in chunk.columns: + anonymized_col = self._anonymize_column_presidio_batch( + chunk[col], detection_result, config + ) + chunk[col] = anonymized_col + + # Anonymize non-text columns with standard methods + if non_text_pii: + chunk, _ = self.hybrid_anonymizer.anonymize_dataset( + chunk, non_text_pii, config + ) + + anonymized_chunks.append(chunk) + + progress = ((i + 1) / total_chunks) * 100 + self._safe_callback( + progress_callback, progress, f"Anonymized chunk {i + 1}/{total_chunks}" + ) + + # Combine chunks + final_dataset = pd.concat(anonymized_chunks, ignore_index=True) + + # Generate report + report = { + "original_shape": dataset.shape, + "final_shape": final_dataset.shape, + "chunks_processed": total_chunks, + "batch_anonymization": True, + "pii_columns": list(pii_results.keys()), + } + + return final_dataset, report + + def _anonymize_column_presidio_batch( + self, + column_data: pd.Series, + detection_result: PIIDetectionResult, + config: dict[str, Any] | None, + ) -> pd.Series: + """Anonymize a column using Presidio with batch optimization.""" + if not self.presidio_analyzer.is_available(): + return column_data + + def anonymize_text_value(text_value): + if isinstance(text_value, str) and len(text_value.strip()) > 0: + return self.presidio_analyzer.anonymize_text(text_value) + return text_value + + return column_data.apply(anonymize_text_value) + + def _get_optimized_detection_config(self) -> dict[str, Any]: + """Get optimized configuration for batch processing.""" + config = self.unified_processor._get_default_config() + + # Optimize for batch processing + config.update( + { + "presidio_sample_size": min( + 200, self.chunk_size // 5 + ), # Sample more for larger chunks + "presidio_confidence_threshold": 0.6, # Slightly lower threshold for batch + "use_presidio_detection": self.presidio_analyzer.is_available(), + "batch_processing": True, + } + ) + + return config + + def get_processing_strategy(self, dataset: pd.DataFrame) -> str: + """Determine the best processing strategy for a dataset.""" + row_count = len(dataset) + + if self.use_structured_engine and row_count > 1000: + return "structured_engine" + elif row_count > self.chunk_size * 2: + return "chunked_processing" + else: + return "standard_processing" + + def estimate_processing_time(self, dataset: pd.DataFrame) -> dict[str, Any]: + """Estimate processing time for different strategies.""" + row_count = len(dataset) + col_count = len(dataset.columns) + text_cols = sum(1 for col in dataset.columns if dataset[col].dtype == "object") + + # Rough estimates based on typical performance + estimates = { + "standard_processing": { + "time_seconds": (row_count * text_cols * 0.001), + "memory_mb": (row_count * col_count * 0.0001), + "recommended": row_count < 10000, + }, + "chunked_processing": { + "time_seconds": (row_count * text_cols * 0.0008), + "memory_mb": (self.chunk_size * col_count * 0.0001), + "recommended": 10000 <= row_count < 100000, + }, + } + + if self.use_structured_engine: + estimates["structured_engine"] = { + "time_seconds": (row_count * text_cols * 0.0005), + "memory_mb": (row_count * col_count * 0.00008), + "recommended": row_count >= 1000, + } + + return estimates + + +# Convenience functions + + +def process_dataset_batch( + dataset: pd.DataFrame, + label_dict: dict[str, str] | None = None, + language: str = "en", + detection_config: dict[str, Any] | None = None, + anonymization_config: dict[str, Any] | None = None, + chunk_size: int = 1000, + max_workers: int = 4, + progress_callback: Callable | None = None, +) -> tuple[dict[str, PIIDetectionResult], pd.DataFrame, dict[str, Any]]: + """Complete batch processing workflow for PII detection and anonymization. + + Args: + dataset: Input dataset + label_dict: Column labels mapping + language: Language for text processing + detection_config: Detection configuration + anonymization_config: Anonymization configuration + chunk_size: Chunk size for processing + max_workers: Maximum parallel workers + progress_callback: Progress callback function + + Returns: + Tuple of (detection_results, anonymized_dataset, report) + + """ + processor = BatchPIIProcessor( + language=language, chunk_size=chunk_size, max_workers=max_workers + ) + + # Detection phase + detection_results = processor.detect_pii_batch( + dataset, label_dict, detection_config, progress_callback + ) + + # Anonymization phase + anonymized_dataset, anonymization_report = processor.anonymize_batch( + dataset, detection_results, anonymization_config, progress_callback + ) + + return detection_results, anonymized_dataset, anonymization_report diff --git a/src/pii_detector/core/hash_utils.py b/src/pii_detector/core/hash_utils.py new file mode 100644 index 0000000..b592f02 --- /dev/null +++ b/src/pii_detector/core/hash_utils.py @@ -0,0 +1,52 @@ +"""Hash utilities for anonymizing PII data.""" + +import hashlib +import hmac +import os + + +def sha1(message: str) -> str: + """Generate SHA1 hash of a message.""" + return hashlib.sha1(bytes(message, encoding="utf-8")).hexdigest() + + +def hmac_sha1(secret_key: str, message: str) -> str: + """Generate HMAC-SHA1 hash of a message with a secret key.""" + h = hmac.new( + bytes(secret_key, encoding="utf-8"), + msg=bytes(message, encoding="utf-8"), + digestmod=hashlib.sha1, + ) + return h.hexdigest() + + +def get_or_create_secret_key() -> str: + """Get or create a secret key for hashing operations.""" + # In a real implementation, this should be securely stored + # For now, generate a consistent key based on environment + key = os.environ.get("PII_HASH_SECRET_KEY") + if not key: + # Generate a default key (not secure for production) + key = "default_secret_key_change_me" + return key + + +def generate_hash(message: str, use_hmac: bool = True) -> str: + """Generate a hash for anonymizing PII data.""" + if use_hmac: + secret_key = get_or_create_secret_key() + return hmac_sha1(secret_key, message) + else: + return sha1(message) + + +if __name__ == "__main__": + # Example usage + test_message = "The Ore-Ida brand is a syllabic abbreviation of Oregon and Idaho" + print(f"SHA1: {sha1(test_message)}") + + secret_key = get_or_create_secret_key() + example = {} + for name in ["felipe", "michael", "lindsey"]: + example[name] = hmac_sha1(secret_key, name) + print(f"HMAC examples: {example}") diff --git a/src/pii_detector/core/hybrid_anonymizer.py b/src/pii_detector/core/hybrid_anonymizer.py new file mode 100644 index 0000000..38d34fc --- /dev/null +++ b/src/pii_detector/core/hybrid_anonymizer.py @@ -0,0 +1,512 @@ +"""Hybrid anonymization engine combining existing techniques with Presidio operators. + +This module extends the current anonymization capabilities by integrating Presidio's +advanced text anonymization while preserving all existing statistical anonymization methods. +""" + +import logging +from typing import Any + +import pandas as pd + +from pii_detector.core.anonymization import AnonymizationTechniques +from pii_detector.core.presidio_engine import get_presidio_analyzer +from pii_detector.core.unified_processor import PIIDetectionResult + +logger = logging.getLogger(__name__) + + +class HybridAnonymizer: + """Hybrid anonymizer combining statistical methods with Presidio text anonymization.""" + + def __init__(self, random_seed: int = 42, language: str = "en"): + """Initialize the hybrid anonymizer. + + Args: + random_seed: Random seed for reproducible results + language: Language code for text processing + + """ + self.current_techniques = AnonymizationTechniques(random_seed=random_seed) + self.presidio_analyzer = get_presidio_analyzer(language=language) + self.language = language + + def anonymize_dataset( + self, + dataset: pd.DataFrame, + pii_columns: list[str] | dict[str, PIIDetectionResult], + anonymization_config: dict[str, Any] | None = None, + ) -> tuple[pd.DataFrame, dict[str, Any]]: + """Anonymize a dataset using hybrid approach. + + Args: + dataset: Original dataset + pii_columns: Either list of column names or detection results + anonymization_config: Configuration for anonymization methods + + Returns: + Tuple of (anonymized_dataset, anonymization_report) + + """ + if anonymization_config is None: + anonymization_config = self._get_default_anonymization_config() + + logger.info(f"Starting hybrid anonymization of {len(dataset)} rows") + + # Convert pii_columns to consistent format + if isinstance(pii_columns, dict): + column_detection_map = pii_columns + column_names = list(pii_columns.keys()) + else: + column_names = pii_columns + column_detection_map = {} + + anonymized_dataset = dataset.copy() + anonymization_log = { + "original_shape": dataset.shape, + "columns_processed": [], + "methods_applied": {}, + "text_anonymization": {}, + "structural_anonymization": {}, + } + + # Process each PII column + for column_name in column_names: + if column_name not in dataset.columns: + logger.warning(f"Column {column_name} not found in dataset") + continue + + detection_result = column_detection_map.get(column_name) + column_config = anonymization_config.get(column_name, {}) + + # Determine anonymization method + method = self._determine_anonymization_method( + dataset[column_name], detection_result, column_config + ) + + logger.info(f"Anonymizing column {column_name} using method: {method}") + + try: + # Apply anonymization + anonymized_column, method_log = self._anonymize_column( + dataset[column_name], method, detection_result, column_config + ) + + anonymized_dataset[column_name] = anonymized_column + + # Log the results + anonymization_log["columns_processed"].append(column_name) + anonymization_log["methods_applied"][column_name] = method + + if method.startswith("presidio"): + anonymization_log["text_anonymization"][column_name] = method_log + else: + anonymization_log["structural_anonymization"][column_name] = ( + method_log + ) + + except Exception as e: + logger.error(f"Error anonymizing column {column_name}: {e}") + # Keep original column if anonymization fails + anonymization_log["methods_applied"][column_name] = "failed" + + # Generate comprehensive report + anonymization_report = self.current_techniques.anonymization_report( + dataset, anonymized_dataset + ) + anonymization_report.update(anonymization_log) + + logger.info( + f"Anonymization completed. Processed {len(anonymization_log['columns_processed'])} columns" + ) + return anonymized_dataset, anonymization_report + + def _determine_anonymization_method( + self, + column_data: pd.Series, + detection_result: PIIDetectionResult | None, + column_config: dict[str, Any], + ) -> str: + """Determine the best anonymization method for a column.""" + # Check if user specified a method + if "method" in column_config: + return column_config["method"] + + # If no detection result, use basic removal + if detection_result is None: + return "remove" + + # Decide based on detection method and entity types + detection_method = detection_result.detection_method + entity_types = detection_result.entity_types + + # Use Presidio for text-based detections with specific entity types + if ( + detection_method in ["presidio_text_analysis", "hybrid_detection"] + and entity_types + and self.presidio_analyzer.is_available() + ): + # Check if we have known entity types that Presidio handles well + presidio_entities = { + "PERSON", + "EMAIL_ADDRESS", + "PHONE_NUMBER", + "US_SSN", + "LOCATION", + } + if any(entity in presidio_entities for entity in entity_types): + return "presidio_replace" + + # Use structural methods for specific detection types + if detection_method == "format_patterns": + return "text_masking" + elif detection_method == "sparsity_analysis": + return "hash_pseudonymization" + elif detection_method == "column_name_matching": + # Choose based on likely column type + column_name_lower = detection_result.column_name.lower() + if any(word in column_name_lower for word in ["age", "birth"]): + return "age_categorization" + elif any( + word in column_name_lower for word in ["income", "salary", "wage"] + ): + return "income_categorization" + elif any( + word in column_name_lower for word in ["location", "address", "city"] + ): + return "geographic_generalization" + else: + return "hash_pseudonymization" + elif detection_method == "location_population": + return "geographic_generalization" + + # Default method + return "hash_pseudonymization" + + def _anonymize_column( + self, + column_data: pd.Series, + method: str, + detection_result: PIIDetectionResult | None, + column_config: dict[str, Any], + ) -> tuple[pd.Series, dict[str, Any]]: + """Anonymize a single column using the specified method.""" + method_log = {"method": method, "original_unique": column_data.nunique()} + + if method == "remove": + # Complete removal + anonymized_data = pd.Series( + [None] * len(column_data), name=column_data.name + ) + method_log["action"] = "column_removed" + + elif method == "presidio_replace": + # Use Presidio for text anonymization + anonymized_data = self._presidio_anonymize_column( + column_data, detection_result, column_config + ) + method_log["presidio_available"] = self.presidio_analyzer.is_available() + method_log["entity_types"] = ( + detection_result.entity_types if detection_result else [] + ) + + elif method == "text_masking": + # Use current text masking + anonymized_data = column_data.apply( + lambda x: self.current_techniques.text_masking(x) if pd.notna(x) else x + ) + method_log["action"] = "text_patterns_masked" + + elif method == "hash_pseudonymization": + # Use hash-based pseudonymization + anonymized_data = self.current_techniques.hash_pseudonymization( + column_data, + consistent=column_config.get("consistent_hashing", True), + prefix=column_config.get("prefix", "ID_"), + ) + method_log["action"] = "hash_pseudonymization" + + elif method == "age_categorization": + # Age categorization + anonymized_data = self.current_techniques.age_categorization( + column_data, + bins=column_config.get("age_bins"), + labels=column_config.get("age_labels"), + ) + method_log["action"] = "age_categorized" + + elif method == "income_categorization": + # Income categorization + anonymized_data = self.current_techniques.income_categorization( + column_data, + bins=column_config.get("income_bins"), + labels=column_config.get("income_labels"), + ) + method_log["action"] = "income_categorized" + + elif method == "geographic_generalization": + # Geographic generalization + anonymized_data = self.current_techniques.geographic_generalization( + column_data, level=column_config.get("geo_level", "region") + ) + method_log["action"] = "geography_generalized" + + elif method == "date_generalization": + # Date generalization + anonymized_data = self.current_techniques.date_generalization( + column_data, precision=column_config.get("date_precision", "month") + ) + method_log["action"] = "dates_generalized" + + elif method == "top_bottom_coding": + # Top/bottom coding for numeric data + anonymized_data = self.current_techniques.top_bottom_coding( + column_data, + top_percentile=column_config.get("top_percentile", 95), + bottom_percentile=column_config.get("bottom_percentile", 5), + ) + method_log["action"] = "top_bottom_coded" + + elif method == "add_noise": + # Add statistical noise + anonymized_data = self.current_techniques.add_noise( + column_data, + noise_type=column_config.get("noise_type", "gaussian"), + noise_level=column_config.get("noise_level", 0.1), + ) + method_log["action"] = "noise_added" + + else: + # Unknown method - default to hash pseudonymization + logger.warning( + f"Unknown anonymization method: {method}. Using hash pseudonymization." + ) + anonymized_data = self.current_techniques.hash_pseudonymization(column_data) + method_log["action"] = "default_hash_pseudonymization" + method_log["warning"] = f"Unknown method {method}" + + method_log["final_unique"] = anonymized_data.nunique() + method_log["uniqueness_reduction"] = ( + (method_log["original_unique"] - method_log["final_unique"]) + / method_log["original_unique"] + * 100 + if method_log["original_unique"] > 0 + else 0 + ) + + return anonymized_data, method_log + + def _presidio_anonymize_column( + self, + column_data: pd.Series, + detection_result: PIIDetectionResult | None, + column_config: dict[str, Any], + ) -> pd.Series: + """Anonymize column using Presidio text anonymization.""" + if not self.presidio_analyzer.is_available(): + logger.warning("Presidio not available, falling back to text masking") + return column_data.apply( + lambda x: self.current_techniques.text_masking(x) if pd.notna(x) else x + ) + + # Custom operators based on detected entities + operators = column_config.get("presidio_operators") + if operators is None and detection_result: + operators = self._create_operators_for_entities( + detection_result.entity_types + ) + + def anonymize_text_value(text_value): + if isinstance(text_value, str) and len(text_value.strip()) > 0: + return self.presidio_analyzer.anonymize_text( + text_value, operators=operators + ) + return text_value + + return column_data.apply(anonymize_text_value) + + def _create_operators_for_entities(self, entity_types: list[str]) -> dict[str, Any]: + """Create Presidio operators based on detected entity types.""" + if not self.presidio_analyzer.is_available(): + return {} + + try: + from presidio_anonymizer.entities import OperatorConfig + + operators = {} + for entity_type in entity_types: + if entity_type == "PERSON": + operators[entity_type] = OperatorConfig( + "replace", {"new_value": "[PERSON]"} + ) + elif entity_type == "EMAIL_ADDRESS": + operators[entity_type] = OperatorConfig( + "replace", {"new_value": "[EMAIL]"} + ) + elif entity_type == "PHONE_NUMBER": + operators[entity_type] = OperatorConfig( + "replace", {"new_value": "[PHONE]"} + ) + elif entity_type == "US_SSN": + operators[entity_type] = OperatorConfig( + "replace", {"new_value": "[SSN]"} + ) + elif entity_type == "LOCATION": + operators[entity_type] = OperatorConfig( + "replace", {"new_value": "[LOCATION]"} + ) + elif entity_type == "DATE_TIME": + operators[entity_type] = OperatorConfig( + "replace", {"new_value": "[DATE]"} + ) + elif entity_type == "CREDIT_CARD": + operators[entity_type] = OperatorConfig( + "replace", {"new_value": "[CARD]"} + ) + else: + # Default operator for unknown entity types + operators[entity_type] = OperatorConfig( + "replace", {"new_value": "[REDACTED]"} + ) + + return operators + + except ImportError: + logger.warning("Presidio anonymizer not available") + return {} + + def _get_default_anonymization_config(self) -> dict[str, Any]: + """Get default configuration for anonymization methods.""" + return { + # Global settings + "prefer_presidio_for_text": True, + "consistent_hashing": True, + # Method-specific defaults + "age_bins": [0, 18, 30, 45, 60, 100], + "age_labels": ["Under 18", "18-29", "30-44", "45-59", "60+"], + "income_bins": [0, 25000, 50000, 75000, 100000, float("inf")], + "income_labels": ["Low", "Lower-Middle", "Middle", "Upper-Middle", "High"], + "geo_level": "region", + "date_precision": "month", + "top_percentile": 95, + "bottom_percentile": 5, + "noise_type": "gaussian", + "noise_level": 0.1, + } + + def anonymize_text_content( + self, text: str, entities_to_anonymize: list[str] | None = None + ) -> str: + """Anonymize text content using Presidio. + + Args: + text: Text to anonymize + entities_to_anonymize: Specific entity types to target + + Returns: + Anonymized text + + """ + if self.presidio_analyzer.is_available(): + # Analyze first + analysis_results = self.presidio_analyzer.analyze_text(text) + + # Filter to specific entities if requested + if entities_to_anonymize: + analysis_results = [ + result + for result in analysis_results + if result["entity_type"] in entities_to_anonymize + ] + + # Anonymize + return self.presidio_analyzer.anonymize_text(text, analysis_results) + else: + # Fall back to basic text masking + return self.current_techniques.text_masking(text) + + def get_available_methods(self) -> dict[str, dict[str, Any]]: + """Get information about available anonymization methods.""" + methods = { + "remove": { + "description": "Complete removal of the column", + "suitable_for": ["any"], + "preserves_format": False, + }, + "hash_pseudonymization": { + "description": "Replace with consistent hash-based identifiers", + "suitable_for": ["identifiers", "names"], + "preserves_format": False, + }, + "age_categorization": { + "description": "Convert ages to categorical ranges", + "suitable_for": ["numeric", "age"], + "preserves_format": False, + }, + "income_categorization": { + "description": "Convert income to categorical ranges", + "suitable_for": ["numeric", "income"], + "preserves_format": False, + }, + "geographic_generalization": { + "description": "Generalize locations to broader regions", + "suitable_for": ["location", "geographic"], + "preserves_format": False, + }, + "date_generalization": { + "description": "Reduce date precision (year, month, quarter)", + "suitable_for": ["date", "temporal"], + "preserves_format": True, + }, + "text_masking": { + "description": "Mask PII patterns in text with placeholders", + "suitable_for": ["text", "mixed"], + "preserves_format": True, + }, + "add_noise": { + "description": "Add statistical noise to numeric values", + "suitable_for": ["numeric"], + "preserves_format": True, + }, + "top_bottom_coding": { + "description": "Cap extreme values in distributions", + "suitable_for": ["numeric"], + "preserves_format": True, + }, + } + + # Add Presidio methods if available + if self.presidio_analyzer.is_available(): + methods["presidio_replace"] = { + "description": "Advanced text anonymization using Presidio ML models", + "suitable_for": ["text", "mixed"], + "preserves_format": True, + "entity_types": self.presidio_analyzer.get_supported_entities(), + } + + return methods + + +# Convenience functions + + +def anonymize_dataset_hybrid( + dataset: pd.DataFrame, + pii_columns: list[str] | dict[str, PIIDetectionResult], + config: dict[str, Any] | None = None, + language: str = "en", +) -> tuple[pd.DataFrame, dict[str, Any]]: + """Perform hybrid dataset anonymization. + + Args: + dataset: Original dataset + pii_columns: PII columns to anonymize + config: Anonymization configuration + language: Language for text processing + + Returns: + Tuple of (anonymized_dataset, anonymization_report) + + """ + anonymizer = HybridAnonymizer(language=language) + return anonymizer.anonymize_dataset(dataset, pii_columns, config) diff --git a/src/pii_detector/core/model_manager.py b/src/pii_detector/core/model_manager.py new file mode 100644 index 0000000..ae8a1a3 --- /dev/null +++ b/src/pii_detector/core/model_manager.py @@ -0,0 +1,544 @@ +"""Dynamic spaCy model management for Presidio integration. + +This module handles automatic detection, installation, and management of spaCy models +without hardcoding versions or model sizes. It provides flexible model installation +and graceful degradation when models are not available. +""" + +import logging +import subprocess +import sys + +logger = logging.getLogger(__name__) + +# Default models by language (size preference: small -> medium -> large) +DEFAULT_MODELS = { + "en": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"], + "es": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"], + "de": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"], + "fr": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"], + "it": ["it_core_news_sm", "it_core_news_md", "it_core_news_lg"], + "pt": ["pt_core_news_sm", "pt_core_news_md", "pt_core_news_lg"], + "nl": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"], + "zh": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"], + "ja": ["ja_core_news_sm", "ja_core_news_md", "ja_core_news_lg"], +} + + +class SpacyModelManager: + """Manages spaCy model installation and detection.""" + + def __init__(self): + """Initialize the model manager.""" + self.spacy_available = False + self.installed_models = [] + self._check_spacy_availability() + + def _check_spacy_availability(self) -> None: + """Check if spaCy is available.""" + try: + import spacy # noqa: F401 + + self.spacy_available = True + self.installed_models = self._get_installed_models() + logger.info(f"spaCy available. Installed models: {self.installed_models}") + except ImportError: + logger.warning("spaCy not available") + + def _get_installed_models(self) -> list[str]: + """Get list of installed spaCy models.""" + if not self.spacy_available: + return [] + + try: + import spacy + + return list(spacy.util.get_installed_models()) + except Exception as e: + logger.error(f"Error getting installed models: {e}") + return [] + + def get_best_model( + self, language: str = "en", preferred_size: str = "sm" + ) -> str | None: + """Get the best available model for a language. + + Args: + language: Language code (e.g., 'en', 'es', 'de') + preferred_size: Preferred model size ('sm', 'md', 'lg') + + Returns: + Model name if available, None otherwise + + """ + if not self.spacy_available: + return None + + # Get possible models for the language + possible_models = DEFAULT_MODELS.get(language, []) + if not possible_models: + logger.warning(f"No known models for language: {language}") + return None + + # Reorder based on size preference + if preferred_size == "lg": + possible_models = ( + [m for m in possible_models if "_lg" in m] + + [m for m in possible_models if "_md" in m] + + [m for m in possible_models if "_sm" in m] + ) + elif preferred_size == "md": + possible_models = ( + [m for m in possible_models if "_md" in m] + + [m for m in possible_models if "_sm" in m] + + [m for m in possible_models if "_lg" in m] + ) + # Default: prefer small models + + # Find first available model + for model in possible_models: + if model in self.installed_models: + logger.info(f"Using installed model: {model}") + return model + + # No installed model found + logger.warning(f"No installed models found for language {language}") + return None + + def install_model(self, model_name: str, force: bool = False) -> bool: + """Install a spaCy model. + + Args: + model_name: Name of the model to install + force: Force installation even if already installed + + Returns: + True if installation successful, False otherwise + + """ + if not self.spacy_available: + logger.error("spaCy not available, cannot install models") + return False + + if not force and model_name in self.installed_models: + logger.info(f"Model {model_name} already installed") + return True + + try: + logger.info(f"Installing spaCy model: {model_name}") + + # Try multiple installation methods + success = False + + # Method 1: Try uv add with model URL (for uv environments) + if self._is_uv_environment(): + success = self._install_with_uv(model_name) + + # Method 2: Fall back to spacy download if uv fails + if not success: + success = self._install_with_spacy_download(model_name) + + # Method 3: Last resort - try pip if available + if not success: + success = self._install_with_pip(model_name) + + if success: + logger.info(f"Successfully installed model: {model_name}") + self.installed_models = self._get_installed_models() # Refresh list + return True + else: + logger.error(f"Failed to install model {model_name} using all methods") + return False + + except subprocess.TimeoutExpired: + logger.error(f"Timeout installing model {model_name}") + return False + except Exception as e: + logger.error(f"Error installing model {model_name}: {e}") + return False + + def _is_uv_environment(self) -> bool: + """Check if we're running in a uv environment.""" + # Check if uv is available and if we're in a uv project + try: + # Look for uv.lock file or .venv created by uv + import os + + current_dir = os.getcwd() + return os.path.exists( + os.path.join(current_dir, "uv.lock") + ) or os.path.exists(os.path.join(current_dir, "pyproject.toml")) + except Exception: + return False + + def _get_spacy_version(self) -> str: + """Get the installed spaCy version.""" + try: + import spacy + + return spacy.__version__ + except Exception: + return "3.8.0" # Default fallback version + + def _get_model_url(self, model_name: str) -> str | None: + """Get the GitHub URL for a spaCy model based on current spaCy version.""" + spacy_version = self._get_spacy_version() + + # Map of major spaCy versions to model versions + version_map = { + "3.8": "3.8.0", + "3.7": "3.7.1", + "3.6": "3.6.1", + "3.5": "3.5.0", + "3.4": "3.4.4", + } + + # Get major.minor version + major_minor = ".".join(spacy_version.split(".")[:2]) + model_version = version_map.get(major_minor, spacy_version) + + # Base URL pattern + base_url = "https://github.com/explosion/spacy-models/releases/download" + + # Common models with predictable naming + model_patterns = [ + "en_core_web_sm", + "en_core_web_md", + "en_core_web_lg", + "es_core_news_sm", + "es_core_news_md", + "es_core_news_lg", + "de_core_news_sm", + "de_core_news_md", + "de_core_news_lg", + "fr_core_news_sm", + "fr_core_news_md", + "fr_core_news_lg", + "it_core_news_sm", + "it_core_news_md", + "it_core_news_lg", + "pt_core_news_sm", + "pt_core_news_md", + "pt_core_news_lg", + "nl_core_news_sm", + "nl_core_news_md", + "nl_core_news_lg", + "zh_core_web_sm", + "zh_core_web_md", + "zh_core_web_lg", + "ja_core_news_sm", + "ja_core_news_md", + "ja_core_news_lg", + ] + + if model_name in model_patterns: + return f"{base_url}/{model_name}-{model_version}/{model_name}-{model_version}-py3-none-any.whl" + else: + logger.warning(f"Unknown model pattern: {model_name}") + return None + + def _install_with_uv(self, model_name: str) -> bool: + """Install model using uv add with direct URL.""" + try: + model_url = self._get_model_url(model_name) + if not model_url: + logger.warning(f"Cannot determine URL for model {model_name}") + return False + + # Use uv add to install from URL + cmd = ["uv", "add", f"{model_name}@{model_url}"] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) + + if result.returncode == 0: + logger.info(f"Successfully installed {model_name} with uv") + return True + else: + logger.warning( + f"uv installation failed for {model_name}: {result.stderr}" + ) + return False + + except Exception as e: + logger.warning(f"Error installing {model_name} with uv: {e}") + return False + + def _install_with_spacy_download(self, model_name: str) -> bool: + """Install model using spacy download command.""" + try: + cmd = [sys.executable, "-m", "spacy", "download", model_name] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) + + if result.returncode == 0: + logger.info(f"Successfully installed {model_name} with spacy download") + return True + else: + logger.warning( + f"spacy download failed for {model_name}: {result.stderr}" + ) + return False + + except Exception as e: + logger.warning(f"Error installing {model_name} with spacy download: {e}") + return False + + def _install_with_pip(self, model_name: str) -> bool: + """Install model using pip (fallback method).""" + try: + # First check if pip is available + pip_cmd = [sys.executable, "-m", "pip", "--version"] + pip_check = subprocess.run( + pip_cmd, capture_output=True, text=True, timeout=10 + ) + + if pip_check.returncode != 0: + logger.warning("pip not available, cannot install with pip") + return False + + # Get model URL + model_url = self._get_model_url(model_name) + if not model_url: + logger.warning(f"Cannot determine URL for model {model_name}") + return False + + # Try to install with pip + cmd = [sys.executable, "-m", "pip", "install", model_url] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) + + if result.returncode == 0: + logger.info(f"Successfully installed {model_name} with pip") + return True + else: + logger.warning( + f"pip installation failed for {model_name}: {result.stderr}" + ) + return False + + except Exception as e: + logger.warning(f"Error installing {model_name} with pip: {e}") + return False + + def install_default_model( + self, language: str = "en", preferred_size: str = "sm" + ) -> str | None: + """Install and return the best default model for a language. + + Args: + language: Language code + preferred_size: Preferred model size + + Returns: + Installed model name if successful, None otherwise + + """ + if not self.spacy_available: + return None + + # Check if we already have a good model + existing_model = self.get_best_model(language, preferred_size) + if existing_model: + return existing_model + + # Try to install models in preference order + possible_models = DEFAULT_MODELS.get(language, []) + if not possible_models: + return None + + # Reorder based on size preference + if preferred_size == "lg": + install_order = ( + [m for m in possible_models if "_lg" in m] + + [m for m in possible_models if "_md" in m] + + [m for m in possible_models if "_sm" in m] + ) + elif preferred_size == "md": + install_order = ( + [m for m in possible_models if "_md" in m] + + [m for m in possible_models if "_sm" in m] + + [m for m in possible_models if "_lg" in m] + ) + else: # Default: prefer small models + install_order = ( + [m for m in possible_models if "_sm" in m] + + [m for m in possible_models if "_md" in m] + + [m for m in possible_models if "_lg" in m] + ) + + # Try to install in order + for model in install_order: + if self.install_model(model): + return model + + logger.error(f"Failed to install any model for language {language}") + return None + + def ensure_model_available( + self, language: str = "en", preferred_size: str = "sm" + ) -> str | None: + """Ensure a model is available, installing if necessary. + + Args: + language: Language code + preferred_size: Preferred model size + + Returns: + Available model name if successful, None otherwise + + """ + # First check if we already have a good model + existing_model = self.get_best_model(language, preferred_size) + if existing_model: + return existing_model + + # Try to install a suitable model + logger.info(f"No suitable {language} model found, attempting installation...") + return self.install_default_model(language, preferred_size) + + def get_model_info(self, model_name: str) -> dict[str, str]: + """Get information about a model. + + Args: + model_name: Name of the model + + Returns: + Dictionary with model information + + """ + if not self.spacy_available or model_name not in self.installed_models: + return {"status": "not_available"} + + try: + import spacy + + nlp = spacy.load(model_name) + return { + "status": "available", + "name": model_name, + "language": nlp.lang, + "version": nlp.meta.get("version", "unknown"), + "size": "sm" + if "_sm" in model_name + else "md" + if "_md" in model_name + else "lg", + "components": list(nlp.pipe_names), + } + except Exception as e: + logger.error(f"Error getting info for model {model_name}: {e}") + return {"status": "error", "error": str(e)} + + def get_available_languages(self) -> list[str]: + """Get list of languages for which models are available. + + Returns: + List of language codes + + """ + return list(DEFAULT_MODELS.keys()) + + def cleanup_unused_models(self, keep_languages: list[str] | None = None) -> None: + """Remove unused spaCy models to save space. + + Args: + keep_languages: Languages to keep models for (None = keep all) + + """ + if not self.spacy_available or not keep_languages: + return + + models_to_remove = [] + for model in self.installed_models: + model_lang = model.split("_")[0] # Extract language from model name + if model_lang not in keep_languages: + models_to_remove.append(model) + + for model in models_to_remove: + try: + logger.info(f"Removing unused model: {model}") + + # Try uv remove first if in uv environment + success = False + if self._is_uv_environment(): + try: + cmd = ["uv", "remove", model] + result = subprocess.run( + cmd, capture_output=True, text=True, timeout=60 + ) + if result.returncode == 0: + success = True + logger.info(f"Successfully removed {model} with uv") + except Exception as e: + logger.warning(f"uv remove failed for {model}: {e}") + + # Fall back to pip if uv didn't work + if not success: + try: + cmd = [sys.executable, "-m", "pip", "uninstall", "-y", model] + subprocess.run(cmd, capture_output=True, text=True, timeout=60) + logger.info(f"Successfully removed {model} with pip") + except Exception as e: + logger.error(f"Error removing model {model}: {e}") + + except Exception as e: + logger.error(f"Error removing model {model}: {e}") + + # Refresh installed models list + self.installed_models = self._get_installed_models() + + +# Global instance +_model_manager = None + + +def get_model_manager() -> SpacyModelManager: + """Get or create the global model manager instance.""" + global _model_manager + if _model_manager is None: + _model_manager = SpacyModelManager() + return _model_manager + + +def ensure_spacy_model(language: str = "en", preferred_size: str = "sm") -> str | None: + """Ensure a spaCy model is available. + + Args: + language: Language code + preferred_size: Preferred model size + + Returns: + Available model name if successful, None otherwise + + """ + manager = get_model_manager() + return manager.ensure_model_available(language, preferred_size) + + +def get_best_spacy_model( + language: str = "en", preferred_size: str = "sm" +) -> str | None: + """Get the best available spaCy model. + + Args: + language: Language code + preferred_size: Preferred model size + + Returns: + Model name if available, None otherwise + + """ + manager = get_model_manager() + return manager.get_best_model(language, preferred_size) + + +def install_spacy_model(model_name: str, force: bool = False) -> bool: + """Install a spaCy model. + + Args: + model_name: Name of the model to install + force: Force installation even if already installed + + Returns: + True if installation successful, False otherwise + + """ + manager = get_model_manager() + return manager.install_model(model_name, force) diff --git a/src/pii_detector/core/presidio_engine.py b/src/pii_detector/core/presidio_engine.py new file mode 100644 index 0000000..e7a0633 --- /dev/null +++ b/src/pii_detector/core/presidio_engine.py @@ -0,0 +1,621 @@ +"""Presidio integration engine for advanced PII detection and anonymization. + +This module provides a wrapper around Microsoft Presidio's analyzer and anonymizer +engines, with graceful degradation if Presidio dependencies are not available. +""" + +import logging +from typing import Any + +import pandas as pd + +logger = logging.getLogger(__name__) + +# Reduce logging noise from Presidio +presidio_loggers = [ + "presidio-analyzer", + "presidio_analyzer", + "presidio_anonymizer", + "presidio-anonymizer", + "spacy", +] +for logger_name in presidio_loggers: + logging.getLogger(logger_name).setLevel(logging.WARNING) + +# Global flags for availability +PRESIDIO_AVAILABLE = False +PRESIDIO_ANALYZER = None +PRESIDIO_ANONYMIZER = None + +try: + from presidio_analyzer import AnalyzerEngine, RecognizerResult # noqa: F401 + from presidio_anonymizer import AnonymizerEngine + from presidio_anonymizer.entities import OperatorConfig + + PRESIDIO_AVAILABLE = True + logger.info("Presidio successfully imported") +except ImportError as e: + logger.warning(f"Presidio not available: {e}. Falling back to basic text analysis.") + +# Import model manager for dynamic spaCy model handling +try: + from pii_detector.core.model_manager import ensure_spacy_model + + MODEL_MANAGER_AVAILABLE = True +except ImportError: + MODEL_MANAGER_AVAILABLE = False + + +class PresidioTextAnalyzer: + """Presidio-powered text analysis for advanced PII detection.""" + + def __init__(self, language: str = "en", preferred_model_size: str = "sm"): + """Initialize the Presidio analyzer. + + Args: + language: Language code for analysis (default: 'en') + preferred_model_size: Preferred spaCy model size ('sm', 'md', 'lg') + + """ + self.language = language + self.preferred_model_size = preferred_model_size + self.analyzer = None + self.anonymizer = None + self.available = PRESIDIO_AVAILABLE + self.spacy_model = None + + if self.available: + try: + # Ensure spaCy model is available + if MODEL_MANAGER_AVAILABLE: + self.spacy_model = ensure_spacy_model( + language, preferred_model_size + ) + if self.spacy_model: + logger.info(f"Using spaCy model: {self.spacy_model}") + else: + logger.warning( + f"No spaCy model available for language {language}" + ) + + # Initialize Presidio engines + # If we have a specific model, configure the analyzer to use it + if self.spacy_model: + from presidio_analyzer import AnalyzerEngine, RecognizerRegistry + from presidio_analyzer.nlp_engine import NlpEngineProvider + + # Create NLP configuration with reduced warnings + nlp_configuration = { + "nlp_engine_name": "spacy", + "models": [ + {"lang_code": language, "model_name": self.spacy_model} + ], + "ner_model_configuration": { + "labels_to_ignore": [ + "CARDINAL", + "FAC", + "EVENT", + "LAW", + "LANGUAGE", + "WORK_OF_ART", + "ORDINAL", + "QUANTITY", + "DATE", + ], + "model_to_presidio_entity_mapping": { + "PERSON": "PERSON", + "GPE": "LOCATION", + "ORG": "ORGANIZATION", + }, + "low_score_entity_names": [], + }, + } + nlp_engine = NlpEngineProvider( + nlp_configuration=nlp_configuration + ).create_engine() + + # Create registry and analyzer + registry = RecognizerRegistry() + registry.load_predefined_recognizers(nlp_engine=nlp_engine) + self.analyzer = AnalyzerEngine( + nlp_engine=nlp_engine, registry=registry + ) + else: + # Use default configuration with reduced warnings + self.analyzer = AnalyzerEngine() + + self.anonymizer = AnonymizerEngine() + logger.info("Presidio engines initialized successfully") + except Exception as e: + logger.error(f"Failed to initialize Presidio engines: {e}") + self.available = False + + def is_available(self) -> bool: + """Check if Presidio is available and properly initialized.""" + return self.available and self.analyzer is not None + + def analyze_text( + self, + text: str, + entities: list[str] | None = None, + confidence_threshold: float = 0.7, + ) -> list[dict[str, Any]]: + """Analyze text for PII entities using Presidio. + + Args: + text: Text to analyze + entities: List of entity types to look for (None = all) + confidence_threshold: Minimum confidence score + + Returns: + List of detected PII entities with metadata + + """ + if not self.is_available(): + logger.warning("Presidio not available, returning empty results") + return [] + + if not text or not isinstance(text, str): + return [] + + try: + # Use Presidio analyzer + results = self.analyzer.analyze( + text=text, entities=entities, language=self.language + ) + + # Filter by confidence and format results + formatted_results = [] + for result in results: + if result.score >= confidence_threshold: + formatted_results.append( + { + "entity_type": result.entity_type, + "start": result.start, + "end": result.end, + "score": result.score, + "text": text[result.start : result.end], + } + ) + + return formatted_results + + except Exception as e: + logger.error(f"Error analyzing text with Presidio: {e}") + return [] + + def _analyze_text_batch( + self, text_batch: pd.Series, confidence_threshold: float + ) -> tuple[list[dict[str, Any]], list[float]]: + """Analyze a batch of text values efficiently. + + Args: + text_batch: Series of text values to analyze + confidence_threshold: Minimum confidence threshold + + Returns: + Tuple of (all_entities, all_scores) + + """ + all_entities = [] + all_scores = [] + + try: + # Try to use batch analyzer if available + if hasattr(self, "_batch_analyzer") and self._batch_analyzer: + # Convert to list and filter valid texts + texts = [ + str(text) + for text in text_batch + if isinstance(text, str) and len(str(text).strip()) > 0 + ] + + if texts: + batch_results = self._batch_analyzer.analyze_iterator( + texts, language=self.language + ) + + for result_list in batch_results: + for result in result_list: + if result.score >= confidence_threshold: + all_entities.append( + { + "entity_type": result.entity_type, + "start": result.start, + "end": result.end, + "score": result.score, + "text": texts[0][ + result.start : result.end + ], # Approximate + } + ) + all_scores.append(result.score) + else: + # Fall back to individual processing + for text_value in text_batch: + if isinstance(text_value, str) and len(text_value.strip()) > 0: + entities = self.analyze_text( + text_value, confidence_threshold=confidence_threshold + ) + all_entities.extend(entities) + all_scores.extend([entity["score"] for entity in entities]) + + except Exception as e: + logger.warning(f"Batch processing failed, falling back to individual: {e}") + # Fall back to individual processing + for text_value in text_batch: + if isinstance(text_value, str) and len(text_value.strip()) > 0: + entities = self.analyze_text( + text_value, confidence_threshold=confidence_threshold + ) + all_entities.extend(entities) + all_scores.extend([entity["score"] for entity in entities]) + + return all_entities, all_scores + + def analyze_column_text( + self, + column_data: pd.Series, + confidence_threshold: float = 0.7, + sample_size: int = 100, + batch_size: int | None = None, + ) -> dict[str, Any]: + """Analyze text content in a pandas column with optional batch processing. + + Args: + column_data: Pandas Series containing text data + confidence_threshold: Minimum confidence for detection + sample_size: Maximum number of samples to analyze + batch_size: Optional batch size for processing large columns + + Returns: + Dictionary with analysis results and statistics + + """ + if not self.is_available(): + return { + "presidio_available": False, + "entities_found": [], + "total_detections": 0, + "confidence_scores": [], + "sample_analyzed": 0, + } + + # Clean and sample the data + clean_data = column_data.dropna() + clean_data = clean_data[clean_data.astype(str).str.strip() != ""] + + if len(clean_data) == 0: + return { + "presidio_available": True, + "entities_found": [], + "total_detections": 0, + "confidence_scores": [], + "sample_analyzed": 0, + } + + # Sample data for analysis + sample_data = clean_data.head(min(sample_size, len(clean_data))) + + all_entities = [] + all_scores = [] + + # Process in batches if batch_size is specified and we have many samples + if batch_size and len(sample_data) > batch_size: + for i in range(0, len(sample_data), batch_size): + batch = sample_data.iloc[i : i + batch_size] + batch_entities, batch_scores = self._analyze_text_batch( + batch, confidence_threshold + ) + all_entities.extend(batch_entities) + all_scores.extend(batch_scores) + else: + # Process individually + for text_value in sample_data: + if isinstance(text_value, str) and len(text_value.strip()) > 0: + entities = self.analyze_text( + text_value, confidence_threshold=confidence_threshold + ) + all_entities.extend(entities) + all_scores.extend([entity["score"] for entity in entities]) + + # Aggregate results + entity_types = {} + for entity in all_entities: + entity_type = entity["entity_type"] + if entity_type not in entity_types: + entity_types[entity_type] = [] + entity_types[entity_type].append(entity) + + return { + "presidio_available": True, + "entities_found": entity_types, + "total_detections": len(all_entities), + "confidence_scores": all_scores, + "sample_analyzed": len(sample_data), + "average_confidence": sum(all_scores) / len(all_scores) + if all_scores + else 0, + } + + def anonymize_text( + self, + text: str, + analyzer_results: list[dict[str, Any]] | None = None, + operators: dict[str, Any] | None = None, + ) -> str: + """Anonymize text using Presidio. + + Args: + text: Text to anonymize + analyzer_results: Pre-computed analysis results + operators: Custom operators for anonymization + + Returns: + Anonymized text + + """ + if not self.is_available(): + logger.warning("Presidio not available, returning original text") + return text + + if not text or not isinstance(text, str): + return text + + try: + # If no analysis results provided, analyze first + if analyzer_results is None: + presidio_results = self.analyzer.analyze( + text=text, language=self.language + ) + else: + # Convert our format back to Presidio format + presidio_results = [] + for result in analyzer_results: + presidio_results.append( + RecognizerResult( + entity_type=result["entity_type"], + start=result["start"], + end=result["end"], + score=result["score"], + ) + ) + + # Default operators + if operators is None: + operators = { + "PERSON": OperatorConfig("replace", {"new_value": "[PERSON]"}), + "PHONE_NUMBER": OperatorConfig("replace", {"new_value": "[PHONE]"}), + "EMAIL_ADDRESS": OperatorConfig( + "replace", {"new_value": "[EMAIL]"} + ), + "LOCATION": OperatorConfig("replace", {"new_value": "[LOCATION]"}), + "DATE_TIME": OperatorConfig("replace", {"new_value": "[DATE]"}), + "US_SSN": OperatorConfig("replace", {"new_value": "[SSN]"}), + "CREDIT_CARD": OperatorConfig("replace", {"new_value": "[CARD]"}), + "US_DRIVER_LICENSE": OperatorConfig( + "replace", {"new_value": "[LICENSE]"} + ), + "DEFAULT": OperatorConfig("replace", {"new_value": "[REDACTED]"}), + } + + # Anonymize the text + anonymized_result = self.anonymizer.anonymize( + text=text, analyzer_results=presidio_results, operators=operators + ) + + return anonymized_result.text + + except Exception as e: + logger.error(f"Error anonymizing text with Presidio: {e}") + return text + + def get_supported_entities(self) -> list[str]: + """Get list of supported entity types. + + Returns: + List of entity type names + + """ + if not self.is_available(): + return [] + + try: + # Get supported recognizers from Presidio + supported_entities = [] + for recognizer in self.analyzer.registry.recognizers: + supported_entities.extend(recognizer.supported_entities) + + return list(set(supported_entities)) + + except Exception as e: + logger.error(f"Error getting supported entities: {e}") + return [] + + def get_recognizer_info(self) -> dict[str, list[str]]: + """Get detailed information about available recognizers. + + Returns: + Dictionary mapping recognizer names to supported entities + + """ + if not self.is_available(): + return {} + + try: + recognizer_info = {} + for recognizer in self.analyzer.registry.recognizers: + recognizer_name = recognizer.__class__.__name__ + recognizer_info[recognizer_name] = recognizer.supported_entities + + return recognizer_info + + except Exception as e: + logger.error(f"Error getting recognizer info: {e}") + return {} + + +# Singleton instance for global use +_presidio_analyzer = None + + +def get_presidio_analyzer( + language: str = "en", preferred_model_size: str = "sm" +) -> PresidioTextAnalyzer: + """Get or create a Presidio analyzer instance. + + Args: + language: Language code for the analyzer + preferred_model_size: Preferred spaCy model size + + Returns: + PresidioTextAnalyzer instance + + """ + global _presidio_analyzer + if _presidio_analyzer is None or _presidio_analyzer.language != language: + _presidio_analyzer = PresidioTextAnalyzer( + language=language, preferred_model_size=preferred_model_size + ) + return _presidio_analyzer + + +def presidio_analyze_text_column( + column_data: pd.Series, + confidence_threshold: float = 0.7, + sample_size: int = 100, +) -> dict[str, Any]: + """Analyze text column with Presidio. + + Args: + column_data: Pandas Series with text data + confidence_threshold: Minimum confidence for detections + sample_size: Maximum samples to analyze + + Returns: + Analysis results dictionary + + """ + analyzer = get_presidio_analyzer() + return analyzer.analyze_column_text( + column_data, confidence_threshold=confidence_threshold, sample_size=sample_size + ) + + +def presidio_anonymize_text_column( + column_data: pd.Series, operators: dict[str, Any] | None = None +) -> pd.Series: + """Anonymize text column with Presidio. + + Args: + column_data: Pandas Series with text data + operators: Custom anonymization operators + + Returns: + Anonymized pandas Series + + """ + analyzer = get_presidio_analyzer() + + if not analyzer.is_available(): + logger.warning("Presidio not available, returning original column") + return column_data + + def anonymize_single_text(text): + if isinstance(text, str) and len(text.strip()) > 0: + return analyzer.anonymize_text(text, operators=operators) + return text + + return column_data.apply(anonymize_single_text) + + +def presidio_analyze_dataframe_batch( + dataframe: pd.DataFrame, + text_columns: list[str] | None = None, + confidence_threshold: float = 0.7, + sample_size: int = 100, + batch_size: int | None = None, +) -> dict[str, dict[str, Any]]: + """Analyze multiple columns in a DataFrame using batch processing. + + Args: + dataframe: DataFrame to analyze + text_columns: List of columns to analyze (None = auto-detect object columns) + confidence_threshold: Minimum confidence for detections + sample_size: Sample size per column + batch_size: Batch size for processing + + Returns: + Dictionary mapping column names to analysis results + + """ + analyzer = get_presidio_analyzer() + + if not analyzer.is_available(): + logger.warning("Presidio not available") + return {} + + # Auto-detect text columns if not specified + if text_columns is None: + text_columns = [ + col for col in dataframe.columns if dataframe[col].dtype == "object" + ] + + results = {} + for col in text_columns: + if col in dataframe.columns: + try: + result = analyzer.analyze_column_text( + dataframe[col], + confidence_threshold=confidence_threshold, + sample_size=sample_size, + batch_size=batch_size, + ) + if result.get("total_detections", 0) > 0: + results[col] = result + except Exception as e: + logger.error(f"Error analyzing column {col}: {e}") + + return results + + +def presidio_anonymize_dataframe_batch( + dataframe: pd.DataFrame, + columns_to_anonymize: list[str] | None = None, + operators: dict[str, dict[str, Any]] | None = None, +) -> pd.DataFrame: + """Anonymize multiple columns in a DataFrame. + + Args: + dataframe: DataFrame to anonymize + columns_to_anonymize: List of columns to anonymize + operators: Dictionary mapping column names to operator configs + + Returns: + DataFrame with anonymized columns + + """ + analyzer = get_presidio_analyzer() + + if not analyzer.is_available(): + logger.warning("Presidio not available, returning original DataFrame") + return dataframe + + anonymized_df = dataframe.copy() + + if columns_to_anonymize is None: + columns_to_anonymize = [ + col for col in dataframe.columns if dataframe[col].dtype == "object" + ] + + for col in columns_to_anonymize: + if col in anonymized_df.columns: + try: + col_operators = operators.get(col) if operators else None + anonymized_df[col] = presidio_anonymize_text_column( + anonymized_df[col], col_operators + ) + except Exception as e: + logger.error(f"Error anonymizing column {col}: {e}") + + return anonymized_df diff --git a/src/pii_detector/core/processor.py b/src/pii_detector/core/processor.py new file mode 100644 index 0000000..8a401e6 --- /dev/null +++ b/src/pii_detector/core/processor.py @@ -0,0 +1,393 @@ +"""Core PII detection and data processing functionality.""" + +import warnings + +import pandas as pd + +from pii_detector.api.queries import query_location_population +from pii_detector.data import constants +from pii_detector.data import restricted_words as restricted_words_list + +warnings.simplefilter(action="ignore", category=FutureWarning) + +# Global variables for output management +OUTPUTS_FOLDER = None +LOG_FILE_PATH = None + + +def get_surveycto_restricted_vars() -> list[str]: + """Get SurveyCTO restricted variables.""" + return restricted_words_list.get_surveycto_restricted_vars() + + +def import_dataset(dataset_path: str) -> tuple[bool, str | list]: + """Import a dataset from various file formats. + + Args: + dataset_path: Path to the dataset file + + Returns: + Tuple of (success, result) where result is either error message or + [dataset, dataset_path, label_dict, value_label_dict] + + """ + dataset, label_dict, value_label_dict = False, False, False + status_message = False + + # Check format + if not dataset_path.endswith(("xlsx", "xls", "csv", "dta")): + return (False, "Supported files are .csv, .dta, .xlsx, .xls") + + try: + if dataset_path.endswith(("xlsx", "xls")): + dataset = pd.read_excel(dataset_path) + elif dataset_path.endswith("csv"): + dataset = pd.read_csv(dataset_path) + elif dataset_path.endswith("dta"): + try: + dataset = pd.read_stata(dataset_path) + except ValueError: + dataset = pd.read_stata(dataset_path, convert_categoricals=False) + label_dict = pd.io.stata.StataReader(dataset_path).variable_labels() + try: + value_label_dict = pd.io.stata.StataReader(dataset_path).value_labels() + except AttributeError: + status_message = "No value labels detected." + elif dataset_path.endswith(("xpt", ".sas7bdat")): + dataset = pd.read_sas(dataset_path) + elif dataset_path.endswith("vc"): + status_message = ( + "**ERROR**: This folder appears to be encrypted using VeraCrypt." + ) + raise Exception + elif dataset_path.endswith("bc"): + status_message = "**ERROR**: This file appears to be encrypted using Boxcryptor. Sign in to Boxcryptor and then select the file in your X: drive." + raise Exception + else: + raise Exception + + except (FileNotFoundError, Exception): + if status_message is False: + status_message = "**ERROR**: This path appears to be invalid. If your folders or filename contain colons or commas, try renaming them or moving the file to a different location." + raise + + if status_message: + log_and_print("There was an error") + log_and_print(status_message) + return (False, status_message) + + log_and_print("The dataset has been read successfully.\n") + dataset_read_return = [dataset, dataset_path, label_dict, value_label_dict] + return (True, dataset_read_return) + + +def word_match( + column_name: str, restricted_word: str, type_of_matching: str = constants.STRICT +) -> bool: + """Check if a column name matches a restricted word.""" + if type_of_matching == constants.STRICT: + return column_name.lower() == restricted_word.lower() + else: # type_of_matching == FUZZY + return restricted_word.lower() in column_name.lower() + + +def remove_other_refuse_and_dont_know(column: pd.Series) -> pd.Series: + """Remove standard survey response values like 999, -999, etc.""" + # List of values to remove. All numbers with 3 digits where all digits are the same + values_to_remove = [str(111 * i) for i in range(-9, 10) if i != 0] + filtered_column = column[~column.isin(values_to_remove)] + return filtered_column + + +def clean_column(column: pd.Series) -> pd.Series: + """Clean a column by removing NaNs, empty entries, and standard survey codes.""" + # Drop NaNs + column_filtered = column.dropna() + + # Remove empty entries + column_filtered = column_filtered[column_filtered != ""] + + # Remove other, refuses and don't knows + if len(column_filtered) != 0: + column_filtered = remove_other_refuse_and_dont_know(column_filtered) + + return column_filtered + + +def column_is_sparse( + dataset: pd.DataFrame, column_name: str, sparse_threshold: float +) -> bool: + """Check if a column is sparse (has high ratio of unique values).""" + column_filtered = clean_column(dataset[column_name]) + + # Check sparsity + n_entries = len(column_filtered) + n_unique_entries = column_filtered.nunique() + + return n_entries != 0 and n_unique_entries / n_entries > sparse_threshold + + +def column_has_sufficiently_sparse_strings( + dataset: pd.DataFrame, column_name: str, sparse_threshold: float = 0.2 +) -> bool: + """Check if 'valid' column entries are sparse. + + Only considers string columns and excludes NaN, empty, and survey codes. + """ + # Check if column type is string + if dataset[column_name].dtypes == "object": + return column_is_sparse(dataset, column_name, sparse_threshold) + else: + return False + + +def column_has_sparse_value_label_dicts( + column_name: str, value_label_dict: dict, sparse_threshold: int = 10 +) -> bool: + """Check if a column has sufficiently sparse value label dictionary.""" + return ( + column_name in value_label_dict + and value_label_dict[column_name] != "" + and len(value_label_dict[column_name]) > sparse_threshold + ) + + +def log_and_print(message: str) -> None: + """Log a message to file and print to console.""" + print(message) + if LOG_FILE_PATH: + with open(LOG_FILE_PATH, "a", encoding="utf-8") as f: + f.write(f"{message}\n") + + +def find_piis_based_on_column_name( + dataset: pd.DataFrame, + label_dict: dict, + language: str, + country: str, + matching_type: str = constants.STRICT, +) -> list[str]: + """Find PIIs based on column name/label matching against restricted word lists. + + Args: + dataset: The pandas DataFrame to analyze + label_dict: Dictionary mapping column names to their labels + language: Language for word matching + country: Country for location-specific matching + matching_type: Type of matching (strict or fuzzy) + + Returns: + List of column names identified as potential PII + + """ + pii_columns = [] + + for column_name in dataset.columns: + # Get column label if available + column_label = ( + label_dict.get(column_name, column_name) if label_dict else column_name + ) + + # Check against various restricted word lists + if _matches_restricted_words( + column_name, column_label, language, country, matching_type + ): + pii_columns.append(column_name) + log_and_print(f"PII detected (column name): {column_name}") + + return pii_columns + + +def find_piis_based_on_column_format(dataset: pd.DataFrame) -> list[str]: + """Find PIIs based on column format patterns (phone numbers, dates, etc.). + + Args: + dataset: The pandas DataFrame to analyze + + Returns: + List of column names identified as potential PII based on format + + """ + pii_columns = [] + + for column_name in dataset.columns: + column_data = clean_column(dataset[column_name]) + + if _contains_phone_numbers(column_data): + pii_columns.append(column_name) + log_and_print(f"PII detected (phone format): {column_name}") + elif _contains_date_patterns(column_data): + pii_columns.append(column_name) + log_and_print(f"PII detected (date format): {column_name}") + elif _contains_email_patterns(column_data): + pii_columns.append(column_name) + log_and_print(f"PII detected (email format): {column_name}") + + return pii_columns + + +def find_piis_based_on_sparse_entries( + dataset: pd.DataFrame, sparse_threshold: float = 0.8 +) -> list[str]: + """Find PIIs based on sparsity analysis (columns with mostly unique values). + + Args: + dataset: The pandas DataFrame to analyze + sparse_threshold: Minimum sparsity ratio to flag as PII + + Returns: + List of column names identified as potential PII based on sparsity + + """ + pii_columns = [] + + for column_name in dataset.columns: + if column_is_sparse(dataset, column_name, sparse_threshold): + pii_columns.append(column_name) + log_and_print(f"PII detected (sparse): {column_name}") + elif column_has_sufficiently_sparse_strings( + dataset, column_name, sparse_threshold + ): + pii_columns.append(column_name) + log_and_print(f"PII detected (sparse strings): {column_name}") + + return pii_columns + + +def find_piis_based_on_locations_population( + dataset: pd.DataFrame, population_threshold: int = 20000, country: str = "US" +) -> list[str]: + """Find PIIs based on location population analysis (small locations may be PII). + + Args: + dataset: The pandas DataFrame to analyze + population_threshold: Maximum population size to consider as PII + country: Country code for location lookups (default: 'US') + + Returns: + List of column names identified as potential PII based on location population + + """ + pii_columns = [] + + for column_name in dataset.columns: + if _contains_small_locations( + dataset[column_name], population_threshold, country + ): + pii_columns.append(column_name) + log_and_print(f"PII detected (small location): {column_name}") + + return pii_columns + + +def _matches_restricted_words( + column_name: str, column_label: str, language: str, country: str, matching_type: str +) -> bool: + """Check if column name/label matches any restricted words.""" + # Get appropriate restricted word lists + if matching_type == constants.STRICT: + word_lists = [ + restricted_words_list.get_strict_restricted_words(), + restricted_words_list.get_locations_strict_restricted_words(), + restricted_words_list.get_surveycto_restricted_vars(), + ] + else: # FUZZY matching + word_lists = [ + restricted_words_list.get_fuzzy_restricted_words(), + restricted_words_list.get_locations_fuzzy_restricted_words(), + ] + + for word_list in word_lists: + for restricted_word in word_list: + if word_match(column_name, restricted_word, matching_type): + return True + if column_label != column_name and word_match( + column_label, restricted_word, matching_type + ): + return True + + return False + + +def _contains_phone_numbers(column_data: pd.Series) -> bool: + """Check if column contains phone number patterns.""" + import re + + phone_pattern = re.compile(r"[\+]?[\d\s\-\(\)]{10,}") + + sample_size = min(100, len(column_data)) + sample_data = column_data.dropna().head(sample_size) + + phone_count = 0 + for value in sample_data: + if isinstance(value, str) and phone_pattern.search(value): + phone_count += 1 + + return phone_count / len(sample_data) > 0.5 if len(sample_data) > 0 else False + + +def _contains_date_patterns(column_data: pd.Series) -> bool: + """Check if column contains date patterns.""" + import re + + date_patterns = [ + re.compile(r"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}"), # MM/DD/YYYY or DD/MM/YYYY + re.compile(r"\d{4}[/-]\d{1,2}[/-]\d{1,2}"), # YYYY/MM/DD + re.compile( + r"\d{1,2}\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{2,4}", + re.IGNORECASE, + ), + ] + + sample_size = min(100, len(column_data)) + sample_data = column_data.dropna().head(sample_size) + + date_count = 0 + for value in sample_data: + if isinstance(value, str): + for pattern in date_patterns: + if pattern.search(value): + date_count += 1 + break + + return date_count / len(sample_data) > 0.5 if len(sample_data) > 0 else False + + +def _contains_email_patterns(column_data: pd.Series) -> bool: + """Check if column contains email patterns.""" + import re + + email_pattern = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b") + + sample_size = min(100, len(column_data)) + sample_data = column_data.dropna().head(sample_size) + + email_count = 0 + for value in sample_data: + if isinstance(value, str) and email_pattern.search(value): + email_count += 1 + + return email_count / len(sample_data) > 0.5 if len(sample_data) > 0 else False + + +def _contains_small_locations( + column_data: pd.Series, population_threshold: int, country: str = "US" +) -> bool: + """Check if column contains locations with small populations.""" + sample_size = min(50, len(column_data)) # Limit API calls + sample_data = column_data.dropna().unique()[:sample_size] + + small_location_count = 0 + for location in sample_data: + if isinstance(location, str) and len(location.strip()) > 2: + try: + population = query_location_population(location.strip(), country) + if population and population < population_threshold: + small_location_count += 1 + except Exception as e: + log_and_print(f"Error querying location {location}: {e}") + continue + + return ( + small_location_count / len(sample_data) > 0.3 if len(sample_data) > 0 else False + ) diff --git a/src/pii_detector/core/text_analysis.py b/src/pii_detector/core/text_analysis.py new file mode 100644 index 0000000..580ea64 --- /dev/null +++ b/src/pii_detector/core/text_analysis.py @@ -0,0 +1,267 @@ +"""Text analysis for finding PII in unstructured text data.""" + +import re +from pathlib import Path + +from pii_detector.api.queries import find_names_in_list_string +from pii_detector.data import restricted_words as restricted_words_list +from pii_detector.data.constants import ENGLISH, SPANISH + + +def get_stopwords(languages: list[str] | None = None) -> list[str]: + """Load stopwords from data directory. + + Args: + languages: List of language names to load, or None for all languages + + Returns: + List of stopwords + + """ + # Get path to stopwords directory in the package + stopwords_path = Path(__file__).parent.parent / "data" / "stopwords" + + # If no language selected, get all stopwords + if languages is None: + stopwords_files = [f for f in stopwords_path.iterdir() if f.is_file()] + else: + # Select only stopwords files for given languages + stopwords_files = [] + for language in languages: + file_path = stopwords_path / language + if file_path.is_file(): + stopwords_files.append(file_path) + + stopwords_list = [] + for file_path in stopwords_files: + try: + with open(file_path, encoding="utf-8") as reader: + stopwords = reader.read().split("\n") + stopwords_list.extend(stopwords) + except UnicodeDecodeError: + # Try with different encoding if UTF-8 fails + with open(file_path, encoding="latin-1") as reader: + stopwords = reader.read().split("\n") + stopwords_list.extend(stopwords) + + return list(set(stopwords_list)) + + +def remove_stopwords( + strings_list: list[str], languages: list[str] = ["english", "spanish"] +) -> list[str]: + """Remove stopwords from a list of strings. + + Args: + strings_list: List of strings to filter + languages: Languages to use for stopword filtering + + Returns: + Filtered list of strings + + """ + stop_words = get_stopwords(languages) + return [s for s in strings_list if s not in stop_words] + + +def find_phone_numbers_in_list_strings(list_strings: list[str]) -> list[str]: + """Find phone numbers in a list of strings using regex patterns. + + Args: + list_strings: List of strings to search + + Returns: + List of strings that match phone number patterns + + """ + phone_n_regex_str = r"(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})" + phone_n_regex = re.compile(phone_n_regex_str) + phone_numbers_found = [s for s in list_strings if phone_n_regex.match(s)] + + return phone_numbers_found + + +def filter_based_type_of_word(list_strings: list[str], language: str) -> list[str]: + """Filter strings based on word type analysis. + + Args: + list_strings: List of strings to analyze + language: Language for analysis context + + Returns: + Filtered list of strings + + """ + # This is a placeholder for more sophisticated NLP analysis + # In the original, this used spacy for linguistic analysis + # For now, we'll do basic filtering + + # Remove single characters and very short strings + filtered = [s for s in list_strings if len(s) > 2] + + # Remove strings that are mostly numbers + filtered = [ + s + for s in filtered + if not s.replace(" ", "").replace("-", "").replace(".", "").isdigit() + ] + + return filtered + + +def extract_words_from_text(text: str, language: str = ENGLISH) -> list[str]: + """Extract individual words from text for PII analysis. + + Args: + text: Text to analyze + language: Language context for processing + + Returns: + List of extracted words + + """ + # Basic word extraction - split on common delimiters + words = re.split(r"[\s,;.!?\-_]+", text) + + # Clean up words + words = [word.strip() for word in words if word.strip()] + + # Remove stopwords + language_map = {ENGLISH: "english", SPANISH: "spanish"} + lang_code = language_map.get(language, "english") + words = remove_stopwords(words, [lang_code]) + + return words + + +def find_potential_names_in_text(text: str, language: str = ENGLISH) -> list[str]: + """Find potential person names in text. + + Args: + text: Text to analyze + language: Language context + + Returns: + List of potential names found + + """ + words = extract_words_from_text(text, language) + + # Filter by word type + potential_names = filter_based_type_of_word(words, language) + + # Use external API to validate names (if available) + try: + validated_names = find_names_in_list_string(potential_names) + return validated_names + except Exception as e: + print(f"Could not validate names via API: {e}") + # Fall back to basic heuristics + return [name for name in potential_names if name.istitle()] + + +def find_locations_in_text(text: str) -> list[str]: + """Find potential location names in text. + + Args: + text: Text to analyze + + Returns: + List of potential locations found + + """ + words = extract_words_from_text(text) + + # Get location-related restricted words + location_words = ( + restricted_words_list.get_locations_strict_restricted_words() + + restricted_words_list.get_locations_fuzzy_restricted_words() + ) + + # Find words that match location patterns + potential_locations = [] + for word in words: + for location_word in location_words: + if ( + location_word.lower() in word.lower() + or word.lower() in location_word.lower() + ): + potential_locations.append(word) + + return list(set(potential_locations)) + + +def replace_piis_in_text( + text: str, replacement: str = "XXXXXX", language: str = ENGLISH +) -> str: + """Replace detected PIIs in text with a placeholder. + + Args: + text: Original text + replacement: String to replace PIIs with + language: Language context for processing + + Returns: + Text with PIIs replaced + + """ + modified_text = text + + # Find and replace names + names = find_potential_names_in_text(text, language) + for name in names: + modified_text = re.sub( + rf"\b{re.escape(name)}\b", replacement, modified_text, flags=re.IGNORECASE + ) + + # Find and replace phone numbers + phone_numbers = find_phone_numbers_in_list_strings([text]) + for phone in phone_numbers: + modified_text = modified_text.replace(phone, replacement) + + # Find and replace locations (if they seem to be small/specific) + locations = find_locations_in_text(text) + for location in locations: + modified_text = re.sub( + rf"\b{re.escape(location)}\b", + replacement, + modified_text, + flags=re.IGNORECASE, + ) + + return modified_text + + +def find_piis_in_unstructured_text(text_series, language: str = ENGLISH) -> set[str]: + """Find PIIs in unstructured text data. + + Args: + text_series: Pandas series or list of text strings + language: Language context for processing + + Returns: + Set of detected PIIs + + """ + all_piis = set() + + # Convert to list if it's a pandas series + if hasattr(text_series, "tolist"): + texts = text_series.tolist() + else: + texts = list(text_series) + + for text in texts: + if not isinstance(text, str): + continue + + # Find different types of PIIs + names = find_potential_names_in_text(text, language) + locations = find_locations_in_text(text) + phone_numbers = find_phone_numbers_in_list_strings([text]) + + all_piis.update(names) + all_piis.update(locations) + all_piis.update(phone_numbers) + + return all_piis diff --git a/src/pii_detector/core/unified_processor.py b/src/pii_detector/core/unified_processor.py new file mode 100644 index 0000000..c140fe0 --- /dev/null +++ b/src/pii_detector/core/unified_processor.py @@ -0,0 +1,432 @@ +"""Unified PII detection processor combining structural analysis with Presidio text analysis. + +This module provides a hybrid approach that combines: +1. Existing structural detection methods (column names, formats, sparsity, locations) +2. Advanced Presidio-powered text content analysis +3. Confidence-weighted scoring and decision making +""" + +import logging +from typing import Any + +import pandas as pd + +from pii_detector.core import processor +from pii_detector.core.presidio_engine import get_presidio_analyzer +from pii_detector.data import constants + +logger = logging.getLogger(__name__) + + +class PIIDetectionResult: + """Represents a PII detection result with confidence scoring.""" + + def __init__( + self, + column_name: str, + detection_method: str, + confidence: float, + entity_types: list[str] | None = None, + details: dict[str, Any] | None = None, + ): + """Initialize PII detection result. + + Args: + column_name: Name of the column + detection_method: Method used for detection + confidence: Confidence score (0.0 to 1.0) + entity_types: List of detected entity types + details: Additional detection details + + """ + self.column_name = column_name + self.detection_method = detection_method + self.confidence = confidence + self.entity_types = entity_types or [] + self.details = details or {} + + def __repr__(self) -> str: + return f"PIIDetectionResult(column='{self.column_name}', method='{self.detection_method}', confidence={self.confidence:.2f})" + + +class UnifiedPIIProcessor: + """Unified processor that combines structural and text-based PII detection.""" + + def __init__(self, language: str = "en"): + """Initialize the unified processor. + + Args: + language: Language code for text analysis + + """ + self.language = language + self.presidio_analyzer = get_presidio_analyzer(language) + + def detect_pii_comprehensive( + self, + dataset: pd.DataFrame, + label_dict: dict[str, str] | None = None, + detection_config: dict[str, Any] | None = None, + ) -> dict[str, PIIDetectionResult]: + """Perform comprehensive PII detection using all available methods. + + Args: + dataset: The pandas DataFrame to analyze + label_dict: Dictionary mapping column names to their labels + detection_config: Configuration options for detection + + Returns: + Dictionary mapping column names to detection results + + """ + if detection_config is None: + detection_config = self._get_default_config() + + logger.info( + f"Starting comprehensive PII detection on {len(dataset.columns)} columns" + ) + + results = {} + + # Run all detection methods + structural_results = self._detect_structural_pii( + dataset, label_dict, detection_config + ) + text_content_results = self._detect_text_content_pii(dataset, detection_config) + + # Combine and score results + all_detected_columns = set(structural_results.keys()) | set( + text_content_results.keys() + ) + + for column_name in all_detected_columns: + structural_result = structural_results.get(column_name) + text_result = text_content_results.get(column_name) + + # Combine results with confidence weighting + combined_result = self._combine_detection_results( + column_name, structural_result, text_result, detection_config + ) + + if combined_result: + results[column_name] = combined_result + + logger.info( + f"PII detection completed. Found {len(results)} potentially sensitive columns" + ) + return results + + def _detect_structural_pii( + self, + dataset: pd.DataFrame, + label_dict: dict[str, str] | None, + config: dict[str, Any], + ) -> dict[str, PIIDetectionResult]: + """Detect PII using structural analysis methods.""" + results = {} + + # Column name/label matching + if config.get("use_column_name_detection", True): + column_name_piis = processor.find_piis_based_on_column_name( + dataset, + label_dict or {}, + self.language, + config.get("country", "US"), + config.get("matching_type", constants.STRICT), + ) + + for column in column_name_piis: + results[column] = PIIDetectionResult( + column_name=column, + detection_method="column_name_matching", + confidence=config.get("column_name_confidence", 0.8), + details={ + "matching_type": config.get("matching_type", constants.STRICT) + }, + ) + + # Format pattern detection + if config.get("use_format_detection", True): + format_piis = processor.find_piis_based_on_column_format(dataset) + + for column in format_piis: + if column not in results: + results[column] = PIIDetectionResult( + column_name=column, + detection_method="format_patterns", + confidence=config.get("format_pattern_confidence", 0.9), + ) + + # Sparsity analysis + if config.get("use_sparsity_detection", True): + sparsity_piis = processor.find_piis_based_on_sparse_entries( + dataset, config.get("sparse_threshold", 0.8) + ) + + for column in sparsity_piis: + if column not in results: + results[column] = PIIDetectionResult( + column_name=column, + detection_method="sparsity_analysis", + confidence=config.get("sparsity_confidence", 0.6), + details={ + "sparse_threshold": config.get("sparse_threshold", 0.8) + }, + ) + + # Location population analysis + if config.get("use_location_detection", True): + location_piis = processor.find_piis_based_on_locations_population( + dataset, + config.get("population_threshold", 20000), + config.get("country", "US"), + ) + + for column in location_piis: + if column not in results: + results[column] = PIIDetectionResult( + column_name=column, + detection_method="location_population", + confidence=config.get("location_confidence", 0.7), + details={ + "population_threshold": config.get( + "population_threshold", 20000 + ) + }, + ) + + return results + + def _detect_text_content_pii( + self, dataset: pd.DataFrame, config: dict[str, Any] + ) -> dict[str, PIIDetectionResult]: + """Detect PII using Presidio text content analysis.""" + results = {} + + if not config.get("use_presidio_detection", True): + return results + + if not self.presidio_analyzer.is_available(): + logger.warning("Presidio not available, skipping text content analysis") + return results + + confidence_threshold = config.get("presidio_confidence_threshold", 0.7) + sample_size = config.get("presidio_sample_size", 100) + + for column_name in dataset.columns: + # Only analyze text-like columns + if dataset[column_name].dtype == "object": + try: + analysis_result = self.presidio_analyzer.analyze_column_text( + dataset[column_name], + confidence_threshold=confidence_threshold, + sample_size=sample_size, + ) + + if ( + analysis_result.get("presidio_available", False) + and analysis_result.get("total_detections", 0) > 0 + ): + # Calculate detection confidence based on results + avg_confidence = analysis_result.get("average_confidence", 0) + detection_rate = analysis_result.get( + "total_detections", 0 + ) / analysis_result.get("sample_analyzed", 1) + + # Adjust confidence based on detection rate + adjusted_confidence = min( + avg_confidence * (1 + detection_rate), 1.0 + ) + + if adjusted_confidence >= confidence_threshold: + entity_types = list( + analysis_result.get("entities_found", {}).keys() + ) + + results[column_name] = PIIDetectionResult( + column_name=column_name, + detection_method="presidio_text_analysis", + confidence=adjusted_confidence, + entity_types=entity_types, + details={ + "total_detections": analysis_result.get( + "total_detections", 0 + ), + "average_confidence": avg_confidence, + "detection_rate": detection_rate, + "entities_found": analysis_result.get( + "entities_found", {} + ), + }, + ) + + except Exception as e: + logger.error( + f"Error analyzing column {column_name} with Presidio: {e}" + ) + + return results + + def _combine_detection_results( + self, + column_name: str, + structural_result: PIIDetectionResult | None, + text_result: PIIDetectionResult | None, + config: dict[str, Any], + ) -> PIIDetectionResult | None: + """Combine structural and text detection results with confidence weighting.""" + if structural_result is None and text_result is None: + return None + + if structural_result is None: + return text_result + + if text_result is None: + return structural_result + + # Both results exist - combine them + structural_weight = config.get("structural_weight", 0.6) + text_weight = config.get("text_weight", 0.4) + + combined_confidence = ( + structural_result.confidence * structural_weight + + text_result.confidence * text_weight + ) + + # Combine entity types + combined_entity_types = list( + set(structural_result.entity_types + text_result.entity_types) + ) + + # Combine details + combined_details = { + "structural_detection": { + "method": structural_result.detection_method, + "confidence": structural_result.confidence, + "details": structural_result.details, + }, + "text_detection": { + "method": text_result.detection_method, + "confidence": text_result.confidence, + "entity_types": text_result.entity_types, + "details": text_result.details, + }, + "combined_weights": {"structural": structural_weight, "text": text_weight}, + } + + return PIIDetectionResult( + column_name=column_name, + detection_method="hybrid_detection", + confidence=combined_confidence, + entity_types=combined_entity_types, + details=combined_details, + ) + + def _get_default_config(self) -> dict[str, Any]: + """Get default configuration for PII detection.""" + return { + # Structural detection options + "use_column_name_detection": True, + "use_format_detection": True, + "use_sparsity_detection": True, + "use_location_detection": True, + # Presidio options + "use_presidio_detection": True, + "presidio_confidence_threshold": 0.7, + "presidio_sample_size": 100, + # Confidence scores for structural methods + "column_name_confidence": 0.8, + "format_pattern_confidence": 0.9, + "sparsity_confidence": 0.6, + "location_confidence": 0.7, + # Combination weights + "structural_weight": 0.6, + "text_weight": 0.4, + # Other parameters + "matching_type": constants.STRICT, + "sparse_threshold": 0.8, + "population_threshold": 20000, + "country": "US", + } + + def get_high_confidence_detections( + self, results: dict[str, PIIDetectionResult], threshold: float = 0.8 + ) -> dict[str, PIIDetectionResult]: + """Filter results to only high-confidence detections.""" + return { + col: result + for col, result in results.items() + if result.confidence >= threshold + } + + def get_detection_summary( + self, results: dict[str, PIIDetectionResult] + ) -> dict[str, Any]: + """Generate a summary of detection results.""" + if not results: + return {"total_columns": 0, "total_detections": 0} + + methods = {} + entity_types = {} + confidence_scores = [] + + for result in results.values(): + # Count methods + method = result.detection_method + methods[method] = methods.get(method, 0) + 1 + + # Count entity types + for entity_type in result.entity_types: + entity_types[entity_type] = entity_types.get(entity_type, 0) + 1 + + confidence_scores.append(result.confidence) + + return { + "total_detections": len(results), + "methods_used": methods, + "entity_types_found": entity_types, + "average_confidence": sum(confidence_scores) / len(confidence_scores), + "confidence_distribution": { + "high": len([c for c in confidence_scores if c >= 0.8]), + "medium": len([c for c in confidence_scores if 0.6 <= c < 0.8]), + "low": len([c for c in confidence_scores if c < 0.6]), + }, + } + + +# Convenience functions for backward compatibility + + +def detect_pii_unified( + dataset: pd.DataFrame, + label_dict: dict[str, str] | None = None, + language: str = "en", + config: dict[str, Any] | None = None, +) -> dict[str, PIIDetectionResult]: + """Perform unified PII detection. + + Args: + dataset: The pandas DataFrame to analyze + label_dict: Dictionary mapping column names to their labels + language: Language code for text analysis + config: Detection configuration options + + Returns: + Dictionary mapping column names to detection results + + """ + processor_instance = UnifiedPIIProcessor(language=language) + return processor_instance.detect_pii_comprehensive(dataset, label_dict, config) + + +def get_pii_column_list(results: dict[str, PIIDetectionResult]) -> list[str]: + """Extract list of column names from detection results. + + Args: + results: Detection results from unified processor + + Returns: + List of column names identified as containing PII + + """ + return list(results.keys()) diff --git a/src/pii_detector/data/__init__.py b/src/pii_detector/data/__init__.py new file mode 100644 index 0000000..e1452ed --- /dev/null +++ b/src/pii_detector/data/__init__.py @@ -0,0 +1,55 @@ +"""Data, constants, and configuration for PII detection.""" + +from pii_detector.data.constants import ( + CHECK_LOCATIONS_POP, + COLUMNS_FORMAT_SEARCH_METHOD, + COLUMNS_NAMES_SEARCH_METHOD, + CONSIDER_SURVEY_CTO_VARS, + DATASET, + DATE, + ENGLISH, + ERROR_MESSAGE, + FUZZY, + LOCATIONS_POPULATIONS_SEARCH_METHOD, + OTHER, + PHONE_NUMBER, + PII_CANDIDATES, + SPANISH, + SPARSE_ENTRIES_SEARCH_METHOD, + STRICT, + UNSTRUCTURED_TEXT_SEARCH_METHOD, +) +from pii_detector.data.restricted_words import ( + get_fuzzy_restricted_words, + get_locations_fuzzy_restricted_words, + get_locations_strict_restricted_words, + get_strict_restricted_words, + get_surveycto_restricted_vars, +) + +__all__ = [ + # Constants + "CHECK_LOCATIONS_POP", + "COLUMNS_FORMAT_SEARCH_METHOD", + "COLUMNS_NAMES_SEARCH_METHOD", + "CONSIDER_SURVEY_CTO_VARS", + "DATASET", + "DATE", + "ENGLISH", + "ERROR_MESSAGE", + "FUZZY", + "LOCATIONS_POPULATIONS_SEARCH_METHOD", + "OTHER", + "PHONE_NUMBER", + "PII_CANDIDATES", + "SPARSE_ENTRIES_SEARCH_METHOD", + "SPANISH", + "STRICT", + "UNSTRUCTURED_TEXT_SEARCH_METHOD", + # Functions + "get_fuzzy_restricted_words", + "get_locations_fuzzy_restricted_words", + "get_locations_strict_restricted_words", + "get_strict_restricted_words", + "get_surveycto_restricted_vars", +] diff --git a/src/pii_detector/data/constants.py b/src/pii_detector/data/constants.py new file mode 100644 index 0000000..e823c51 --- /dev/null +++ b/src/pii_detector/data/constants.py @@ -0,0 +1,106 @@ +"""Constants and configuration values for PII detection.""" + +# Configuration options +CONSIDER_SURVEY_CTO_VARS = "consider_surveyCTO_vars" +CHECK_LOCATIONS_POP = "check_locations_pop" + +# Search method identifiers +COLUMNS_NAMES_SEARCH_METHOD = "columns names search method" +LOCATIONS_POPULATIONS_SEARCH_METHOD = "locations populations search method" +COLUMNS_FORMAT_SEARCH_METHOD = "column format search method" +SPARSE_ENTRIES_SEARCH_METHOD = "sparse entries search method" +UNSTRUCTURED_TEXT_SEARCH_METHOD = "unstructured text search method" + +# Matching types +STRICT = "strict" +FUZZY = "fuzzy" + +# Data format types +PHONE_NUMBER = "phone number" +DATE = "date" + +# Language options +ENGLISH = "English" +SPANISH = "Spanish" +FRENCH = "French" +OTHER = "Other" + +# Return value keys +ERROR_MESSAGE = "error_message" +PII_CANDIDATES = "pii_candidates" +DATASET = "dataset" +LABEL_DICT = "label_dict" +VALUE_LABEL_DICT = "value_label_dict" + +COLUMNS_STILL_TO_CHECK = "COLUMNS_STILL_TO_CHECK" + +# Country definitions +BANGLADESH = "Bangladesh" +MYANMAR = "Myanmar" +PHILIPPINES = "Philippines" +BOLIVIA = "Bolivia" +COLOMBIA = "Colombia" +DOMINICAN_REPUBLIC = "Dominican Republic" +MEXICO = "Mexico" +PARAGUAY = "Paraguay" +PERU = "Peru" +BURKINA_FASO = "Burkina Faso" +COTE_DIVOIRE = "Cote dIvoire" +GHANA = "Ghana" +LIBERIA = "Liberia" +MALI = "Mali" +SIERRA_LEONE = "Sierra Leone" +KENYA = "Kenya" +MALAWI = "Malawi" +RWANDA = "Rwanda" +TANZANIA = "Tanzania" +UGANDA = "Uganda" +ZAMBIA = "Zambia" + +ALL_COUNTRIES = [ + PHILIPPINES, + BOLIVIA, + COLOMBIA, + DOMINICAN_REPUBLIC, + MEXICO, + PARAGUAY, + PERU, + BURKINA_FASO, + COTE_DIVOIRE, + GHANA, + LIBERIA, + MALI, + SIERRA_LEONE, + KENYA, + MALAWI, + RWANDA, + TANZANIA, + UGANDA, + ZAMBIA, + MYANMAR, + BANGLADESH, +] + +COUNTRY_NAME_TO_ISO_CODE = { + MEXICO: "mx", + BANGLADESH: "bd", + MYANMAR: "mm", + PHILIPPINES: "ph", + BOLIVIA: "bo", + COLOMBIA: "co", + DOMINICAN_REPUBLIC: "do", + PARAGUAY: "py", + PERU: "pe", + BURKINA_FASO: "bf", + COTE_DIVOIRE: "ci", + GHANA: "gh", + LIBERIA: "lr", + MALI: "ml", + SIERRA_LEONE: "sl", + KENYA: "ke", + MALAWI: "mw", + RWANDA: "rw", + TANZANIA: "tz", + UGANDA: "ug", + ZAMBIA: "zm", +} diff --git a/src/pii_detector/data/demo_data.csv b/src/pii_detector/data/demo_data.csv new file mode 100644 index 0000000..a0d9cd6 --- /dev/null +++ b/src/pii_detector/data/demo_data.csv @@ -0,0 +1,6 @@ +name,email,phone,age,city,survey_response +John Smith,john.smith@email.com,555-123-4567,34,New York,Very satisfied with service +Sarah Johnson,sarah.j@company.org,555-987-6543,28,Los Angeles,Needs improvement +Mike Davis,m.davis@university.edu,555-456-7890,45,Chicago,Excellent experience +Lisa Brown,lisa.brown@hospital.net,555-321-0987,39,Houston,Good overall rating +David Wilson,d.wilson@startup.com,555-654-3210,52,Phoenix,Could be better diff --git a/src/pii_detector/data/restricted_words.py b/src/pii_detector/data/restricted_words.py new file mode 100644 index 0000000..5bc959a --- /dev/null +++ b/src/pii_detector/data/restricted_words.py @@ -0,0 +1,251 @@ +"""Restricted word lists for PII detection across multiple languages and domains. + +Fuzzy = variables that if contained inside a column name/label, there will be a match +Strict = variables that if are strictly equal to column name/label, there will be a match +""" + +# SURVEYCTO VARIABLES +survey_cto_strict = [ + "deviceid", + "subscriberid", + "simid", + "formdef_version", + "devicephonenum", + "duration", + "bc_rand", + "key", + "starttime", + "endtime", + "audio_audit_cons_1", + "audio_audit_cons_2", + "audio_audit_cons_positivo", + "text_audit", + "text_audit_field", + "call_log", + "caseid", + "sstrm_pct_conversation", + "sstat_sound_level", + "sstrm_sound_level", + "audio_audit_survey", + "reschedule_format", + "reschedule_2_format", +] + +# LOCATIONS VARIABLES +locations_strict = ["vill", "lc"] + +locations_fuzzy = [ + "district", + "country", + "subcountry", + "parish", + "village", + "community", + "location", + "panchayat", + "compound", + "survey_location", + "county", + "subcounty", + "ciudad", + "distrito", + "villa", + "city", + "town", + "neighborhood", + "neighbourhood", + "barangay", + "brgy", + "municipio", + "colonia", + "alcaldia", + "alcaldía", + "upazila", + "tribe", +] + +# STATA VARIABLES +stata_strict = [ + "nam", + "add", + "addr", + "addr1", + "addr2", + "dist", + "parish", + "loc", + "acc", + "plan", + "medic", + "insur", + "num", + "resid", + "home", + "spec", + "id", + "enum", + "info", + "data", + "comm", + "count", + "fo", +] + +# IPA GUIDELINE DOCUMENT +other_strict = [ + "gps", + "lat", + "lon", + "coord", + "house", + "social", + "census", + "fax", + "ip", + "url", + "specify", + "enumerator", + "random", + "name", + "enum_name", + "rand", + "uid", + "hh", + "age", + "gps", + "id", + "ip", + "red", + "fono", + "url", + "web", + "number", + "encuestador", + "escuela", + "colegio", + "edad", + "insurance", + "school", + "birth", +] + +other_fuzzy = [ + "name", + "_name", + "fname", + "lname", + "first_name", + "last_name", + "birthday", + "bday", + "address", + "network", + "email", + "beneficiary", + "mother", + "wife", + "father", + "husband", + "enumerator ", + "enumerator_", + "child_age", + "latitude", + "longitude", + "coordinates", + "website", + "nickname", + "nick_name", + "firstname", + "lastname", + "sublocation", + "alternativecontact", + "division", + "resp_name", + "head_name", + "headname", + "respname", + "subvillage", +] + +# OTHER LANGUAGES +spanish_fuzzy = [ + "apellido", + "apellidos", + "beneficiario", + "censo", + "comunidad", + "contar", + "coordenadas", + "direccion", + "edad_nino", + "email", + "esposa", + "esposo", + "fecha_nacimiento", + "identificador", + "identidad", + "informacion", + "latitud", + "latitude", + "locacion", + "longitud", + "madre", + "medico", + "nino", + "nombre", + "numero", + "padre", + "pag_web", + "pais", + "parroquia", + "primer_nombre", + "random", + "salud", + "seguro", + "ubicacion", +] + +swahili_strict = [ + "jina", + "simu", + "mkoa", + "wilaya", + "kata", + "kijiji", + "kitongoji", + "vitongoji", + "nyumba", + "numba", + "namba", + "tarahe ya kuzaliwa", + "umri", + "jinsi", + "jinsia", +] + + +def get_locations_strict_restricted_words() -> list[str]: + """Get strict location-related restricted words.""" + return locations_strict + + +def get_locations_fuzzy_restricted_words() -> list[str]: + """Get fuzzy location-related restricted words.""" + return locations_fuzzy + + +def get_surveycto_restricted_vars() -> list[str]: + """Get SurveyCTO-specific restricted variables.""" + return survey_cto_strict + + +def get_strict_restricted_words() -> list[str]: + """Get all strict matching restricted words.""" + strict_restricted = stata_strict + other_strict + swahili_strict + return list(set(strict_restricted)) + + +def get_fuzzy_restricted_words() -> list[str]: + """Get all fuzzy matching restricted words.""" + fuzzy_restricted = other_fuzzy + spanish_fuzzy + return list(set(fuzzy_restricted)) diff --git a/stopwords/README b/src/pii_detector/data/stopwords/README similarity index 100% rename from stopwords/README rename to src/pii_detector/data/stopwords/README diff --git a/stopwords/arabic b/src/pii_detector/data/stopwords/arabic similarity index 100% rename from stopwords/arabic rename to src/pii_detector/data/stopwords/arabic diff --git a/stopwords/azerbaijani b/src/pii_detector/data/stopwords/azerbaijani similarity index 98% rename from stopwords/azerbaijani rename to src/pii_detector/data/stopwords/azerbaijani index 27bf294..e868d57 100644 --- a/stopwords/azerbaijani +++ b/src/pii_detector/data/stopwords/azerbaijani @@ -118,7 +118,7 @@ ona ondan onlar onlardan -onların +onların onsuzda onu onun @@ -162,4 +162,4 @@ yox yoxdur yoxsa yüz -zaman \ No newline at end of file +zaman diff --git a/stopwords/danish b/src/pii_detector/data/stopwords/danish similarity index 100% rename from stopwords/danish rename to src/pii_detector/data/stopwords/danish diff --git a/stopwords/dutch b/src/pii_detector/data/stopwords/dutch similarity index 100% rename from stopwords/dutch rename to src/pii_detector/data/stopwords/dutch diff --git a/stopwords/english b/src/pii_detector/data/stopwords/english similarity index 100% rename from stopwords/english rename to src/pii_detector/data/stopwords/english diff --git a/stopwords/finnish b/src/pii_detector/data/stopwords/finnish similarity index 100% rename from stopwords/finnish rename to src/pii_detector/data/stopwords/finnish diff --git a/stopwords/french b/src/pii_detector/data/stopwords/french similarity index 100% rename from stopwords/french rename to src/pii_detector/data/stopwords/french diff --git a/stopwords/german b/src/pii_detector/data/stopwords/german similarity index 100% rename from stopwords/german rename to src/pii_detector/data/stopwords/german diff --git a/stopwords/greek b/src/pii_detector/data/stopwords/greek similarity index 100% rename from stopwords/greek rename to src/pii_detector/data/stopwords/greek diff --git a/stopwords/hungarian b/src/pii_detector/data/stopwords/hungarian similarity index 100% rename from stopwords/hungarian rename to src/pii_detector/data/stopwords/hungarian diff --git a/stopwords/indonesian b/src/pii_detector/data/stopwords/indonesian similarity index 99% rename from stopwords/indonesian rename to src/pii_detector/data/stopwords/indonesian index bf88a45..aba6f67 100644 --- a/stopwords/indonesian +++ b/src/pii_detector/data/stopwords/indonesian @@ -755,4 +755,4 @@ wong yaitu yakin yakni -yang \ No newline at end of file +yang diff --git a/stopwords/italian b/src/pii_detector/data/stopwords/italian similarity index 100% rename from stopwords/italian rename to src/pii_detector/data/stopwords/italian diff --git a/stopwords/kazakh b/src/pii_detector/data/stopwords/kazakh similarity index 100% rename from stopwords/kazakh rename to src/pii_detector/data/stopwords/kazakh diff --git a/stopwords/nepali b/src/pii_detector/data/stopwords/nepali similarity index 99% rename from stopwords/nepali rename to src/pii_detector/data/stopwords/nepali index b2e4d34..f63d4dc 100644 --- a/stopwords/nepali +++ b/src/pii_detector/data/stopwords/nepali @@ -252,4 +252,4 @@ सोही स्पष्ट हरे -हरेक \ No newline at end of file +हरेक diff --git a/stopwords/norwegian b/src/pii_detector/data/stopwords/norwegian similarity index 100% rename from stopwords/norwegian rename to src/pii_detector/data/stopwords/norwegian diff --git a/stopwords/portuguese b/src/pii_detector/data/stopwords/portuguese similarity index 100% rename from stopwords/portuguese rename to src/pii_detector/data/stopwords/portuguese diff --git a/stopwords/romanian b/src/pii_detector/data/stopwords/romanian similarity index 99% rename from stopwords/romanian rename to src/pii_detector/data/stopwords/romanian index 45651c9..e98615e 100644 --- a/stopwords/romanian +++ b/src/pii_detector/data/stopwords/romanian @@ -353,4 +353,4 @@ zice ăştia şi ţi -ţie \ No newline at end of file +ţie diff --git a/stopwords/russian b/src/pii_detector/data/stopwords/russian similarity index 100% rename from stopwords/russian rename to src/pii_detector/data/stopwords/russian diff --git a/stopwords/slovene b/src/pii_detector/data/stopwords/slovene similarity index 100% rename from stopwords/slovene rename to src/pii_detector/data/stopwords/slovene diff --git a/stopwords/spanish b/src/pii_detector/data/stopwords/spanish similarity index 100% rename from stopwords/spanish rename to src/pii_detector/data/stopwords/spanish diff --git a/stopwords/swedish b/src/pii_detector/data/stopwords/swedish similarity index 100% rename from stopwords/swedish rename to src/pii_detector/data/stopwords/swedish diff --git a/stopwords/tajik b/src/pii_detector/data/stopwords/tajik similarity index 82% rename from stopwords/tajik rename to src/pii_detector/data/stopwords/tajik index 898614a..d04d346 100644 --- a/stopwords/tajik +++ b/src/pii_detector/data/stopwords/tajik @@ -9,7 +9,7 @@ пеши назди рӯйи -болои +болои паси ғайри ҳамон @@ -22,9 +22,9 @@ қабл дида сар карда -агар +агар агар ки -валекин +валекин ки лекин аммо @@ -41,40 +41,40 @@ бо нияти он ки лекин ва ҳол он ки ё -ё ин ки -бе он ки +ё ин ки +бе он ки дар ҳолате ки -то даме ки +то даме ки баъд аз он ки даме ки -ба тразе ки +ба тразе ки аз баҳри он ки -гар +гар ар ба шарте -азбаски +азбаски модоме ки агар чи -гарчанде ки +гарчанде ки бо вуҷуди он ки гӯё -аз-баски +аз-баски чун-ки агар-чанд -агар-чи +агар-чи гар-чи то ки чунон ки то даме ки ҳар қадар ки -магар +магар оё наход -ҳатто -ҳам -бале -оре -хуб +ҳатто +ҳам +бале +оре +хуб хуш хайр не @@ -83,7 +83,7 @@ э фақат танҳо -кошки +кошки мабодо ҳтимол ана ҳамин @@ -112,7 +112,7 @@ нм оббо ӯббо -ҳой-ҳой +ҳой-ҳой вой-вой ту-ту ҳмм @@ -123,12 +123,12 @@ ало аё ой -ӯим +ӯим ором хом?ш -ҳай-ҳай +ҳай-ҳай бай-бай -аз +аз он баъд азбаски @@ -159,5 +159,5 @@ шояд ки охир аз рӯи -аз рӯйи -рӯ \ No newline at end of file +аз рӯйи +рӯ diff --git a/stopwords/turkish b/src/pii_detector/data/stopwords/turkish similarity index 100% rename from stopwords/turkish rename to src/pii_detector/data/stopwords/turkish diff --git a/src/pii_detector/gui/__init__.py b/src/pii_detector/gui/__init__.py new file mode 100644 index 0000000..7d2c18b --- /dev/null +++ b/src/pii_detector/gui/__init__.py @@ -0,0 +1,3 @@ +"""GUI components for the PII detector application.""" + +__all__ = [] diff --git a/src/pii_detector/gui/flet_app/__init__.py b/src/pii_detector/gui/flet_app/__init__.py new file mode 100644 index 0000000..4cd31da --- /dev/null +++ b/src/pii_detector/gui/flet_app/__init__.py @@ -0,0 +1 @@ +"""Flet-based PII Detector application package.""" diff --git a/src/pii_detector/gui/flet_app/backend_adapter.py b/src/pii_detector/gui/flet_app/backend_adapter.py new file mode 100644 index 0000000..a7614ae --- /dev/null +++ b/src/pii_detector/gui/flet_app/backend_adapter.py @@ -0,0 +1,942 @@ +"""Backend integration adapter for Flet GUI. + +This module provides a bridge between the Flet GUI and the existing PII detection +core modules, handling the conversion between GUI state and backend processing. +""" + +import os +import threading +from collections.abc import Callable +from pathlib import Path +from typing import Any + +import pandas as pd + +from pii_detector.core import processor +from pii_detector.core.anonymization import AnonymizationTechniques +from pii_detector.core.unified_processor import detect_pii_unified +from pii_detector.gui.flet_app.config.settings import ( + DetectionConfig, + DetectionResult, +) + + +class PIIDetectionAdapter: + """Adapter class to connect Flet GUI with PII detection backend.""" + + def __init__(self): + """Initialize the PII detection adapter.""" + self.anonymizer = AnonymizationTechniques() + self.current_dataset = None + self.current_label_dict = None + self.detection_results = None + + def convert_gui_config_to_backend_config( + self, detection_config: DetectionConfig, api_key: str | None = None + ) -> dict[str, Any]: + """Convert GUI detection configuration to backend configuration format. + + Args: + detection_config: GUI detection configuration + api_key: Optional GeoNames API key for location checks + + Returns: + Dictionary with backend configuration parameters + + """ + config = { + # Method enable/disable flags + "use_column_name_detection": detection_config.column_name_enabled, + "use_format_pattern_detection": detection_config.format_pattern_enabled, + "use_sparsity_detection": detection_config.sparsity_enabled, + "use_text_analysis": detection_config.ai_text_enabled, + "use_location_detection": detection_config.location_population_enabled, + # Method-specific parameters + "confidence_threshold": detection_config.confidence_threshold, + "language": detection_config.language, + "sample_size": detection_config.sample_size, + "chunk_size": detection_config.chunk_size, + "max_workers": detection_config.max_workers, + # Sparsity analysis settings + "sparsity_threshold": detection_config.sparsity_threshold, + # Location population settings + "population_threshold": detection_config.population_threshold, + # Text analysis settings + "text_analysis_mode": detection_config.text_analysis_mode, + # API credentials + "geonames_api_key": api_key, + } + + return config + + def load_dataset(self, file_path: str) -> tuple[bool, str]: + """Load a dataset file using the existing backend processor. + + Args: + file_path: Path to the dataset file + + Returns: + Tuple of (success, message) + + """ + try: + success, result = processor.import_dataset(file_path) + + if success: + self.current_dataset, _, self.current_label_dict, _ = result + return ( + True, + f"Successfully loaded dataset with {len(self.current_dataset)} rows and {len(self.current_dataset.columns)} columns", + ) + else: + return False, f"Failed to load dataset: {result}" + + except Exception as e: + return False, f"Error loading dataset: {str(e)}" + + def run_pii_detection( + self, + detection_config: DetectionConfig, + api_key: str | None = None, + progress_callback: Callable[[float, str], None] | None = None, + ) -> tuple[bool, list[DetectionResult]]: + """Run PII detection on the loaded dataset. + + Args: + detection_config: GUI detection configuration + api_key: Optional GeoNames API key + progress_callback: Optional callback for progress updates + + Returns: + Tuple of (success, detection_results) + + """ + if self.current_dataset is None: + return False, [] + + try: + # Convert GUI config to backend config + backend_config = self.convert_gui_config_to_backend_config( + detection_config, api_key + ) + + if progress_callback: + progress_callback(0.1, "Preparing analysis configuration") + + # Set up environment variables for API access + if api_key: + os.environ["GEONAMES_USERNAME"] = api_key + + if progress_callback: + progress_callback(0.2, "Running PII detection analysis") + + # Run the unified PII detection + pii_results = detect_pii_unified( + dataset=self.current_dataset, + label_dict=self.current_label_dict, + language=detection_config.language, + config=backend_config, + ) + + if progress_callback: + progress_callback(0.8, "Processing detection results") + + # Convert backend results to GUI format + gui_results = [] + for column_name, pii_result in pii_results.items(): + # Map PII entity types to human-readable format + pii_type = self._map_entity_types_to_pii_type(pii_result.entity_types) + + detection_result = DetectionResult( + column=column_name, + method=pii_result.detection_method, + confidence=pii_result.confidence, + pii_type=pii_type, + entity_types=pii_result.entity_types, + details=pii_result.details, + ) + gui_results.append(detection_result) + + self.detection_results = gui_results + + if progress_callback: + progress_callback(1.0, "Analysis completed successfully") + + return True, gui_results + + except Exception as e: + if progress_callback: + progress_callback(0.0, f"Analysis failed: {str(e)}") + return False, [] + + def _map_entity_types_to_pii_type(self, entity_types: list[str]) -> str: + """Map detected entity types to human-readable PII type. + + Args: + entity_types: List of detected entity types + + Returns: + Human-readable PII type description + + """ + if not entity_types: + return "Personal Information" + + # Priority mapping for common entity types + type_mapping = { + "PERSON": "Person Name", + "EMAIL_ADDRESS": "Email Address", + "PHONE_NUMBER": "Phone Number", + "SSN": "Social Security Number", + "CREDIT_CARD": "Credit Card Number", + "LOCATION": "Geographic Location", + "DATE_TIME": "Date/Time Information", + "IP_ADDRESS": "IP Address", + "ORGANIZATION": "Organization Name", + "IDENTIFIER": "Unique Identifier", + } + + # Return the first recognized type or a generic description + for entity_type in entity_types: + if entity_type in type_mapping: + return type_mapping[entity_type] + + return "Personal Information" + + def generate_anonymized_dataset( + self, + user_actions: dict[str, str], + progress_callback: Callable[[float, str], None] | None = None, + ) -> tuple[bool, pd.DataFrame | None]: + """Generate anonymized dataset based on user actions. + + Args: + user_actions: Dictionary mapping column names to actions (Drop/Encode/Keep) + progress_callback: Optional callback for progress updates + + Returns: + Tuple of (success, anonymized_dataframe) + + """ + if self.current_dataset is None: + return False, None + + try: + if progress_callback: + progress_callback(0.1, "Preparing anonymization") + + anonymized_df = self.current_dataset.copy() + + total_actions = len(user_actions) + for i, (column, action) in enumerate(user_actions.items()): + if column not in anonymized_df.columns: + continue + + progress = 0.2 + (0.7 * i / total_actions) + if progress_callback: + progress_callback(progress, f"Processing column: {column}") + + if action == "Drop": + # Remove the column entirely + anonymized_df = self.anonymizer.remove_variables( + anonymized_df, [column] + ) + + elif action == "Encode": + # Apply pseudonymization/encoding + if anonymized_df[column].dtype == "object": + # Text-based columns: hash pseudonymization + anonymized_df[column] = self.anonymizer.hash_pseudonymization( + anonymized_df[column], + consistent=True, + prefix=f"ANON_{column.upper()}_", + ) + else: + # Numeric columns: add noise + anonymized_df[column] = self.anonymizer.add_noise( + anonymized_df[column], + noise_type="gaussian", + noise_level=0.1, + ) + + # "Keep" action: no changes needed + + if progress_callback: + progress_callback(1.0, "Anonymization completed successfully") + + return True, anonymized_df + + except Exception as e: + if progress_callback: + progress_callback(0.0, f"Anonymization failed: {str(e)}") + return False, None + + def generate_automatic_anonymized_dataset( + self, + anonymization_method: str, + detection_results: list, + progress_callback: Callable[[float, str], None] | None = None, + ) -> tuple[bool, pd.DataFrame | None, str]: + """Generate anonymized dataset automatically using selected method. + + Args: + anonymization_method: Method to use (remove, encode, categorize, mask) + detection_results: List of DetectionResult objects + progress_callback: Optional callback for progress updates + + Returns: + Tuple of (success, anonymized_dataframe, report_text) + + """ + if self.current_dataset is None: + return False, None, "No dataset loaded" + + try: + if progress_callback: + progress_callback(0.1, "Preparing automatic anonymization") + + anonymized_df = self.current_dataset.copy() + pii_columns = [result.column for result in detection_results] + changes_log = [] + + if progress_callback: + progress_callback( + 0.2, + f"Applying '{anonymization_method}' method to {len(pii_columns)} PII columns", + ) + + if anonymization_method == "remove": + # Remove all PII columns + if progress_callback: + progress_callback(0.3, "Removing PII columns") + + anonymized_df = self.anonymizer.remove_variables( + anonymized_df, pii_columns + ) + changes_log.append( + f"Removed {len(pii_columns)} PII columns: {', '.join(pii_columns)}" + ) + + elif anonymization_method == "encode": + # Hash/pseudonymize all PII columns + if progress_callback: + progress_callback(0.3, "Encoding PII columns") + + for i, column in enumerate(pii_columns): + if column not in anonymized_df.columns: + continue + + progress = 0.3 + (0.5 * i / len(pii_columns)) + if progress_callback: + progress_callback(progress, f"Encoding column: {column}") + + if anonymized_df[column].dtype == "object": + # Text-based columns: hash pseudonymization + anonymized_df[column] = self.anonymizer.hash_pseudonymization( + anonymized_df[column], + consistent=True, + prefix=f"ANON_{column.upper()}_", + ) + changes_log.append(f"Hashed text column: {column}") + else: + # Numeric columns: add noise + anonymized_df[column] = self.anonymizer.add_noise( + anonymized_df[column], + noise_type="gaussian", + noise_level=0.1, + ) + changes_log.append(f"Added noise to numeric column: {column}") + + elif anonymization_method == "categorize": + # Intelligently categorize based on column type + if progress_callback: + progress_callback(0.3, "Categorizing PII columns") + + for i, column in enumerate(pii_columns): + if column not in anonymized_df.columns: + continue + + progress = 0.3 + (0.5 * i / len(pii_columns)) + if progress_callback: + progress_callback(progress, f"Categorizing column: {column}") + + # Try to intelligently categorize based on column name and type + column_lower = column.lower() + + if "age" in column_lower and pd.api.types.is_numeric_dtype( + anonymized_df[column] + ): + # Age categorization + anonymized_df[column] = self.anonymizer.age_categorization( + anonymized_df[column] + ) + changes_log.append(f"Categorized age column: {column}") + + elif "date" in column_lower or "time" in column_lower: + # Date generalization + try: + anonymized_df[column] = self.anonymizer.date_generalization( + pd.to_datetime(anonymized_df[column]), + granularity="month", + ) + changes_log.append( + f"Generalized date column to month: {column}" + ) + except Exception: + # If date conversion fails, just hash it + anonymized_df[column] = ( + self.anonymizer.hash_pseudonymization( + anonymized_df[column], + consistent=True, + prefix=f"ANON_{column.upper()}_", + ) + ) + changes_log.append( + f"Hashed column (date conversion failed): {column}" + ) + + elif ( + "location" in column_lower + or "address" in column_lower + or "city" in column_lower + or "state" in column_lower + ): + # Geographic generalization + anonymized_df[column] = ( + self.anonymizer.geographic_generalization( + anonymized_df[column], level="state" + ) + ) + changes_log.append( + f"Generalized location to state level: {column}" + ) + + else: + # Default: hash for text, add noise for numeric + if anonymized_df[column].dtype == "object": + anonymized_df[column] = ( + self.anonymizer.hash_pseudonymization( + anonymized_df[column], + consistent=True, + prefix=f"ANON_{column.upper()}_", + ) + ) + changes_log.append(f"Hashed column: {column}") + else: + anonymized_df[column] = self.anonymizer.add_noise( + anonymized_df[column], + noise_type="gaussian", + noise_level=0.1, + ) + changes_log.append(f"Added noise to column: {column}") + + elif anonymization_method == "mask": + # Pattern-based masking + if progress_callback: + progress_callback(0.3, "Masking PII columns") + + for i, column in enumerate(pii_columns): + if column not in anonymized_df.columns: + continue + + progress = 0.3 + (0.5 * i / len(pii_columns)) + if progress_callback: + progress_callback(progress, f"Masking column: {column}") + + if anonymized_df[column].dtype == "object": + # Text masking with pattern detection + anonymized_df[column] = self.anonymizer.text_masking( + anonymized_df[column] + ) + changes_log.append(f"Masked text patterns in column: {column}") + else: + # For numeric, convert to string and mask + anonymized_df[column] = anonymized_df[column].apply( + lambda x: "***" if pd.notna(x) else x + ) + changes_log.append(f"Masked numeric column: {column}") + + if progress_callback: + progress_callback(0.9, "Generating anonymization report") + + # Generate report + report = self._generate_anonymization_report( + anonymization_method, + pii_columns, + changes_log, + len(anonymized_df), + len(anonymized_df.columns), + ) + + if progress_callback: + progress_callback(1.0, "Anonymization completed successfully") + + return True, anonymized_df, report + + except Exception as e: + if progress_callback: + progress_callback(0.0, f"Anonymization failed: {str(e)}") + return False, None, f"Anonymization failed: {str(e)}" + + def _generate_anonymization_report( + self, + method: str, + pii_columns: list[str], + changes_log: list[str], + num_rows: int, + num_columns: int, + ) -> str: + """Generate a report documenting the anonymization process.""" + report_lines = [ + "=" * 70, + "AUTOMATIC ANONYMIZATION REPORT", + "=" * 70, + "", + f"Anonymization Method: {method.upper()}", + f"Timestamp: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}", + "", + "DATASET INFORMATION", + "-" * 70, + f"Total rows: {num_rows:,}", + f"Total columns: {num_columns}", + f"PII columns processed: {len(pii_columns)}", + "", + "CHANGES APPLIED", + "-" * 70, + ] + + for change in changes_log: + report_lines.append(f"• {change}") + + report_lines.extend( + [ + "", + "AFFECTED COLUMNS", + "-" * 70, + ", ".join(pii_columns), + "", + "=" * 70, + "End of Report", + "=" * 70, + ] + ) + + return "\n".join(report_lines) + + def save_results( + self, anonymized_df: pd.DataFrame, output_path: str + ) -> tuple[bool, str]: + """Save the anonymized dataset and analysis report. + + Args: + anonymized_df: The anonymized dataset + output_path: Directory to save results + + Returns: + Tuple of (success, message) + + """ + try: + output_dir = Path(output_path) + output_dir.mkdir(parents=True, exist_ok=True) + + # Save anonymized dataset + dataset_path = output_dir / "anonymized_dataset.csv" + anonymized_df.to_csv(dataset_path, index=False) + + # Generate and save analysis report + report_path = output_dir / "pii_detection_report.txt" + self._generate_analysis_report(report_path) + + return True, f"Results saved successfully to {output_dir}" + + except Exception as e: + return False, f"Failed to save results: {str(e)}" + + def _generate_analysis_report(self, report_path: Path): + """Generate a text report of the PII detection analysis.""" + with open(report_path, "w") as f: + f.write("PII Detection Analysis Report\n") + f.write("=" * 50 + "\n\n") + + if self.current_dataset is not None: + f.write("Dataset Information:\n") + f.write(f" - Total rows: {len(self.current_dataset):,}\n") + f.write(f" - Total columns: {len(self.current_dataset.columns):,}\n\n") + + if self.detection_results: + f.write("PII Detection Results:\n") + f.write( + f" - Columns flagged as PII: {len(self.detection_results)}\n\n" + ) + + for result in self.detection_results: + f.write(f"Column: {result.column}\n") + f.write(f" - Detection Method: {result.method}\n") + f.write(f" - Confidence: {result.confidence:.2%}\n") + f.write(f" - PII Type: {result.pii_type}\n") + if result.entity_types: + f.write(f" - Entity Types: {', '.join(result.entity_types)}\n") + f.write("\n") + + f.write("Analysis completed successfully.\n") + + def generate_per_column_anonymized_dataset( + self, + column_methods: dict[str, str], + detection_results: list, + progress_callback: Callable[[float, str], None] | None = None, + ) -> tuple[bool, pd.DataFrame | None, str]: + """Generate anonymized dataset with per-column anonymization methods. + + Args: + column_methods: Dict mapping column_name -> method (remove/encode/categorize/mask/unchanged) + detection_results: List of DetectionResult objects + progress_callback: Optional callback for progress updates + + Returns: + Tuple of (success, anonymized_dataframe, report_text) + + """ + if self.current_dataset is None: + return False, None, "No dataset loaded" + + try: + if progress_callback: + progress_callback(0.1, "Preparing per-column anonymization") + + anonymized_df = self.current_dataset.copy() + changes_log = [] + pii_columns = [result.column for result in detection_results] + + # Group columns by method for efficient processing + method_groups = {} + for column in pii_columns: + method = column_methods.get(column, "remove") + if method not in method_groups: + method_groups[method] = [] + method_groups[method].append(column) + + total_columns = len(pii_columns) + processed = 0 + + # Process each method group + for method, columns in method_groups.items(): + if progress_callback: + progress_callback( + 0.2 + (0.7 * processed / total_columns), + f"Applying '{method}' to {len(columns)} columns", + ) + + if method == "unchanged": + # Skip these columns - leave them as-is + changes_log.append( + f"Unchanged (preserved original): {', '.join(columns)}" + ) + + elif method == "remove": + anonymized_df = self.anonymizer.remove_variables( + anonymized_df, columns + ) + changes_log.append(f"Removed columns: {', '.join(columns)}") + + elif method == "encode": + for column in columns: + if column not in anonymized_df.columns: + continue + if anonymized_df[column].dtype == "object": + anonymized_df[column] = ( + self.anonymizer.hash_pseudonymization( + anonymized_df[column], + consistent=True, + prefix=f"ANON_{column.upper()}_", + ) + ) + changes_log.append(f"Hashed: {column}") + else: + anonymized_df[column] = self.anonymizer.add_noise( + anonymized_df[column], + noise_type="gaussian", + noise_level=0.1, + ) + changes_log.append(f"Added noise: {column}") + + elif method == "categorize": + for column in columns: + if column not in anonymized_df.columns: + continue + column_lower = column.lower() + + # Intelligent categorization based on column patterns + if "age" in column_lower: + anonymized_df[column] = self.anonymizer.age_categorization( + anonymized_df[column] + ) + changes_log.append(f"Age categorization: {column}") + elif ( + "date" in column_lower + or "time" in column_lower + or "dob" in column_lower + ): + try: + anonymized_df[column] = ( + self.anonymizer.date_generalization( + pd.to_datetime( + anonymized_df[column], errors="coerce" + ), + granularity="month", + ) + ) + changes_log.append(f"Date generalization: {column}") + except Exception: + # Fallback to top/bottom coding + anonymized_df[column] = ( + self.anonymizer.top_bottom_coding( + anonymized_df[column] + ) + ) + changes_log.append( + f"Top/bottom coding (date conversion failed): {column}" + ) + elif any( + keyword in column_lower + for keyword in [ + "location", + "address", + "city", + "state", + "zip", + ] + ): + anonymized_df[column] = ( + self.anonymizer.geographic_generalization( + anonymized_df[column], level="state" + ) + ) + changes_log.append(f"Geographic generalization: {column}") + elif any( + keyword in column_lower + for keyword in ["income", "salary", "wage", "earnings"] + ): + anonymized_df[column] = self.anonymizer.income_bracketing( + anonymized_df[column] + ) + changes_log.append(f"Income bracketing: {column}") + else: + # Default to top/bottom coding for numeric, or generic categorization + if pd.api.types.is_numeric_dtype(anonymized_df[column]): + anonymized_df[column] = ( + self.anonymizer.top_bottom_coding( + anonymized_df[column] + ) + ) + changes_log.append(f"Top/bottom coding: {column}") + else: + # For non-numeric, hash it + anonymized_df[column] = ( + self.anonymizer.hash_pseudonymization( + anonymized_df[column], + consistent=True, + prefix=f"CAT_{column.upper()}_", + ) + ) + changes_log.append( + f"Pseudonymization (non-numeric): {column}" + ) + + elif method == "mask": + for column in columns: + if column not in anonymized_df.columns: + continue + if anonymized_df[column].dtype == "object": + # Apply text masking to each value in the column + anonymized_df[column] = anonymized_df[column].apply( + lambda x: self.anonymizer.text_masking(x) + if pd.notna(x) + else x + ) + changes_log.append(f"Text masking: {column}") + else: + # Mask numeric values with placeholder + anonymized_df[column] = anonymized_df[column].apply( + lambda x: "***" if pd.notna(x) else x + ) + changes_log.append(f"Value masking: {column}") + + processed += len(columns) + + if progress_callback: + progress_callback(0.9, "Generating anonymization report") + + # Generate report + report = self._generate_per_column_anonymization_report( + column_methods, + pii_columns, + changes_log, + len(anonymized_df), + len(anonymized_df.columns), + ) + + if progress_callback: + progress_callback(1.0, "Anonymization completed successfully") + + return True, anonymized_df, report + + except Exception as e: + error_msg = f"Anonymization failed: {str(e)}" + if progress_callback: + progress_callback(0.0, error_msg) + return False, None, error_msg + + def _generate_per_column_anonymization_report( + self, + column_methods: dict[str, str], + pii_columns: list[str], + changes_log: list[str], + num_rows: int, + num_columns: int, + ) -> str: + """Generate a detailed report documenting per-column anonymization. + + Args: + column_methods: Dict mapping column names to anonymization methods + pii_columns: List of PII column names processed + changes_log: List of change descriptions + num_rows: Number of rows in dataset + num_columns: Total number of columns in dataset + + Returns: + Formatted report text + + """ + import pandas as pd + + # Group by method for summary + method_counts = {} + for method in column_methods.values(): + method_counts[method] = method_counts.get(method, 0) + 1 + + report_lines = [ + "=" * 70, + "PER-COLUMN ANONYMIZATION REPORT", + "=" * 70, + "", + f"Timestamp: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}", + "", + "DATASET INFORMATION", + "-" * 70, + f"Total rows: {num_rows:,}", + f"Total columns: {num_columns}", + f"PII columns processed: {len(pii_columns)}", + "", + "METHOD SUMMARY", + "-" * 70, + ] + + for method, count in sorted(method_counts.items()): + report_lines.append(f" {method.upper()}: {count} columns") + + report_lines.extend( + [ + "", + "PER-COLUMN METHODS", + "-" * 70, + ] + ) + + for column in sorted(pii_columns): + method = column_methods.get(column, "unknown") + report_lines.append(f" {column}: {method}") + + report_lines.extend( + [ + "", + "DETAILED CHANGES", + "-" * 70, + ] + ) + + for change in changes_log: + report_lines.append(f"• {change}") + + report_lines.extend( + [ + "", + "METHOD DESCRIPTIONS", + "-" * 70, + "• UNCHANGED: Preserves original column values (user overrode PII detection)", + "• REMOVE: Completely deletes PII columns from the dataset", + "• ENCODE: Applies hashing (text) or noise addition (numeric) to obfuscate values", + "• CATEGORIZE: Groups values into ranges or categories (age groups, income brackets, etc.)", + "• MASK: Replaces values with placeholder characters while preserving format", + "", + "=" * 70, + "End of Report", + "=" * 70, + ] + ) + + return "\n".join(report_lines) + + +class BackgroundProcessor: + """Background processor for running PII detection without blocking the GUI.""" + + def __init__(self, adapter: PIIDetectionAdapter): + """Initialize the background processor. + + Args: + adapter: PIIDetectionAdapter instance to use for detection + + """ + self.adapter = adapter + self.is_running = False + self.is_cancelled = False + + def run_analysis_async( + self, + detection_config: DetectionConfig, + api_key: str | None, + progress_callback: Callable[[float, str], None] | None = None, + completion_callback: Callable[[bool, list[DetectionResult]], None] + | None = None, + ): + """Run PII detection analysis in a background thread. + + Args: + detection_config: GUI detection configuration + api_key: Optional GeoNames API key + progress_callback: Optional callback for progress updates + completion_callback: Optional callback when analysis completes + + """ + if self.is_running: + return + + def analysis_thread(): + self.is_running = True + self.is_cancelled = False + + try: + success, results = self.adapter.run_pii_detection( + detection_config, api_key, progress_callback + ) + + if not self.is_cancelled and completion_callback: + completion_callback(success, results) + + except Exception as e: + if progress_callback: + progress_callback(0.0, f"Analysis failed: {str(e)}") + if completion_callback and not self.is_cancelled: + completion_callback(False, []) + finally: + self.is_running = False + + thread = threading.Thread(target=analysis_thread, daemon=True) + thread.start() + + def cancel_analysis(self): + """Cancel the running analysis.""" + self.is_cancelled = True + self.is_running = False diff --git a/src/pii_detector/gui/flet_app/config/__init__.py b/src/pii_detector/gui/flet_app/config/__init__.py new file mode 100644 index 0000000..7173ad8 --- /dev/null +++ b/src/pii_detector/gui/flet_app/config/__init__.py @@ -0,0 +1 @@ +"""Configuration package for the PII Detector Flet application.""" diff --git a/src/pii_detector/gui/flet_app/config/constants.py b/src/pii_detector/gui/flet_app/config/constants.py new file mode 100644 index 0000000..f5fb634 --- /dev/null +++ b/src/pii_detector/gui/flet_app/config/constants.py @@ -0,0 +1,114 @@ +"""Constants for the PII Detector Flet application. + +This module contains all color, typography, and spacing constants +following the IPA design system specifications. +""" + + +class IPAColors: + """IPA brand color palette.""" + + # Primary Brand Colors + IPA_GREEN = "#49ac57" # Primary actions, success states + DARK_GREEN = "#155240" # Sequential data, deep success + LIGHT_BLUE = "#84d0d4" # Secondary actions, hover states + DARK_BLUE = "#2b4085" # Headers, navigation, primary text + RED_ORANGE = "#f26529" # High-confidence alerts, critical actions + + # Neutral Palette + LIGHT_GREY = "#f1f2f2" # Background, card surfaces + DARK_GREY = "#c9c9c8" # Borders, secondary text + CHARCOAL = "#414042" # Primary text, icons + BLUE_ACCENT = "#ceecee" # Subtle highlights, table alternation + + # Confidence Level Indicators + HIGH_CONFIDENCE = RED_ORANGE # 0.8+ confidence scores + MED_CONFIDENCE = "#f5cb57" # 0.5-0.8 confidence scores + LOW_CONFIDENCE = DARK_GREY # <0.5 confidence scores + + # Interactive States + HOVER_COLOR = BLUE_ACCENT + ACTIVE_COLOR = IPA_GREEN + DISABLED_COLOR = DARK_GREY + + # Additional semantic colors + WHITE = "#ffffff" + SUCCESS = IPA_GREEN + WARNING = MED_CONFIDENCE + ERROR = RED_ORANGE + INFO = LIGHT_BLUE + + +class IPATypography: + """Typography system for consistent text styling.""" + + # Font families (system fonts with fallbacks) + PRIMARY_FONT = "Segoe UI, -apple-system, BlinkMacSystemFont, sans-serif" + MONOSPACE_FONT = "Consolas, Monaco, Courier New, monospace" + + # Font sizes (in pixels for Flet) + HEADER_1 = 32 # Main page titles + HEADER_2 = 24 # Section headers + HEADER_3 = 18 # Subsection titles + BODY_LARGE = 16 # Primary text, buttons + BODY_REGULAR = 14 # Secondary text, labels + BODY_SMALL = 12 # Captions, metadata + CODE_TEXT = 12 # Monospace content + + # Font weights (Flet FontWeight enum values) + LIGHT = "w300" + REGULAR = "w400" + MEDIUM = "w500" + SEMIBOLD = "w600" + BOLD = "w700" + + +class IPASpacing: + """Spacing system based on 8px grid.""" + + # Base spacing unit (8px grid system) + UNIT = 8 + + # Common spacing values + XS = UNIT // 2 # 4px - tight spacing + SM = UNIT # 8px - compact spacing + MD = UNIT * 2 # 16px - standard spacing + LG = UNIT * 3 # 24px - generous spacing + XL = UNIT * 4 # 32px - section spacing + XXL = UNIT * 6 # 48px - major section breaks + + # Component-specific spacing + CARD_PADDING = MD + BUTTON_PADDING_H = MD + BUTTON_PADDING_V = SM + INPUT_PADDING = SM + + # Border radius values + RADIUS_SM = 4 # Small elements (checkboxes, small buttons) + RADIUS_MD = 8 # Cards, input fields + RADIUS_LG = 12 # Major containers, panels + + +class AppConstants: + """Application-specific constants.""" + + # File size limits + MAX_FILE_SIZE_MB = 100 + + # Supported file formats + SUPPORTED_FORMATS = [".csv", ".xlsx", ".xls", ".dta"] + + # Screen names + SCREEN_DASHBOARD = "dashboard" + SCREEN_FILE_SELECTION = "file_selection" + SCREEN_CONFIGURATION = "configuration" + SCREEN_PROGRESS = "progress" + SCREEN_RESULTS = "results" + SCREEN_SETTINGS = "settings" + + # Detection method names + METHOD_COLUMN_NAME = "Column Name Analysis" + METHOD_FORMAT_PATTERN = "Format Pattern Detection" + METHOD_SPARSITY = "Sparsity Analysis" + METHOD_AI_TEXT = "AI Text Analysis (Presidio)" + METHOD_LOCATION_POPULATION = "Location Population Check" diff --git a/src/pii_detector/gui/flet_app/config/settings.py b/src/pii_detector/gui/flet_app/config/settings.py new file mode 100644 index 0000000..376d67e --- /dev/null +++ b/src/pii_detector/gui/flet_app/config/settings.py @@ -0,0 +1,145 @@ +"""Application settings and configuration management.""" + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + + +@dataclass +class DetectionConfig: + """Configuration for PII detection methods.""" + + # Method enable/disable states + column_name_enabled: bool = True + format_pattern_enabled: bool = True + sparsity_enabled: bool = True + ai_text_enabled: bool = True + location_population_enabled: bool = False + + # Column Name Detection settings + fuzzy_match_threshold: float = 0.8 + matching_type: str = "fuzzy" # strict, fuzzy, or both + + # Format Pattern Detection settings + format_confidence_threshold: float = 0.7 + detect_phone: bool = True + detect_email: bool = True + detect_ssn: bool = True + detect_dates: bool = True + + # Sparsity analysis settings + sparsity_threshold: float = 0.8 + min_entries_required: int = 10 + + # Location population settings + population_threshold: int = 50000 + + # Presidio (AI Text) settings + presidio_confidence_threshold: float = 0.8 + presidio_language_model: str = "en_core_web_sm" + presidio_detect_person: bool = True + presidio_detect_org: bool = True + + # General settings + confidence_threshold: float = 0.7 + language: str = "en" + sample_size: int = 100 + chunk_size: int = 1000 + max_workers: int = 4 + text_analysis_mode: str = "comprehensive" # quick, balanced, comprehensive + + +@dataclass +class FileInfo: + """Information about a selected file.""" + + path: Path + name: str + size_mb: float + format: str + is_valid: bool = True + validation_message: str = "" + + +@dataclass +class ValidationResult: + """Result of file validation.""" + + is_valid: bool + message: str + details: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class DetectionResult: + """Result of PII detection for a single column.""" + + column: str + method: str + confidence: float + pii_type: str + entity_types: list[str] = field(default_factory=list) + details: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class AppState: + """Central application state management.""" + + # Navigation state + current_screen: str = "dashboard" + screen_history: list[str] = field(default_factory=list) + + # File management + selected_files: list[FileInfo] = field(default_factory=list) + file_validation_results: dict[str, ValidationResult] = field(default_factory=dict) + + # Configuration state + detection_config: DetectionConfig = field(default_factory=DetectionConfig) + preset_mode: str = "balanced" # quick, balanced, thorough + + # Processing state + is_processing: bool = False + current_progress: float = 0.0 + processing_stage: str = "" + estimated_time_remaining: int | None = None + current_file: str = "" + + # Results state + detection_results: list[DetectionResult] = field(default_factory=list) + user_actions: dict[str, str] = field( + default_factory=dict + ) # column -> action mapping + + # Anonymization configuration - per-column methods + column_anonymization_methods: dict[str, str] = field( + default_factory=dict + ) # column_name -> method (remove, encode, categorize, mask) + + # UI state + panel_expansion_states: dict[str, bool] = field(default_factory=dict) + error_messages: list[str] = field(default_factory=list) + success_messages: list[str] = field(default_factory=list) + + # API keys and external service configuration + geonames_api_key: str | None = None + + +class AppSettings: + """Application settings and preferences.""" + + def __init__(self): + """Initialize application settings with default values.""" + self.window_width = 1200 + self.window_height = 800 + self.theme_mode = "light" + self.default_export_path = Path.home() / "Downloads" + self.remember_settings = True + + def save_settings(self): + """Save settings to file (implementation depends on requirements).""" + pass + + def load_settings(self): + """Load settings from file (implementation depends on requirements).""" + pass diff --git a/src/pii_detector/gui/flet_app/ui/__init__.py b/src/pii_detector/gui/flet_app/ui/__init__.py new file mode 100644 index 0000000..b7211ec --- /dev/null +++ b/src/pii_detector/gui/flet_app/ui/__init__.py @@ -0,0 +1 @@ +"""UI package for the PII Detector Flet application.""" diff --git a/src/pii_detector/gui/flet_app/ui/app.py b/src/pii_detector/gui/flet_app/ui/app.py new file mode 100644 index 0000000..ff242a7 --- /dev/null +++ b/src/pii_detector/gui/flet_app/ui/app.py @@ -0,0 +1,748 @@ +"""Main application controller and state manager.""" + +import contextlib +from typing import Any + +import flet as ft + +from pii_detector.gui.flet_app.config.constants import AppConstants, IPAColors +from pii_detector.gui.flet_app.config.settings import AppState +from pii_detector.gui.flet_app.ui.screens.configuration import ConfigurationScreen +from pii_detector.gui.flet_app.ui.screens.dashboard import DashboardScreen +from pii_detector.gui.flet_app.ui.screens.file_selection import FileSelectionScreen +from pii_detector.gui.flet_app.ui.screens.progress import ProgressScreen +from pii_detector.gui.flet_app.ui.screens.results import ResultsScreen + + +class StateManager: + """Central state management for the application.""" + + def __init__(self, page: ft.Page): + """Initialize the state manager. + + Args: + page: Flet page instance + + """ + self.page = page + self.state = AppState() + self._observers = [] + + def update_state(self, **kwargs): + """Central state update method with UI refresh. + + Args: + **kwargs: State attributes to update + + """ + for key, value in kwargs.items(): + if hasattr(self.state, key): + setattr(self.state, key, value) + + # Notify observers + self.notify_observers() + + def add_observer(self, observer): + """Add state change observer.""" + self._observers.append(observer) + + def notify_observers(self): + """Notify all observers of state changes.""" + for observer in self._observers: + if hasattr(observer, "on_state_changed"): + observer.on_state_changed(self.state) + + def navigate_to(self, screen_name: str): + """Navigate to a specific screen. + + Args: + screen_name: Name of the screen to navigate to + + """ + # Add current screen to history + if self.state.current_screen != screen_name: + self.state.screen_history.append(self.state.current_screen) + + self.update_state(current_screen=screen_name) + + def go_back(self): + """Navigate back to the previous screen.""" + if self.state.screen_history: + previous_screen = self.state.screen_history.pop() + self.update_state(current_screen=previous_screen) + + def add_error_message(self, message: str): + """Add an error message to the state.""" + errors = self.state.error_messages.copy() + errors.append(message) + self.update_state(error_messages=errors) + + def add_success_message(self, message: str): + """Add a success message to the state.""" + messages = self.state.success_messages.copy() + messages.append(message) + self.update_state(success_messages=messages) + + def clear_messages(self): + """Clear all messages.""" + self.update_state(error_messages=[], success_messages=[]) + + def add_file(self, file_info): + """Add a file to the selected files list. + + Args: + file_info: FileInfo object containing file details + + """ + files = self.state.selected_files.copy() + files.append(file_info) + self.update_state(selected_files=files) + + def remove_file(self, file_path): + """Remove a file from the selected files list. + + Args: + file_path: Path of the file to remove + + """ + files = [f for f in self.state.selected_files if f.path != file_path] + self.update_state(selected_files=files) + + def clear_files(self): + """Clear all selected files.""" + self.update_state(selected_files=[]) + + def add_detection_result(self, result): + """Add a detection result to the state. + + Args: + result: DetectionResult object + + """ + results = self.state.detection_results.copy() + results.append(result) + self.update_state(detection_results=results) + + def set_user_action(self, column_name: str, action: str): + """Set user action for a column. + + Args: + column_name: Name of the column + action: Action to perform (remove, encode, mask, keep, etc.) + + """ + actions = self.state.user_actions.copy() + actions[column_name] = action + self.update_state(user_actions=actions) + + def update_progress( + self, + progress: float = None, + stage: str = None, + current_file: str = None, + estimated_time_remaining: int = None, + ): + """Update processing progress. + + Args: + progress: Progress value (0.0 to 1.0) + stage: Current processing stage description + current_file: Name of file currently being processed + estimated_time_remaining: Estimated seconds remaining + + """ + updates = {} + if progress is not None: + updates["current_progress"] = progress + if stage is not None: + updates["processing_stage"] = stage + if current_file is not None: + updates["current_file"] = current_file + if estimated_time_remaining is not None: + updates["estimated_time_remaining"] = estimated_time_remaining + + self.update_state(**updates) + + def set_processing(self, is_processing: bool): + """Set processing state. + + Args: + is_processing: True if processing is active, False otherwise + + """ + self.update_state(is_processing=is_processing) + + def set_api_key(self, api_key: str | None): + """Set GeoNames API key. + + Args: + api_key: API key string or None to clear + + """ + self.update_state(geonames_api_key=api_key) + + def reset_state(self): + """Reset state to initial defaults.""" + self.state = AppState() + self.notify_observers() + + +class PIIDetectorApp: + """Main application class.""" + + def __init__(self, page: ft.Page): + """Initialize the main application. + + Args: + page: Flet page instance + + """ + self.page = page + self.state_manager = StateManager(page) + self.screens: dict[str, Any] = {} + self.current_screen_widget = None + + # Initialize screens + self._initialize_screens() + + def _initialize_screens(self): + """Initialize all application screens.""" + self.screens = { + AppConstants.SCREEN_DASHBOARD: DashboardScreen( + self.page, self.state_manager + ), + AppConstants.SCREEN_FILE_SELECTION: FileSelectionScreen( + self.page, self.state_manager + ), + AppConstants.SCREEN_CONFIGURATION: ConfigurationScreen( + self.page, self.state_manager + ), + AppConstants.SCREEN_PROGRESS: ProgressScreen(self.page, self.state_manager), + AppConstants.SCREEN_RESULTS: ResultsScreen(self.page, self.state_manager), + } + + # Add state manager as observer for each screen + for screen in self.screens.values(): + self.state_manager.add_observer(screen) + + def initialize(self): + """Initialize the application and show the first screen.""" + # Add this app as an observer to the state manager + self.state_manager.add_observer(self) + + # Create header bar + self.header_bar = self._create_header_bar() + + # Create main content area + self.main_content = ft.Container( + expand=True, + padding=0, + ) + + # Create main layout + self.page.add( + ft.Column( + [ + self.header_bar, + self.main_content, + ], + spacing=0, + expand=True, + ) + ) + + # Show initial screen + self._show_screen(AppConstants.SCREEN_DASHBOARD) + + # Update the page + self.page.update() + + def _create_header_bar(self) -> ft.Container: + """Create the application header bar.""" + return ft.Container( + content=ft.Row( + [ + # Left side - App title and icon + ft.Row( + [ + ft.Icon( + ft.Icons.SHIELD, + size=24, + color=IPAColors.WHITE, + ), + ft.Text( + "IPA PII Detector v3.0", + size=16, + weight=ft.FontWeight.W_600, + color=IPAColors.WHITE, + ), + ], + spacing=8, + ), + # Right side - Settings and navigation + ft.Row( + [ + # Back button (only show if there's history) + ft.IconButton( + icon=ft.Icons.ARROW_BACK, + tooltip="Go Back", + icon_color=IPAColors.WHITE, + on_click=lambda e: self.state_manager.go_back(), + visible=len(self.state_manager.state.screen_history) + > 0, + ), + # Settings button + ft.IconButton( + icon=ft.Icons.SETTINGS, + tooltip="Settings", + icon_color=IPAColors.WHITE, + on_click=self._handle_settings_click, + bgcolor=IPAColors.IPA_GREEN, + style=ft.ButtonStyle( + shape=ft.RoundedRectangleBorder(radius=6), + ), + ), + ], + spacing=8, + ), + ], + alignment=ft.MainAxisAlignment.SPACE_BETWEEN, + ), + height=60, + bgcolor=IPAColors.DARK_BLUE, + padding=ft.padding.symmetric(horizontal=24, vertical=8), + ) + + def _handle_settings_click(self, e): + """Handle settings button click.""" + with contextlib.suppress(Exception): + # Silently handle any errors + self._show_settings() + + def _show_settings(self): + """Show settings dialog with actual functionality.""" + # # print("DEBUG: Settings button clicked!") # Debug output + + def close_settings(e): + self.page.close(settings_dialog) + # # print("DEBUG: Settings dialog closed") + + def reset_app(e): + # print("DEBUG: Reset Application clicked!") + # Reset application state + self.state_manager.update_state( + selected_files=[], + file_validation_results={}, + detection_results=[], + user_actions={}, + error_messages=[], + success_messages=[], + ) + self.state_manager.navigate_to(AppConstants.SCREEN_DASHBOARD) + self.state_manager.add_success_message("Application reset successfully") + # print("DEBUG: Application state reset completed") + close_settings(e) + + def show_about(e): + close_settings(e) + self._show_about_dialog() + + def show_export_location(e): + # print("DEBUG: Export Location clicked!") + close_settings(e) + self._show_export_location_dialog() + + settings_content = ft.Column( + [ + ft.Text("Application Settings", size=16, weight=ft.FontWeight.W_600), + ft.Divider(), + ft.ListTile( + leading=ft.Icon(ft.Icons.INFO), + title=ft.Text("About PII Detector"), + subtitle=ft.Text("Version information and credits"), + on_click=show_about, + ), + ft.ListTile( + leading=ft.Icon(ft.Icons.REFRESH), + title=ft.Text("Reset Application"), + subtitle=ft.Text("Clear all data and return to dashboard"), + on_click=reset_app, + ), + ft.ListTile( + leading=ft.Icon(ft.Icons.FOLDER), + title=ft.Text("Export Location"), + subtitle=ft.Text("Default: Downloads folder"), + trailing=ft.Icon(ft.Icons.CHEVRON_RIGHT), + on_click=show_export_location, + ), + ] + ) + + settings_dialog = ft.AlertDialog( + modal=True, + title=ft.Text("Settings"), + content=ft.Container( + content=settings_content, + width=400, + height=300, + ), + actions=[ + ft.TextButton("Close", on_click=close_settings), + ], + actions_alignment=ft.MainAxisAlignment.END, + ) + + # Try the standard Flet dialog method + self.page.open(settings_dialog) + # print("DEBUG: Dialog opened with page.open() method") + + def _show_about_dialog(self): + """Show about dialog.""" + + def close_about(e): + self.page.close(about_dialog) + # print("DEBUG: About dialog closed") + + about_content = ft.Column( + [ + ft.Icon(ft.Icons.SHIELD, size=48, color=IPAColors.IPA_GREEN), + ft.Text("PII Detector", size=20, weight=ft.FontWeight.BOLD), + ft.Text( + "Version 3.0 (Flet Edition)", size=14, color=IPAColors.DARK_GREY + ), + ft.Divider(), + ft.Text( + "A professional tool for identifying and anonymizing personally identifiable information (PII) in research datasets.", + text_align=ft.TextAlign.CENTER, + ), + ft.Text( + "Built by IPA Global Research and Data Science", + size=12, + color=IPAColors.DARK_GREY, + text_align=ft.TextAlign.CENTER, + ), + ], + horizontal_alignment=ft.CrossAxisAlignment.CENTER, + spacing=8, + ) + + about_dialog = ft.AlertDialog( + modal=True, + title=ft.Text("About PII Detector"), + content=ft.Container( + content=about_content, + width=350, + height=200, + ), + actions=[ + ft.TextButton("Close", on_click=close_about), + ], + actions_alignment=ft.MainAxisAlignment.END, + ) + + self.page.open(about_dialog) + # print("DEBUG: About dialog opened") + + def _show_export_location_dialog(self): + """Show export location selection dialog.""" + import os + + def close_export_dialog(e): + self.page.close(export_dialog) + # print("DEBUG: Export location dialog closed") + + def handle_folder_result(e: ft.FilePickerResultEvent): + if e.path: + # Update the display with the new path + location_display.content = ft.Text( + e.path, + size=12, + color=IPAColors.DARK_GREY, + ) + location_display.update() # Force update the container + + # Show success message using snack_bar + self.page.snack_bar = ft.SnackBar( + content=ft.Text(f"Export location updated to: {e.path}"), + action="OK", + ) + self.page.snack_bar.open = True + self.page.update() + + def browse_folder(e): + # print("DEBUG: Browse folder clicked - opening folder picker") + folder_picker.get_directory_path(dialog_title="Select Export Folder") + + # Create folder picker + folder_picker = ft.FilePicker(on_result=handle_folder_result) + + # Add folder picker to page overlay if not already added + if folder_picker not in self.page.overlay: + self.page.overlay.append(folder_picker) + + current_location = os.path.join(os.path.expanduser("~"), "Downloads") + + # Create the location display container that can be updated + location_display = ft.Container( + content=ft.Text( + current_location, + size=12, + color=IPAColors.DARK_GREY, + ), + padding=ft.padding.all(10), + bgcolor=IPAColors.LIGHT_GREY, + border_radius=5, + ) + + export_content = ft.Column( + [ + ft.Text( + "Export Location Settings", size=16, weight=ft.FontWeight.W_600 + ), + ft.Divider(), + ft.Text("Current export location:", size=14), + location_display, + ft.Text("All processed files and reports will be saved here.", size=12), + ft.ElevatedButton( + text="Browse Folder", + icon=ft.Icons.FOLDER_OPEN, + on_click=browse_folder, + style=ft.ButtonStyle( + bgcolor=IPAColors.IPA_GREEN, + color=IPAColors.WHITE, + ), + ), + ], + spacing=10, + ) + + export_dialog = ft.AlertDialog( + modal=True, + title=ft.Text("Export Location"), + content=ft.Container( + content=export_content, + width=400, + height=250, + ), + actions=[ + ft.TextButton("Close", on_click=close_export_dialog), + ], + actions_alignment=ft.MainAxisAlignment.END, + ) + + self.page.open(export_dialog) + # print("DEBUG: Export location dialog opened") + + # Create the text field + geonames_api_field = ft.TextField( + label="GeoNames API Key", + hint_text="Enter your GeoNames username", + width=350, + value="", + ) + + def close_custom_modal(e=None): + # Remove the modal from overlay + with contextlib.suppress(Exception): + self.page.overlay.remove(modal_overlay) + self.page.update() + # print("DEBUG: Custom modal closed") + + def save_api_keys(e): + geonames_key = geonames_api_field.value + + if geonames_key and geonames_key.strip(): + close_custom_modal() + + # Show success message + self.page.snack_bar = ft.SnackBar( + content=ft.Text("API key saved successfully!"), action="OK" + ) + self.page.snack_bar.open = True + self.page.update() + else: + self.page.snack_bar = ft.SnackBar( + content=ft.Text("Please enter a valid API key"), action="OK" + ) + self.page.snack_bar.open = True + self.page.update() + + def test_api_key(e): + geonames_key = geonames_api_field.value + + if not geonames_key or not geonames_key.strip(): + self.page.snack_bar = ft.SnackBar( + content=ft.Text("Please enter an API key first"), action="OK" + ) + self.page.snack_bar.open = True + self.page.update() + return + + self.page.snack_bar = ft.SnackBar( + content=ft.Text("API key test - functionality coming soon"), action="OK" + ) + self.page.snack_bar.open = True + self.page.update() + + # Create custom modal overlay + modal_content = ft.Container( + content=ft.Column( + [ + # Title + ft.Text( + "API Keys Configuration", + size=18, + weight=ft.FontWeight.W_600, + color=IPAColors.DARK_BLUE, + ), + ft.Divider(), + # Content + ft.Text( + "GeoNames API Configuration", + size=14, + weight=ft.FontWeight.W_500, + ), + ft.Text( + "Required for Location Population Checks", + size=12, + color=IPAColors.DARK_GREY, + ), + # Text field + geonames_api_field, + # Instructions + ft.Container( + content=ft.Column( + [ + ft.Text( + "How to get your API key:", + size=12, + weight=ft.FontWeight.W_500, + ), + ft.Text("1. Register at: geonames.org/login", size=11), + ft.Text("2. Your username is your API key", size=11), + ft.Text( + "3. Free accounts: 1,000 requests/hour", size=11 + ), + ] + ), + padding=10, + bgcolor=IPAColors.LIGHT_GREY, + border_radius=5, + ), + # Add some spacing before buttons + ft.Container(height=20), + # Buttons + ft.Row( + [ + ft.TextButton("Close", on_click=close_custom_modal), + ft.ElevatedButton("Test", on_click=test_api_key), + ft.ElevatedButton( + "Save", + on_click=save_api_keys, + style=ft.ButtonStyle( + bgcolor=IPAColors.IPA_GREEN, + color=IPAColors.WHITE, + ), + ), + ], + alignment=ft.MainAxisAlignment.END, + ), + ], + spacing=15, + ), + padding=25, + width=500, + height=500, + bgcolor=IPAColors.WHITE, + border_radius=10, + border=ft.border.all(2, IPAColors.DARK_GREY), + shadow=ft.BoxShadow( + spread_radius=1, blur_radius=15, color=ft.colors.BLACK54 + ), + ) + + # Create semi-transparent background + modal_overlay = ft.Container( + content=ft.Stack( + [ + # Background overlay (semi-transparent) + ft.Container( + width=self.page.window_width or 1200, + height=self.page.window_height or 800, + bgcolor=ft.colors.BLACK54, + on_click=close_custom_modal, # Click outside to close + ), + # Centered modal content + ft.Container( + content=modal_content, + alignment=ft.alignment.center, + width=self.page.window_width or 1200, + height=self.page.window_height or 800, + ), + ] + ), + expand=True, + ) + + # Add to overlay + self.page.overlay.append(modal_overlay) + self.page.update() + + # Try to focus the text field + def focus_field(): + try: + geonames_api_field.focus() + self.page.update() + pass + except Exception: + pass + + import threading + + threading.Timer(0.2, focus_field).start() + + # print("DEBUG: Custom modal overlay created and added") + + def _show_screen(self, screen_name: str): + """Show a specific screen. + + Args: + screen_name: Name of the screen to show + + """ + if screen_name in self.screens: + # Update back button visibility + back_button = self.header_bar.content.controls[1].controls[0] + back_button.visible = len(self.state_manager.state.screen_history) > 0 + + # Get the screen widget + screen = self.screens[screen_name] + screen_widget = screen.build() + + # Update main content + self.main_content.content = screen_widget + self.current_screen_widget = screen_widget + + # Update state without triggering observer notifications (to avoid loops) + self.state_manager.state.current_screen = screen_name + + # Update the page + self.page.update() + + def on_state_changed(self, state: AppState): + """Handle state changes from the state manager. + + Args: + state: The updated application state + + """ + # Check if we need to navigate to a different screen + current_screen_name = getattr(self.current_screen_widget, "_screen_name", None) + if state.current_screen != current_screen_name: + self._show_screen(state.current_screen) + + # Update back button visibility + if hasattr(self, "header_bar") and self.header_bar: + back_button = self.header_bar.content.controls[1].controls[0] + back_button.visible = len(state.screen_history) > 0 + self.page.update() diff --git a/src/pii_detector/gui/flet_app/ui/components/__init__.py b/src/pii_detector/gui/flet_app/ui/components/__init__.py new file mode 100644 index 0000000..cbd9024 --- /dev/null +++ b/src/pii_detector/gui/flet_app/ui/components/__init__.py @@ -0,0 +1 @@ +"""Components package for reusable UI elements.""" diff --git a/src/pii_detector/gui/flet_app/ui/components/buttons.py b/src/pii_detector/gui/flet_app/ui/components/buttons.py new file mode 100644 index 0000000..4ec6633 --- /dev/null +++ b/src/pii_detector/gui/flet_app/ui/components/buttons.py @@ -0,0 +1,243 @@ +"""Custom button components following the IPA design system.""" + +from collections.abc import Callable + +import flet as ft + +from pii_detector.gui.flet_app.config.constants import ( + IPAColors, + IPASpacing, + IPATypography, +) + + +def create_primary_button( + text: str, + on_click: Callable | None = None, + icon: str | None = None, + width: float | None = None, + disabled: bool = False, +) -> ft.ElevatedButton: + """Create a primary action button. + + Args: + text: Button text + on_click: Click handler function + icon: Optional icon name + width: Optional button width + disabled: Whether button is disabled + + Returns: + Flet ElevatedButton with primary styling + + """ + return ft.ElevatedButton( + text=text, + icon=icon, + on_click=on_click, + width=width, + height=44, # Standard button height + disabled=disabled, + style=ft.ButtonStyle( + bgcolor=IPAColors.IPA_GREEN if not disabled else IPAColors.DISABLED_COLOR, + color=IPAColors.WHITE, + overlay_color=IPAColors.DARK_GREEN, + elevation=2, + text_style=ft.TextStyle( + size=IPATypography.BODY_LARGE, + weight=ft.FontWeight.W_500, + ), + padding=ft.padding.symmetric( + horizontal=IPASpacing.BUTTON_PADDING_H, + vertical=IPASpacing.BUTTON_PADDING_V, + ), + shape=ft.RoundedRectangleBorder(radius=IPASpacing.RADIUS_SM), + ), + ) + + +def create_secondary_button( + text: str, + on_click: Callable | None = None, + icon: str | None = None, + width: float | None = None, + disabled: bool = False, +) -> ft.OutlinedButton: + """Create a secondary action button. + + Args: + text: Button text + on_click: Click handler function + icon: Optional icon name + width: Optional button width + disabled: Whether button is disabled + + Returns: + Flet OutlinedButton with secondary styling + + """ + return ft.OutlinedButton( + text=text, + icon=icon, + on_click=on_click, + width=width, + height=44, + disabled=disabled, + style=ft.ButtonStyle( + color=IPAColors.IPA_GREEN if not disabled else IPAColors.DISABLED_COLOR, + bgcolor=IPAColors.WHITE, + overlay_color=IPAColors.BLUE_ACCENT, + side=ft.BorderSide( + color=IPAColors.IPA_GREEN if not disabled else IPAColors.DISABLED_COLOR, + width=2, + ), + text_style=ft.TextStyle( + size=IPATypography.BODY_LARGE, + weight=ft.FontWeight.W_500, + ), + padding=ft.padding.symmetric( + horizontal=IPASpacing.BUTTON_PADDING_H, + vertical=IPASpacing.BUTTON_PADDING_V, + ), + shape=ft.RoundedRectangleBorder(radius=IPASpacing.RADIUS_SM), + ), + ) + + +def create_danger_button( + text: str, + on_click: Callable | None = None, + icon: str | None = None, + width: float | None = None, + disabled: bool = False, +) -> ft.ElevatedButton: + """Create a danger/warning action button. + + Args: + text: Button text + on_click: Click handler function + icon: Optional icon name + width: Optional button width + disabled: Whether button is disabled + + Returns: + Flet ElevatedButton with danger styling + + """ + return ft.ElevatedButton( + text=text, + icon=icon, + on_click=on_click, + width=width, + height=44, + disabled=disabled, + style=ft.ButtonStyle( + bgcolor=IPAColors.RED_ORANGE if not disabled else IPAColors.DISABLED_COLOR, + color=IPAColors.WHITE, + overlay_color=IPAColors.RED_ORANGE + "CC", # Darker overlay + elevation=2, + text_style=ft.TextStyle( + size=IPATypography.BODY_LARGE, + weight=ft.FontWeight.W_500, + ), + padding=ft.padding.symmetric( + horizontal=IPASpacing.BUTTON_PADDING_H, + vertical=IPASpacing.BUTTON_PADDING_V, + ), + shape=ft.RoundedRectangleBorder(radius=IPASpacing.RADIUS_SM), + ), + ) + + +def create_icon_button( + icon: str, + tooltip: str, + on_click: Callable | None = None, + color: str = IPAColors.CHARCOAL, + disabled: bool = False, +) -> ft.IconButton: + """Create an icon-only button. + + Args: + icon: Material Design icon name + tooltip: Button tooltip text + on_click: Click handler function + color: Icon color + disabled: Whether button is disabled + + Returns: + Flet IconButton with styling + + """ + return ft.IconButton( + icon=icon, + tooltip=tooltip, + on_click=on_click, + icon_color=color if not disabled else IPAColors.DISABLED_COLOR, + icon_size=20, + disabled=disabled, + ) + + +def create_action_button_group(column_name: str, action_handlers: dict) -> ft.Row: + """Create a group of action buttons for PII column actions. + + Args: + column_name: Name of the column + action_handlers: Dictionary of action handlers + + Returns: + Flet Row with action buttons + + """ + return ft.Row( + [ + ft.ElevatedButton( + "Keep", + icon=ft.Icons.CHECK_CIRCLE, + on_click=lambda e: action_handlers.get("keep", lambda x: None)( + column_name + ), + style=ft.ButtonStyle( + bgcolor=IPAColors.DARK_GREY, + color=IPAColors.WHITE, + text_style=ft.TextStyle(size=IPATypography.BODY_SMALL), + padding=ft.padding.symmetric(horizontal=8, vertical=4), + shape=ft.RoundedRectangleBorder(radius=IPASpacing.RADIUS_SM), + ), + height=32, + ), + ft.ElevatedButton( + "Anonymize", + icon=ft.Icons.LOCK, + on_click=lambda e: action_handlers.get("anonymize", lambda x: None)( + column_name + ), + style=ft.ButtonStyle( + bgcolor=IPAColors.IPA_GREEN, + color=IPAColors.WHITE, + text_style=ft.TextStyle(size=IPATypography.BODY_SMALL), + padding=ft.padding.symmetric(horizontal=8, vertical=4), + shape=ft.RoundedRectangleBorder(radius=IPASpacing.RADIUS_SM), + ), + height=32, + ), + ft.ElevatedButton( + "Remove", + icon=ft.Icons.DELETE, + on_click=lambda e: action_handlers.get("remove", lambda x: None)( + column_name + ), + style=ft.ButtonStyle( + bgcolor=IPAColors.RED_ORANGE, + color=IPAColors.WHITE, + text_style=ft.TextStyle(size=IPATypography.BODY_SMALL), + padding=ft.padding.symmetric(horizontal=8, vertical=4), + shape=ft.RoundedRectangleBorder(radius=IPASpacing.RADIUS_SM), + ), + height=32, + ), + ], + spacing=IPASpacing.XS, + tight=True, + ) diff --git a/src/pii_detector/gui/flet_app/ui/components/cards.py b/src/pii_detector/gui/flet_app/ui/components/cards.py new file mode 100644 index 0000000..428bce9 --- /dev/null +++ b/src/pii_detector/gui/flet_app/ui/components/cards.py @@ -0,0 +1,219 @@ +"""Reusable card components following the IPA design system.""" + +from collections.abc import Callable + +import flet as ft + +from pii_detector.gui.flet_app.config.constants import ( + IPAColors, + IPASpacing, + IPATypography, +) + + +def create_action_card( + title: str, + description: str, + icon: str, + on_click_handler: Callable | None = None, + enabled: bool = True, +) -> ft.Container: + """Create an action card for the dashboard. + + Args: + title: Card title text + description: Card description text + icon: Material Design icon name + on_click_handler: Optional click handler function + enabled: Whether the card is interactive + + Returns: + Flet Container with action card styling + + """ + # Determine colors based on enabled state + bg_color = IPAColors.LIGHT_GREY if enabled else IPAColors.DISABLED_COLOR + border_color = IPAColors.DARK_GREY if enabled else IPAColors.DISABLED_COLOR + text_color = IPAColors.CHARCOAL if enabled else IPAColors.DARK_GREY + + def handle_hover(e): + if enabled: + if e.data == "true": # Mouse enter + card.bgcolor = IPAColors.BLUE_ACCENT + card.border = ft.border.all(2, IPAColors.IPA_GREEN) + else: # Mouse leave + card.bgcolor = IPAColors.LIGHT_GREY + card.border = ft.border.all(2, IPAColors.DARK_GREY) + card.update() + + card = ft.Container( + content=ft.Column( + [ + # Icon container + ft.Container( + content=ft.Icon(icon, size=24, color=IPAColors.WHITE), + width=60, + height=60, + bgcolor=IPAColors.IPA_GREEN if enabled else IPAColors.DARK_GREY, + border_radius=30, + alignment=ft.alignment.center, + ), + # Title text + ft.Text( + title, + size=IPATypography.HEADER_3, + weight=ft.FontWeight.W_600, + color=text_color, + text_align=ft.TextAlign.CENTER, + ), + # Description text + ft.Text( + description, + size=IPATypography.BODY_SMALL, + color=text_color, + text_align=ft.TextAlign.CENTER, + max_lines=4, + overflow=ft.TextOverflow.ELLIPSIS, + ), + ], + horizontal_alignment=ft.CrossAxisAlignment.CENTER, + spacing=IPASpacing.MD, + ), + width=200, + height=220, + padding=IPASpacing.XL, + bgcolor=bg_color, + border=ft.border.all(2, border_color), + border_radius=IPASpacing.RADIUS_LG, + on_click=on_click_handler if enabled else None, + on_hover=handle_hover if enabled else None, + tooltip=title if enabled else "Feature disabled", + ) + + return card + + +def create_metric_card( + title: str, + value: str, + subtitle: str | None = None, + color: str = IPAColors.IPA_GREEN, + icon: str | None = None, +) -> ft.Container: + """Create a metric display card. + + Args: + title: Metric title + value: Metric value (number or text) + subtitle: Optional subtitle text + color: Color theme for the card + icon: Optional icon name + + Returns: + Flet Container with metric card styling + + """ + content_items = [] + + # Add icon if provided + if icon: + content_items.append( + ft.Icon( + icon, + size=32, + color=color, + ) + ) + + # Value text (large) + content_items.append( + ft.Text( + value, + size=IPATypography.HEADER_2, + weight=ft.FontWeight.BOLD, + color=color, + ) + ) + + # Title text + content_items.append( + ft.Text( + title, + size=IPATypography.BODY_REGULAR, + weight=ft.FontWeight.W_500, + color=IPAColors.CHARCOAL, + ) + ) + + # Subtitle if provided + if subtitle: + content_items.append( + ft.Text( + subtitle, + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + ) + ) + + return ft.Container( + content=ft.Column( + content_items, + horizontal_alignment=ft.CrossAxisAlignment.CENTER, + spacing=IPASpacing.SM, + ), + padding=IPASpacing.MD, + bgcolor=IPAColors.WHITE, + border=ft.border.all(1, IPAColors.DARK_GREY), + border_radius=IPASpacing.RADIUS_MD, + alignment=ft.alignment.center, + ) + + +def create_status_card( + title: str, status: str, details: str, status_color: str = IPAColors.IPA_GREEN +) -> ft.Container: + """Create a system status card. + + Args: + title: Status category title + status: Status text (e.g., "Active", "Warning") + details: Additional status details + status_color: Color for the status indicator + + Returns: + Flet Container with status card styling + + """ + return ft.Container( + content=ft.Row( + [ + # Status indicator dot + ft.Container( + width=12, + height=12, + bgcolor=status_color, + border_radius=6, + ), + # Status content + ft.Column( + [ + ft.Text( + title, + size=IPATypography.BODY_REGULAR, + weight=ft.FontWeight.W_500, + color=IPAColors.CHARCOAL, + ), + ft.Text( + f"{status} - {details}", + size=IPATypography.BODY_SMALL, + color=IPAColors.CHARCOAL, + ), + ], + spacing=2, + ), + ], + spacing=IPASpacing.SM, + alignment=ft.MainAxisAlignment.START, + ), + padding=IPASpacing.SM, + ) diff --git a/src/pii_detector/gui/flet_app/ui/screens/__init__.py b/src/pii_detector/gui/flet_app/ui/screens/__init__.py new file mode 100644 index 0000000..02d647c --- /dev/null +++ b/src/pii_detector/gui/flet_app/ui/screens/__init__.py @@ -0,0 +1 @@ +"""Screens package for application screens.""" diff --git a/src/pii_detector/gui/flet_app/ui/screens/configuration.py b/src/pii_detector/gui/flet_app/ui/screens/configuration.py new file mode 100644 index 0000000..c99c353 --- /dev/null +++ b/src/pii_detector/gui/flet_app/ui/screens/configuration.py @@ -0,0 +1,1048 @@ +"""Configuration screen for PII detection settings.""" + +import flet as ft + +from pii_detector.gui.flet_app.config.constants import ( + AppConstants, + IPAColors, + IPASpacing, + IPATypography, +) +from pii_detector.gui.flet_app.config.settings import AppState, DetectionConfig +from pii_detector.gui.flet_app.ui.components.buttons import ( + create_primary_button, + create_secondary_button, +) + + +class ConfigurationScreen: + """Configuration screen implementation.""" + + def __init__(self, page: ft.Page, state_manager): + """Initialize the configuration screen. + + Args: + page: Flet page instance + state_manager: Application state manager + + """ + self.page = page + self.state_manager = state_manager + self._screen_name = AppConstants.SCREEN_CONFIGURATION + + # Preset buttons for detection configuration + self.preset_quick = ft.ElevatedButton( + text="Quick", + style=ft.ButtonStyle( + bgcolor=IPAColors.LIGHT_GREY, color=IPAColors.CHARCOAL + ), + on_click=lambda e: self._set_preset("quick"), + ) + self.preset_balanced = ft.ElevatedButton( + text="Balanced", + style=ft.ButtonStyle(bgcolor=IPAColors.IPA_GREEN, color=IPAColors.WHITE), + on_click=lambda e: self._set_preset("balanced"), + ) + self.preset_thorough = ft.ElevatedButton( + text="Thorough", + style=ft.ButtonStyle( + bgcolor=IPAColors.LIGHT_GREY, color=IPAColors.CHARCOAL + ), + on_click=lambda e: self._set_preset("thorough"), + ) + + # Detection method expandable sections + self.column_name_expanded = False + self.format_pattern_expanded = False + self.sparsity_expanded = False + self.location_expanded = False + self.presidio_expanded = False + + # Detection method checkboxes (now part of expandable sections) + self.column_name_check = ft.Checkbox(value=True) + self.format_pattern_check = ft.Checkbox(value=True) + self.sparsity_check = ft.Checkbox(value=True) + self.location_check = ft.Checkbox(value=False) + self.presidio_check = ft.Checkbox(value=False) + + # Column Name Detection controls + self.fuzzy_threshold = None + self.matching_dropdown = None + self.fuzzy_value_text = None + + # Format Pattern Detection controls + self.format_confidence_slider = None + self.format_confidence_value_text = None + self.phone_checkbox = None + self.email_checkbox = None + self.ssn_checkbox = None + self.date_checkbox = None + + # Sparsity Analysis controls + self.uniqueness_slider = None + self.min_entries_slider = None + self.uniqueness_value_text = None + self.min_entries_value_text = None + + # Location Population controls + self.population_slider = None + self.population_value_text = None + + # Presidio controls + self.presidio_confidence_slider = None + self.presidio_confidence_value_text = None + self.presidio_language_dropdown = None + self.presidio_person_checkbox = None + self.presidio_org_checkbox = None + + def build(self) -> ft.Container: + """Build the configuration screen.""" + # Create the detection methods container + self.detection_methods_container = ft.Column(spacing=IPASpacing.SM) + self._build_all_detection_methods() + + return ft.Container( + content=ft.Column( + [ + # Title + ft.Text( + "Detection Configuration", + size=IPATypography.HEADER_2, + weight=ft.FontWeight.W_600, + color=IPAColors.DARK_BLUE, + ), + ft.Text( + "Configure PII detection methods for your analysis. You'll select anonymization methods for each detected column on the Results screen.", + size=IPATypography.BODY_REGULAR, + color=IPAColors.CHARCOAL, + ), + # Detection Methods Section + ft.Container( + content=ft.Column( + [ + ft.Text( + "Detection Configuration", + size=IPATypography.HEADER_3, + weight=ft.FontWeight.W_500, + color=IPAColors.CHARCOAL, + ), + # Preset buttons + ft.Row( + [ + self.preset_quick, + self.preset_balanced, + self.preset_thorough, + ], + alignment=ft.MainAxisAlignment.CENTER, + spacing=IPASpacing.SM, + ), + # Expandable detection method sections + self.detection_methods_container, + ], + spacing=IPASpacing.SM, + ), + padding=IPASpacing.MD, + bgcolor=IPAColors.WHITE, + border=ft.border.all(1, IPAColors.LIGHT_GREY), + border_radius=IPASpacing.RADIUS_MD, + ), + # Action buttons + ft.Row( + [ + create_secondary_button( + text="Back to Files", + on_click=lambda e: self.state_manager.navigate_to( + AppConstants.SCREEN_FILE_SELECTION + ), + icon=ft.Icons.ARROW_BACK, + ), + create_primary_button( + text="Start Analysis", + on_click=self._handle_start_analysis, + icon=ft.Icons.PLAY_ARROW, + ), + ], + alignment=ft.MainAxisAlignment.SPACE_BETWEEN, + ), + ], + spacing=IPASpacing.LG, + scroll=ft.ScrollMode.AUTO, + ), + padding=IPASpacing.XL, + expand=True, + ) + + def _build_detection_method_section( + self, + method_id: str, + title: str, + description: str, + checkbox: ft.Checkbox, + is_expanded: bool, + ): + """Build an expandable detection method section.""" + + def toggle_expand(e): + # Toggle the expansion state + if method_id == "column_name": + self.column_name_expanded = not self.column_name_expanded + elif method_id == "format_pattern": + self.format_pattern_expanded = not self.format_pattern_expanded + elif method_id == "sparsity": + self.sparsity_expanded = not self.sparsity_expanded + elif method_id == "location": + self.location_expanded = not self.location_expanded + elif method_id == "presidio": + self.presidio_expanded = not self.presidio_expanded + + # Rebuild the detection methods container + self._build_all_detection_methods() + self.page.update() + + # Header with checkbox and toggle + header = ft.Container( + content=ft.Row( + [ + ft.Row( + [ + checkbox, + ft.Text( + title, + size=IPATypography.BODY_LARGE, + weight=ft.FontWeight.W_500, + color=IPAColors.CHARCOAL, + ), + ], + spacing=IPASpacing.XS, + ), + ft.IconButton( + icon=ft.Icons.EXPAND_MORE + if not is_expanded + else ft.Icons.EXPAND_LESS, + icon_color=IPAColors.DARK_GREY, + on_click=toggle_expand, + ), + ], + alignment=ft.MainAxisAlignment.SPACE_BETWEEN, + ), + padding=IPASpacing.SM, + bgcolor=IPAColors.LIGHT_GREY, + border_radius=IPASpacing.RADIUS_SM, + ) + + # Expanded content with detailed settings + expanded_content = None + if is_expanded: + expanded_content = ft.Container( + content=ft.Column( + [ + ft.Text( + description, + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + ), + self._get_method_settings(method_id), + ], + spacing=IPASpacing.SM, + ), + padding=IPASpacing.SM, + bgcolor=IPAColors.WHITE, + border=ft.border.all(1, IPAColors.LIGHT_GREY), + border_radius=IPASpacing.RADIUS_SM, + ) + + # Return the complete section + section_content = [header] + if expanded_content: + section_content.append(expanded_content) + + return ft.Column(section_content, spacing=IPASpacing.XS) + + def _build_all_detection_methods(self): + """Build all detection method sections and update the container.""" + methods = [ + ( + "column_name", + "Column Name/Label Analysis", + "Analyzes column headers against restricted word lists for data collection variables, location identifiers, personal identifiers, and sensitive account information.", + self.column_name_check, + self.column_name_expanded, + ), + ( + "format_pattern", + "Format Pattern Detection", + "Identifies structured data patterns like phone numbers, emails, dates, and social security numbers using regex patterns.", + self.format_pattern_check, + self.format_pattern_expanded, + ), + ( + "sparsity", + "Sparsity Analysis", + "Flags columns where most values are unique, indicating potential open-ended responses or identifiers.", + self.sparsity_check, + self.sparsity_expanded, + ), + ( + "presidio", + "AI-Powered Presidio Engine", + "Uses Microsoft's Presidio ML models for advanced entity recognition and context-aware PII detection.", + self.presidio_check, + self.presidio_expanded, + ), + ( + "location", + "Location Population Checks (GeoNames API Required)", + "Cross-references location names against population databases to identify small communities (requires API access).", + self.location_check, + self.location_expanded, + ), + ] + + # Clear existing controls + self.detection_methods_container.controls.clear() + + # Add all detection method sections + for method_id, title, description, checkbox, is_expanded in methods: + section = self._build_detection_method_section( + method_id, title, description, checkbox, is_expanded + ) + self.detection_methods_container.controls.append(section) + + def _get_method_settings(self, method_id: str): + """Get detailed settings for each detection method.""" + if method_id == "column_name": + # Create value display text + self.fuzzy_value_text = ft.Text( + "0.8 (80%)", size=IPATypography.BODY_SMALL, color=IPAColors.CHARCOAL + ) + + # Create fuzzy threshold slider (initially enabled based on default "fuzzy" value) + self.fuzzy_threshold = ft.Slider( + label="Fuzzy Match Threshold", + value=0.8, + min=0.5, + max=1.0, + divisions=10, + width=200, + disabled=False, # Enabled by default since "fuzzy" is selected + ) + + def on_fuzzy_threshold_change(e): + value = e.control.value + self.fuzzy_value_text.value = f"{value:.1f} ({value * 100:.0f}%)" + self.page.update() + + self.fuzzy_threshold.on_change = on_fuzzy_threshold_change + + def on_matching_type_change(e): + # Enable/disable fuzzy threshold based on matching type + matching_type = e.control.value + self.fuzzy_threshold.disabled = matching_type == "strict" + if matching_type == "strict": + self.fuzzy_value_text.value = "N/A (Strict mode)" + else: + self.fuzzy_value_text.value = f"{self.fuzzy_threshold.value:.1f} ({self.fuzzy_threshold.value * 100:.0f}%)" + self.page.update() + + self.matching_dropdown = ft.Dropdown( + label="Matching Type", + value="fuzzy", + options=[ + ft.dropdown.Option("strict", "Strict (Exact matches only)"), + ft.dropdown.Option("fuzzy", "Fuzzy (Allow similar terms)"), + ft.dropdown.Option("both", "Both (Strict + Fuzzy)"), + ], + width=250, + on_change=on_matching_type_change, + ) + + return ft.Column( + [ + ft.Text( + "Matching Settings:", + size=IPATypography.BODY_SMALL, + weight=ft.FontWeight.W_500, + ), + self.matching_dropdown, + ft.Row( + [ + self.fuzzy_threshold, + self.fuzzy_value_text, + ], + alignment=ft.MainAxisAlignment.START, + spacing=IPASpacing.SM, + ), + ], + spacing=IPASpacing.XS, + ) + + elif method_id == "format_pattern": + self.format_confidence_value_text = ft.Text( + "0.7 (70%)", size=IPATypography.BODY_SMALL, color=IPAColors.CHARCOAL + ) + + self.format_confidence_slider = ft.Slider( + label="Detection Confidence", + value=0.7, + min=0.5, + max=1.0, + divisions=10, + width=200, + ) + + def on_confidence_change(e): + value = e.control.value + self.format_confidence_value_text.value = ( + f"{value:.1f} ({value * 100:.0f}%)" + ) + self.page.update() + + self.format_confidence_slider.on_change = on_confidence_change + + # Store checkboxes as instance variables + self.phone_checkbox = ft.Checkbox(label="Phone Numbers", value=True) + self.email_checkbox = ft.Checkbox(label="Email Addresses", value=True) + self.ssn_checkbox = ft.Checkbox(label="Social Security Numbers", value=True) + self.date_checkbox = ft.Checkbox(label="Date Formats", value=True) + + return ft.Column( + [ + ft.Text( + "Pattern Types:", + size=IPATypography.BODY_SMALL, + weight=ft.FontWeight.W_500, + ), + ft.Row( + [ + self.phone_checkbox, + self.email_checkbox, + ], + spacing=IPASpacing.SM, + ), + ft.Row( + [ + self.ssn_checkbox, + self.date_checkbox, + ], + spacing=IPASpacing.SM, + ), + ft.Row( + [ + self.format_confidence_slider, + self.format_confidence_value_text, + ], + alignment=ft.MainAxisAlignment.START, + spacing=IPASpacing.SM, + ), + ], + spacing=IPASpacing.XS, + ) + + elif method_id == "sparsity": + self.uniqueness_value_text = ft.Text( + "0.8 (80%)", size=IPATypography.BODY_SMALL, color=IPAColors.CHARCOAL + ) + self.min_entries_value_text = ft.Text( + "10 entries", size=IPATypography.BODY_SMALL, color=IPAColors.CHARCOAL + ) + + self.uniqueness_slider = ft.Slider( + label="Uniqueness Threshold", + value=0.8, + min=0.5, + max=1.0, + divisions=10, + width=200, + ) + + self.min_entries_slider = ft.Slider( + label="Minimum Entries Required", + value=10, + min=5, + max=100, + divisions=19, + width=200, + ) + + def on_uniqueness_change(e): + value = e.control.value + self.uniqueness_value_text.value = f"{value:.1f} ({value * 100:.0f}%)" + self.page.update() + + def on_min_entries_change(e): + value = int(e.control.value) + self.min_entries_value_text.value = f"{value} entries" + self.page.update() + + self.uniqueness_slider.on_change = on_uniqueness_change + self.min_entries_slider.on_change = on_min_entries_change + + return ft.Column( + [ + ft.Text( + "Sparsity Thresholds:", + size=IPATypography.BODY_SMALL, + weight=ft.FontWeight.W_500, + ), + ft.Row( + [ + self.uniqueness_slider, + self.uniqueness_value_text, + ], + alignment=ft.MainAxisAlignment.START, + spacing=IPASpacing.SM, + ), + ft.Row( + [ + self.min_entries_slider, + self.min_entries_value_text, + ], + alignment=ft.MainAxisAlignment.START, + spacing=IPASpacing.SM, + ), + ], + spacing=IPASpacing.XS, + ) + + elif method_id == "location": + self.population_value_text = ft.Text( + "50,000 people", size=IPATypography.BODY_SMALL, color=IPAColors.CHARCOAL + ) + + self.population_slider = ft.Slider( + label="Small Population Threshold", + value=50000, + min=1000, + max=100000, + divisions=99, + width=200, + ) + + def on_population_change(e): + value = int(e.control.value) + self.population_value_text.value = f"{value:,} people" + self.page.update() + + self.population_slider.on_change = on_population_change + + # API Key status display - check if API key is configured + has_api_key = bool(self.state_manager.state.geonames_api_key) + + if has_api_key: + api_status = ft.Container( + content=ft.Row( + [ + ft.Icon( + ft.Icons.CHECK_CIRCLE, color=IPAColors.SUCCESS, size=16 + ), + ft.Text( + "API Key configured", + size=IPATypography.BODY_SMALL, + color=IPAColors.SUCCESS, + ), + ], + spacing=IPASpacing.XS, + ), + padding=IPASpacing.SM, + bgcolor=IPAColors.SUCCESS + "20", # 20% opacity + border_radius=IPASpacing.RADIUS_SM, + ) + else: + api_status = ft.Container( + content=ft.Row( + [ + ft.Icon( + ft.Icons.WARNING, color=IPAColors.RED_ORANGE, size=16 + ), + ft.Text( + "No API Key configured", + size=IPATypography.BODY_SMALL, + color=IPAColors.RED_ORANGE, + ), + ], + spacing=IPASpacing.XS, + ), + padding=IPASpacing.SM, + bgcolor=IPAColors.RED_ORANGE + "20", # 20% opacity + border_radius=IPASpacing.RADIUS_SM, + ) + + # Information about getting API key with conditional button + if has_api_key: + # Show update option when API key exists + api_info = ft.Container( + content=ft.Column( + [ + ft.Text( + "API Key Information:", + size=IPATypography.BODY_SMALL, + weight=ft.FontWeight.W_500, + ), + ft.Text( + "✅ GeoNames API key is configured", + size=IPATypography.BODY_SMALL, + ), + ft.Text( + "• Free accounts: 1,000 requests/hour", + size=IPATypography.BODY_SMALL, + ), + # Update API Key button + ft.Container( + content=ft.ElevatedButton( + text="Update API Key", + icon=ft.Icons.EDIT, + on_click=self._handle_add_api_key, + style=ft.ButtonStyle( + bgcolor=IPAColors.DARK_BLUE, + color=IPAColors.WHITE, + ), + ), + margin=ft.margin.only(top=IPASpacing.SM), + ), + ], + spacing=IPASpacing.XS, + ), + padding=IPASpacing.SM, + bgcolor=IPAColors.LIGHT_GREY, + border_radius=IPASpacing.RADIUS_SM, + ) + else: + # Show setup option when no API key + api_info = ft.Container( + content=ft.Column( + [ + ft.Text( + "API Key Information:", + size=IPATypography.BODY_SMALL, + weight=ft.FontWeight.W_500, + ), + ft.Text( + "• Register at: https://www.geonames.org/login", + size=IPATypography.BODY_SMALL, + ), + ft.Text( + "• Free account allows 1,000 requests/hour", + size=IPATypography.BODY_SMALL, + ), + ft.Text( + "• Your username is your API key", + size=IPATypography.BODY_SMALL, + ), + # Add API Key button + ft.Container( + content=ft.ElevatedButton( + text="Add API Key", + icon=ft.Icons.KEY, + on_click=self._handle_add_api_key, + style=ft.ButtonStyle( + bgcolor=IPAColors.IPA_GREEN, + color=IPAColors.WHITE, + ), + ), + margin=ft.margin.only(top=IPASpacing.SM), + ), + ], + spacing=IPASpacing.XS, + ), + padding=IPASpacing.SM, + bgcolor=IPAColors.LIGHT_GREY, + border_radius=IPASpacing.RADIUS_SM, + ) + + return ft.Column( + [ + ft.Text( + "Population Lookup:", + size=IPATypography.BODY_SMALL, + weight=ft.FontWeight.W_500, + ), + api_status, + api_info, + ft.Row( + [ + self.population_slider, + self.population_value_text, + ], + alignment=ft.MainAxisAlignment.START, + spacing=IPASpacing.SM, + ), + ], + spacing=IPASpacing.XS, + ) + + elif method_id == "presidio": + self.presidio_confidence_value_text = ft.Text( + "0.8 (80%)", size=IPATypography.BODY_SMALL, color=IPAColors.CHARCOAL + ) + + self.presidio_confidence_slider = ft.Slider( + label="Confidence Threshold", + value=0.8, + min=0.5, + max=1.0, + divisions=10, + width=200, + ) + + def on_presidio_confidence_change(e): + value = e.control.value + self.presidio_confidence_value_text.value = ( + f"{value:.1f} ({value * 100:.0f}%)" + ) + self.page.update() + + self.presidio_confidence_slider.on_change = on_presidio_confidence_change + + # Store language dropdown as instance variable + self.presidio_language_dropdown = ft.Dropdown( + label="Language Model", + value="en_core_web_sm", + options=[ + ft.dropdown.Option("en_core_web_sm", "English (Small)"), + ft.dropdown.Option("en_core_web_md", "English (Medium)"), + ft.dropdown.Option("en_core_web_lg", "English (Large)"), + ], + width=250, + ) + + # Store Presidio entity checkboxes + self.presidio_person_checkbox = ft.Checkbox( + label="Person Names", value=True + ) + self.presidio_org_checkbox = ft.Checkbox(label="Organizations", value=True) + + return ft.Column( + [ + ft.Text( + "Presidio Settings:", + size=IPATypography.BODY_SMALL, + weight=ft.FontWeight.W_500, + ), + self.presidio_language_dropdown, + ft.Row( + [ + self.presidio_confidence_slider, + self.presidio_confidence_value_text, + ], + alignment=ft.MainAxisAlignment.START, + spacing=IPASpacing.SM, + ), + ft.Row( + [ + self.presidio_person_checkbox, + self.presidio_org_checkbox, + ], + spacing=IPASpacing.SM, + ), + ], + spacing=IPASpacing.XS, + ) + + return ft.Text( + "No additional settings", + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + ) + + def _set_preset(self, preset_type: str): + """Set detection method presets.""" + # Update preset button styles + if preset_type == "quick": + self.preset_quick.style.bgcolor = IPAColors.IPA_GREEN + self.preset_quick.style.color = IPAColors.WHITE + self.preset_balanced.style.bgcolor = IPAColors.LIGHT_GREY + self.preset_balanced.style.color = IPAColors.CHARCOAL + self.preset_thorough.style.bgcolor = IPAColors.LIGHT_GREY + self.preset_thorough.style.color = IPAColors.CHARCOAL + + # Quick preset: Enable only basic methods + self.column_name_check.value = True + self.format_pattern_check.value = True + self.sparsity_check.value = False + self.location_check.value = False + self.presidio_check.value = False + + elif preset_type == "balanced": + self.preset_balanced.style.bgcolor = IPAColors.IPA_GREEN + self.preset_balanced.style.color = IPAColors.WHITE + self.preset_quick.style.bgcolor = IPAColors.LIGHT_GREY + self.preset_quick.style.color = IPAColors.CHARCOAL + self.preset_thorough.style.bgcolor = IPAColors.LIGHT_GREY + self.preset_thorough.style.color = IPAColors.CHARCOAL + + # Balanced preset: Enable most methods + self.column_name_check.value = True + self.format_pattern_check.value = True + self.sparsity_check.value = True + self.location_check.value = False + self.presidio_check.value = False + + elif preset_type == "thorough": + self.preset_thorough.style.bgcolor = IPAColors.IPA_GREEN + self.preset_thorough.style.color = IPAColors.WHITE + self.preset_quick.style.bgcolor = IPAColors.LIGHT_GREY + self.preset_quick.style.color = IPAColors.CHARCOAL + self.preset_balanced.style.bgcolor = IPAColors.LIGHT_GREY + self.preset_balanced.style.color = IPAColors.CHARCOAL + + # Thorough preset: Enable all methods + self.column_name_check.value = True + self.format_pattern_check.value = True + self.sparsity_check.value = True + self.location_check.value = True + self.presidio_check.value = True + + self.page.update() + + def _handle_start_analysis(self, e): + """Handle start analysis button click.""" + # Validate that at least one detection method is selected + detection_methods_enabled = [ + self.column_name_check.value, + self.format_pattern_check.value, + self.sparsity_check.value, + self.location_check.value, + self.presidio_check.value, + ] + + if not any(detection_methods_enabled): + # Show error dialog + def close_dialog(e): + dialog.open = False + self.page.update() + + dialog = ft.AlertDialog( + modal=True, + title=ft.Text("No Detection Methods Selected", color=IPAColors.ERROR), + content=ft.Column( + [ + ft.Icon(ft.Icons.WARNING, color=IPAColors.ERROR, size=48), + ft.Text( + "Please select at least one PII detection method before starting the analysis.", + text_align=ft.TextAlign.CENTER, + ), + ], + horizontal_alignment=ft.CrossAxisAlignment.CENTER, + tight=True, + ), + actions=[ + ft.TextButton("OK", on_click=close_dialog), + ], + actions_alignment=ft.MainAxisAlignment.END, + ) + + self.page.open(dialog) + return + + # Collect configuration and save to state + config = DetectionConfig( + # Detection method enabled/disabled states + column_name_enabled=self.column_name_check.value, + format_pattern_enabled=self.format_pattern_check.value, + sparsity_enabled=self.sparsity_check.value, + location_population_enabled=self.location_check.value, + ai_text_enabled=self.presidio_check.value, + # Column Name Detection settings + fuzzy_match_threshold=self.fuzzy_threshold.value + if self.fuzzy_threshold + else 0.8, + matching_type=self.matching_dropdown.value + if self.matching_dropdown + else "fuzzy", + # Format Pattern Detection settings + format_confidence_threshold=self.format_confidence_slider.value + if self.format_confidence_slider + else 0.7, + detect_phone=self.phone_checkbox.value if self.phone_checkbox else True, + detect_email=self.email_checkbox.value if self.email_checkbox else True, + detect_ssn=self.ssn_checkbox.value if self.ssn_checkbox else True, + detect_dates=self.date_checkbox.value if self.date_checkbox else True, + # Sparsity Analysis settings + sparsity_threshold=self.uniqueness_slider.value + if self.uniqueness_slider + else 0.8, + min_entries_required=int(self.min_entries_slider.value) + if self.min_entries_slider + else 10, + # Location Population settings + population_threshold=int(self.population_slider.value) + if self.population_slider + else 50000, + # Presidio settings + presidio_confidence_threshold=self.presidio_confidence_slider.value + if self.presidio_confidence_slider + else 0.8, + presidio_language_model=self.presidio_language_dropdown.value + if self.presidio_language_dropdown + else "en_core_web_sm", + presidio_detect_person=self.presidio_person_checkbox.value + if self.presidio_person_checkbox + else True, + presidio_detect_org=self.presidio_org_checkbox.value + if self.presidio_org_checkbox + else True, + ) + + # Save configuration to state + self.state_manager.update_state( + detection_config=config, + ) + + # Show success message and navigate + self.state_manager.add_success_message( + "Configuration saved. Starting analysis..." + ) + self.state_manager.navigate_to(AppConstants.SCREEN_PROGRESS) + + def _handle_add_api_key(self, e): + """Handle Add API Key button click in Location Population section.""" + # Show API key input dialog + api_key_field = ft.TextField( + label="GeoNames API Key", + hint_text="Enter your GeoNames username", + width=350, + value="", + ) + + def close_api_dialog(e=None): + dialog.open = False + self.page.update() + + def save_api_key(e): + api_key = api_key_field.value.strip() + if api_key: + # Save API key securely (don't log the actual key!) + # In real app, this would be stored securely in keychain/credential store + self.state_manager.update_state(geonames_api_key=api_key) + + # Show success message + self.state_manager.add_success_message("API key saved successfully!") + close_api_dialog() + + # Rebuild the detection methods to show updated status + self._build_all_detection_methods() + self.page.update() + else: + self.state_manager.add_error_message("Please enter a valid API key") + + def test_api_key(e): + api_key = api_key_field.value.strip() + if not api_key: + self.state_manager.add_error_message("Please enter an API key first") + return + + # Test the API key with a simple GeoNames request + import threading + + import requests + + def test_api(): + try: + # Simple test query to GeoNames API + test_url = f"http://api.geonames.org/searchJSON?q=London&maxRows=1&username={api_key}" + response = requests.get(test_url, timeout=10) + + if response.status_code == 200: + data = response.json() + if "geonames" in data and len(data["geonames"]) > 0: + # API key works + self.state_manager.add_success_message( + "✅ API key is valid and working!" + ) + else: + self.state_manager.add_error_message( + "API key may be invalid - no results returned" + ) + else: + self.state_manager.add_error_message( + f"API test failed - HTTP {response.status_code}" + ) + + except requests.exceptions.Timeout: + self.state_manager.add_error_message( + "API test timeout - please check internet connection" + ) + except requests.exceptions.RequestException: + self.state_manager.add_error_message( + "API test failed - network error" + ) + except Exception: + self.state_manager.add_error_message( + "API test failed - unexpected error" + ) + + # Show testing message and run test + self.state_manager.add_success_message("Testing API key...") + threading.Thread(target=test_api, daemon=True).start() + + dialog = ft.AlertDialog( + modal=True, + title=ft.Text("Configure GeoNames API Key", color=IPAColors.DARK_BLUE), + content=ft.Container( + content=ft.Column( + [ + ft.Text( + "Location Population Check Configuration", + size=IPATypography.BODY_REGULAR, + weight=ft.FontWeight.W_500, + ), + ft.Text( + "Required for identifying small communities by population size", + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + ), + api_key_field, + ft.Container( + content=ft.Column( + [ + ft.Text( + "How to get your API key:", + size=IPATypography.BODY_SMALL, + weight=ft.FontWeight.W_500, + ), + ft.Text( + "1. Register at: https://www.geonames.org/login", + size=IPATypography.BODY_SMALL, + ), + ft.Text( + "2. Your username is your API key", + size=IPATypography.BODY_SMALL, + ), + ft.Text( + "3. Free accounts: 1,000 requests/hour", + size=IPATypography.BODY_SMALL, + ), + ], + spacing=4, + ), + padding=IPASpacing.SM, + bgcolor=IPAColors.LIGHT_GREY, + border_radius=IPASpacing.RADIUS_SM, + margin=ft.margin.symmetric(vertical=IPASpacing.SM), + ), + ], + spacing=IPASpacing.SM, + ), + width=450, + height=300, + ), + actions=[ + ft.TextButton("Cancel", on_click=close_api_dialog), + ft.TextButton("Test", on_click=test_api_key), + ft.ElevatedButton( + "Save", + on_click=save_api_key, + style=ft.ButtonStyle( + bgcolor=IPAColors.IPA_GREEN, + color=IPAColors.WHITE, + ), + ), + ], + actions_alignment=ft.MainAxisAlignment.END, + ) + + self.page.open(dialog) + + def on_state_changed(self, state: AppState): + """Handle state changes.""" + pass diff --git a/src/pii_detector/gui/flet_app/ui/screens/dashboard.py b/src/pii_detector/gui/flet_app/ui/screens/dashboard.py new file mode 100644 index 0000000..73cac89 --- /dev/null +++ b/src/pii_detector/gui/flet_app/ui/screens/dashboard.py @@ -0,0 +1,189 @@ +"""Dashboard screen - main landing page with quick actions.""" + +import flet as ft + +from pii_detector.gui.flet_app.config.constants import ( + AppConstants, + IPAColors, + IPASpacing, + IPATypography, +) +from pii_detector.gui.flet_app.config.settings import AppState +from pii_detector.gui.flet_app.ui.components.cards import ( + create_action_card, + create_status_card, +) + + +class DashboardScreen: + """Dashboard screen implementation.""" + + def __init__(self, page: ft.Page, state_manager): + """Initialize the dashboard screen. + + Args: + page: Flet page instance + state_manager: Application state manager + + """ + self.page = page + self.state_manager = state_manager + self._screen_name = AppConstants.SCREEN_DASHBOARD + + def build(self) -> ft.Container: + """Build the dashboard screen.""" + return ft.Container( + content=ft.Column( + [ + # Quick Actions Section + self._build_quick_actions(), + # System Status Section + self._build_system_status(), + ], + spacing=IPASpacing.XL, + scroll=ft.ScrollMode.AUTO, + ), + padding=IPASpacing.XL, + expand=True, + ) + + def _build_quick_actions(self) -> ft.Container: + """Build the quick actions grid.""" + return ft.Container( + content=ft.Column( + [ + # Section title + ft.Text( + "Quick Actions", + size=IPATypography.HEADER_2, + weight=ft.FontWeight.W_600, + color=IPAColors.DARK_BLUE, + ), + # Action cards grid + ft.Row( + [ + create_action_card( + title="Single Analysis", + description="Analyze one dataset file for PII detection and anonymization", + icon=ft.Icons.DESCRIPTION, + on_click_handler=self._handle_single_analysis, + ), + create_action_card( + title="Batch Process", + description="Process multiple dataset files in parallel for efficient workflow", + icon=ft.Icons.BAR_CHART, + on_click_handler=self._handle_batch_process, + enabled=False, # Feature disabled - focusing on single analysis + ), + create_action_card( + title="Recent Projects", + description="View and reopen previously analyzed datasets and results", + icon=ft.Icons.HISTORY, + on_click_handler=self._handle_recent_projects, + enabled=False, # Feature not implemented yet + ), + ], + alignment=ft.MainAxisAlignment.CENTER, + spacing=IPASpacing.LG, + ), + ], + spacing=IPASpacing.MD, + ), + ) + + def _build_system_status(self) -> ft.Container: + """Build the system status panel.""" + return ft.Container( + content=ft.Column( + [ + # Section title + ft.Text( + "System Status", + size=IPATypography.BODY_LARGE, + weight=ft.FontWeight.W_600, + color=IPAColors.CHARCOAL, + ), + # Status indicators + ft.Column( + [ + create_status_card( + title="Detection Methods", + status="Active", + details="Standard and AI detection available", + status_color=IPAColors.SUCCESS, + ), + create_status_card( + title="Last Processing", + status="Idle", + details="No recent processing activity", + status_color=IPAColors.DARK_GREY, + ), + create_status_card( + title="Performance", + status="Optimal", + details="All systems running normally", + status_color=IPAColors.SUCCESS, + ), + ], + spacing=IPASpacing.SM, + ), + ], + spacing=IPASpacing.MD, + ), + padding=IPASpacing.MD, + bgcolor=IPAColors.WHITE, + border=ft.border.all(1, IPAColors.DARK_GREY), + border_radius=IPASpacing.RADIUS_MD, + ) + + def _handle_single_analysis(self, e): + """Handle single analysis action.""" + # print("DEBUG: Single analysis clicked!") # Debug output + # Clear any previous file selections for single analysis + self.state_manager.update_state(selected_files=[]) + # print("DEBUG: About to navigate to file selection") # Debug output + self.state_manager.navigate_to(AppConstants.SCREEN_FILE_SELECTION) + # print("DEBUG: Navigation complete") # Debug output + + def _handle_batch_process(self, e): + """Handle batch process action.""" + # Clear any previous file selections for batch processing + self.state_manager.update_state(selected_files=[]) + # Navigate to file selection with batch mode indication + # For now, use the same file selection screen + self.state_manager.navigate_to(AppConstants.SCREEN_FILE_SELECTION) + + def _handle_recent_projects(self, e): + """Handle recent projects action.""" + + # Placeholder for recent projects functionality + def close_dialog(e): + dialog.open = False + self.page.update() + + dialog = ft.AlertDialog( + modal=True, + title=ft.Text("Recent Projects"), + content=ft.Text( + "Recent projects functionality is coming soon. This will show your previously analyzed datasets and allow you to reopen results." + ), + actions=[ + ft.TextButton("Close", on_click=close_dialog), + ], + actions_alignment=ft.MainAxisAlignment.END, + ) + + self.page.dialog = dialog + dialog.open = True + self.page.update() + + def on_state_changed(self, state: AppState): + """Handle state changes. + + Args: + state: Updated application state + + """ + # Dashboard doesn't need to react to most state changes + # but we could update the status panel based on recent activity + pass diff --git a/src/pii_detector/gui/flet_app/ui/screens/file_selection.py b/src/pii_detector/gui/flet_app/ui/screens/file_selection.py new file mode 100644 index 0000000..d514d47 --- /dev/null +++ b/src/pii_detector/gui/flet_app/ui/screens/file_selection.py @@ -0,0 +1,547 @@ +"""File selection screen for choosing dataset files.""" + +from pathlib import Path + +import flet as ft + +from pii_detector.gui.flet_app.config.constants import ( + AppConstants, + IPAColors, + IPASpacing, + IPATypography, +) +from pii_detector.gui.flet_app.config.settings import ( + AppState, + FileInfo, + ValidationResult, +) +from pii_detector.gui.flet_app.ui.components.buttons import ( + create_primary_button, + create_secondary_button, +) + + +class FileSelectionScreen: + """File selection screen implementation.""" + + def __init__(self, page: ft.Page, state_manager): + """Initialize the file selection screen. + + Args: + page: Flet page instance + state_manager: Application state manager + + """ + self.page = page + self.state_manager = state_manager + self._screen_name = AppConstants.SCREEN_FILE_SELECTION + + # File picker + self.file_picker = ft.FilePicker(on_result=self._handle_file_picker_result) + + # Add file picker to page overlay if not already added + if self.file_picker not in self.page.overlay: + self.page.overlay.append(self.file_picker) + + # UI components that need updating + self.file_list_container = None + self.next_button = None + self.drop_zone = None + + def build(self) -> ft.Container: + """Build the file selection screen.""" + container = ft.Container( + content=ft.Column( + [ + # Title and description + self._build_header(), + # User feedback messages + self._build_messages(), + # File drop zone + self._build_drop_zone(), + # Selected files list + self._build_selected_files_section(), + # Action buttons + self._build_action_buttons(), + ], + spacing=IPASpacing.LG, + scroll=ft.ScrollMode.AUTO, + ), + padding=IPASpacing.XL, + expand=True, + ) + + # Update file list display now that UI components are built + self._update_file_list_display() + + return container + + def _build_messages(self) -> ft.Container: + """Build user feedback messages.""" + messages = [] + + # Error messages + for error in self.state_manager.state.error_messages: + messages.append( + ft.Container( + content=ft.Row( + [ + ft.Icon(ft.Icons.ERROR, color=IPAColors.ERROR, size=16), + ft.Text( + error, + color=IPAColors.ERROR, + size=IPATypography.BODY_SMALL, + ), + ] + ), + padding=IPASpacing.SM, + bgcolor=IPAColors.ERROR + "20", # 20% opacity + border_radius=IPASpacing.RADIUS_SM, + ) + ) + + # Success messages + for success in self.state_manager.state.success_messages: + messages.append( + ft.Container( + content=ft.Row( + [ + ft.Icon( + ft.Icons.CHECK_CIRCLE, color=IPAColors.SUCCESS, size=16 + ), + ft.Text( + success, + color=IPAColors.SUCCESS, + size=IPATypography.BODY_SMALL, + ), + ] + ), + padding=IPASpacing.SM, + bgcolor=IPAColors.SUCCESS + "20", # 20% opacity + border_radius=IPASpacing.RADIUS_SM, + ) + ) + + return ft.Container( + content=ft.Column(messages, spacing=IPASpacing.XS), + visible=len(messages) > 0, + ) + + def _build_header(self) -> ft.Column: + """Build the screen header.""" + return ft.Column( + [ + ft.Text( + "Select Dataset Files", + size=IPATypography.HEADER_2, + weight=ft.FontWeight.W_600, + color=IPAColors.DARK_BLUE, + ), + ft.Text( + "Choose the dataset files you want to analyze for PII. You can select single or multiple files.", + size=IPATypography.BODY_REGULAR, + color=IPAColors.CHARCOAL, + ), + ], + spacing=IPASpacing.SM, + ) + + def _build_drop_zone(self) -> ft.Container: + """Build the file selection zone.""" + self.drop_zone = ft.Container( + content=ft.Column( + [ + ft.Icon( + ft.Icons.FOLDER_OPEN, + size=48, + color=IPAColors.IPA_GREEN, + ), + ft.Text( + "Click to browse files", + size=IPATypography.HEADER_3, + weight=ft.FontWeight.W_500, + color=IPAColors.CHARCOAL, + text_align=ft.TextAlign.CENTER, + ), + ft.Text( + f"Supports: {', '.join(AppConstants.SUPPORTED_FORMATS)} (max {AppConstants.MAX_FILE_SIZE_MB}MB each)", + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + text_align=ft.TextAlign.CENTER, + ), + ], + horizontal_alignment=ft.CrossAxisAlignment.CENTER, + spacing=IPASpacing.SM, + ), + height=200, + padding=IPASpacing.XL, + border=ft.border.all(3, IPAColors.DARK_GREY), + border_radius=IPASpacing.RADIUS_LG, + bgcolor=IPAColors.LIGHT_GREY, + alignment=ft.alignment.center, + on_click=self._handle_browse_click, + ) + + return self.drop_zone + + def _build_selected_files_section(self) -> ft.Column: + """Build the selected files section.""" + # Create the container that will hold the file list + self.file_list_container = ft.Container( + content=ft.Text( + "No files selected", + size=IPATypography.BODY_REGULAR, + color=IPAColors.DARK_GREY, + text_align=ft.TextAlign.CENTER, + ), + padding=IPASpacing.MD, + border=ft.border.all(1, IPAColors.DARK_GREY), + border_radius=IPASpacing.RADIUS_MD, + bgcolor=IPAColors.WHITE, + alignment=ft.alignment.center, + ) + + return ft.Column( + [ + ft.Text( + "Selected Files:", + size=IPATypography.BODY_LARGE, + weight=ft.FontWeight.W_600, + color=IPAColors.CHARCOAL, + ), + self.file_list_container, + ], + spacing=IPASpacing.SM, + ) + + def _build_action_buttons(self) -> ft.Column: + """Build the action buttons.""" + self.next_button = create_primary_button( + text="Next: Configure Analysis", + on_click=self._handle_next_click, + icon=ft.Icons.ARROW_FORWARD, + disabled=len(self.state_manager.state.selected_files) == 0, + ) + + return ft.Column( + [ + # Demo data button + ft.Row( + [ + create_secondary_button( + text="Load Demo Data", + on_click=self._handle_load_demo, + icon=ft.Icons.DATASET, + ), + ], + alignment=ft.MainAxisAlignment.CENTER, + ), + # Main action buttons + ft.Row( + [ + create_secondary_button( + text="Clear All", + on_click=self._handle_clear_all, + icon=ft.Icons.CLEAR, + disabled=len(self.state_manager.state.selected_files) == 0, + ), + create_secondary_button( + text="Add More Files", + on_click=self._handle_browse_click, + icon=ft.Icons.ADD, + ), + self.next_button, + ], + alignment=ft.MainAxisAlignment.SPACE_BETWEEN, + ), + ], + spacing=IPASpacing.SM, + ) + + def _handle_browse_click(self, e): + """Handle browse button click.""" + self.file_picker.pick_files( + dialog_title="Select dataset files", + file_type=ft.FilePickerFileType.CUSTOM, + allowed_extensions=["csv", "xlsx", "xls", "dta"], + allow_multiple=True, + ) + + def _handle_file_picker_result(self, e: ft.FilePickerResultEvent): + """Handle file picker result.""" + if e.files: + new_files = [] + for file in e.files: + file_info = self._create_file_info(Path(file.path)) + validation_result = self._validate_file(file_info) + + new_files.append(file_info) + self.state_manager.state.file_validation_results[file.path] = ( + validation_result + ) + + # Add to selected files (avoiding duplicates) + existing_paths = {f.path for f in self.state_manager.state.selected_files} + filtered_new_files = [f for f in new_files if f.path not in existing_paths] + + if filtered_new_files: + updated_files = ( + self.state_manager.state.selected_files + filtered_new_files + ) + self.state_manager.update_state(selected_files=updated_files) + + # Show success message + self.state_manager.add_success_message( + f"Added {len(filtered_new_files)} file(s) to selection" + ) + else: + self.state_manager.add_error_message( + "All selected files are already in the list" + ) + + def _create_file_info(self, file_path: Path) -> FileInfo: + """Create FileInfo object from path.""" + try: + size_mb = file_path.stat().st_size / (1024 * 1024) # Convert to MB + return FileInfo( + path=file_path, + name=file_path.name, + size_mb=round(size_mb, 2), + format=file_path.suffix.lower(), + is_valid=True, + ) + except Exception as e: + return FileInfo( + path=file_path, + name=file_path.name, + size_mb=0, + format=file_path.suffix.lower(), + is_valid=False, + validation_message=f"Error reading file: {str(e)}", + ) + + def _validate_file(self, file_info: FileInfo) -> ValidationResult: + """Validate a selected file.""" + # Check file format + if file_info.format not in AppConstants.SUPPORTED_FORMATS: + return ValidationResult( + is_valid=False, + message=f"Unsupported format: {file_info.format}", + details={"supported_formats": AppConstants.SUPPORTED_FORMATS}, + ) + + # Check file size + if file_info.size_mb > AppConstants.MAX_FILE_SIZE_MB: + return ValidationResult( + is_valid=False, + message=f"File too large: {file_info.size_mb}MB (max: {AppConstants.MAX_FILE_SIZE_MB}MB)", + details={"max_size_mb": AppConstants.MAX_FILE_SIZE_MB}, + ) + + # Check if file exists and is readable + if not file_info.path.exists(): + return ValidationResult( + is_valid=False, + message="File does not exist", + ) + + if not file_info.path.is_file(): + return ValidationResult( + is_valid=False, + message="Path is not a file", + ) + + return ValidationResult( + is_valid=True, + message="File is valid", + ) + + def _update_file_list_display(self): + """Update the file list display.""" + # Only update if the UI components have been built + if not self.file_list_container or not self.next_button: + return + + files = self.state_manager.state.selected_files + + if not files: + # Double check that container still exists + if self.file_list_container: + self.file_list_container.content = ft.Text( + "No files selected", + size=IPATypography.BODY_REGULAR, + color=IPAColors.DARK_GREY, + text_align=ft.TextAlign.CENTER, + ) + self.file_list_container.alignment = ft.alignment.center + else: + # Create file list items + file_items = [] + for i, file_info in enumerate(files): + validation = self.state_manager.state.file_validation_results.get( + str(file_info.path), + ValidationResult(is_valid=True, message="Valid"), + ) + + # Status icon and color + if validation.is_valid: + status_icon = ft.Icons.CHECK_CIRCLE + status_color = IPAColors.SUCCESS + status_text = "Valid" + else: + status_icon = ft.Icons.ERROR + status_color = IPAColors.ERROR + status_text = validation.message + + file_item = ft.Container( + content=ft.Row( + [ + ft.Icon( + status_icon, + size=20, + color=status_color, + ), + ft.Column( + [ + ft.Text( + file_info.name, + size=IPATypography.BODY_REGULAR, + weight=ft.FontWeight.W_500, + color=IPAColors.CHARCOAL, + ), + ft.Text( + f"{file_info.size_mb}MB • {file_info.format.upper()} • {status_text}", + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY + if validation.is_valid + else status_color, + ), + ], + expand=True, + spacing=2, + ), + ft.IconButton( + icon=ft.Icons.CLOSE, + tooltip="Remove file", + icon_color=IPAColors.RED_ORANGE, + on_click=lambda e, idx=i: self._remove_file(idx), + ), + ], + alignment=ft.MainAxisAlignment.SPACE_BETWEEN, + ), + padding=ft.padding.symmetric( + horizontal=IPASpacing.SM, vertical=IPASpacing.XS + ), + border=ft.border.only(bottom=ft.BorderSide(1, IPAColors.LIGHT_GREY)) + if i < len(files) - 1 + else None, + ) + + file_items.append(file_item) + + if self.file_list_container: + self.file_list_container.content = ft.Column( + file_items, + spacing=0, + tight=True, + ) + self.file_list_container.alignment = None + + # Update next button state + if self.next_button: + valid_files = [ + f + for f in files + if self.state_manager.state.file_validation_results.get( + str(f.path), ValidationResult(is_valid=True, message="Valid") + ).is_valid + ] + self.next_button.disabled = len(valid_files) == 0 + + self.page.update() + + def _remove_file(self, index: int): + """Remove a file from the selection.""" + files = self.state_manager.state.selected_files.copy() + if 0 <= index < len(files): + removed_file = files.pop(index) + # Also remove from validation results + validation_results = self.state_manager.state.file_validation_results.copy() + validation_results.pop(str(removed_file.path), None) + + self.state_manager.update_state( + selected_files=files, file_validation_results=validation_results + ) + + def _handle_load_demo(self, e): + """Load demo data for testing.""" + demo_file_path = Path("src/pii_detector/data/demo_data.csv") + + if demo_file_path.exists(): + file_info = self._create_file_info(demo_file_path) + validation_result = self._validate_file(file_info) + + # Replace current selection with demo data + self.state_manager.update_state( + selected_files=[file_info], + file_validation_results={str(demo_file_path): validation_result}, + ) + + self.state_manager.add_success_message("Demo data loaded successfully!") + else: + self.state_manager.add_error_message( + "Demo data file not found. Please use 'Add More Files' to select your own dataset." + ) + + def _handle_clear_all(self, e): + """Handle clear all button click.""" + self.state_manager.update_state(selected_files=[], file_validation_results={}) + + def _handle_next_click(self, e): + """Handle next button click.""" + # Validate that we have at least one valid file + valid_files = [] + for file_info in self.state_manager.state.selected_files: + validation = self.state_manager.state.file_validation_results.get( + str(file_info.path), ValidationResult(is_valid=True, message="Valid") + ) + if validation.is_valid: + valid_files.append(file_info) + + if not valid_files: + self.state_manager.add_error_message( + "Please select at least one valid file" + ) + return + + # Navigate to configuration screen + self.state_manager.navigate_to(AppConstants.SCREEN_CONFIGURATION) + + def on_state_changed(self, state: AppState): + """Handle state changes.""" + # Update file list display when selected files change, but only if components are initialized + if self.file_list_container is not None and self.next_button is not None: + self._update_file_list_display() + + # Clear messages after a few seconds (auto-dismiss) + if state.success_messages or state.error_messages: + # Auto-clear messages after 3 seconds + import threading + + def clear_messages(): + try: + import time + + time.sleep(3) + # Only clear if we're still on this screen and the app is running + if ( + hasattr(self.state_manager, "state") + and self.state_manager.state.current_screen + == AppConstants.SCREEN_FILE_SELECTION + ): + self.state_manager.clear_messages() + except Exception: + # Ignore errors if app is shutting down or screen changed + pass + + threading.Timer(3.0, clear_messages).start() diff --git a/src/pii_detector/gui/flet_app/ui/screens/progress.py b/src/pii_detector/gui/flet_app/ui/screens/progress.py new file mode 100644 index 0000000..8e50a41 --- /dev/null +++ b/src/pii_detector/gui/flet_app/ui/screens/progress.py @@ -0,0 +1,629 @@ +"""Progress screen for tracking PII detection analysis.""" + +import contextlib +import threading +import time + +import flet as ft + +from pii_detector.gui.flet_app.backend_adapter import ( + BackgroundProcessor, + PIIDetectionAdapter, +) +from pii_detector.gui.flet_app.config.constants import ( + AppConstants, + IPAColors, + IPASpacing, + IPATypography, +) +from pii_detector.gui.flet_app.config.settings import AppState +from pii_detector.gui.flet_app.ui.components.buttons import ( + create_primary_button, + create_secondary_button, +) + + +class ProgressScreen: + """Progress screen implementation for tracking analysis progress.""" + + def __init__(self, page: ft.Page, state_manager): + """Initialize the progress screen. + + Args: + page: Flet page instance + state_manager: Application state manager + + """ + self.page = page + self.state_manager = state_manager + self._screen_name = AppConstants.SCREEN_PROGRESS + + # Progress tracking + self.overall_progress_bar = None + self.current_task_text = None + self.progress_log = None + self.cancel_button = None + self.view_results_button = None + self.progress_percentage_text = None + + # Analysis state + self.is_analysis_running = False + self.analysis_cancelled = False + self.analysis_complete = False + self.current_step = 0 + self.total_steps = 0 + self.progress_messages = [] + + # Backend integration + self.adapter = PIIDetectionAdapter() + self.background_processor = BackgroundProcessor(self.adapter) + + # Progress log storage for copying + self.progress_messages = [] + + def build(self) -> ft.Container: + """Build the progress screen.""" + return ft.Container( + content=ft.Column( + [ + # Header + self._build_header(), + # Progress Section + self._build_progress_section(), + # Progress Log + self._build_progress_log(), + # Action Buttons + self._build_action_buttons(), + ], + spacing=IPASpacing.LG, + ), + padding=IPASpacing.XL, + expand=True, + ) + + def _build_header(self) -> ft.Column: + """Build the screen header.""" + return ft.Column( + [ + ft.Text( + "Analysis Progress", + size=IPATypography.HEADER_2, + weight=ft.FontWeight.W_600, + color=IPAColors.DARK_BLUE, + ), + ft.Text( + "Analyzing your dataset for PII detection and preparing anonymized results.", + size=IPATypography.BODY_REGULAR, + color=IPAColors.CHARCOAL, + ), + ], + spacing=IPASpacing.SM, + ) + + def _build_progress_section(self) -> ft.Container: + """Build the progress tracking section.""" + # Overall progress bar + self.overall_progress_bar = ft.ProgressBar( + value=0, + color=IPAColors.IPA_GREEN, + bgcolor=IPAColors.LIGHT_GREY, + height=8, + ) + + # Current task indicator + self.current_task_text = ft.Text( + "Preparing analysis...", + size=IPATypography.BODY_REGULAR, + color=IPAColors.CHARCOAL, + weight=ft.FontWeight.W_500, + ) + + # Progress percentage text + self.progress_percentage_text = ft.Text( + "0%", + size=IPATypography.BODY_REGULAR, + color=IPAColors.DARK_GREY, + ) + + return ft.Container( + content=ft.Column( + [ + ft.Text( + "Overall Progress", + size=IPATypography.BODY_LARGE, + weight=ft.FontWeight.W_600, + color=IPAColors.CHARCOAL, + ), + self.overall_progress_bar, + ft.Row( + [ + self.current_task_text, + self.progress_percentage_text, + ], + alignment=ft.MainAxisAlignment.SPACE_BETWEEN, + ), + ], + spacing=IPASpacing.SM, + ), + padding=IPASpacing.MD, + bgcolor=IPAColors.WHITE, + border=ft.border.all(1, IPAColors.DARK_GREY), + border_radius=IPASpacing.RADIUS_MD, + ) + + def _build_progress_log(self) -> ft.Container: + """Build the progress log section.""" + self.progress_log = ft.Column( + [ + ft.Text( + "Ready to start analysis", + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + ), + ], + spacing=IPASpacing.XS, + scroll=ft.ScrollMode.AUTO, + ) + + # Add initial message to progress messages list + self.progress_messages = ["Ready to start analysis"] + + return ft.Container( + content=ft.Column( + [ + ft.Row( + [ + ft.Text( + "Progress Log", + size=IPATypography.BODY_LARGE, + weight=ft.FontWeight.W_600, + color=IPAColors.CHARCOAL, + ), + ft.IconButton( + icon=ft.Icons.COPY, + tooltip="Copy progress log to clipboard", + icon_size=20, + on_click=self._handle_copy_log, + style=ft.ButtonStyle( + color=IPAColors.DARK_BLUE, + ), + ), + ], + alignment=ft.MainAxisAlignment.SPACE_BETWEEN, + ), + ft.Container( + content=self.progress_log, + height=200, + padding=IPASpacing.SM, + bgcolor=IPAColors.WHITE, + border=ft.border.all(1, IPAColors.DARK_GREY), + border_radius=IPASpacing.RADIUS_SM, + ), + ], + spacing=IPASpacing.SM, + ), + ) + + def _build_action_buttons(self) -> ft.Row: + """Build the action buttons.""" + self.cancel_button = create_secondary_button( + text="Cancel Analysis", + on_click=self._handle_cancel_analysis, + icon=ft.Icons.CANCEL, + disabled=False, + ) + + self.view_results_button = create_primary_button( + text="View Results", + on_click=self._handle_view_results, + icon=ft.Icons.VISIBILITY, + disabled=True, + ) + + return ft.Row( + [ + create_secondary_button( + text="Back to Configuration", + on_click=self._handle_back_to_config, + icon=ft.Icons.ARROW_BACK, + disabled=False, + ), + self.cancel_button, + self.view_results_button, + ], + alignment=ft.MainAxisAlignment.SPACE_BETWEEN, + ) + + def _handle_cancel_analysis(self, e): + """Handle analysis cancellation.""" + if self.is_analysis_running: + self.analysis_cancelled = True + self._add_log_message("Analysis cancelled by user", IPAColors.WARNING) + self._update_current_task("Cancelling analysis...") + if self.cancel_button: + self.cancel_button.disabled = True + with contextlib.suppress(Exception): + # Ignore if page is no longer available + self.page.update() + + # Cancel the real backend processing + self.background_processor.cancel_analysis() + threading.Thread( + target=self._handle_cancellation_cleanup, daemon=True + ).start() + + def _handle_view_results(self, e): + """Handle view results button click.""" + if self.analysis_complete: + # Navigate to results screen + self.state_manager.add_success_message("Analysis completed successfully!") + self.state_manager.navigate_to(AppConstants.SCREEN_RESULTS) + + def _handle_back_to_config(self, e): + """Handle back to configuration button click.""" + if not self.is_analysis_running: + self.state_manager.navigate_to(AppConstants.SCREEN_CONFIGURATION) + + def _handle_copy_log(self, e): + """Handle copying the progress log to clipboard.""" + try: + # Create a formatted log text + log_text = "\n".join(self.progress_messages) + + # Add header with timestamp and analysis info + current_time = time.strftime("%Y-%m-%d %H:%M:%S") + header = f"PII Detection Analysis Progress Log\nGenerated: {current_time}\n{'-' * 50}\n\n" + full_text = header + log_text + + # Copy to clipboard using Flet's clipboard functionality + self.page.set_clipboard(full_text) + + # Show feedback to user + self.state_manager.add_success_message("Progress log copied to clipboard!") + + except Exception as ex: + self.state_manager.add_error_message(f"Failed to copy log: {str(ex)}") + + def _add_log_message(self, message: str, color: str = IPAColors.CHARCOAL): + """Add a message to the progress log.""" + timestamp = time.strftime("%H:%M:%S") + + # Store message with timestamp for copying + log_message_text = f"[{timestamp}] {message}" + self.progress_messages.append(log_message_text) + + # Keep only last 50 messages to prevent memory issues + if len(self.progress_messages) > 50: + self.progress_messages.pop(0) + + log_entry = ft.Row( + [ + ft.Text( + f"[{timestamp}]", + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + font_family="Consolas, monospace", + ), + ft.Text( + message, + size=IPATypography.BODY_SMALL, + color=color, + expand=True, + selectable=True, # Make text selectable for manual copying + ), + ] + ) + + if self.progress_log: + self.progress_log.controls.append(log_entry) + # Keep only last 20 UI entries to prevent memory issues + if len(self.progress_log.controls) > 20: + self.progress_log.controls.pop(0) + with contextlib.suppress(Exception): + # Ignore if page is no longer available + self.page.update() + + def _update_progress(self, progress: float, task_description: str): + """Update the progress bar and current task.""" + if self.overall_progress_bar: + self.overall_progress_bar.value = progress + + if self.current_task_text: + self.current_task_text.value = task_description + + if self.progress_percentage_text: + self.progress_percentage_text.value = f"{int(progress * 100)}%" + + # Safe page update + with contextlib.suppress(Exception): + # Ignore if page is no longer available + self.page.update() + + def _update_current_task(self, task: str): + """Update just the current task text.""" + if self.current_task_text: + self.current_task_text.value = task + with contextlib.suppress(Exception): + # Ignore if page is no longer available + self.page.update() + + def start_analysis(self): + """Start the PII detection analysis.""" + print("DEBUG: start_analysis() called") + if not self.is_analysis_running: + print("DEBUG: Analysis is not running, starting new analysis") + self.is_analysis_running = True + self.analysis_cancelled = False + self.analysis_complete = False + + # Update UI only if components exist + if self.cancel_button: + self.cancel_button.disabled = False + if self.view_results_button: + self.view_results_button.disabled = True + + self._add_log_message( + "Starting PII detection analysis...", IPAColors.IPA_GREEN + ) + + # Load datasets first + self._load_datasets_and_start_analysis() + + def _load_datasets_and_start_analysis(self): + """Load datasets and start real PII analysis.""" + # print("DEBUG: _load_datasets_and_start_analysis() called") + + def load_and_analyze(): + # print("DEBUG: load_and_analyze() thread started") + try: + # Step 1: Load datasets + self._update_progress(0.1, "Loading dataset files...") + self._add_log_message("Loading dataset files...") + + files = self.state_manager.state.selected_files + if not files: + self._add_log_message( + "No files selected for analysis", IPAColors.ERROR + ) + self._update_current_task("Analysis failed - no files selected") + self.is_analysis_running = False + if self.cancel_button: + self.cancel_button.disabled = True + return + + # For now, process first file (could be extended for multiple files) + first_file = files[0] + success, message = self.adapter.load_dataset(str(first_file.path)) + + if not success: + self._add_log_message( + f"Failed to load dataset: {message}", IPAColors.ERROR + ) + self._update_current_task("Dataset loading failed") + self.is_analysis_running = False + return + + self._add_log_message(message, IPAColors.SUCCESS) + self._update_progress(0.2, "Dataset loaded successfully") + + # Step 2: Start real PII analysis + self._start_real_pii_analysis() + + except Exception as e: + self._add_log_message( + f"Error during dataset loading: {str(e)}", IPAColors.ERROR + ) + self._update_current_task("Analysis failed") + self.is_analysis_running = False + + threading.Thread(target=load_and_analyze, daemon=True).start() + + def _start_real_pii_analysis(self): + """Start the real PII detection analysis using the backend.""" + try: + + def progress_callback(progress: float, message: str): + """Handle progress updates from backend.""" + if not self.analysis_cancelled: + self._update_progress(0.2 + (0.7 * progress), message) + self._add_log_message(message) + + def completion_callback(success: bool, results): + """Handle analysis completion.""" + if self.analysis_cancelled: + return + + if success: + # Store results in state + self.state_manager.state.detection_results = results + + # Initialize smart default anonymization methods for each column + self._initialize_default_anonymization_methods(results) + + # Update UI + self.analysis_complete = True + self.is_analysis_running = False + + self._update_progress(1.0, "Analysis completed successfully!") + self._add_log_message( + f"Analysis completed! Found {len(results)} potentially sensitive columns.", + IPAColors.SUCCESS, + ) + + # Enable results button + if self.view_results_button: + self.view_results_button.disabled = False + if self.cancel_button: + self.cancel_button.disabled = True + + # Show completion notification + try: + self.page.update() + self._show_completion_notification() + except Exception: + pass + else: + self._add_log_message("Analysis failed", IPAColors.ERROR) + self._update_current_task("Analysis failed") + self.is_analysis_running = False + + # Start background processing + self.background_processor.run_analysis_async( + self.state_manager.state.detection_config, + self.state_manager.state.geonames_api_key, + progress_callback, + completion_callback, + ) + + except Exception as e: + self._add_log_message(f"Error starting analysis: {str(e)}", IPAColors.ERROR) + self._update_current_task("Analysis failed") + self.is_analysis_running = False + + def _handle_cancellation_cleanup(self): + """Handle cleanup after analysis cancellation.""" + time.sleep(1) # Brief delay to simulate cleanup + + self.is_analysis_running = False + self.analysis_cancelled = True + + self._update_current_task("Analysis cancelled") + self._add_log_message("Analysis cancelled successfully", IPAColors.WARNING) + + # Re-enable back button, keep cancel disabled + with contextlib.suppress(Exception): + # Ignore if page is no longer available + self.page.update() + + def _show_completion_notification(self): + """Show analysis completion notification.""" + + def close_dialog(e): + dialog.open = False + self.page.update() + + dialog = ft.AlertDialog( + modal=True, + title=ft.Text("Analysis Complete!", color=IPAColors.SUCCESS), + content=ft.Column( + [ + ft.Icon(ft.Icons.CHECK_CIRCLE, color=IPAColors.SUCCESS, size=48), + ft.Text( + "Your dataset has been successfully analyzed for PII. The anonymized version is ready for review.", + text_align=ft.TextAlign.CENTER, + ), + ], + horizontal_alignment=ft.CrossAxisAlignment.CENTER, + tight=True, + ), + actions=[ + ft.TextButton( + "View Results", + on_click=lambda e: (close_dialog(e), self._handle_view_results(e)), + ), + ft.TextButton("Close", on_click=close_dialog), + ], + actions_alignment=ft.MainAxisAlignment.END, + ) + + self.page.open(dialog) + + def on_state_changed(self, state: AppState): + "Handle state changes." + # Always start a new analysis if not currently running + if ( + state.current_screen == AppConstants.SCREEN_PROGRESS + and not self.is_analysis_running + ): + # Reset completed flag before starting a new run + self.analysis_complete = False + threading.Timer(0.5, self.start_analysis).start() + + def on_screen_enter(self): + """Enter this screen and reset analysis state.""" + # print("DEBUG: on_screen_enter() called") + # Reset analysis state for new analysis + self.is_analysis_running = False + self.analysis_cancelled = False + self.analysis_complete = False + print("DEBUG: Analysis state reset") + + if self.overall_progress_bar: + self.overall_progress_bar.value = 0 + + if self.current_task_text: + self.current_task_text.value = "Preparing analysis..." + + if self.progress_percentage_text: + self.progress_percentage_text.value = "0%" + + if self.progress_log: + self.progress_log.controls.clear() + self.progress_log.controls.append( + ft.Text( + "Ready to start analysis", + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + ) + ) + + # Reset progress messages for copying + self.progress_messages = ["Ready to start analysis"] + print("DEBUG: UI components reset") + + # Reset background processor + self.background_processor = BackgroundProcessor(self.adapter) + print("DEBUG: BackgroundProcessor recreated") + + def _initialize_default_anonymization_methods(self, results): + """Initialize smart default anonymization methods for detected PII columns. + + Args: + results: List of DetectionResult objects from analysis + + """ + default_methods = {} + + for result in results: + column_lower = result.column.lower() + pii_type_lower = result.pii_type.lower() + + # Smart defaults based on confidence, column name, and PII type + if result.confidence > 0.8: + # High confidence = high risk, default to remove + default_methods[result.column] = "remove" + elif any( + keyword in pii_type_lower + for keyword in ["email", "phone", "ssn", "credit"] + ): + # Sensitive patterns should be masked + default_methods[result.column] = "mask" + elif any( + keyword in column_lower + for keyword in ["age", "date", "year", "time", "dob", "birth"] + ): + # Date/age columns are good candidates for categorization + default_methods[result.column] = "categorize" + elif any( + keyword in column_lower + for keyword in ["location", "address", "city", "state", "zip"] + ): + # Location columns can be generalized/categorized + default_methods[result.column] = "categorize" + elif any( + keyword in column_lower + for keyword in ["income", "salary", "wage", "earnings"] + ): + # Financial data can be categorized into ranges + default_methods[result.column] = "categorize" + else: + # Default to encoding for everything else + default_methods[result.column] = "encode" + + # Update state with default methods + self.state_manager.state.column_anonymization_methods = default_methods + + self._add_log_message( + f"Initialized anonymization defaults for {len(default_methods)} columns (can be customized in Results)", + IPAColors.INFO, + ) diff --git a/src/pii_detector/gui/flet_app/ui/screens/results.py b/src/pii_detector/gui/flet_app/ui/screens/results.py new file mode 100644 index 0000000..53e38b2 --- /dev/null +++ b/src/pii_detector/gui/flet_app/ui/screens/results.py @@ -0,0 +1,847 @@ +"""Results screen for displaying PII detection results.""" + +import time +from pathlib import Path + +import flet as ft + +from pii_detector.gui.flet_app.backend_adapter import PIIDetectionAdapter +from pii_detector.gui.flet_app.config.constants import ( + AppConstants, + IPAColors, + IPASpacing, + IPATypography, +) +from pii_detector.gui.flet_app.config.settings import AppState +from pii_detector.gui.flet_app.ui.components.buttons import ( + create_primary_button, + create_secondary_button, +) +from pii_detector.gui.flet_app.ui.components.cards import create_metric_card + + +class ResultsScreen: + """Results screen implementation (placeholder).""" + + def __init__(self, page: ft.Page, state_manager): + """Initialize the results screen. + + Args: + page: Flet page instance + state_manager: Application state manager + + """ + self.page = page + self.state_manager = state_manager + self._screen_name = AppConstants.SCREEN_RESULTS + + # Create adapter instance to access dataset + self.adapter = PIIDetectionAdapter() + + # Track dropdowns for each column + self.column_method_dropdowns = {} + + def build(self) -> ft.Container: + """Build the results screen.""" + return ft.Container( + content=ft.Column( + [ + # Title + ft.Text( + "PII Detection Results", + size=IPATypography.HEADER_2, + weight=ft.FontWeight.W_600, + color=IPAColors.DARK_BLUE, + ), + # Summary metrics + self._build_summary_metrics(), + # Results table + self._build_results_table(), + # Action buttons + ft.Row( + [ + create_secondary_button( + text="New Analysis", + on_click=lambda e: self.state_manager.navigate_to( + AppConstants.SCREEN_DASHBOARD + ), + icon=ft.Icons.ADD, + ), + create_secondary_button( + text="Preview Data", + on_click=self._handle_preview_data, + icon=ft.Icons.VISIBILITY, + ), + create_secondary_button( + text="Generate PII Report", + on_click=self._handle_generate_export, + icon=ft.Icons.FILE_DOWNLOAD, + ), + create_primary_button( + text="Export Deidentified Data", + on_click=self._handle_download_deidentified, + icon=ft.Icons.DOWNLOAD, + ), + ], + alignment=ft.MainAxisAlignment.SPACE_BETWEEN, + ), + ], + spacing=IPASpacing.LG, + scroll=ft.ScrollMode.AUTO, + ), + padding=IPASpacing.XL, + expand=True, + ) + + def _build_summary_metrics(self) -> ft.Row: + """Build the summary metrics cards.""" + # Get detection results from state + results = self.state_manager.state.detection_results + + # Calculate metrics + total_pii = len(results) + high_conf = sum(1 for r in results if r.confidence > 0.8) + medium_conf = sum(1 for r in results if 0.5 <= r.confidence <= 0.8) + low_conf = sum(1 for r in results if r.confidence < 0.5) + + return ft.Row( + [ + create_metric_card( + title="PII Detected", + value=str(total_pii), + subtitle="Columns", + color=IPAColors.RED_ORANGE, + icon=ft.Icons.WARNING, + ), + create_metric_card( + title="High Confidence", + value=str(high_conf), + subtitle="> 0.8 score", + color=IPAColors.HIGH_CONFIDENCE, + icon=ft.Icons.SECURITY, + ), + create_metric_card( + title="Medium Confidence", + value=str(medium_conf), + subtitle="0.5-0.8 score", + color=IPAColors.MED_CONFIDENCE, + icon=ft.Icons.INFO, + ), + create_metric_card( + title="Low Confidence", + value=str(low_conf), + subtitle="< 0.5 score", + color=IPAColors.DARK_GREY, + icon=ft.Icons.HELP, + ), + ], + alignment=ft.MainAxisAlignment.SPACE_EVENLY, + ) + + def _build_results_table(self) -> ft.Container: + """Build the results table showing detected PII columns.""" + results = self.state_manager.state.detection_results + + if not results: + # Show message if no results + return ft.Container( + content=ft.Column( + [ + ft.Icon( + ft.Icons.CHECK_CIRCLE, size=64, color=IPAColors.SUCCESS + ), + ft.Text( + "No PII Detected", + size=IPATypography.HEADER_3, + color=IPAColors.CHARCOAL, + ), + ft.Text( + "No personally identifiable information was found in the dataset.", + size=IPATypography.BODY_REGULAR, + color=IPAColors.DARK_GREY, + ), + ], + horizontal_alignment=ft.CrossAxisAlignment.CENTER, + spacing=IPASpacing.MD, + ), + padding=IPASpacing.XL, + alignment=ft.alignment.center, + ) + + # Create table rows + rows = [] + for result in results: + # Color code confidence + if result.confidence > 0.8: + conf_color = IPAColors.HIGH_CONFIDENCE + elif result.confidence >= 0.5: + conf_color = IPAColors.MED_CONFIDENCE + else: + conf_color = IPAColors.DARK_GREY + + # Get current anonymization method for this column + current_method = self.state_manager.state.column_anonymization_methods.get( + result.column, "remove" + ) + + # Create dropdown for anonymization method selection + method_dropdown = ft.Dropdown( + value=current_method, + options=[ + ft.dropdown.Option("unchanged", "Unchanged"), + ft.dropdown.Option("remove", "Remove"), + ft.dropdown.Option("encode", "Encode"), + ft.dropdown.Option("categorize", "Categorize"), + ft.dropdown.Option("mask", "Mask"), + ], + width=150, + dense=True, + text_size=IPATypography.BODY_SMALL, + content_padding=ft.padding.symmetric(horizontal=8, vertical=4), + on_change=lambda e, col=result.column: self._on_method_change( + col, e.control.value + ), + ) + + # Store reference to dropdown + self.column_method_dropdowns[result.column] = method_dropdown + + row = ft.DataRow( + cells=[ + ft.DataCell( + ft.Text( + result.column, + weight=ft.FontWeight.W_600, + color=IPAColors.CHARCOAL, + ) + ), + ft.DataCell(ft.Text(result.method, color=IPAColors.CHARCOAL)), + ft.DataCell( + ft.Container( + content=ft.Text( + f"{result.confidence:.2f}", + color="white", + size=IPATypography.BODY_SMALL, + weight=ft.FontWeight.W_600, + ), + bgcolor=conf_color, + padding=ft.padding.symmetric(horizontal=8, vertical=4), + border_radius=4, + ) + ), + ft.DataCell(ft.Text(result.pii_type, color=IPAColors.CHARCOAL)), + ft.DataCell(method_dropdown), # New column with dropdown + ], + ) + rows.append(row) + + # Create data table + table = ft.DataTable( + columns=[ + ft.DataColumn( + ft.Text( + "Column", weight=ft.FontWeight.W_600, color=IPAColors.DARK_BLUE + ) + ), + ft.DataColumn( + ft.Text( + "Method", weight=ft.FontWeight.W_600, color=IPAColors.DARK_BLUE + ) + ), + ft.DataColumn( + ft.Text( + "Confidence", + weight=ft.FontWeight.W_600, + color=IPAColors.DARK_BLUE, + ) + ), + ft.DataColumn( + ft.Text( + "PII Type", + weight=ft.FontWeight.W_600, + color=IPAColors.DARK_BLUE, + ) + ), + ft.DataColumn( + ft.Text( + "Anonymization", + weight=ft.FontWeight.W_600, + color=IPAColors.DARK_BLUE, + ) + ), + ], + rows=rows, + border=ft.border.all(1, IPAColors.DARK_GREY), + border_radius=IPASpacing.RADIUS_MD, + horizontal_lines=ft.BorderSide(1, IPAColors.LIGHT_GREY), + heading_row_color=IPAColors.BLUE_ACCENT, + ) + + return ft.Container( + content=ft.Column( + [ + ft.Text( + "Detected PII Columns", + size=IPATypography.BODY_LARGE, + weight=ft.FontWeight.W_600, + color=IPAColors.CHARCOAL, + ), + # Helper text + ft.Container( + content=ft.Row( + [ + ft.Icon( + ft.Icons.INFO_OUTLINE, + color=IPAColors.DARK_BLUE, + size=16, + ), + ft.Text( + "Select how to handle each PII column when exporting deidentified data. Smart defaults have been applied based on detection confidence and column type. Choose 'Unchanged' to preserve columns you disagree with the detection.", + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + ), + ], + spacing=IPASpacing.XS, + ), + padding=IPASpacing.SM, + bgcolor=IPAColors.BLUE_ACCENT, + border_radius=IPASpacing.RADIUS_SM, + ), + ft.Container( + content=table, + bgcolor=IPAColors.WHITE, + border_radius=IPASpacing.RADIUS_MD, + ), + ], + spacing=IPASpacing.SM, + ), + padding=IPASpacing.MD, + ) + + def _on_method_change(self, column: str, method: str): + """Handle anonymization method change for a column. + + Args: + column: Name of the column + method: Selected anonymization method (remove/encode/categorize/mask) + + """ + # Update state with new method + current_methods = self.state_manager.state.column_anonymization_methods.copy() + current_methods[column] = method + self.state_manager.state.column_anonymization_methods = current_methods + + def _handle_preview_data(self, e): + """Handle preview data button click.""" + # print("DEBUG: Preview Data button clicked") + + # Get the selected file information + files = self.state_manager.state.selected_files + results = self.state_manager.state.detection_results + + if not files: + # print("DEBUG: No files found, showing dialog") + self._show_dialog("No Data", "No dataset file is currently loaded.") + return + + file_info = files[0] + + # Load dataset to preview + try: + import pandas as pd + + # Determine file type and load accordingly + file_path = str(file_info.path) + + if file_path.endswith(".csv"): + # print("DEBUG: Loading as CSV") + df = pd.read_csv(file_path) + elif file_path.endswith(".xlsx"): + # print("DEBUG: Loading as Excel") + df = pd.read_excel(file_path) + elif file_path.endswith(".dta"): + # print("DEBUG: Loading as Stata") + df = pd.read_stata(file_path) + else: + self._show_dialog( + "Unsupported Format", + f"Unable to preview this file format: {file_path}", + ) + return + + # Get first 5 rows + preview_df = df.head(5) + + # Create preview dialog + self._show_data_preview_dialog(preview_df, results) + + except Exception as ex: + import traceback + + traceback.print_exc() + self._show_dialog( + "Preview Error", + f"Failed to load data preview:\n\n{type(ex).__name__}: {str(ex)}", + ) + + def _show_data_preview_dialog(self, df, results): + """Show data preview dialog with PII columns highlighted.""" + + def close_dialog(e): + dialog.open = False + self.page.update() + + # Get list of PII column names + pii_columns = [r.column for r in results] + + # Create table headers + headers = [] + for col in df.columns: + is_pii = col in pii_columns + headers.append( + ft.DataColumn( + ft.Container( + content=ft.Text( + col, + weight=ft.FontWeight.W_600, + color="white" if is_pii else IPAColors.DARK_BLUE, + size=IPATypography.BODY_SMALL, + ), + bgcolor=IPAColors.RED_ORANGE if is_pii else None, + padding=4, + border_radius=4, + ) + ) + ) + + # Create table rows + rows = [] + for idx, row in df.iterrows(): + cells = [] + for col in df.columns: + is_pii = col in pii_columns + value = str(row[col]) + # Truncate long values + if len(value) > 30: + value = value[:27] + "..." + + cells.append( + ft.DataCell( + ft.Container( + content=ft.Text( + value, + size=IPATypography.BODY_SMALL, + color=IPAColors.RED_ORANGE + if is_pii + else IPAColors.CHARCOAL, + weight=ft.FontWeight.W_600 + if is_pii + else ft.FontWeight.NORMAL, + ), + bgcolor=IPAColors.BLUE_ACCENT if is_pii else None, + padding=4, + border_radius=4, + ) + ) + ) + rows.append(ft.DataRow(cells=cells)) + + preview_table = ft.DataTable( + columns=headers, + rows=rows, + border=ft.border.all(1, IPAColors.DARK_GREY), + horizontal_lines=ft.BorderSide(1, IPAColors.LIGHT_GREY), + ) + + dialog = ft.AlertDialog( + modal=True, + title=ft.Text("Data Preview (First 5 Rows)"), + content=ft.Container( + content=ft.Column( + [ + ft.Container( + content=ft.Row( + [ + ft.Icon( + ft.Icons.WARNING, + color=IPAColors.RED_ORANGE, + size=16, + ), + ft.Text( + "PII columns are highlighted in orange", + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + ), + ], + spacing=8, + ), + bgcolor=IPAColors.BLUE_ACCENT, + padding=8, + border_radius=4, + ), + ft.Container( + content=ft.Row( + controls=[preview_table], + scroll=ft.ScrollMode.AUTO, + ), + width=800, + ), + ], + spacing=IPASpacing.SM, + scroll=ft.ScrollMode.AUTO, + ), + height=400, + ), + actions=[ + ft.TextButton("Close", on_click=close_dialog), + ], + actions_alignment=ft.MainAxisAlignment.END, + ) + + # print("DEBUG: Opening dialog using page.open()") + self.page.open(dialog) + # print("DEBUG: Preview dialog opened") + + def _handle_download_deidentified(self, e): + """Handle download deidentified data button click.""" + + # Use file picker to select save location + def handle_save_result(e: ft.FilePickerResultEvent): + if e.path: + save_path = Path(e.path) + self._perform_anonymization_and_save(save_path) + + file_picker = ft.FilePicker(on_result=handle_save_result) + self.page.overlay.append(file_picker) + self.page.update() + + # Determine default filename from original file + files = self.state_manager.state.selected_files + if files: + original_name = files[0].name + original_ext = files[0].format + + # Remove the extension from the filename using Path + from pathlib import Path + + path_obj = Path(original_name) + base_name = path_obj.stem # Gets filename without extension + + # Create new filename with _deidentified suffix + if original_ext: + default_name = f"{base_name}_deidentified.{original_ext}" + else: + # If no extension info, try to get it from the filename itself + if path_obj.suffix: + default_name = f"{base_name}_deidentified{path_obj.suffix}" + else: + default_name = f"{base_name}_deidentified.csv" + else: + # Fallback for demo data or when no file info available + default_name = "demo_deidentified.csv" + + # Open save file dialog + file_picker.save_file( + dialog_title="Save Deidentified Dataset", + file_name=default_name, + allowed_extensions=["csv", "xlsx", "dta"], + ) + + def _perform_anonymization_and_save(self, save_path): + """Perform anonymization and save the file.""" + try: + # Get file info and load dataset + files = self.state_manager.state.selected_files + if not files: + self._show_dialog("No Data", "No dataset file is currently loaded.") + return + + # Load the dataset into the adapter + file_path = str(files[0].path) + success, message = self.adapter.load_dataset(file_path) + if not success: + self._show_dialog("Load Error", f"Failed to load dataset: {message}") + return + + # Get per-column anonymization methods and results from state + column_methods = self.state_manager.state.column_anonymization_methods + results = self.state_manager.state.detection_results + + # Perform per-column anonymization + success, anonymized_df, report = ( + self.adapter.generate_per_column_anonymized_dataset( + column_methods=column_methods, + detection_results=results, + ) + ) + + if not success: + self._show_dialog( + "Anonymization Failed", f"Failed to anonymize data: {report}" + ) + return + + # Save the file + # Determine file format and save accordingly + if save_path.suffix.lower() == ".csv": + anonymized_df.to_csv(save_path, index=False) + elif save_path.suffix.lower() == ".xlsx": + anonymized_df.to_excel(save_path, index=False) + elif save_path.suffix.lower() == ".dta": + anonymized_df.to_stata(save_path, write_index=False) + else: + # Default to CSV + anonymized_df.to_csv(save_path, index=False) + + # Save the report alongside the data + report_path = ( + save_path.parent / f"{save_path.stem}_anonymization_report.txt" + ) + with open(report_path, "w") as f: + f.write(report) + + # Show success dialog + self._show_download_success_dialog(save_path) + + except Exception as ex: + self._show_dialog( + "Download Failed", f"Failed to download deidentified data:\n\n{str(ex)}" + ) + + def _show_download_success_dialog(self, save_path): + """Show success dialog after download completes.""" + import platform + import subprocess + + def close_dialog(e): + dialog.open = False + self.page.update() + + def open_folder(e): + folder_path = save_path.parent + if platform.system() == "Windows": + subprocess.Popen(f'explorer /select,"{save_path}"') + elif platform.system() == "Darwin": + subprocess.Popen(["open", "-R", str(save_path)]) + else: + subprocess.Popen(["xdg-open", str(folder_path)]) + close_dialog(e) + + dialog = ft.AlertDialog( + modal=True, + title=ft.Text("Download Complete!", color=IPAColors.SUCCESS), + content=ft.Column( + [ + ft.Icon(ft.Icons.CHECK_CIRCLE, color=IPAColors.SUCCESS, size=48), + ft.Text( + "Deidentified dataset has been saved successfully!", + text_align=ft.TextAlign.CENTER, + ), + ft.Text( + f"\nSaved to: {save_path.name}", + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + text_align=ft.TextAlign.CENTER, + selectable=True, + ), + ft.Text( + f"\nReport: {save_path.stem}_anonymization_report.txt", + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + text_align=ft.TextAlign.CENTER, + ), + ft.Text( + "\nUsed per-column anonymization methods (see report for details)", + size=IPATypography.BODY_SMALL, + color=IPAColors.CHARCOAL, + text_align=ft.TextAlign.CENTER, + weight=ft.FontWeight.W_600, + ), + ], + horizontal_alignment=ft.CrossAxisAlignment.CENTER, + tight=True, + spacing=IPASpacing.SM, + ), + actions=[ + ft.TextButton("Open Folder", on_click=open_folder), + ft.TextButton("Close", on_click=close_dialog), + ], + actions_alignment=ft.MainAxisAlignment.END, + ) + + self.page.open(dialog) + + def _handle_generate_export(self, e): + """Handle generate export button click.""" + + # Use file picker to select export location + def handle_export_result(e: ft.FilePickerResultEvent): + if e.path: + export_path = Path(e.path) + self._perform_export(export_path) + + file_picker = ft.FilePicker(on_result=handle_export_result) + self.page.overlay.append(file_picker) + self.page.update() + + # Open directory picker + file_picker.get_directory_path(dialog_title="Select Export Location") + + def _perform_export(self, export_dir: Path): + """Perform the actual export operation.""" + try: + # Create timestamped export folder + timestamp = time.strftime("%Y%m%d_%H%M%S") + export_folder = export_dir / f"pii_analysis_{timestamp}" + export_folder.mkdir(parents=True, exist_ok=True) + + # Generate report file + report_path = export_folder / "pii_detection_report.txt" + self._generate_report(report_path) + + # Show success dialog + def close_dialog(e): + dialog.open = False + self.page.update() + + def open_folder(e): + import platform + import subprocess + + if platform.system() == "Windows": + subprocess.Popen(f'explorer "{export_folder}"') + elif platform.system() == "Darwin": + subprocess.Popen(["open", str(export_folder)]) + else: + subprocess.Popen(["xdg-open", str(export_folder)]) + close_dialog(e) + + dialog = ft.AlertDialog( + modal=True, + title=ft.Text("Export Successful", color=IPAColors.SUCCESS), + content=ft.Column( + [ + ft.Icon( + ft.Icons.CHECK_CIRCLE, color=IPAColors.SUCCESS, size=48 + ), + ft.Text( + "Analysis results exported to:", + text_align=ft.TextAlign.CENTER, + ), + ft.Text( + str(export_folder), + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + text_align=ft.TextAlign.CENTER, + selectable=True, + ), + ft.Text( + "\nExported files:", + weight=ft.FontWeight.W_600, + ), + ft.Text( + "• pii_detection_report.txt - Detailed analysis report", + size=IPATypography.BODY_SMALL, + ), + ], + horizontal_alignment=ft.CrossAxisAlignment.CENTER, + tight=True, + spacing=IPASpacing.SM, + ), + actions=[ + ft.TextButton("Open Folder", on_click=open_folder), + ft.TextButton("Close", on_click=close_dialog), + ], + actions_alignment=ft.MainAxisAlignment.END, + ) + + self.page.open(dialog) + + except Exception as ex: + self._show_dialog("Export Failed", f"Failed to export results: {str(ex)}") + + def _generate_report(self, report_path: Path): + """Generate a text report of the analysis results.""" + results = self.state_manager.state.detection_results + files = self.state_manager.state.selected_files + + with open(report_path, "w") as f: + f.write("=" * 70 + "\n") + f.write("PII DETECTION ANALYSIS REPORT\n") + f.write("=" * 70 + "\n\n") + + # File information + f.write("ANALYZED FILES\n") + f.write("-" * 70 + "\n") + for file_info in files: + f.write(f"File: {file_info.name}\n") + f.write(f"Path: {file_info.path}\n") + f.write(f"Size: {file_info.size_mb:.2f} MB\n") + f.write(f"Format: {file_info.format}\n\n") + + # Summary statistics + f.write("\nDETECTION SUMMARY\n") + f.write("-" * 70 + "\n") + f.write(f"Total PII columns detected: {len(results)}\n") + + high_conf = sum(1 for r in results if r.confidence > 0.8) + medium_conf = sum(1 for r in results if 0.5 <= r.confidence <= 0.8) + low_conf = sum(1 for r in results if r.confidence < 0.5) + + f.write(f" - High confidence (>0.8): {high_conf}\n") + f.write(f" - Medium confidence (0.5-0.8): {medium_conf}\n") + f.write(f" - Low confidence (<0.5): {low_conf}\n\n") + + # Detailed results + f.write("\nDETAILED RESULTS\n") + f.write("-" * 70 + "\n\n") + + for i, result in enumerate(results, 1): + f.write(f"{i}. Column: {result.column}\n") + f.write(f" Detection Method: {result.method}\n") + f.write(f" Confidence Score: {result.confidence:.2%}\n") + f.write(f" PII Type: {result.pii_type}\n") + if result.entity_types: + f.write(f" Entity Types: {', '.join(result.entity_types)}\n") + if result.details: + f.write(f" Details: {result.details}\n") + f.write("\n") + + # Recommendations + f.write("\nRECOMMENDATIONS\n") + f.write("-" * 70 + "\n") + f.write("1. Review all detected PII columns carefully\n") + f.write("2. Consider anonymizing or removing high-confidence PII columns\n") + f.write("3. Manually verify medium and low confidence detections\n") + f.write( + "4. Ensure compliance with data protection regulations (GDPR, HIPAA, etc.)\n\n" + ) + + # Footer + f.write("=" * 70 + "\n") + f.write(f"Report generated: {time.strftime('%Y-%m-%d %H:%M:%S')}\n") + f.write("Generated by: IPA PII Detector\n") + f.write("=" * 70 + "\n") + + def _show_dialog(self, title: str, message: str): + """Show a dialog with the given title and message.""" + + def close_dialog(e): + dialog.open = False + self.page.update() + + dialog = ft.AlertDialog( + modal=True, + title=ft.Text(title), + content=ft.Text(message), + actions=[ + ft.TextButton("Close", on_click=close_dialog), + ], + actions_alignment=ft.MainAxisAlignment.END, + ) + + self.page.open(dialog) + + def on_state_changed(self, state: AppState): + """Handle state changes.""" + pass diff --git a/src/pii_detector/gui/flet_app/ui/themes/__init__.py b/src/pii_detector/gui/flet_app/ui/themes/__init__.py new file mode 100644 index 0000000..f581e67 --- /dev/null +++ b/src/pii_detector/gui/flet_app/ui/themes/__init__.py @@ -0,0 +1 @@ +"""Themes package for the PII Detector Flet application.""" diff --git a/src/pii_detector/gui/flet_app/ui/themes/ipa_theme.py b/src/pii_detector/gui/flet_app/ui/themes/ipa_theme.py new file mode 100644 index 0000000..b583c8d --- /dev/null +++ b/src/pii_detector/gui/flet_app/ui/themes/ipa_theme.py @@ -0,0 +1,136 @@ +"""IPA theme implementation for Flet application.""" + +import flet as ft + +from pii_detector.gui.flet_app.config.constants import IPAColors, IPATypography + + +def create_ipa_theme() -> ft.Theme: + """Create Flet theme with IPA color palette and typography.""" + return ft.Theme( + color_scheme=ft.ColorScheme( + # Primary colors + primary=IPAColors.IPA_GREEN, + primary_container=IPAColors.LIGHT_BLUE, + on_primary=IPAColors.WHITE, + on_primary_container=IPAColors.CHARCOAL, + # Secondary colors + secondary=IPAColors.DARK_BLUE, + secondary_container=IPAColors.BLUE_ACCENT, + on_secondary=IPAColors.WHITE, + on_secondary_container=IPAColors.CHARCOAL, + # Surface colors + surface=IPAColors.WHITE, + surface_variant=IPAColors.LIGHT_GREY, + on_surface=IPAColors.CHARCOAL, + on_surface_variant=IPAColors.DARK_GREY, + # Background + background=IPAColors.LIGHT_GREY, + on_background=IPAColors.CHARCOAL, + # Error colors + error=IPAColors.RED_ORANGE, + on_error=IPAColors.WHITE, + error_container=IPAColors.RED_ORANGE + "20", # 20% opacity + on_error_container=IPAColors.RED_ORANGE, + # Additional colors + outline=IPAColors.DARK_GREY, + outline_variant=IPAColors.LIGHT_GREY, + shadow=IPAColors.CHARCOAL + "40", # 40% opacity + ), + text_theme=ft.TextTheme( + # Display styles + display_large=ft.TextStyle( + size=IPATypography.HEADER_1, + color=IPAColors.DARK_BLUE, + weight=ft.FontWeight.BOLD, + font_family=IPATypography.PRIMARY_FONT, + ), + display_medium=ft.TextStyle( + size=IPATypography.HEADER_2, + color=IPAColors.DARK_BLUE, + weight=ft.FontWeight.W_600, + font_family=IPATypography.PRIMARY_FONT, + ), + display_small=ft.TextStyle( + size=IPATypography.HEADER_3, + color=IPAColors.CHARCOAL, + weight=ft.FontWeight.W_600, + font_family=IPATypography.PRIMARY_FONT, + ), + # Headline styles + headline_large=ft.TextStyle( + size=IPATypography.HEADER_1, + color=IPAColors.DARK_BLUE, + weight=ft.FontWeight.BOLD, + font_family=IPATypography.PRIMARY_FONT, + ), + headline_medium=ft.TextStyle( + size=IPATypography.HEADER_2, + color=IPAColors.CHARCOAL, + weight=ft.FontWeight.W_600, + font_family=IPATypography.PRIMARY_FONT, + ), + headline_small=ft.TextStyle( + size=IPATypography.HEADER_3, + color=IPAColors.CHARCOAL, + weight=ft.FontWeight.W_500, + font_family=IPATypography.PRIMARY_FONT, + ), + # Title styles + title_large=ft.TextStyle( + size=IPATypography.BODY_LARGE, + color=IPAColors.CHARCOAL, + weight=ft.FontWeight.W_500, + font_family=IPATypography.PRIMARY_FONT, + ), + title_medium=ft.TextStyle( + size=IPATypography.BODY_REGULAR, + color=IPAColors.CHARCOAL, + weight=ft.FontWeight.W_500, + font_family=IPATypography.PRIMARY_FONT, + ), + title_small=ft.TextStyle( + size=IPATypography.BODY_SMALL, + color=IPAColors.CHARCOAL, + weight=ft.FontWeight.W_500, + font_family=IPATypography.PRIMARY_FONT, + ), + # Body styles + body_large=ft.TextStyle( + size=IPATypography.BODY_LARGE, + color=IPAColors.CHARCOAL, + font_family=IPATypography.PRIMARY_FONT, + ), + body_medium=ft.TextStyle( + size=IPATypography.BODY_REGULAR, + color=IPAColors.CHARCOAL, + font_family=IPATypography.PRIMARY_FONT, + ), + body_small=ft.TextStyle( + size=IPATypography.BODY_SMALL, + color=IPAColors.DARK_GREY, + font_family=IPATypography.PRIMARY_FONT, + ), + # Label styles + label_large=ft.TextStyle( + size=IPATypography.BODY_REGULAR, + color=IPAColors.CHARCOAL, + weight=ft.FontWeight.W_500, + font_family=IPATypography.PRIMARY_FONT, + ), + label_medium=ft.TextStyle( + size=IPATypography.BODY_SMALL, + color=IPAColors.CHARCOAL, + weight=ft.FontWeight.W_500, + font_family=IPATypography.PRIMARY_FONT, + ), + label_small=ft.TextStyle( + size=10, + color=IPAColors.DARK_GREY, + weight=ft.FontWeight.W_500, + font_family=IPATypography.PRIMARY_FONT, + ), + ), + # Visual density + visual_density=ft.VisualDensity.STANDARD, + ) diff --git a/src/pii_detector/gui/flet_main.py b/src/pii_detector/gui/flet_main.py new file mode 100644 index 0000000..401f36d --- /dev/null +++ b/src/pii_detector/gui/flet_main.py @@ -0,0 +1,35 @@ +"""Main entry point for the Flet-based PII Detector application.""" + +import sys +from pathlib import Path + +import flet as ft + +# Add src to path for imports +sys.path.append(str(Path(__file__).parent.parent.parent)) + +from pii_detector.gui.flet_app.ui.app import PIIDetectorApp +from pii_detector.gui.flet_app.ui.themes.ipa_theme import create_ipa_theme + + +def main(page: ft.Page): + """Main application entry point.""" # noqa: D401 + # Configure desktop app + page.title = "IPA PII Detector" + page.window_width = 1200 + page.window_height = 800 + page.window_min_width = 800 + page.window_min_height = 600 + page.theme_mode = ft.ThemeMode.LIGHT + page.theme = create_ipa_theme() + page.padding = 0 + page.spacing = 0 + + # Initialize and run the app + app = PIIDetectorApp(page) + app.initialize() + + +if __name__ == "__main__": + # For development + ft.app(target=main) diff --git a/src/pii_detector/gui/frontend.py b/src/pii_detector/gui/frontend.py new file mode 100644 index 0000000..6aceaf0 --- /dev/null +++ b/src/pii_detector/gui/frontend.py @@ -0,0 +1,655 @@ +"""Graphical user interface for the PII detector application.""" + +import sys +import tkinter as tk +import webbrowser +from pathlib import Path +from tkinter import messagebox, ttk +from tkinter.filedialog import askopenfilename + +import pandas as pd +from PIL import Image, ImageTk + +from pii_detector.core import processor +from pii_detector.data import constants + +# Application constants +INTRO_TEXT = ( + "This script is meant to assist in the detection of PII " + "(personally identifiable information) and subsequent removal from a dataset. " + "This is an alpha program, not fully tested yet." +) + +INTRO_TEXT_P2 = ( + "You will first load a dataset that might contain PII variables. " + "The system will try to identify the PII candidates. " + "Please indicate if you would like to Drop, Encode or Keep them.\n\n" + "Once finished, you will be able to export a list of the PII detected, a do-file " + "to generate a deidentified dataset according to your options, and an already " + "deidentified dataset in case your input file is not a .dta\n\n" + "Please help improve the program by filling out the survey on your experience using it (Help -> Provide Feedback)." +) + +VERSION_NUMBER = "0.2.23" +APP_TITLE = f"IPA's PII Detector - v{VERSION_NUMBER}" + + +class PIIDetectorGUI: + """Main GUI application class.""" + + def __init__(self): + """Initialize the GUI application.""" + self.window = None + self.canvas = None + self.frame = None + + # Application state + self.dataset = None + self.dataset_path = None + self.new_file_path = None + self.label_dict = None + self.value_label_dict = None + + # UI elements + self.pii_candidates_to_dropdown_element = {} + self.find_piis_options = {} + + # Configuration variables + self.check_survey_cto_checkbutton_var = None + self.check_locations_pop_checkbutton_var = None + self.column_level_option_for_unstructured_text_checkbutton_var = None + self.keep_unstructured_text_option_checkbutton_var = None + + self.country_dropdown = None + self.language_dropdown = None + + # Frames for different sections + self.piis_frame = None + self.anonymized_dataset_creation_frame = None + self.new_dataset_message_frame = None + self.do_file_message_frame = None + + # Window dimensions + self.window_width = None + self.window_height = None + + self.setup_gui() + + def setup_gui(self): + """Set up the main GUI window and components.""" + self.window = tk.Tk() + self.window.title(APP_TITLE) + + # Set window size and position + self.window_width = 640 + self.window_height = 700 + screen_width = self.window.winfo_screenwidth() + screen_height = self.window.winfo_screenheight() + x = (screen_width - self.window_width) // 2 + y = (screen_height - self.window_height) // 2 + self.window.geometry(f"{self.window_width}x{self.window_height}+{x}+{y}") + + # Configure styles + self.setup_styles() + + # Set up main frame with scrolling + self.setup_scrollable_frame() + + # Set up menu + self.setup_menu() + + # Load and display logo + self.display_logo() + + # Display initial content + self.display_intro_text() + self.create_file_selection_section() + + def setup_styles(self): + """Configure tkinter styles.""" + style = ttk.Style() + style.configure("my.TLabel", background="white", foreground="black") + + def setup_scrollable_frame(self): + """Set up the main scrollable frame.""" + # Create canvas and scrollbar + self.canvas = tk.Canvas(self.window, bg="white") + scrollbar = ttk.Scrollbar( + self.window, orient="vertical", command=self.canvas.yview + ) + self.canvas.configure(yscrollcommand=scrollbar.set) + + # Pack canvas and scrollbar + self.canvas.pack(side="left", fill="both", expand=True) + scrollbar.pack(side="right", fill="y") + + # Create frame inside canvas + self.frame = tk.Frame(self.canvas, bg="white") + self.canvas.create_window((0, 0), window=self.frame, anchor="nw") + + # Bind frame configure event + self.frame.bind("", self.on_frame_configure) + + # Bind mousewheel to canvas + self.canvas.bind("", self.on_mousewheel) + + def on_frame_configure(self, event=None): + """Handle frame resize to update scroll region.""" + self.canvas.configure(scrollregion=self.canvas.bbox("all")) + + def on_mousewheel(self, event): + """Handle mouse wheel scrolling.""" + self.canvas.yview_scroll(int(-1 * (event.delta / 120)), "units") + + def setup_menu(self): + """Set up the application menu bar.""" + menubar = tk.Menu(self.window) + self.window.config(menu=menubar) + + # Help menu + help_menu = tk.Menu(menubar, tearoff=0) + menubar.add_cascade(label="Help", menu=help_menu) + help_menu.add_command(label="About", command=self.show_about) + help_menu.add_command( + label="Provide Feedback", command=self.open_feedback_survey + ) + + def display_logo(self): + """Display the IPA logo.""" + try: + # Get path to logo in assets + logo_path = ( + Path(__file__).parent.parent.parent.parent / "assets" / "ipa_logo.jpg" + ) + if logo_path.exists(): + img = Image.open(logo_path) + img = img.resize((120, 60), Image.Resampling.LANCZOS) + photo = ImageTk.PhotoImage(img) + + logo_label = ttk.Label(self.frame, image=photo, style="my.TLabel") + logo_label.image = photo # Keep a reference + logo_label.pack(anchor="nw", padx=(30, 30), pady=(10, 10)) + except Exception as e: + print(f"Could not load logo: {e}") + + def display_intro_text(self): + """Display the introductory text.""" + self.display_title("Welcome to the PII detector app") + self.display_message(INTRO_TEXT) + self.display_message(INTRO_TEXT_P2) + + def display_title(self, title): + """Display a title label.""" + label = ttk.Label( + self.frame, + text=title, + wraplength=546, + justify=tk.LEFT, + font=("Calibri", 12, "bold"), + style="my.TLabel", + ) + label.pack(anchor="nw", padx=(30, 30), pady=(0, 5)) + self.frame.update() + return label + + def display_message(self, message): + """Display a message label.""" + label = ttk.Label( + self.frame, + text=message, + wraplength=546, + justify=tk.LEFT, + font=("Calibri Italic", 11), + style="my.TLabel", + ) + label.pack(anchor="nw", padx=(30, 30), pady=(0, 5)) + self.frame.update() + return label + + def create_file_selection_section(self): + """Create the file selection section.""" + self.display_title("Step 1: Select your dataset") + self.display_message( + "Click the button below to select the dataset you want to analyze." + ) + + button_frame = tk.Frame(self.frame, bg="white") + button_frame.pack(anchor="nw", padx=(30, 30), pady=(10, 10)) + + select_button = ttk.Button( + button_frame, text="Select Dataset File", command=self.select_file + ) + select_button.pack(side=tk.LEFT) + + def select_file(self): + """Handle file selection.""" + file_path = askopenfilename( + title="Select dataset file", + filetypes=[ + ("All supported", "*.csv;*.xlsx;*.xls;*.dta"), + ("CSV files", "*.csv"), + ("Excel files", "*.xlsx;*.xls"), + ("Stata files", "*.dta"), + ("All files", "*.*"), + ], + ) + + if file_path: + self.load_dataset(file_path) + + def load_dataset(self, file_path): + """Load and process the selected dataset.""" + self.display_message(f"Loading dataset from: {file_path}") + + try: + success, result = processor.import_dataset(file_path) + + if success: + ( + self.dataset, + self.dataset_path, + self.label_dict, + self.value_label_dict, + ) = result + self.display_message( + f"Successfully loaded dataset with {len(self.dataset)} rows and {len(self.dataset.columns)} columns." + ) + + # Continue with PII detection workflow + self.create_pii_detection_options() + + else: + error_message = result + messagebox.showerror( + "Error", f"Failed to load dataset: {error_message}" + ) + self.display_message(f"Error: {error_message}") + + except Exception as e: + error_message = f"Unexpected error: {str(e)}" + messagebox.showerror("Error", error_message) + self.display_message(error_message) + + def create_pii_detection_options(self): + """Create the PII detection options section.""" + self.display_title("Step 2: Configure PII Detection") + self.display_message("Select the types of PII detection to perform:") + + options_frame = tk.Frame(self.frame, bg="white") + options_frame.pack(anchor="nw", padx=(30, 30), pady=(10, 10)) + + # Survey CTO variables option + self.check_survey_cto_checkbutton_var = tk.BooleanVar(value=True) + survey_cto_check = ttk.Checkbutton( + options_frame, + text="Check for SurveyCTO system variables", + variable=self.check_survey_cto_checkbutton_var, + ) + survey_cto_check.pack(anchor="w", pady=(0, 5)) + + # Location population option + self.check_locations_pop_checkbutton_var = tk.BooleanVar(value=False) + locations_check = ttk.Checkbutton( + options_frame, + text="Check location populations (requires internet)", + variable=self.check_locations_pop_checkbutton_var, + ) + locations_check.pack(anchor="w", pady=(0, 5)) + + # Country selection for location checking + country_frame = tk.Frame(options_frame, bg="white") + country_frame.pack(anchor="w", pady=(0, 10)) + + ttk.Label(country_frame, text="Country:", style="my.TLabel").pack(side=tk.LEFT) + self.country_dropdown = ttk.Combobox( + country_frame, values=constants.ALL_COUNTRIES, state="readonly", width=20 + ) + self.country_dropdown.pack(side=tk.LEFT, padx=(5, 0)) + if constants.ALL_COUNTRIES: + self.country_dropdown.set(constants.ALL_COUNTRIES[0]) + + # Language selection + language_frame = tk.Frame(options_frame, bg="white") + language_frame.pack(anchor="w", pady=(0, 10)) + + ttk.Label(language_frame, text="Language:", style="my.TLabel").pack( + side=tk.LEFT + ) + self.language_dropdown = ttk.Combobox( + language_frame, + values=[constants.ENGLISH, constants.SPANISH, constants.OTHER], + state="readonly", + width=20, + ) + self.language_dropdown.pack(side=tk.LEFT, padx=(5, 0)) + self.language_dropdown.set(constants.ENGLISH) + + # Start detection button + detect_button = ttk.Button( + options_frame, text="Start PII Detection", command=self.start_pii_detection + ) + detect_button.pack(anchor="w", pady=(20, 0)) + + def start_pii_detection(self): + """Start the PII detection process.""" + self.display_title("Step 3: PII Detection Results") + self.display_message("Analyzing dataset for potential PII...") + + # Configure detection options + self.find_piis_options = { + constants.CONSIDER_SURVEY_CTO_VARS: self.check_survey_cto_checkbutton_var.get(), + constants.CHECK_LOCATIONS_POP: self.check_locations_pop_checkbutton_var.get(), + } + + try: + # Run PII detection algorithms + all_pii_candidates = [] + + # 1. Column name/label matching + self.display_message("Checking column names and labels...") + column_name_piis = processor.find_piis_based_on_column_name( + self.dataset, + self.label_dict or {}, + self.language_dropdown.get(), + self.country_dropdown.get(), + constants.STRICT, + ) + all_pii_candidates.extend( + [(col, "Column Name Match") for col in column_name_piis] + ) + + # 2. Format pattern detection + self.display_message("Checking data formats...") + format_piis = processor.find_piis_based_on_column_format(self.dataset) + all_pii_candidates.extend([(col, "Format Pattern") for col in format_piis]) + + # 3. Sparsity analysis + self.display_message("Checking for sparse columns...") + sparse_piis = processor.find_piis_based_on_sparse_entries(self.dataset) + all_pii_candidates.extend([(col, "Sparse Data") for col in sparse_piis]) + + # 4. Location population check (if enabled) + if self.find_piis_options[constants.CHECK_LOCATIONS_POP]: + self.display_message( + "Checking location populations (this may take a moment)..." + ) + location_piis = processor.find_piis_based_on_locations_population( + self.dataset + ) + all_pii_candidates.extend( + [(col, "Small Location") for col in location_piis] + ) + + # Remove duplicates while preserving detection methods + unique_piis = {} + for col, method in all_pii_candidates: + if col not in unique_piis: + unique_piis[col] = [method] + else: + unique_piis[col].append(method) + + # Display results + if unique_piis: + self.display_pii_results(unique_piis) + else: + self.display_message("✅ No PII detected in this dataset.") + self.display_message( + "The dataset appears to be clean of obvious personally identifiable information." + ) + + except Exception as e: + error_msg = f"Error during PII detection: {str(e)}" + self.display_message(f"❌ {error_msg}") + messagebox.showerror("PII Detection Error", error_msg) + + def display_pii_results(self, unique_piis): + """Display PII detection results with action options.""" + self.display_message(f"🔍 Found {len(unique_piis)} potential PII columns:") + + # Store PII results for later processing + self.pii_results = unique_piis + self.pii_actions = {} # Will store user's chosen actions + + # Create frame for PII results + results_frame = tk.Frame(self.frame, bg="white") + results_frame.pack(anchor="nw", padx=(30, 30), pady=(10, 10), fill="x") + + # Header row + header_frame = tk.Frame(results_frame, bg="lightgray") + header_frame.pack(fill="x", pady=(0, 5)) + + ttk.Label( + header_frame, + text="Column", + font=("Calibri", 10, "bold"), + background="lightgray", + ).pack(side="left", padx=(5, 20)) + ttk.Label( + header_frame, + text="Detection Method", + font=("Calibri", 10, "bold"), + background="lightgray", + ).pack(side="left", padx=(0, 20)) + ttk.Label( + header_frame, + text="Action", + font=("Calibri", 10, "bold"), + background="lightgray", + ).pack(side="left", padx=(0, 20)) + + # Results rows + for column, methods in unique_piis.items(): + row_frame = tk.Frame(results_frame, bg="white", relief="solid", bd=1) + row_frame.pack(fill="x", pady=(0, 2)) + + # Column name + col_label = ttk.Label( + row_frame, text=column, font=("Calibri", 9), style="my.TLabel" + ) + col_label.pack(side="left", padx=(5, 20), anchor="w") + + # Detection methods + methods_text = ", ".join(methods) + methods_label = ttk.Label( + row_frame, text=methods_text, font=("Calibri", 9), style="my.TLabel" + ) + methods_label.pack(side="left", padx=(0, 20), anchor="w") + + # Action dropdown + action_var = tk.StringVar(value="Keep") + self.pii_actions[column] = action_var + + action_dropdown = ttk.Combobox( + row_frame, + textvariable=action_var, + values=["Keep", "Drop", "Encode"], + state="readonly", + width=10, + ) + action_dropdown.pack(side="left", padx=(0, 5)) + + # Export options + self.create_export_section() + + def create_export_section(self): + """Create the export options section.""" + self.display_title("Step 4: Export Options") + self.display_message( + "Choose your export options and generate the cleaned dataset:" + ) + + export_frame = tk.Frame(self.frame, bg="white") + export_frame.pack(anchor="nw", padx=(30, 30), pady=(10, 10)) + + # Export buttons + ttk.Button( + export_frame, + text="Generate Summary Report", + command=self.generate_summary_report, + ).pack(side="left", padx=(0, 10)) + + ttk.Button( + export_frame, + text="Export Cleaned Dataset", + command=self.export_cleaned_dataset, + ).pack(side="left", padx=(0, 10)) + + def generate_summary_report(self): + """Generate a summary report of PII detection results.""" + if not hasattr(self, "pii_results"): + messagebox.showwarning("No Results", "Please run PII detection first.") + return + + report_lines = [ + "PII Detection Summary Report", + "=" * 40, + f"Dataset: {self.dataset_path}", + f"Total columns analyzed: {len(self.dataset.columns)}", + f"Potential PII columns found: {len(self.pii_results)}", + "", + "Detection Results:", + ] + + for column, methods in self.pii_results.items(): + action = self.pii_actions[column].get() + report_lines.append(f" • {column}: {', '.join(methods)} → {action}") + + report_lines.extend( + [ + "", + "Actions Summary:", + f" • Keep: {sum(1 for var in self.pii_actions.values() if var.get() == 'Keep')} columns", + f" • Drop: {sum(1 for var in self.pii_actions.values() if var.get() == 'Drop')} columns", + f" • Encode: {sum(1 for var in self.pii_actions.values() if var.get() == 'Encode')} columns", + ] + ) + + report_text = "\n".join(report_lines) + + # Show in a new window + report_window = tk.Toplevel(self.window) + report_window.title("PII Detection Report") + report_window.geometry("600x400") + + text_widget = tk.Text(report_window, wrap="word", font=("Courier", 10)) + scrollbar = ttk.Scrollbar( + report_window, orient="vertical", command=text_widget.yview + ) + text_widget.configure(yscrollcommand=scrollbar.set) + + text_widget.pack(side="left", fill="both", expand=True) + scrollbar.pack(side="right", fill="y") + + text_widget.insert("1.0", report_text) + text_widget.config(state="disabled") + + def export_cleaned_dataset(self): + """Export the dataset with PII handling applied.""" + if not hasattr(self, "pii_results"): + messagebox.showwarning("No Results", "Please run PII detection first.") + return + + try: + # Create cleaned dataset based on user actions + cleaned_dataset = self.dataset.copy() + dropped_columns = [] + encoded_columns = [] + + for column, action_var in self.pii_actions.items(): + action = action_var.get() + + if action == "Drop": + if column in cleaned_dataset.columns: + cleaned_dataset = cleaned_dataset.drop(column, axis=1) + dropped_columns.append(column) + + elif action == "Encode" and column in cleaned_dataset.columns: + # Simple encoding - replace with hash + from pii_detector.core.hash_utils import generate_hash + + cleaned_dataset[f"{column}_encoded"] = ( + cleaned_dataset[column] + .astype(str) + .apply( + lambda x: generate_hash(str(x)) + if pd.notna(x) and x != "" + else x + ) + ) + cleaned_dataset = cleaned_dataset.drop(column, axis=1) + encoded_columns.append(column) + + # Save cleaned dataset + from tkinter.filedialog import asksaveasfilename + + save_path = asksaveasfilename( + title="Save cleaned dataset", + defaultextension=".csv", + filetypes=[ + ("CSV files", "*.csv"), + ("Excel files", "*.xlsx"), + ("All files", "*.*"), + ], + ) + + if save_path: + if save_path.endswith(".csv"): + cleaned_dataset.to_csv(save_path, index=False) + elif save_path.endswith(".xlsx"): + cleaned_dataset.to_excel(save_path, index=False) + + summary = f"Successfully exported cleaned dataset to: {save_path}\n\n" + summary += f"Original columns: {len(self.dataset.columns)}\n" + summary += f"Cleaned columns: {len(cleaned_dataset.columns)}\n" + if dropped_columns: + summary += f"Dropped: {', '.join(dropped_columns)}\n" + if encoded_columns: + summary += f"Encoded: {', '.join(encoded_columns)}" + + messagebox.showinfo("Export Complete", summary) + self.display_message(f"✅ Dataset exported successfully to {save_path}") + + except Exception as e: + error_msg = f"Error exporting dataset: {str(e)}" + messagebox.showerror("Export Error", error_msg) + self.display_message(f"❌ {error_msg}") + + def show_about(self): + """Show the About dialog.""" + about_text = ( + f"{APP_TITLE}\n\n" + "A tool for identifying and handling personally identifiable information (PII) in datasets.\n\n" + "Developed by IPA's Global Research and Data Science Team\n" + "License: MIT" + ) + messagebox.showinfo("About", about_text) + + def open_feedback_survey(self): + """Open the GitHub issues page for feedback.""" + github_issues_url = "https://github.com/PovertyAction/PII_detection/issues" + try: + webbrowser.open(github_issues_url) + except Exception as e: + messagebox.showerror("Error", f"Could not open web browser: {e}") + + def run(self): + """Start the GUI application.""" + try: + self.window.mainloop() + except KeyboardInterrupt: + self.window.destroy() + + +def main(): + """Launch the PII detector GUI application.""" + try: + app = PIIDetectorGUI() + app.run() + except Exception as e: + print(f"Error starting GUI application: {e}") + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..d466920 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Test suite for PII detector.""" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..87cba07 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,221 @@ +""" +Pytest configuration and fixtures for PII detector tests. +""" + +from unittest.mock import Mock + +import numpy as np +import pandas as pd +import pytest + + +@pytest.fixture +def sample_dataset(): + """Create a sample dataset for testing.""" + np.random.seed(42) + return pd.DataFrame( + { + "id": range(1, 11), + "name": [ + "John Doe", + "Jane Smith", + "Bob Wilson", + "Alice Brown", + "Charlie Davis", + "Diana Evans", + "Frank Miller", + "Grace Lee", + "Henry Taylor", + "Ivy Chen", + ], + "email": [f"person{i}@test.com" for i in range(10)], + "phone": [f"555-{i:04d}" for i in range(10)], + "age": np.random.randint(18, 80, 10), + "notes": [f"Note about person {i}" for i in range(10)], + } + ) + + +@pytest.fixture +def large_sample_dataset(): + """Create a larger sample dataset for batch processing tests.""" + np.random.seed(42) + size = 1000 + + return pd.DataFrame( + { + "id": range(1, size + 1), + "name": [f"Person {i}" for i in range(size)], + "email": [f"person{i}@test.com" for i in range(size)], + "phone": [f"555-{i:04d}" for i in range(size)], + "address": [f"{i} Main St, City, State" for i in range(size)], + "age": np.random.randint(18, 80, size), + "salary": np.random.randint(30000, 150000, size), + "comments": [f"Comment about person {i}" for i in range(size)], + "notes": [f"Additional notes for {i}" for i in range(size)], + } + ) + + +@pytest.fixture +def text_heavy_dataset(): + """Create a dataset with lots of text content for Presidio testing.""" + return pd.DataFrame( + { + "participant_id": range(1, 6), + "full_name": [ + "John Doe", + "Jane Smith", + "Bob Wilson", + "Alice Brown", + "Charlie Davis", + ], + "contact_info": [ + "Email: john.doe@example.com, Phone: 555-123-4567", + "Reach Jane at jane.smith@test.org or call 555-987-6543", + "Bob Wilson can be contacted at bob@company.com", + "Alice's number is 555-111-2222 and email alice.brown@email.com", + "Charlie Davis, charlie.davis@workplace.net, office: 555-444-5555", + ], + "address_info": [ + "123 Main Street, Springfield, IL 62701", + "456 Oak Avenue, Chicago, IL 60601", + "789 Pine Road, Peoria, IL 61602", + "321 Elm Street, Rockford, IL 61103", + "654 Maple Drive, Naperville, IL 60540", + ], + "personal_notes": [ + "John mentioned his SSN is 123-45-6789 for verification", + "Jane's driver license number is D123456789", + "Bob shared his credit card info: 4532-1234-5678-9012", + "Alice provided her passport number: A12345678", + "Charlie's bank account: 987654321 at First National Bank", + ], + } + ) + + +@pytest.fixture +def mock_presidio_analyzer(): + """Create a mock Presidio analyzer for testing.""" + mock_analyzer = Mock() + mock_analyzer.is_available.return_value = True + mock_analyzer.language = "en" + + # Mock analysis results + mock_analyzer.analyze_text.return_value = [ + { + "entity_type": "PERSON", + "start": 0, + "end": 8, + "score": 0.9, + "text": "John Doe", + } + ] + + # Mock column analysis results + mock_analyzer.analyze_column_text.return_value = { + "presidio_available": True, + "entities_found": {"PERSON": [{"text": "John Doe", "score": 0.9}]}, + "total_detections": 1, + "confidence_scores": [0.9], + "sample_analyzed": 10, + "average_confidence": 0.9, + } + + # Mock anonymization + mock_analyzer.anonymize_text.return_value = "[PERSON]" + + return mock_analyzer + + +@pytest.fixture +def mock_batch_processor(): + """Create a mock batch processor for testing.""" + from pii_detector.core.unified_processor import PIIDetectionResult + + mock_processor = Mock() + + # Mock detection results + mock_detection_results = { + "name": PIIDetectionResult( + column_name="name", + detection_method="column_name_matching", + confidence=0.9, + entity_types=["PERSON"], + ), + "email": PIIDetectionResult( + column_name="email", + detection_method="format_patterns", + confidence=0.95, + entity_types=["EMAIL_ADDRESS"], + ), + } + + mock_processor.detect_pii_batch.return_value = mock_detection_results + + # Mock anonymization results + def mock_anonymize_batch(df, pii_results, config=None, callback=None): + anonymized_df = df.copy() + for col in pii_results: + if col in anonymized_df.columns: + anonymized_df[col] = "[REDACTED]" + + report = { + "original_shape": df.shape, + "final_shape": anonymized_df.shape, + "columns_processed": list(pii_results.keys()), + "batch_processing": True, + } + + return anonymized_df, report + + mock_processor.anonymize_batch.side_effect = mock_anonymize_batch + + return mock_processor + + +@pytest.fixture +def detection_config(): + """Standard detection configuration for tests.""" + return { + "use_column_name_detection": True, + "use_format_detection": True, + "use_sparsity_detection": False, # Disabled for consistent testing + "use_location_detection": False, # Disabled to avoid external API calls + "use_presidio_detection": False, # Disabled by default for unit tests + "column_name_confidence": 0.8, + "format_pattern_confidence": 0.9, + "presidio_confidence_threshold": 0.7, + "presidio_sample_size": 50, + } + + +@pytest.fixture +def anonymization_config(): + """Standard anonymization configuration for tests.""" + return { + "consistent_hashing": True, + "age_bins": [0, 18, 30, 45, 60, 100], + "age_labels": ["Under 18", "18-29", "30-44", "45-59", "60+"], + "geo_level": "region", + "date_precision": "month", + } + + +# Pytest markers +pytest_plugins = [] + + +def pytest_configure(config): + """Configure custom pytest markers.""" + config.addinivalue_line( + "markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')" + ) + config.addinivalue_line("markers", "integration: marks tests as integration tests") + config.addinivalue_line( + "markers", "presidio: marks tests that require Presidio installation" + ) + config.addinivalue_line( + "markers", "batch: marks tests for batch processing functionality" + ) diff --git a/tests/data/clean_data.csv b/tests/data/clean_data.csv new file mode 100644 index 0000000..a02d298 --- /dev/null +++ b/tests/data/clean_data.csv @@ -0,0 +1,9 @@ +survey_id,region,age_group,education_level,income_bracket,satisfaction_score,completion_date,product_category +S001,North,25-34,Bachelor,Medium,4.2,2024-01-15,Electronics +S002,South,25-34,Master,Medium,3.8,2024-01-16,Electronics +S003,East,25-34,Bachelor,Low,4.5,2024-01-17,Clothing +S004,West,25-34,PhD,High,3.9,2024-01-18,Electronics +S005,North,35-44,Bachelor,Medium,4.1,2024-01-19,Electronics +S006,South,35-44,Bachelor,Medium,4.0,2024-01-20,Clothing +S007,East,35-44,Master,Low,4.3,2024-01-21,Electronics +S008,West,35-44,PhD,High,3.7,2024-01-22,Clothing diff --git a/tests/data/comprehensive_pii_data.csv b/tests/data/comprehensive_pii_data.csv new file mode 100644 index 0000000..8ab2b0a --- /dev/null +++ b/tests/data/comprehensive_pii_data.csv @@ -0,0 +1,9 @@ +participant_id,first_name,last_name,full_name,email,phone_number,ssn,date_of_birth,age,income,address,city,state,zip_code,country,occupation,medical_condition,notes,gps_lat,gps_lon,device_id,session_duration +P001,John,Doe,John Doe,john.doe@email.com,555-123-4567,123-45-6789,1985-03-15,38,75000,"123 Main St","Springfield","IL","62701","USA","Engineer","Diabetes","Called about billing issue on March 3rd",39.7817,-89.6501,DEV001,1200 +P002,Jane,Smith,Jane Smith,jane.smith@gmail.com,555-987-6543,987-65-4321,1992-07-22,31,52000,"456 Oak Ave","Springfield","IL","62702","USA","Teacher","None","Requested password reset, lives on Oak Avenue",39.7900,-89.6400,DEV002,950 +P003,Maria,Rodriguez,Maria Rodriguez,maria.r@hotmail.com,555-555-1212,555-12-3456,1988-11-08,35,68000,"789 Pine Rd","Riverside","CA","92501","USA","Nurse","Hypertension","Mother of two, works at Riverside General Hospital",33.9533,-117.3958,DEV003,1450 +P004,Ahmed,Hassan,Ahmed Hassan,ahmed.hassan@yahoo.com,555-444-3333,444-33-2222,1990-01-25,33,45000,"321 Elm Dr","Riverside","CA","92502","USA","Clerk","None","Recently moved from Chicago to 321 Elm Drive",33.9700,-117.4000,DEV004,800 +P005,Sarah,Johnson,Sarah Johnson,sarah.j@outlook.com,555-777-8888,777-88-9999,1995-12-03,28,85000,"654 Birch Ln","Chicago","IL","60601","USA","Developer","None","Software developer at TechCorp, Sarah mentioned project deadlines",41.8781,-87.6298,DEV005,1800 +P006,Michael,Brown,Michael Brown,m.brown@company.com,555-222-3333,222-33-4444,1987-05-14,36,72000,"987 Cedar St","Chicago","IL","60602","USA","Manager","Asthma","Team lead for Project Alpha, Michael scheduled follow-up meeting",41.8800,-87.6200,DEV006,1350 +P007,Emily,Davis,Emily Davis,emily.davis@university.edu,555-666-7777,666-77-8888,1993-09-18,30,38000,"147 Maple Dr","Madison","WI","53703","USA","Student","None","Graduate student researching data privacy, Emily from University of Wisconsin",43.0731,-89.4012,DEV007,2100 +P008,David,Wilson,David Wilson,d.wilson@freelance.com,555-111-2222,111-22-3333,1980-04-07,43,95000,"258 Walnut Ave","Madison","WI","53704","USA","Consultant","None","David Wilson provides consulting services, mentioned traveling next week",43.0800,-89.3900,DEV008,900 diff --git a/tests/data/qualitative_data.csv b/tests/data/qualitative_data.csv new file mode 100644 index 0000000..2c29a1b --- /dev/null +++ b/tests/data/qualitative_data.csv @@ -0,0 +1,5 @@ +response_id,interview_transcript,researcher_notes,participant_background +R001,"I am John Smith from Chicago and I work at ABC Corp on 123 Main Street. My phone number is 555-1234 and my email is john@abc.com. I've lived here for 10 years with my wife Sarah and our two children.","Participant seemed nervous, mentioned specific workplace concerns","45-year-old male, married, works in finance sector" +R002,"My name is Maria and I live in Springfield at 456 Oak Road. You can reach me at maria.rodriguez@email.com or call 555-9876. I'm originally from Mexico but moved here 5 years ago for work.","Very open about immigration experience, provided detailed timeline","32-year-old female, immigrant, works in healthcare" +R003,"I'm David from the University of Wisconsin. My office is in Room 205 of the Science Building. Students can email me at d.professor@uwisconsin.edu. I've been researching climate change for 15 years.","Professor was enthusiastic about research, mentioned upcoming publications","Professor in environmental sciences, 50+ years old" +R004,"Hi, this is Sarah Johnson. I currently live at 789 Elm Street, Apartment 3B. My cell is 555-4444 and work number is 555-5555 ext 123. I work remotely for a tech company in California.","Works from home, mentioned challenges with remote collaboration","28-year-old software engineer, lives alone" diff --git a/tests/data/sample_pii_data.csv b/tests/data/sample_pii_data.csv new file mode 100644 index 0000000..f67b6ef --- /dev/null +++ b/tests/data/sample_pii_data.csv @@ -0,0 +1,5 @@ +participant_id,first_name,last_name,email,phone_number,date_of_birth,address,city,country,survey_score,deviceid,gps_lat,gps_lon +001,John,Doe,john.doe@email.com,555-123-4567,1990-01-15,"123 Main St",Springfield,USA,85,DEV001,40.7128,-74.0060 +002,Jane,Smith,jane.smith@gmail.com,555-987-6543,1985-06-22,"456 Oak Ave",Riverside,USA,92,DEV002,34.0522,-118.2437 +003,Maria,Rodriguez,maria.r@hotmail.com,555-555-1212,1992-03-10,"789 Pine Rd",Smalltown,USA,78,DEV003,41.8781,-87.6298 +004,Ahmed,Hassan,ahmed.hassan@yahoo.com,555-444-3333,1988-11-05,"321 Elm Dr",Riverside,USA,88,DEV004,25.7617,-80.1918 diff --git a/tests/data/test_data.csv b/tests/data/test_data.csv new file mode 100644 index 0000000..577e587 --- /dev/null +++ b/tests/data/test_data.csv @@ -0,0 +1,4 @@ +name,email,age,phone,employee_id +John Doe,john@email.com,25,555-1234,E001 +Jane Smith,jane@email.com,30,555-5678,E002 +Bob Johnson,bob@email.com,35,555-9999,E003 diff --git a/tests/test_anonymization.py b/tests/test_anonymization.py new file mode 100644 index 0000000..1e02b73 --- /dev/null +++ b/tests/test_anonymization.py @@ -0,0 +1,491 @@ +""" +Comprehensive tests for anonymization techniques. + +Tests various PII removal and anonymization methods based on FSD guidelines +and academic literature on statistical disclosure control. +""" + +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +from pii_detector.core.anonymization import ( + AdvancedAnonymization, + AnonymizationTechniques, +) + + +class TestAnonymizationTechniques: + """Test suite for anonymization methods.""" + + @pytest.fixture + def anonymizer(self): + """Create anonymization techniques instance with fixed seed.""" + return AnonymizationTechniques(random_seed=42) + + @pytest.fixture + def test_data_dir(self): + """Get path to test data directory.""" + return Path(__file__).parent / "data" + + @pytest.fixture + def comprehensive_data(self, test_data_dir): + """Load comprehensive PII test dataset.""" + return pd.read_csv(test_data_dir / "comprehensive_pii_data.csv") + + @pytest.fixture + def qualitative_data(self, test_data_dir): + """Load qualitative data with text content.""" + return pd.read_csv(test_data_dir / "qualitative_data.csv") + + @pytest.fixture + def sample_numeric_data(self): + """Create sample numeric data for testing.""" + return pd.DataFrame( + { + "age": [25, 30, 45, 60, 35, 28, 50, 40], + "income": [45000, 65000, 85000, 120000, 55000, 48000, 95000, 72000], + "score": [85.5, 92.3, 78.1, 88.9, 91.2, 76.8, 89.4, 83.7], + } + ) + + # ==================== REMOVAL TECHNIQUES TESTS ==================== + + def test_remove_variables(self, anonymizer, comprehensive_data): + """Test complete variable removal.""" + columns_to_remove = ["first_name", "last_name", "ssn", "email"] + result = anonymizer.remove_variables(comprehensive_data, columns_to_remove) + + # Check that specified columns are removed + for col in columns_to_remove: + assert col not in result.columns + + # Check that other columns remain + assert "age" in result.columns + assert "city" in result.columns + assert len(result) == len(comprehensive_data) # Same number of rows + + def test_remove_records_with_unique_combinations(self, anonymizer): + """Test removal of records with unique quasi-identifier combinations.""" + # Create test data with some unique combinations + test_df = pd.DataFrame( + { + "age_group": ["20-30", "30-40", "20-30", "40-50", "20-30"], + "occupation": [ + "Engineer", + "Teacher", + "Engineer", + "Unique_Job", + "Engineer", + ], + "city": ["Chicago", "NYC", "Chicago", "SmallTown", "Chicago"], + "sensitive_data": ["A", "B", "C", "D", "E"], + } + ) + + result = anonymizer.remove_records_with_unique_combinations( + test_df, ["age_group", "occupation", "city"], threshold=1 + ) + + # Should remove records with unique combinations + assert len(result) < len(test_df) + # Records with repeated combinations should remain + remaining_combinations = result.groupby( + ["age_group", "occupation", "city"] + ).size() + assert all(remaining_combinations > 1) + + # ==================== PSEUDONYMIZATION TESTS ==================== + + def test_hash_pseudonymization(self, anonymizer, comprehensive_data): + """Test hash-based pseudonymization.""" + original_names = comprehensive_data["first_name"] + pseudonyms = anonymizer.hash_pseudonymization(original_names, prefix="ANON_") + + # Check that values are different but consistent + assert not pseudonyms.equals(original_names) + assert all( + pseudo.startswith("ANON_") for pseudo in pseudonyms if pd.notna(pseudo) + ) + + # Test consistency - same inputs should give same outputs + pseudonyms2 = anonymizer.hash_pseudonymization(original_names, prefix="ANON_") + assert pseudonyms.equals(pseudonyms2) + + def test_name_pseudonymization(self, anonymizer, comprehensive_data): + """Test name-specific pseudonymization.""" + original_names = comprehensive_data["first_name"] + + # Test different name types + for name_type in ["generic", "coded", "alphabetic"]: + pseudonyms = anonymizer.name_pseudonymization(original_names, name_type) + + assert not pseudonyms.equals(original_names) + assert len(pseudonyms) == len(original_names) + + # Check that unique names get consistent pseudonyms + unique_mapping = dict(zip(original_names.dropna(), pseudonyms.dropna())) + assert len(unique_mapping) == len(original_names.dropna().unique()) + + # ==================== RECODING/CATEGORIZATION TESTS ==================== + + def test_age_categorization(self, anonymizer, comprehensive_data): + """Test age categorization.""" + ages = comprehensive_data["age"] + categories = anonymizer.age_categorization(ages) + + # Check that ages are converted to categories + assert categories.dtype.name == "category" + assert all( + cat in ["Under 18", "18-29", "30-44", "45-59", "60+"] + for cat in categories.dropna() + ) + + # Test custom bins + custom_bins = [0, 25, 35, 50, 100] + custom_labels = ["Young", "Adult", "Middle", "Senior"] + custom_categories = anonymizer.age_categorization( + ages, custom_bins, custom_labels + ) + assert all(cat in custom_labels for cat in custom_categories.dropna()) + + def test_income_categorization(self, anonymizer, comprehensive_data): + """Test income categorization.""" + incomes = comprehensive_data["income"] + categories = anonymizer.income_categorization(incomes) + + assert categories.dtype.name == "category" + expected_categories = ["Low", "Lower-Middle", "Middle", "Upper-Middle", "High"] + assert all(cat in expected_categories for cat in categories.dropna()) + + def test_date_generalization(self, anonymizer, comprehensive_data): + """Test date generalization to different precision levels.""" + dates = comprehensive_data["date_of_birth"] + + # Test year precision + years = anonymizer.date_generalization(dates, precision="year") + assert all(isinstance(year, (int, np.integer)) for year in years.dropna()) + + # Test month precision + months = anonymizer.date_generalization(dates, precision="month") + assert all(hasattr(month, "year") for month in months.dropna()) + + # Test quarter precision + quarters = anonymizer.date_generalization(dates, precision="quarter") + assert all(hasattr(quarter, "year") for quarter in quarters.dropna()) + + def test_geographic_generalization(self, anonymizer, comprehensive_data): + """Test geographic generalization.""" + states = comprehensive_data["state"] + + # Test region mapping + regions = anonymizer.geographic_generalization(states, level="region") + # Should have fewer unique values than original + assert regions.nunique() <= states.nunique() + + def test_top_bottom_coding(self, anonymizer, sample_numeric_data): + """Test top and bottom coding of continuous variables.""" + incomes = sample_numeric_data["income"] + coded_incomes = anonymizer.top_bottom_coding( + incomes, top_percentile=90, bottom_percentile=10 + ) + + # Should have some string values for extreme values + assert any(isinstance(val, str) for val in coded_incomes) + # Should contain ≥ or ≤ symbols + string_values = [val for val in coded_incomes if isinstance(val, str)] + assert any("≥" in str(val) or "≤" in str(val) for val in string_values) + + # ==================== RANDOMIZATION TESTS ==================== + + def test_add_noise(self, anonymizer, sample_numeric_data): + """Test noise addition to numeric data.""" + original_scores = sample_numeric_data["score"] + + # Test Gaussian noise + noisy_scores_gaussian = anonymizer.add_noise( + original_scores, noise_type="gaussian", noise_level=0.1 + ) + assert not noisy_scores_gaussian.equals(original_scores) + assert len(noisy_scores_gaussian) == len(original_scores) + + # Test uniform noise + noisy_scores_uniform = anonymizer.add_noise( + original_scores, noise_type="uniform", noise_level=0.1 + ) + assert not noisy_scores_uniform.equals(original_scores) + + # Test that noise doesn't affect non-numeric data + text_series = pd.Series(["a", "b", "c"]) + text_with_noise = anonymizer.add_noise(text_series) + assert text_with_noise.equals(text_series) + + def test_permutation_swapping(self, anonymizer, comprehensive_data): + """Test permutation-based value swapping.""" + original_df = comprehensive_data.copy() + swapped_df = anonymizer.permutation_swapping( + original_df, ["age", "income"], swap_probability=0.3 + ) + + # DataFrames should have same shape + assert swapped_df.shape == original_df.shape + + # Some values should be different due to swapping + age_changes = (swapped_df["age"] != original_df["age"]).sum() + income_changes = (swapped_df["income"] != original_df["income"]).sum() + + # At least some swapping should have occurred + assert age_changes > 0 or income_changes > 0 + + # Total values should be preserved (just rearranged) + assert set(swapped_df["age"]) == set(original_df["age"]) + assert set(swapped_df["income"]) == set(original_df["income"]) + + # ==================== STATISTICAL ANONYMIZATION TESTS ==================== + + def test_k_anonymity_check(self, anonymizer): + """Test k-anonymity checking.""" + # Create test data that violates k-anonymity + test_df = pd.DataFrame( + { + "age_group": [ + "20-30", + "20-30", + "30-40", + "40-50", + ], # One unique combination + "occupation": ["Engineer", "Engineer", "Teacher", "UniqueJob"], + "salary": [50000, 55000, 60000, 100000], + } + ) + + is_anonymous, violations = anonymizer.k_anonymity_check( + test_df, ["age_group", "occupation"], k=2 + ) + + assert not is_anonymous # Should violate k-anonymity + assert len(violations) > 0 # Should have violations + assert any(violations["count"] < 2) # Some groups have count < k + + def test_achieve_k_anonymity(self, anonymizer): + """Test achieving k-anonymity through record removal.""" + # Create test data with k-anonymity violations + test_df = pd.DataFrame( + { + "age_group": ["20-30", "20-30", "20-30", "30-40", "40-50"], + "education": ["BS", "BS", "MS", "PhD", "HS"], + "sensitive": ["A", "B", "C", "D", "E"], + } + ) + + anonymized_df = anonymizer.achieve_k_anonymity( + test_df, ["age_group", "education"], k=2 + ) + + # Check that result satisfies k-anonymity + is_anonymous, _ = anonymizer.k_anonymity_check( + anonymized_df, ["age_group", "education"], k=2 + ) + assert is_anonymous + + # Should have fewer or equal rows + assert len(anonymized_df) <= len(test_df) + + # ==================== TEXT ANONYMIZATION TESTS ==================== + + def test_text_masking(self, anonymizer, qualitative_data): + """Test PII pattern masking in text.""" + sample_text = qualitative_data["interview_transcript"].iloc[0] + masked_text = anonymizer.text_masking(sample_text) + + # Should replace emails with [EMAIL] + assert "[EMAIL]" in masked_text + # Should replace phone numbers with [PHONE] + assert "[PHONE]" in masked_text + # Original PII should be removed + assert "john@abc.com" not in masked_text + assert "555-1234" not in masked_text + + def test_selective_text_suppression(self, anonymizer, qualitative_data): + """Test selective suppression of text content.""" + sample_text = qualitative_data["interview_transcript"].iloc[0] + + # Test name suppression + names_suppressed = anonymizer.selective_text_suppression( + sample_text, suppress_types=["names"] + ) + assert "[REDACTED]" in names_suppressed + + # Test location suppression + locations_suppressed = anonymizer.selective_text_suppression( + sample_text, suppress_types=["locations"] + ) + assert "[LOCATION]" in locations_suppressed + + # Test number suppression + numbers_suppressed = anonymizer.selective_text_suppression( + sample_text, suppress_types=["numbers"] + ) + assert "[NUMBER]" in numbers_suppressed + + def test_custom_text_patterns(self, anonymizer): + """Test custom pattern masking.""" + text = "My SSN is 123-45-6789 and credit card is 4532-1234-5678-9012" + custom_patterns = { + r"\b\d{3}-\d{2}-\d{4}\b": "[SSN_MASKED]", + r"\b\d{4}-\d{4}-\d{4}-\d{4}\b": "[CREDIT_CARD]", + } + + masked = anonymizer.text_masking(text, patterns=custom_patterns) + assert "[SSN_MASKED]" in masked + assert "[CREDIT_CARD]" in masked + assert "123-45-6789" not in masked + assert "4532-1234-5678-9012" not in masked + + # ==================== UTILITY TESTS ==================== + + def test_anonymization_report(self, anonymizer, comprehensive_data): + """Test anonymization reporting functionality.""" + # Apply some anonymization + anonymized = anonymizer.remove_variables( + comprehensive_data, ["first_name", "last_name"] + ) + anonymized = anonymized.iloc[:-2] # Remove some rows too + + report = anonymizer.anonymization_report(comprehensive_data, anonymized) + + # Check report structure + assert "original_rows" in report + assert "anonymized_rows" in report + assert "rows_removed" in report + assert "removal_percentage" in report + assert "columns_comparison" in report + + # Check values + assert report["original_rows"] == len(comprehensive_data) + assert report["anonymized_rows"] == len(anonymized) + assert report["rows_removed"] == 2 # We removed 2 rows + assert report["removal_percentage"] == (2 / len(comprehensive_data)) * 100 + + # ==================== EDGE CASES AND ERROR HANDLING ==================== + + def test_empty_data_handling(self, anonymizer): + """Test handling of empty datasets.""" + empty_df = pd.DataFrame() + + # Should not crash on empty data + result = anonymizer.remove_variables(empty_df, ["nonexistent"]) + assert len(result) == 0 + + empty_series = pd.Series(dtype="object") + result_series = anonymizer.hash_pseudonymization(empty_series) + assert len(result_series) == 0 + + def test_missing_values_handling(self, anonymizer): + """Test handling of missing values.""" + series_with_na = pd.Series(["John", None, "Jane", pd.NA, "Bob"]) + + # Pseudonymization should preserve NaN values + pseudonyms = anonymizer.hash_pseudonymization(series_with_na) + assert pseudonyms.isna().sum() == series_with_na.isna().sum() + + # Age categorization should handle NaN + ages_with_na = pd.Series([25, None, 35, pd.NA, 45]) + categories = anonymizer.age_categorization(ages_with_na) + assert categories.isna().sum() == ages_with_na.isna().sum() + + def test_non_numeric_noise_addition(self, anonymizer): + """Test that noise addition doesn't affect non-numeric data.""" + text_data = pd.Series(["apple", "banana", "cherry"]) + result = anonymizer.add_noise(text_data) + assert result.equals(text_data) + + def test_invalid_column_removal(self, anonymizer, comprehensive_data): + """Test removal of non-existent columns.""" + # Should not crash when removing non-existent columns + result = anonymizer.remove_variables(comprehensive_data, ["nonexistent_column"]) + assert result.equals(comprehensive_data) + + # ==================== INTEGRATION TESTS ==================== + + @pytest.mark.integration + def test_full_anonymization_workflow(self, anonymizer, comprehensive_data): + """Test complete anonymization workflow.""" + original_data = comprehensive_data.copy() + + # Step 1: Remove direct identifiers + step1 = anonymizer.remove_variables( + original_data, ["first_name", "last_name", "ssn", "email"] + ) + + # Step 2: Pseudonymize remaining identifiers + step2 = step1.copy() + step2["participant_id"] = anonymizer.hash_pseudonymization( + step1["participant_id"], prefix="P_" + ) + + # Step 3: Categorize continuous variables + step3 = step2.copy() + step3["age_group"] = anonymizer.age_categorization(step2["age"]) + step3["income_bracket"] = anonymizer.income_categorization(step2["income"]) + + # Step 4: Generalize geography + step4 = step3.copy() + step4["region"] = anonymizer.geographic_generalization(step3["state"]) + + # Step 5: Apply k-anonymity + final_result = anonymizer.achieve_k_anonymity( + step4, ["age_group", "occupation", "region"], k=2 + ) + + # Verify transformations + assert "first_name" not in final_result.columns + assert "ssn" not in final_result.columns + assert all(pid.startswith("P_") for pid in final_result["participant_id"]) + assert final_result["age_group"].dtype.name == "category" + + # Generate report + report = anonymizer.anonymization_report(original_data, final_result) + assert report["original_rows"] >= report["anonymized_rows"] + + +class TestAdvancedAnonymization: + """Test suite for advanced (mock) anonymization techniques.""" + + def test_l_diversity_check_mock(self): + """Test l-diversity mock implementation.""" + df = pd.DataFrame({"quasi": [1, 2, 3], "sensitive": ["A", "B", "C"]}) + result = AdvancedAnonymization.l_diversity_check( + df, ["quasi"], "sensitive", diversity_l=2 + ) + # Mock should return False + assert result is False + + def test_t_closeness_check_mock(self): + """Test t-closeness mock implementation.""" + df = pd.DataFrame({"quasi": [1, 2, 3], "sensitive": ["A", "B", "C"]}) + result = AdvancedAnonymization.t_closeness_check( + df, ["quasi"], "sensitive", t=0.2 + ) + # Mock should return False + assert result is False + + def test_differential_privacy_mock(self): + """Test differential privacy mock implementation.""" + series = pd.Series([1, 2, 3, 4, 5]) + result = AdvancedAnonymization.differential_privacy_noise(series, epsilon=1.0) + # Mock should return original series + assert result.equals(series) + + def test_synthetic_data_generation_mock(self): + """Test synthetic data generation mock.""" + df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) + result = AdvancedAnonymization.synthetic_data_generation(df) + # Mock should return copy of original + assert result.equals(df) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_batch_processing.py b/tests/test_batch_processing.py new file mode 100644 index 0000000..04b00ee --- /dev/null +++ b/tests/test_batch_processing.py @@ -0,0 +1,706 @@ +"""Tests for batch processing functionality.""" + +from unittest.mock import Mock, patch + +import numpy as np +import pandas as pd +import pytest + +from pii_detector.core.batch_processor import BatchPIIProcessor, process_dataset_batch +from pii_detector.core.presidio_engine import ( + presidio_analyze_dataframe_batch, + presidio_anonymize_dataframe_batch, +) +from pii_detector.core.unified_processor import PIIDetectionResult + + +class TestBatchPIIProcessor: + """Test the batch PII processor.""" + + def test_processor_initialization(self): + """Test batch processor initialization with various configurations.""" + # Default initialization + processor = BatchPIIProcessor() + assert processor is not None + assert processor.language == "en" + assert processor.chunk_size == 1000 + assert processor.max_workers == 4 + + # Custom initialization + processor_custom = BatchPIIProcessor( + language="es", chunk_size=500, max_workers=2, use_structured_engine=False + ) + assert processor_custom.language == "es" + assert processor_custom.chunk_size == 500 + assert processor_custom.max_workers == 2 + assert processor_custom.use_structured_engine is False + + def test_get_processing_strategy(self): + """Test processing strategy selection logic.""" + processor = BatchPIIProcessor(chunk_size=1000) + + # Small dataset - standard processing + small_df = pd.DataFrame({"col1": range(100)}) + strategy = processor.get_processing_strategy(small_df) + assert strategy == "standard_processing" + + # Large dataset - chunked processing + large_df = pd.DataFrame({"col1": range(3000)}) + strategy = processor.get_processing_strategy(large_df) + assert strategy == "chunked_processing" + + def test_estimate_processing_time(self): + """Test processing time estimation.""" + processor = BatchPIIProcessor() + + # Create test dataset + df = pd.DataFrame({"text_col": ["sample text"] * 500, "num_col": range(500)}) + + estimates = processor.estimate_processing_time(df) + + # Should return estimates dictionary + assert isinstance(estimates, dict) + assert "standard_processing" in estimates + assert "chunked_processing" in estimates + + for strategy, estimate in estimates.items(): + assert "time_seconds" in estimate + assert "memory_mb" in estimate + assert "recommended" in estimate + assert isinstance(estimate["time_seconds"], (int, float)) + assert isinstance(estimate["memory_mb"], (int, float)) + assert isinstance(estimate["recommended"], bool) + + def create_test_dataset(self, size: int = 1000) -> pd.DataFrame: + """Create synthetic dataset for testing.""" + np.random.seed(42) + + data = { + "id": range(1, size + 1), + "name": [f"Person {i}" for i in range(size)], + "email": [f"person{i}@test.com" for i in range(size)], + "phone": [f"555-{i:04d}" for i in range(size)], + "address": [f"{i} Main St, City, State" for i in range(size)], + "age": np.random.randint(18, 80, size), + "salary": np.random.randint(30000, 150000, size), + "comments": [f"Comment about person {i}" for i in range(size)], + "notes": [f"Additional notes for {i}" for i in range(size)], + } + + return pd.DataFrame(data) + + def test_detect_pii_batch_small_dataset(self): + """Test batch detection on small dataset (standard processing).""" + processor = BatchPIIProcessor(chunk_size=1000) # Large chunk = no chunking + df = self.create_test_dataset(500) # Small dataset + + progress_calls = [] + + def progress_callback(percent, message): + progress_calls.append((percent, message)) + + results = processor.detect_pii_batch(df, progress_callback=progress_callback) + + # Should return detection results + assert isinstance(results, dict) + assert len(results) > 0 # Should find some PII + + # Check that some expected PII columns are detected + expected_pii_cols = ["name", "email", "phone", "address"] + found_pii = [col for col in expected_pii_cols if col in results] + assert len(found_pii) > 0, ( + f"Expected to find PII in {expected_pii_cols}, got {list(results.keys())}" + ) + + # Progress should have been called + assert len(progress_calls) > 0 + + # Verify result structure + for col_name, result in results.items(): + assert isinstance(result, PIIDetectionResult) + assert result.column_name == col_name + assert 0 <= result.confidence <= 1 + assert result.detection_method is not None + + def test_detect_pii_batch_large_dataset_chunked(self): + """Test batch detection on large dataset (chunked processing).""" + processor = BatchPIIProcessor(chunk_size=300, max_workers=2) + df = self.create_test_dataset(1000) # Large dataset + + progress_calls = [] + + def progress_callback(percent, message): + progress_calls.append((percent, message)) + + results = processor.detect_pii_batch(df, progress_callback=progress_callback) + + # Should return detection results + assert isinstance(results, dict) + assert len(results) > 0 + + # Should have used chunked processing (multiple progress updates) + assert len(progress_calls) > 1 + + # Verify that chunked processing produces similar results to standard + # by checking that major PII columns are still detected + expected_pii_cols = ["name", "email", "phone", "address"] + found_pii = [col for col in expected_pii_cols if col in results] + assert len(found_pii) > 0 + + def test_detect_with_custom_config(self): + """Test batch detection with custom configuration.""" + processor = BatchPIIProcessor() + df = self.create_test_dataset(200) + + # Custom detection config + detection_config = { + "use_presidio_detection": False, # Disable Presidio for consistent testing + "use_column_name_detection": True, + "use_format_detection": True, + "use_sparsity_detection": False, # Disable to reduce variability + "use_location_detection": False, # Disable to reduce external dependencies + "column_name_confidence": 0.9, + "format_pattern_confidence": 0.95, + } + + results = processor.detect_pii_batch(df, detection_config=detection_config) + + assert isinstance(results, dict) + # With restricted detection methods, should still find some PII + # (at minimum, format patterns like email should be detected) + + def test_analyze_chunk_text_content(self): + """Test chunk text content analysis.""" + processor = BatchPIIProcessor() + + # Create chunk with text data + chunk_data = { + "text_col1": ["John Doe", "jane@example.com", "normal text"], + "text_col2": ["555-123-4567", "more text", "even more text"], + "num_col": [1, 2, 3], + } + chunk = pd.DataFrame(chunk_data) + text_columns = ["text_col1", "text_col2"] + + config = processor._get_optimized_detection_config() + + # Mock the presidio analyzer to avoid external dependencies + with ( + patch.object( + processor.presidio_analyzer, "is_available", return_value=True + ), + patch.object( + processor.presidio_analyzer, "analyze_column_text" + ) as mock_analyze, + ): + # Mock return value + mock_analyze.return_value = { + "presidio_available": True, + "total_detections": 2, + "entities_found": {"PERSON": ["John Doe"]}, + "confidence_scores": [0.9, 0.8], + "sample_analyzed": 3, + } + + results = processor._analyze_chunk_text_content(chunk, text_columns, config) + + # Should analyze each text column + assert mock_analyze.call_count == len(text_columns) + + # Should return results for columns with detections + assert isinstance(results, dict) + + def test_aggregate_chunk_results(self): + """Test aggregation of results from multiple chunks.""" + processor = BatchPIIProcessor() + + # Mock chunk results + chunk_results = { + "email_col": [ + { + "total_detections": 3, + "sample_analyzed": 10, + "confidence_scores": [0.9, 0.8, 0.7], + "entities_found": {"EMAIL_ADDRESS": ["email1", "email2", "email3"]}, + }, + { + "total_detections": 2, + "sample_analyzed": 8, + "confidence_scores": [0.85, 0.75], + "entities_found": {"EMAIL_ADDRESS": ["email4", "email5"]}, + }, + ], + "phone_col": [ + { + "total_detections": 1, + "sample_analyzed": 5, + "confidence_scores": [0.6], # Below default threshold + "entities_found": {"PHONE_NUMBER": ["phone1"]}, + } + ], + } + + config = processor._get_optimized_detection_config() + results = processor._aggregate_chunk_results(chunk_results, config) + + # Should aggregate email_col (above threshold) + assert "email_col" in results + email_result = results["email_col"] + assert isinstance(email_result, PIIDetectionResult) + assert email_result.detection_method == "presidio_batch_text" + assert email_result.details["total_detections"] == 5 # 3 + 2 + assert email_result.details["total_samples"] == 18 # 10 + 8 + + # Should exclude phone_col (below threshold after aggregation) + # This depends on the confidence threshold calculation + + def test_anonymize_batch_small_dataset(self): + """Test batch anonymization on small dataset.""" + processor = BatchPIIProcessor() + df = self.create_test_dataset(200) + + # Create mock PII results + pii_results = { + "name": PIIDetectionResult( + column_name="name", + detection_method="column_name_matching", + confidence=0.9, + ), + "email": PIIDetectionResult( + column_name="email", detection_method="format_patterns", confidence=0.95 + ), + } + + anonymized_df, report = processor.anonymize_batch(df, pii_results) + + # Should return anonymized dataset and report + assert isinstance(anonymized_df, pd.DataFrame) + assert isinstance(report, dict) + assert anonymized_df.shape == df.shape + + # Should have processed the PII columns differently + assert not anonymized_df["name"].equals(df["name"]) + assert not anonymized_df["email"].equals(df["email"]) + + # Non-PII columns should be unchanged + assert anonymized_df["age"].equals(df["age"]) + assert anonymized_df["salary"].equals(df["salary"]) + + def test_anonymize_batch_large_dataset_chunked(self): + """Test batch anonymization with chunking.""" + processor = BatchPIIProcessor(chunk_size=100) # Force chunking + df = self.create_test_dataset(300) # Dataset larger than chunk size + + pii_results = { + "comments": PIIDetectionResult( + column_name="comments", + detection_method="presidio_text_analysis", + confidence=0.8, + entity_types=["PERSON"], + ) + } + + progress_calls = [] + + def progress_callback(percent, message): + progress_calls.append((percent, message)) + + anonymized_df, report = processor.anonymize_batch( + df, pii_results, progress_callback=progress_callback + ) + + # Should return results + assert isinstance(anonymized_df, pd.DataFrame) + assert isinstance(report, dict) + assert anonymized_df.shape == df.shape + + # Should have called progress callback multiple times (chunked processing) + assert len(progress_calls) > 1 + + # Report should indicate batch processing + assert report.get("batch_anonymization") is True + assert "chunks_processed" in report + + def test_optimized_detection_config(self): + """Test optimized configuration generation.""" + processor = BatchPIIProcessor(chunk_size=500) + config = processor._get_optimized_detection_config() + + # Should return configuration dictionary + assert isinstance(config, dict) + + # Should include batch processing optimizations + assert "batch_processing" in config + assert config["batch_processing"] is True + + # Sample size should be related to chunk size + expected_sample_size = min(200, 500 // 5) # From implementation + assert config["presidio_sample_size"] == expected_sample_size + + # Should have slightly lower confidence threshold for batch + assert config["presidio_confidence_threshold"] == 0.6 + + +class TestPresidioDataFrameFunctions: + """Test new DataFrame-level Presidio functions.""" + + def create_test_dataframe(self): + """Create test DataFrame with various text columns.""" + return pd.DataFrame( + { + "name": ["John Doe", "Jane Smith", "Bob Wilson"], + "email": ["john@test.com", "jane@test.com", "bob@test.com"], + "phone": ["555-0123", "555-0456", "555-0789"], + "comments": [ + "Contact John at his email", + "Jane prefers phone calls at 555-0456", + "Bob's address is 123 Main St", + ], + "age": [25, 30, 35], + "score": [85.5, 92.3, 78.1], + } + ) + + def test_presidio_analyze_dataframe_batch(self): + """Test batch DataFrame analysis function.""" + df = self.create_test_dataframe() + + # Test with Presidio not available + with patch( + "pii_detector.core.presidio_engine.get_presidio_analyzer" + ) as mock_get_analyzer: + mock_analyzer = Mock() + mock_analyzer.is_available.return_value = False + mock_get_analyzer.return_value = mock_analyzer + + results = presidio_analyze_dataframe_batch(df) + assert results == {} + + def test_presidio_analyze_dataframe_batch_with_mock(self): + """Test batch DataFrame analysis with mocked Presidio.""" + df = self.create_test_dataframe() + text_columns = ["name", "email", "comments"] + + with patch( + "pii_detector.core.presidio_engine.get_presidio_analyzer" + ) as mock_get_analyzer: + mock_analyzer = Mock() + mock_analyzer.is_available.return_value = True + mock_analyzer.analyze_column_text.return_value = { + "total_detections": 2, + "entities_found": { + "PERSON": ["John"], + "EMAIL_ADDRESS": ["john@test.com"], + }, + "confidence_scores": [0.9, 0.8], + } + mock_get_analyzer.return_value = mock_analyzer + + results = presidio_analyze_dataframe_batch( + df, text_columns=text_columns, confidence_threshold=0.7 + ) + + # Should analyze specified columns + assert mock_analyzer.analyze_column_text.call_count == len(text_columns) + + # Should return results for columns with detections + assert isinstance(results, dict) + assert len(results) == len(text_columns) # Mock returns detections for all + + def test_presidio_anonymize_dataframe_batch(self): + """Test batch DataFrame anonymization function.""" + df = self.create_test_dataframe() + columns_to_anonymize = ["name", "email"] + + # Test with Presidio not available + with patch( + "pii_detector.core.presidio_engine.get_presidio_analyzer" + ) as mock_get_analyzer: + mock_analyzer = Mock() + mock_analyzer.is_available.return_value = False + mock_get_analyzer.return_value = mock_analyzer + + result_df = presidio_anonymize_dataframe_batch(df, columns_to_anonymize) + + # Should return original DataFrame when Presidio not available + assert result_df.equals(df) + + def test_presidio_anonymize_dataframe_batch_with_mock(self): + """Test batch DataFrame anonymization with mocked Presidio.""" + df = self.create_test_dataframe() + columns_to_anonymize = ["name", "email"] + + with patch( + "pii_detector.core.presidio_engine.get_presidio_analyzer" + ) as mock_get_analyzer: + mock_analyzer = Mock() + mock_analyzer.is_available.return_value = True + mock_get_analyzer.return_value = mock_analyzer + + # Mock the anonymize_text method + with patch( + "pii_detector.core.presidio_engine.presidio_anonymize_text_column" + ) as mock_anonymize: + mock_anonymize.return_value = pd.Series( + ["[PERSON]", "[PERSON]", "[PERSON]"] + ) + + result_df = presidio_anonymize_dataframe_batch(df, columns_to_anonymize) + + # Should call anonymization for specified columns + assert mock_anonymize.call_count == len(columns_to_anonymize) + + # Should return modified DataFrame + assert isinstance(result_df, pd.DataFrame) + assert result_df.shape == df.shape + + +class TestBatchProcessingConvenienceFunctions: + """Test convenience functions for batch processing.""" + + def create_test_dataset(self, size: int = 500) -> pd.DataFrame: + """Create test dataset.""" + np.random.seed(42) + return pd.DataFrame( + { + "id": range(size), + "name": [f"Person {i}" for i in range(size)], + "email": [f"person{i}@test.com" for i in range(size)], + "phone": [f"555-{i:04d}" for i in range(size)], + "notes": [f"Note about person {i}" for i in range(size)], + } + ) + + def test_process_dataset_batch(self): + """Test complete batch processing workflow.""" + df = self.create_test_dataset(300) + + progress_calls = [] + + def progress_callback(percent, message): + progress_calls.append((percent, message)) + + # Mock to avoid external dependencies + with patch( + "pii_detector.core.batch_processor.BatchPIIProcessor" + ) as mock_processor_class: + mock_processor = Mock() + + # Mock detection results + mock_detection_results = { + "name": PIIDetectionResult( + column_name="name", + detection_method="column_name_matching", + confidence=0.9, + ), + "email": PIIDetectionResult( + column_name="email", + detection_method="format_patterns", + confidence=0.95, + ), + } + + # Mock anonymized dataset and report + mock_anonymized_df = df.copy() + mock_anonymized_df["name"] = "[REDACTED]" + mock_report = {"processed": True, "columns": 2} + + mock_processor.detect_pii_batch.return_value = mock_detection_results + mock_processor.anonymize_batch.return_value = ( + mock_anonymized_df, + mock_report, + ) + mock_processor_class.return_value = mock_processor + + # Test the function + detection_results, anonymized_df, report = process_dataset_batch( + df, + language="en", + chunk_size=100, + max_workers=2, + progress_callback=progress_callback, + ) + + # Verify processor was created with correct parameters + mock_processor_class.assert_called_once_with( + language="en", chunk_size=100, max_workers=2 + ) + + # Verify methods were called + mock_processor.detect_pii_batch.assert_called_once() + mock_processor.anonymize_batch.assert_called_once() + + # Verify results + assert detection_results == mock_detection_results + assert anonymized_df.equals(mock_anonymized_df) + assert report == mock_report + + def test_process_dataset_batch_with_configs(self): + """Test batch processing with custom configurations.""" + df = self.create_test_dataset(200) + + detection_config = { + "use_presidio_detection": False, + "presidio_confidence_threshold": 0.8, + } + + anonymization_config = { + "name": {"method": "hash_pseudonymization"}, + "email": {"method": "remove"}, + } + + # Mock processor + with patch( + "pii_detector.core.batch_processor.BatchPIIProcessor" + ) as mock_processor_class: + mock_processor = Mock() + mock_processor.detect_pii_batch.return_value = {} + mock_processor.anonymize_batch.return_value = (df, {}) + mock_processor_class.return_value = mock_processor + + # Test with configs + detection_results, anonymized_df, report = process_dataset_batch( + df, + detection_config=detection_config, + anonymization_config=anonymization_config, + ) + + # Verify configs were passed to methods + mock_processor.detect_pii_batch.assert_called_once() + mock_processor.anonymize_batch.assert_called_once() + + # Extract call arguments + detection_call_args = mock_processor.detect_pii_batch.call_args + anonymization_call_args = mock_processor.anonymize_batch.call_args + + # Verify detection config was passed + assert detection_call_args[0][1] is None # label_dict + assert detection_call_args[0][2] == detection_config + + # Verify anonymization config was passed + assert anonymization_call_args[0][2] == anonymization_config + + +class TestBatchProcessingEdgeCases: + """Test edge cases and error handling in batch processing.""" + + def test_empty_dataset(self): + """Test batch processing with empty dataset.""" + empty_df = pd.DataFrame() + processor = BatchPIIProcessor() + + results = processor.detect_pii_batch(empty_df) + assert isinstance(results, dict) + assert len(results) == 0 + + def test_single_column_dataset(self): + """Test batch processing with single column.""" + df = pd.DataFrame({"single_col": ["value1", "value2", "value3"]}) + processor = BatchPIIProcessor() + + results = processor.detect_pii_batch(df) + assert isinstance(results, dict) + + def test_no_text_columns(self): + """Test batch processing with no text columns.""" + df = pd.DataFrame( + { + "numeric_col": [1, 2, 3, 4, 5], + "boolean_col": [True, False, True, False, True], + } + ) + processor = BatchPIIProcessor() + + results = processor.detect_pii_batch(df) + assert isinstance(results, dict) + # Should still run structural analysis + + def test_chunk_size_larger_than_dataset(self): + """Test when chunk size is larger than dataset.""" + df = pd.DataFrame({"col": ["value1", "value2"]}) + processor = BatchPIIProcessor(chunk_size=1000) # Much larger than dataset + + results = processor.detect_pii_batch(df) + assert isinstance(results, dict) + + def test_invalid_progress_callback(self): + """Test with progress callback that raises exceptions.""" + df = pd.DataFrame({"col": range(100)}) + processor = BatchPIIProcessor() + + def failing_callback(percent, message): + raise Exception("Callback failed") + + # Should not crash even if callback fails + results = processor.detect_pii_batch(df, progress_callback=failing_callback) + assert isinstance(results, dict) + + +class TestBatchProcessingIntegration: + """Integration tests for batch processing (may require external dependencies).""" + + @pytest.mark.slow + def test_batch_vs_standard_consistency(self): + """Test that batch processing produces consistent results with standard processing.""" + # Create test dataset + np.random.seed(42) + df = pd.DataFrame( + { + "name": ["John Doe", "Jane Smith", "Bob Wilson"] * 100, + "email": ["john@test.com", "jane@test.com", "bob@test.com"] * 100, + "age": np.random.randint(18, 80, 300), + "notes": ["Some notes about the person"] * 300, + } + ) + + # Standard processing + standard_processor = BatchPIIProcessor(chunk_size=10000) # No chunking + standard_results = standard_processor.detect_pii_batch(df) + + # Batch processing + batch_processor = BatchPIIProcessor(chunk_size=100) # Force chunking + batch_results = batch_processor.detect_pii_batch(df) + + # Results should be similar (same columns detected) + standard_columns = set(standard_results.keys()) + batch_columns = set(batch_results.keys()) + + # Allow for some differences due to sampling and chunking + overlap = standard_columns & batch_columns + total_unique = standard_columns | batch_columns + + if total_unique: # Only check if any PII was detected + overlap_ratio = len(overlap) / len(total_unique) + assert overlap_ratio >= 0.7, ( + f"Batch and standard processing should produce similar results. Overlap: {overlap_ratio}" + ) + + @pytest.mark.skipif( + True, reason="Requires full Presidio installation - integration test only" + ) + def test_with_real_presidio(self): + """Integration test with real Presidio (if available).""" + df = pd.DataFrame( + { + "text": [ + "Contact John Doe at john.doe@example.com or call 555-123-4567", + "Jane Smith lives at 123 Main Street, Springfield, IL 62701", + "Bob's SSN is 123-45-6789 and he works at Acme Corp", + ] + } + ) + + processor = BatchPIIProcessor() + + # This will only work if Presidio is actually installed + if processor.presidio_analyzer.is_available(): + results = processor.detect_pii_batch(df) + + # Should detect PII in the text column + assert "text" in results + text_result = results["text"] + assert len(text_result.entity_types) > 0 + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/test_flet_gui_configuration.py b/tests/test_flet_gui_configuration.py new file mode 100644 index 0000000..0729485 --- /dev/null +++ b/tests/test_flet_gui_configuration.py @@ -0,0 +1,457 @@ +"""Integration tests for Flet GUI configuration screen.""" + +from unittest.mock import Mock + +import pytest + +from pii_detector.gui.flet_app.config.settings import DetectionConfig +from pii_detector.gui.flet_app.ui.app import StateManager + + +class TestConfigurationScreen: + """Test suite for configuration screen functionality.""" + + @pytest.fixture + def mock_page(self): + """Create a mock Flet page.""" + page = Mock() + page.update = Mock() + page.open = Mock() + return page + + @pytest.fixture + def state_manager(self, mock_page): + """Create a StateManager instance.""" + return StateManager(mock_page) + + @pytest.fixture + def configuration_screen(self, mock_page, state_manager): + """Create a ConfigurationScreen instance.""" + # We can't directly import due to Flet dependency, so we'll test through state + return { + "page": mock_page, + "state_manager": state_manager, + } + + def test_preset_quick_configuration(self, state_manager): + """Test quick preset configuration values.""" + # Quick preset should enable only column name and format pattern detection + config = DetectionConfig( + column_name_enabled=True, + format_pattern_enabled=True, + sparsity_enabled=False, + location_population_enabled=False, + ai_text_enabled=False, + ) + + state_manager.update_state(detection_config=config, preset_mode="quick") + + assert state_manager.state.detection_config.column_name_enabled is True + assert state_manager.state.detection_config.format_pattern_enabled is True + assert state_manager.state.detection_config.sparsity_enabled is False + assert state_manager.state.preset_mode == "quick" + + def test_preset_balanced_configuration(self, state_manager): + """Test balanced preset configuration values.""" + # Balanced preset should enable most methods except location + config = DetectionConfig( + column_name_enabled=True, + format_pattern_enabled=True, + sparsity_enabled=True, + location_population_enabled=False, + ai_text_enabled=True, + ) + + state_manager.update_state(detection_config=config, preset_mode="balanced") + + assert state_manager.state.detection_config.column_name_enabled is True + assert state_manager.state.detection_config.format_pattern_enabled is True + assert state_manager.state.detection_config.sparsity_enabled is True + assert state_manager.state.detection_config.ai_text_enabled is True + assert state_manager.state.preset_mode == "balanced" + + def test_preset_thorough_configuration(self, state_manager): + """Test thorough preset configuration values.""" + # Thorough preset should enable all methods + config = DetectionConfig( + column_name_enabled=True, + format_pattern_enabled=True, + sparsity_enabled=True, + location_population_enabled=True, + ai_text_enabled=True, + ) + + state_manager.update_state(detection_config=config, preset_mode="thorough") + + assert state_manager.state.detection_config.column_name_enabled is True + assert state_manager.state.detection_config.format_pattern_enabled is True + assert state_manager.state.detection_config.sparsity_enabled is True + assert state_manager.state.detection_config.location_population_enabled is True + assert state_manager.state.detection_config.ai_text_enabled is True + assert state_manager.state.preset_mode == "thorough" + + def test_column_name_detection_settings(self, state_manager): + """Test column name detection configuration.""" + config = DetectionConfig( + column_name_enabled=True, + fuzzy_match_threshold=0.9, + matching_type="fuzzy", + ) + + state_manager.update_state(detection_config=config) + + assert state_manager.state.detection_config.column_name_enabled is True + assert state_manager.state.detection_config.fuzzy_match_threshold == 0.9 + assert state_manager.state.detection_config.matching_type == "fuzzy" + + def test_column_name_strict_matching(self, state_manager): + """Test strict matching configuration.""" + config = DetectionConfig( + column_name_enabled=True, + matching_type="strict", + ) + + state_manager.update_state(detection_config=config) + + assert state_manager.state.detection_config.matching_type == "strict" + + def test_column_name_both_matching(self, state_manager): + """Test both (strict + fuzzy) matching configuration.""" + config = DetectionConfig( + column_name_enabled=True, + matching_type="both", + fuzzy_match_threshold=0.85, + ) + + state_manager.update_state(detection_config=config) + + assert state_manager.state.detection_config.matching_type == "both" + assert state_manager.state.detection_config.fuzzy_match_threshold == 0.85 + + def test_format_pattern_detection_settings(self, state_manager): + """Test format pattern detection configuration.""" + config = DetectionConfig( + format_pattern_enabled=True, + format_confidence_threshold=0.85, + detect_phone=True, + detect_email=True, + detect_ssn=False, + detect_dates=True, + ) + + state_manager.update_state(detection_config=config) + + assert state_manager.state.detection_config.format_pattern_enabled is True + assert state_manager.state.detection_config.format_confidence_threshold == 0.85 + assert state_manager.state.detection_config.detect_phone is True + assert state_manager.state.detection_config.detect_email is True + assert state_manager.state.detection_config.detect_ssn is False + assert state_manager.state.detection_config.detect_dates is True + + def test_sparsity_analysis_settings(self, state_manager): + """Test sparsity analysis configuration.""" + config = DetectionConfig( + sparsity_enabled=True, + sparsity_threshold=0.75, + min_entries_required=15, + ) + + state_manager.update_state(detection_config=config) + + assert state_manager.state.detection_config.sparsity_enabled is True + assert state_manager.state.detection_config.sparsity_threshold == 0.75 + assert state_manager.state.detection_config.min_entries_required == 15 + + def test_location_population_settings(self, state_manager): + """Test location population detection configuration.""" + config = DetectionConfig( + location_population_enabled=True, + population_threshold=75000, + ) + + state_manager.update_state(detection_config=config) + + assert state_manager.state.detection_config.location_population_enabled is True + assert state_manager.state.detection_config.population_threshold == 75000 + + def test_presidio_ai_settings(self, state_manager): + """Test Presidio AI text detection configuration.""" + config = DetectionConfig( + ai_text_enabled=True, + presidio_confidence_threshold=0.75, + presidio_language_model="en_core_web_md", + presidio_detect_person=True, + presidio_detect_org=False, + ) + + state_manager.update_state(detection_config=config) + + assert state_manager.state.detection_config.ai_text_enabled is True + assert ( + state_manager.state.detection_config.presidio_confidence_threshold == 0.75 + ) + assert ( + state_manager.state.detection_config.presidio_language_model + == "en_core_web_md" + ) + assert state_manager.state.detection_config.presidio_detect_person is True + assert state_manager.state.detection_config.presidio_detect_org is False + + def test_threshold_value_ranges(self, state_manager): + """Test that threshold values are within valid ranges.""" + # Test minimum values + config_min = DetectionConfig( + fuzzy_match_threshold=0.5, + format_confidence_threshold=0.5, + sparsity_threshold=0.5, + presidio_confidence_threshold=0.5, + ) + + state_manager.update_state(detection_config=config_min) + + assert state_manager.state.detection_config.fuzzy_match_threshold == 0.5 + assert state_manager.state.detection_config.format_confidence_threshold == 0.5 + assert state_manager.state.detection_config.sparsity_threshold == 0.5 + assert state_manager.state.detection_config.presidio_confidence_threshold == 0.5 + + # Test maximum values + config_max = DetectionConfig( + fuzzy_match_threshold=1.0, + format_confidence_threshold=1.0, + sparsity_threshold=1.0, + presidio_confidence_threshold=1.0, + ) + + state_manager.update_state(detection_config=config_max) + + assert state_manager.state.detection_config.fuzzy_match_threshold == 1.0 + assert state_manager.state.detection_config.format_confidence_threshold == 1.0 + assert state_manager.state.detection_config.sparsity_threshold == 1.0 + assert state_manager.state.detection_config.presidio_confidence_threshold == 1.0 + + def test_api_key_configuration(self, state_manager): + """Test API key configuration.""" + # Set API key + state_manager.set_api_key("test_geonames_username") + + assert state_manager.state.geonames_api_key == "test_geonames_username" + + # Clear API key + state_manager.set_api_key(None) + assert state_manager.state.geonames_api_key is None + + def test_configuration_validation_no_methods_selected(self, state_manager): + """Test validation when no detection methods are selected.""" + # Create config with all methods disabled + config = DetectionConfig( + column_name_enabled=False, + format_pattern_enabled=False, + sparsity_enabled=False, + location_population_enabled=False, + ai_text_enabled=False, + ) + + state_manager.update_state(detection_config=config) + + # Check that at least one method should be enabled for valid config + any_enabled = any( + [ + state_manager.state.detection_config.column_name_enabled, + state_manager.state.detection_config.format_pattern_enabled, + state_manager.state.detection_config.sparsity_enabled, + state_manager.state.detection_config.location_population_enabled, + state_manager.state.detection_config.ai_text_enabled, + ] + ) + + assert any_enabled is False # This should trigger validation error in UI + + def test_configuration_with_all_methods(self, state_manager): + """Test configuration with all detection methods enabled.""" + config = DetectionConfig( + column_name_enabled=True, + format_pattern_enabled=True, + sparsity_enabled=True, + location_population_enabled=True, + ai_text_enabled=True, + fuzzy_match_threshold=0.85, + format_confidence_threshold=0.8, + sparsity_threshold=0.75, + population_threshold=60000, + presidio_confidence_threshold=0.8, + ) + + state_manager.update_state(detection_config=config) + + # Verify all methods are enabled + assert state_manager.state.detection_config.column_name_enabled is True + assert state_manager.state.detection_config.format_pattern_enabled is True + assert state_manager.state.detection_config.sparsity_enabled is True + assert state_manager.state.detection_config.location_population_enabled is True + assert state_manager.state.detection_config.ai_text_enabled is True + + # Verify all thresholds are set + assert state_manager.state.detection_config.fuzzy_match_threshold == 0.85 + assert state_manager.state.detection_config.format_confidence_threshold == 0.8 + assert state_manager.state.detection_config.sparsity_threshold == 0.75 + assert state_manager.state.detection_config.presidio_confidence_threshold == 0.8 + + def test_preset_switching(self, state_manager): + """Test switching between presets.""" + # Start with quick + config_quick = DetectionConfig( + column_name_enabled=True, + format_pattern_enabled=True, + sparsity_enabled=False, + location_population_enabled=False, + ai_text_enabled=False, + ) + state_manager.update_state(detection_config=config_quick, preset_mode="quick") + assert state_manager.state.preset_mode == "quick" + + # Switch to balanced + config_balanced = DetectionConfig( + column_name_enabled=True, + format_pattern_enabled=True, + sparsity_enabled=True, + location_population_enabled=False, + ai_text_enabled=True, + ) + state_manager.update_state( + detection_config=config_balanced, preset_mode="balanced" + ) + assert state_manager.state.preset_mode == "balanced" + assert state_manager.state.detection_config.sparsity_enabled is True + + # Switch to thorough + config_thorough = DetectionConfig( + column_name_enabled=True, + format_pattern_enabled=True, + sparsity_enabled=True, + location_population_enabled=True, + ai_text_enabled=True, + ) + state_manager.update_state( + detection_config=config_thorough, preset_mode="thorough" + ) + assert state_manager.state.preset_mode == "thorough" + assert state_manager.state.detection_config.location_population_enabled is True + + def test_language_model_selection(self, state_manager): + """Test Presidio language model selection.""" + # Test small model + config_small = DetectionConfig( + ai_text_enabled=True, + presidio_language_model="en_core_web_sm", + ) + state_manager.update_state(detection_config=config_small) + assert ( + state_manager.state.detection_config.presidio_language_model + == "en_core_web_sm" + ) + + # Test medium model + config_medium = DetectionConfig( + ai_text_enabled=True, + presidio_language_model="en_core_web_md", + ) + state_manager.update_state(detection_config=config_medium) + assert ( + state_manager.state.detection_config.presidio_language_model + == "en_core_web_md" + ) + + # Test large model + config_large = DetectionConfig( + ai_text_enabled=True, + presidio_language_model="en_core_web_lg", + ) + state_manager.update_state(detection_config=config_large) + assert ( + state_manager.state.detection_config.presidio_language_model + == "en_core_web_lg" + ) + + def test_pattern_type_selective_detection(self, state_manager): + """Test selective pattern type detection.""" + # Only detect phone and email + config = DetectionConfig( + format_pattern_enabled=True, + detect_phone=True, + detect_email=True, + detect_ssn=False, + detect_dates=False, + ) + + state_manager.update_state(detection_config=config) + + assert state_manager.state.detection_config.detect_phone is True + assert state_manager.state.detection_config.detect_email is True + assert state_manager.state.detection_config.detect_ssn is False + assert state_manager.state.detection_config.detect_dates is False + + def test_presidio_entity_selective_detection(self, state_manager): + """Test selective Presidio entity detection.""" + # Only detect persons, not organizations + config = DetectionConfig( + ai_text_enabled=True, + presidio_detect_person=True, + presidio_detect_org=False, + ) + + state_manager.update_state(detection_config=config) + + assert state_manager.state.detection_config.presidio_detect_person is True + assert state_manager.state.detection_config.presidio_detect_org is False + + +class TestConfigurationPersistence: + """Test configuration persistence and state management.""" + + @pytest.fixture + def mock_page(self): + """Create a mock Flet page.""" + page = Mock() + page.update = Mock() + return page + + @pytest.fixture + def state_manager(self, mock_page): + """Create a StateManager instance.""" + return StateManager(mock_page) + + def test_configuration_persists_across_navigation(self, state_manager): + """Test that configuration persists when navigating between screens.""" + # Set configuration + config = DetectionConfig( + column_name_enabled=True, + fuzzy_match_threshold=0.92, + format_confidence_threshold=0.88, + ) + state_manager.update_state(detection_config=config) + + # Navigate to different screen + state_manager.navigate_to("progress") + + # Configuration should persist + assert state_manager.state.detection_config.fuzzy_match_threshold == 0.92 + assert state_manager.state.detection_config.format_confidence_threshold == 0.88 + + def test_configuration_reset(self, state_manager): + """Test resetting configuration to defaults.""" + # Set custom configuration + config = DetectionConfig( + fuzzy_match_threshold=0.95, + format_confidence_threshold=0.9, + sparsity_threshold=0.7, + ) + state_manager.update_state(detection_config=config) + + # Reset to defaults + default_config = DetectionConfig() + state_manager.update_state(detection_config=default_config) + + # Should have default values + assert state_manager.state.detection_config.fuzzy_match_threshold == 0.8 + assert state_manager.state.detection_config.format_confidence_threshold == 0.7 + assert state_manager.state.detection_config.sparsity_threshold == 0.8 diff --git a/tests/test_flet_gui_integration.py b/tests/test_flet_gui_integration.py new file mode 100644 index 0000000..41b96dc --- /dev/null +++ b/tests/test_flet_gui_integration.py @@ -0,0 +1,577 @@ +"""Integration tests for Flet GUI end-to-end workflows.""" + +from pathlib import Path +from unittest.mock import Mock + +import pandas as pd +import pytest + +from pii_detector.gui.flet_app.config.settings import ( + DetectionConfig, + DetectionResult, + FileInfo, +) +from pii_detector.gui.flet_app.ui.app import StateManager + + +class TestFileSelectionWorkflow: + """Test suite for file selection workflow.""" + + @pytest.fixture + def mock_page(self): + """Create a mock Flet page.""" + page = Mock() + page.update = Mock() + return page + + @pytest.fixture + def state_manager(self, mock_page): + """Create a StateManager instance.""" + return StateManager(mock_page) + + @pytest.fixture + def sample_csv_file(self, tmp_path): + """Create a sample CSV file for testing.""" + file_path = tmp_path / "test_data.csv" + df = pd.DataFrame( + { + "name": ["John Doe", "Jane Smith"], + "email": ["john@test.com", "jane@test.com"], + "age": [30, 25], + } + ) + df.to_csv(file_path, index=False) + return file_path + + @pytest.fixture + def sample_excel_file(self, tmp_path): + """Create a sample Excel file for testing.""" + file_path = tmp_path / "test_data.xlsx" + df = pd.DataFrame( + { + "participant_id": [1, 2, 3], + "full_name": ["Alice Brown", "Bob Wilson", "Carol Davis"], + "phone": ["555-1234", "555-5678", "555-9012"], + } + ) + df.to_excel(file_path, index=False) + return file_path + + def test_add_csv_file(self, state_manager, sample_csv_file): + """Test adding a CSV file to selection.""" + file_info = FileInfo( + path=sample_csv_file, + name=sample_csv_file.name, + size_mb=sample_csv_file.stat().st_size / (1024 * 1024), + format="csv", + is_valid=True, + ) + + state_manager.add_file(file_info) + + assert len(state_manager.state.selected_files) == 1 + assert state_manager.state.selected_files[0].format == "csv" + assert state_manager.state.selected_files[0].is_valid is True + + def test_add_excel_file(self, state_manager, sample_excel_file): + """Test adding an Excel file to selection.""" + file_info = FileInfo( + path=sample_excel_file, + name=sample_excel_file.name, + size_mb=sample_excel_file.stat().st_size / (1024 * 1024), + format="xlsx", + is_valid=True, + ) + + state_manager.add_file(file_info) + + assert len(state_manager.state.selected_files) == 1 + assert state_manager.state.selected_files[0].format == "xlsx" + + def test_add_multiple_files( + self, state_manager, sample_csv_file, sample_excel_file + ): + """Test adding multiple files.""" + csv_info = FileInfo( + path=sample_csv_file, + name=sample_csv_file.name, + size_mb=0.1, + format="csv", + ) + excel_info = FileInfo( + path=sample_excel_file, + name=sample_excel_file.name, + size_mb=0.1, + format="xlsx", + ) + + state_manager.add_file(csv_info) + state_manager.add_file(excel_info) + + assert len(state_manager.state.selected_files) == 2 + + def test_remove_file_from_selection(self, state_manager, sample_csv_file): + """Test removing a file from selection.""" + file_info = FileInfo( + path=sample_csv_file, + name=sample_csv_file.name, + size_mb=0.1, + format="csv", + ) + + state_manager.add_file(file_info) + assert len(state_manager.state.selected_files) == 1 + + state_manager.remove_file(sample_csv_file) + assert len(state_manager.state.selected_files) == 0 + + def test_file_validation_invalid_format(self, state_manager, tmp_path): + """Test file validation for invalid format.""" + invalid_file = tmp_path / "test.txt" + invalid_file.write_text("This is a text file") + + file_info = FileInfo( + path=invalid_file, + name=invalid_file.name, + size_mb=0.001, + format="txt", + is_valid=False, + validation_message="Unsupported file format", + ) + + state_manager.add_file(file_info) + + assert state_manager.state.selected_files[0].is_valid is False + assert "Unsupported" in state_manager.state.selected_files[0].validation_message + + +class TestDetectionWorkflow: + """Test suite for PII detection workflow.""" + + @pytest.fixture + def mock_page(self): + """Create a mock Flet page.""" + page = Mock() + page.update = Mock() + return page + + @pytest.fixture + def state_manager(self, mock_page): + """Create a StateManager instance.""" + return StateManager(mock_page) + + @pytest.fixture + def detection_config(self): + """Create a detection configuration.""" + return DetectionConfig( + column_name_enabled=True, + format_pattern_enabled=True, + sparsity_enabled=True, + fuzzy_match_threshold=0.8, + format_confidence_threshold=0.7, + ) + + def test_start_detection_with_config(self, state_manager, detection_config): + """Test starting detection with configuration.""" + state_manager.update_state(detection_config=detection_config) + state_manager.set_processing(True) + + assert state_manager.state.is_processing is True + assert state_manager.state.detection_config.column_name_enabled is True + + def test_detection_progress_updates(self, state_manager): + """Test detection progress updates.""" + state_manager.set_processing(True) + + # Update progress at different stages + state_manager.update_progress( + progress=0.25, + stage="Loading file", + current_file="test.csv", + ) + assert state_manager.state.current_progress == 0.25 + assert state_manager.state.processing_stage == "Loading file" + + state_manager.update_progress( + progress=0.5, + stage="Analyzing columns", + ) + assert state_manager.state.current_progress == 0.5 + + state_manager.update_progress( + progress=1.0, + stage="Complete", + ) + assert state_manager.state.current_progress == 1.0 + + def test_add_detection_results(self, state_manager): + """Test adding detection results.""" + result1 = DetectionResult( + column="name", + method="column_name", + confidence=0.9, + pii_type="PERSON", + entity_types=["PERSON"], + ) + result2 = DetectionResult( + column="email", + method="format_pattern", + confidence=0.95, + pii_type="EMAIL_ADDRESS", + entity_types=["EMAIL"], + ) + + state_manager.add_detection_result(result1) + state_manager.add_detection_result(result2) + + assert len(state_manager.state.detection_results) == 2 + assert state_manager.state.detection_results[0].column == "name" + assert state_manager.state.detection_results[1].column == "email" + + def test_detection_completion(self, state_manager): + """Test detection workflow completion.""" + # Start detection + state_manager.set_processing(True) + state_manager.update_progress(progress=0.0, stage="Starting") + + # Add results + result = DetectionResult( + column="phone", + method="format_pattern", + confidence=0.9, + pii_type="PHONE_NUMBER", + entity_types=["PHONE"], + ) + state_manager.add_detection_result(result) + + # Complete detection + state_manager.update_progress(progress=1.0, stage="Complete") + state_manager.set_processing(False) + + assert state_manager.state.is_processing is False + assert len(state_manager.state.detection_results) == 1 + + +class TestReviewAndActionWorkflow: + """Test suite for results review and action workflow.""" + + @pytest.fixture + def mock_page(self): + """Create a mock Flet page.""" + page = Mock() + page.update = Mock() + return page + + @pytest.fixture + def state_manager_with_results(self, mock_page): + """Create a StateManager with detection results.""" + manager = StateManager(mock_page) + + # Add detection results + results = [ + DetectionResult("name", "column_name", 0.9, "PERSON", ["PERSON"]), + DetectionResult("email", "format_pattern", 0.95, "EMAIL", ["EMAIL"]), + DetectionResult("phone", "format_pattern", 0.9, "PHONE", ["PHONE"]), + DetectionResult("comments", "sparsity", 0.85, "FREETEXT", []), + ] + + for result in results: + manager.add_detection_result(result) + + return manager + + def test_set_user_actions(self, state_manager_with_results): + """Test setting user actions for detected columns.""" + manager = state_manager_with_results + + # Set actions + manager.set_user_action("name", "remove") + manager.set_user_action("email", "encode") + manager.set_user_action("phone", "mask") + manager.set_user_action("comments", "keep") + + assert manager.state.user_actions["name"] == "remove" + assert manager.state.user_actions["email"] == "encode" + assert manager.state.user_actions["phone"] == "mask" + assert manager.state.user_actions["comments"] == "keep" + + def test_set_anonymization_methods(self, state_manager_with_results): + """Test setting anonymization methods for columns.""" + manager = state_manager_with_results + + # Set anonymization methods + manager.state.column_anonymization_methods["name"] = "remove" + manager.state.column_anonymization_methods["email"] = "hash" + manager.state.column_anonymization_methods["phone"] = "pattern_mask" + + assert manager.state.column_anonymization_methods["name"] == "remove" + assert manager.state.column_anonymization_methods["email"] == "hash" + assert manager.state.column_anonymization_methods["phone"] == "pattern_mask" + + def test_change_user_action(self, state_manager_with_results): + """Test changing user action for a column.""" + manager = state_manager_with_results + + # Initial action + manager.set_user_action("email", "remove") + assert manager.state.user_actions["email"] == "remove" + + # Change action + manager.set_user_action("email", "encode") + assert manager.state.user_actions["email"] == "encode" + + +class TestNavigationWorkflow: + """Test suite for screen navigation workflow.""" + + @pytest.fixture + def mock_page(self): + """Create a mock Flet page.""" + page = Mock() + page.update = Mock() + return page + + @pytest.fixture + def state_manager(self, mock_page): + """Create a StateManager instance.""" + return StateManager(mock_page) + + def test_complete_workflow_navigation(self, state_manager): + """Test navigation through complete workflow.""" + # Start at dashboard + assert state_manager.state.current_screen == "dashboard" + + # Navigate to file selection + state_manager.navigate_to("file_selection") + assert state_manager.state.current_screen == "file_selection" + assert "dashboard" in state_manager.state.screen_history + + # Navigate to configuration + state_manager.navigate_to("configuration") + assert state_manager.state.current_screen == "configuration" + assert "file_selection" in state_manager.state.screen_history + + # Navigate to progress + state_manager.navigate_to("progress") + assert state_manager.state.current_screen == "progress" + + # Navigate to results + state_manager.navigate_to("results") + assert state_manager.state.current_screen == "results" + + # Navigate to export + state_manager.navigate_to("export") + assert state_manager.state.current_screen == "export" + + def test_back_navigation(self, state_manager): + """Test back navigation through workflow.""" + # Navigate forward + state_manager.navigate_to("file_selection") + state_manager.navigate_to("configuration") + state_manager.navigate_to("progress") + + # Navigate back + state_manager.go_back() + assert state_manager.state.current_screen == "configuration" + + state_manager.go_back() + assert state_manager.state.current_screen == "file_selection" + + state_manager.go_back() + assert state_manager.state.current_screen == "dashboard" + + +class TestErrorHandling: + """Test suite for error handling and validation.""" + + @pytest.fixture + def mock_page(self): + """Create a mock Flet page.""" + page = Mock() + page.update = Mock() + return page + + @pytest.fixture + def state_manager(self, mock_page): + """Create a StateManager instance.""" + return StateManager(mock_page) + + def test_add_error_message(self, state_manager): + """Test adding error messages.""" + state_manager.add_error_message("File validation failed") + state_manager.add_error_message("Detection error") + + assert len(state_manager.state.error_messages) == 2 + assert "File validation failed" in state_manager.state.error_messages + + def test_add_success_message(self, state_manager): + """Test adding success messages.""" + state_manager.add_success_message("File loaded successfully") + state_manager.add_success_message("Detection complete") + + assert len(state_manager.state.success_messages) == 2 + assert "File loaded successfully" in state_manager.state.success_messages + + def test_clear_messages(self, state_manager): + """Test clearing messages.""" + state_manager.add_error_message("Error 1") + state_manager.add_success_message("Success 1") + + assert len(state_manager.state.error_messages) == 1 + assert len(state_manager.state.success_messages) == 1 + + state_manager.clear_messages() + + assert len(state_manager.state.error_messages) == 0 + assert len(state_manager.state.success_messages) == 0 + + +class TestBackendIntegration: + """Test suite for backend adapter integration.""" + + @pytest.fixture + def mock_page(self): + """Create a mock Flet page.""" + page = Mock() + page.update = Mock() + return page + + @pytest.fixture + def state_manager(self, mock_page): + """Create a StateManager instance.""" + return StateManager(mock_page) + + @pytest.fixture + def sample_dataframe(self): + """Create a sample dataframe.""" + return pd.DataFrame( + { + "participant_name": ["John Doe", "Jane Smith", "Bob Wilson"], + "email_address": ["john@test.com", "jane@test.com", "bob@test.com"], + "phone_number": ["555-1234", "555-5678", "555-9012"], + "age_years": [30, 25, 35], + "survey_notes": ["Note 1", "Note 2", "Note 3"], + } + ) + + def test_config_to_backend_mapping(self, state_manager): + """Test mapping GUI config to backend processor config.""" + # Set GUI configuration + gui_config = DetectionConfig( + column_name_enabled=True, + format_pattern_enabled=True, + sparsity_enabled=True, + ai_text_enabled=True, + fuzzy_match_threshold=0.85, + format_confidence_threshold=0.8, + sparsity_threshold=0.75, + presidio_confidence_threshold=0.8, + ) + + state_manager.update_state(detection_config=gui_config) + + # Verify configuration values are accessible + config = state_manager.state.detection_config + + assert config.column_name_enabled is True + assert config.fuzzy_match_threshold == 0.85 + assert config.format_confidence_threshold == 0.8 + assert config.sparsity_threshold == 0.75 + assert config.presidio_confidence_threshold == 0.8 + + def test_detection_results_from_backend(self, state_manager): + """Test receiving detection results from backend.""" + # Simulate backend returning detection results + backend_results = [ + { + "column": "participant_name", + "method": "column_name_matching", + "confidence": 0.9, + "pii_type": "PERSON", + "entity_types": ["PERSON"], + }, + { + "column": "email_address", + "method": "format_patterns", + "confidence": 0.95, + "pii_type": "EMAIL_ADDRESS", + "entity_types": ["EMAIL"], + }, + ] + + # Convert to DetectionResult objects + for result in backend_results: + detection_result = DetectionResult( + column=result["column"], + method=result["method"], + confidence=result["confidence"], + pii_type=result["pii_type"], + entity_types=result["entity_types"], + ) + state_manager.add_detection_result(detection_result) + + assert len(state_manager.state.detection_results) == 2 + assert state_manager.state.detection_results[0].column == "participant_name" + assert state_manager.state.detection_results[1].column == "email_address" + + +class TestStateReset: + """Test suite for state reset and cleanup.""" + + @pytest.fixture + def mock_page(self): + """Create a mock Flet page.""" + page = Mock() + page.update = Mock() + return page + + @pytest.fixture + def populated_state_manager(self, mock_page): + """Create a StateManager with populated state.""" + manager = StateManager(mock_page) + + # Add files + file_info = FileInfo( + path=Path("test.csv"), + name="test.csv", + size_mb=1.0, + format="csv", + ) + manager.add_file(file_info) + + # Set configuration + config = DetectionConfig(fuzzy_match_threshold=0.95) + manager.update_state(detection_config=config) + + # Add results + result = DetectionResult("name", "column_name", 0.9, "PERSON", ["PERSON"]) + manager.add_detection_result(result) + + # Add messages + manager.add_error_message("Test error") + manager.add_success_message("Test success") + + # Set processing state + manager.set_processing(True) + + return manager + + def test_reset_state(self, populated_state_manager): + """Test resetting state to defaults.""" + manager = populated_state_manager + + # Verify state is populated + assert len(manager.state.selected_files) == 1 + assert len(manager.state.detection_results) == 1 + assert len(manager.state.error_messages) == 1 + assert manager.state.is_processing is True + + # Reset state + manager.reset_state() + + # Verify state is reset + assert len(manager.state.selected_files) == 0 + assert len(manager.state.detection_results) == 0 + assert len(manager.state.error_messages) == 0 + assert manager.state.is_processing is False + assert manager.state.current_screen == "dashboard" diff --git a/tests/test_flet_gui_state.py b/tests/test_flet_gui_state.py new file mode 100644 index 0000000..faf03a4 --- /dev/null +++ b/tests/test_flet_gui_state.py @@ -0,0 +1,498 @@ +"""Integration tests for Flet GUI state management.""" + +from pathlib import Path +from unittest.mock import Mock + +import pytest + +from pii_detector.gui.flet_app.config.settings import ( + AppState, + DetectionConfig, + DetectionResult, + FileInfo, + ValidationResult, +) +from pii_detector.gui.flet_app.ui.app import StateManager + + +class TestAppState: + """Test suite for AppState dataclass.""" + + def test_app_state_initialization(self): + """Test that AppState initializes with correct defaults.""" + state = AppState() + + # Navigation state + assert state.current_screen == "dashboard" + assert state.screen_history == [] + + # File management + assert state.selected_files == [] + assert state.file_validation_results == {} + + # Configuration state + assert isinstance(state.detection_config, DetectionConfig) + assert state.preset_mode == "balanced" + + # Processing state + assert state.is_processing is False + assert state.current_progress == 0.0 + assert state.processing_stage == "" + assert state.estimated_time_remaining is None + assert state.current_file == "" + + # Results state + assert state.detection_results == [] + assert state.user_actions == {} + assert state.column_anonymization_methods == {} + + # UI state + assert state.panel_expansion_states == {} + assert state.error_messages == [] + assert state.success_messages == [] + + # API configuration + assert state.geonames_api_key is None + + def test_app_state_file_selection(self): + """Test file selection state management.""" + state = AppState() + + # Add a file + file_info = FileInfo( + path=Path("test.csv"), + name="test.csv", + size_mb=1.5, + format="csv", + is_valid=True, + validation_message="", + ) + state.selected_files.append(file_info) + + assert len(state.selected_files) == 1 + assert state.selected_files[0].name == "test.csv" + assert state.selected_files[0].format == "csv" + + def test_app_state_detection_results(self): + """Test detection results state management.""" + state = AppState() + + # Add detection results + result = DetectionResult( + column="email", + method="format_pattern", + confidence=0.95, + pii_type="EMAIL_ADDRESS", + entity_types=["EMAIL"], + details={ + "pattern_matched": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b" + }, + ) + state.detection_results.append(result) + + assert len(state.detection_results) == 1 + assert state.detection_results[0].column == "email" + assert state.detection_results[0].confidence == 0.95 + + def test_app_state_user_actions(self): + """Test user action tracking.""" + state = AppState() + + # Set user actions for columns + state.user_actions["email"] = "remove" + state.user_actions["phone"] = "encode" + state.user_actions["age"] = "categorize" + + assert state.user_actions["email"] == "remove" + assert state.user_actions["phone"] == "encode" + assert state.user_actions["age"] == "categorize" + + +class TestDetectionConfig: + """Test suite for DetectionConfig dataclass.""" + + def test_detection_config_defaults(self): + """Test that DetectionConfig initializes with correct defaults.""" + config = DetectionConfig() + + # Method enable/disable states + assert config.column_name_enabled is True + assert config.format_pattern_enabled is True + assert config.sparsity_enabled is True + assert config.ai_text_enabled is True + assert config.location_population_enabled is False + + # Column Name Detection settings + assert config.fuzzy_match_threshold == 0.8 + assert config.matching_type == "fuzzy" + + # Format Pattern Detection settings + assert config.format_confidence_threshold == 0.7 + assert config.detect_phone is True + assert config.detect_email is True + assert config.detect_ssn is True + assert config.detect_dates is True + + # Sparsity analysis settings + assert config.sparsity_threshold == 0.8 + assert config.min_entries_required == 10 + + # Location population settings + assert config.population_threshold == 50000 + + # Presidio settings + assert config.presidio_confidence_threshold == 0.8 + assert config.presidio_language_model == "en_core_web_sm" + assert config.presidio_detect_person is True + assert config.presidio_detect_org is True + + def test_detection_config_custom_values(self): + """Test DetectionConfig with custom values.""" + config = DetectionConfig( + column_name_enabled=False, + fuzzy_match_threshold=0.9, + matching_type="strict", + format_confidence_threshold=0.85, + detect_phone=False, + sparsity_threshold=0.7, + population_threshold=100000, + presidio_confidence_threshold=0.75, + ) + + assert config.column_name_enabled is False + assert config.fuzzy_match_threshold == 0.9 + assert config.matching_type == "strict" + assert config.format_confidence_threshold == 0.85 + assert config.detect_phone is False + assert config.sparsity_threshold == 0.7 + assert config.population_threshold == 100000 + assert config.presidio_confidence_threshold == 0.75 + + def test_detection_config_validation_ranges(self): + """Test that config values are within expected ranges.""" + config = DetectionConfig() + + # Threshold values should be between 0 and 1 + assert 0.0 <= config.fuzzy_match_threshold <= 1.0 + assert 0.0 <= config.format_confidence_threshold <= 1.0 + assert 0.0 <= config.sparsity_threshold <= 1.0 + assert 0.0 <= config.presidio_confidence_threshold <= 1.0 + + # Population threshold should be positive + assert config.population_threshold > 0 + + # Min entries should be positive + assert config.min_entries_required > 0 + + +class TestStateManager: + """Test suite for StateManager class.""" + + @pytest.fixture + def mock_page(self): + """Create a mock Flet page.""" + page = Mock() + page.update = Mock() + return page + + @pytest.fixture + def state_manager(self, mock_page): + """Create a StateManager instance for testing.""" + return StateManager(mock_page) + + def test_state_manager_initialization(self, state_manager): + """Test StateManager initialization.""" + assert isinstance(state_manager.state, AppState) + assert state_manager.state.current_screen == "dashboard" + + def test_state_manager_update_state(self, state_manager): + """Test state update functionality.""" + # Update detection config + new_config = DetectionConfig( + fuzzy_match_threshold=0.95, + format_confidence_threshold=0.8, + ) + state_manager.update_state(detection_config=new_config) + + assert state_manager.state.detection_config.fuzzy_match_threshold == 0.95 + assert state_manager.state.detection_config.format_confidence_threshold == 0.8 + + def test_state_manager_navigation(self, state_manager): + """Test navigation state management.""" + # Navigate to file selection + state_manager.navigate_to("file_selection") + assert state_manager.state.current_screen == "file_selection" + assert "dashboard" in state_manager.state.screen_history + + # Navigate to configuration + state_manager.navigate_to("configuration") + assert state_manager.state.current_screen == "configuration" + assert "file_selection" in state_manager.state.screen_history + + def test_state_manager_go_back(self, state_manager): + """Test navigation back functionality.""" + # Navigate through screens + state_manager.navigate_to("file_selection") + state_manager.navigate_to("configuration") + state_manager.navigate_to("progress") + + # Go back + state_manager.go_back() + assert state_manager.state.current_screen == "configuration" + + state_manager.go_back() + assert state_manager.state.current_screen == "file_selection" + + def test_state_manager_add_file(self, state_manager): + """Test adding files to state.""" + file_info = FileInfo( + path=Path("test.csv"), + name="test.csv", + size_mb=2.0, + format="csv", + is_valid=True, + ) + + state_manager.add_file(file_info) + assert len(state_manager.state.selected_files) == 1 + assert state_manager.state.selected_files[0].name == "test.csv" + + def test_state_manager_remove_file(self, state_manager): + """Test removing files from state.""" + # Add files + file1 = FileInfo( + path=Path("test1.csv"), + name="test1.csv", + size_mb=1.0, + format="csv", + ) + file2 = FileInfo( + path=Path("test2.csv"), + name="test2.csv", + size_mb=1.5, + format="csv", + ) + + state_manager.add_file(file1) + state_manager.add_file(file2) + assert len(state_manager.state.selected_files) == 2 + + # Remove first file + state_manager.remove_file(Path("test1.csv")) + assert len(state_manager.state.selected_files) == 1 + assert state_manager.state.selected_files[0].name == "test2.csv" + + def test_state_manager_clear_files(self, state_manager): + """Test clearing all files from state.""" + # Add files + file1 = FileInfo( + path=Path("test1.csv"), name="test1.csv", size_mb=1.0, format="csv" + ) + file2 = FileInfo( + path=Path("test2.csv"), name="test2.csv", size_mb=1.5, format="csv" + ) + + state_manager.add_file(file1) + state_manager.add_file(file2) + + # Clear files + state_manager.clear_files() + assert len(state_manager.state.selected_files) == 0 + + def test_state_manager_add_detection_result(self, state_manager): + """Test adding detection results to state.""" + result = DetectionResult( + column="name", + method="column_name", + confidence=0.9, + pii_type="PERSON", + entity_types=["PERSON"], + ) + + state_manager.add_detection_result(result) + assert len(state_manager.state.detection_results) == 1 + assert state_manager.state.detection_results[0].column == "name" + + def test_state_manager_set_user_action(self, state_manager): + """Test setting user actions for columns.""" + state_manager.set_user_action("email", "remove") + state_manager.set_user_action("phone", "encode") + + assert state_manager.state.user_actions["email"] == "remove" + assert state_manager.state.user_actions["phone"] == "encode" + + def test_state_manager_add_error_message(self, state_manager): + """Test adding error messages.""" + state_manager.add_error_message("Test error message") + + assert len(state_manager.state.error_messages) == 1 + assert state_manager.state.error_messages[0] == "Test error message" + + def test_state_manager_add_success_message(self, state_manager): + """Test adding success messages.""" + state_manager.add_success_message("Test success message") + + assert len(state_manager.state.success_messages) == 1 + assert state_manager.state.success_messages[0] == "Test success message" + + def test_state_manager_clear_messages(self, state_manager): + """Test clearing all messages.""" + state_manager.add_error_message("Error 1") + state_manager.add_error_message("Error 2") + state_manager.add_success_message("Success 1") + + state_manager.clear_messages() + + assert len(state_manager.state.error_messages) == 0 + assert len(state_manager.state.success_messages) == 0 + + def test_state_manager_update_progress(self, state_manager): + """Test updating processing progress.""" + state_manager.update_progress( + progress=0.5, + stage="Analyzing columns", + current_file="test.csv", + estimated_time_remaining=120, + ) + + assert state_manager.state.current_progress == 0.5 + assert state_manager.state.processing_stage == "Analyzing columns" + assert state_manager.state.current_file == "test.csv" + assert state_manager.state.estimated_time_remaining == 120 + + def test_state_manager_set_processing(self, state_manager): + """Test setting processing state.""" + state_manager.set_processing(True) + assert state_manager.state.is_processing is True + + state_manager.set_processing(False) + assert state_manager.state.is_processing is False + + def test_state_manager_set_api_key(self, state_manager): + """Test setting API key.""" + state_manager.set_api_key("test_api_key_123") + assert state_manager.state.geonames_api_key == "test_api_key_123" + + def test_state_manager_reset_state(self, state_manager): + """Test resetting state to defaults.""" + # Modify state + state_manager.navigate_to("configuration") + state_manager.add_error_message("Error") + state_manager.set_processing(True) + + # Reset + state_manager.reset_state() + + # Verify reset + assert state_manager.state.current_screen == "dashboard" + assert len(state_manager.state.error_messages) == 0 + assert state_manager.state.is_processing is False + + +class TestFileInfo: + """Test suite for FileInfo dataclass.""" + + def test_file_info_creation(self): + """Test FileInfo creation.""" + file_info = FileInfo( + path=Path("test.csv"), + name="test.csv", + size_mb=2.5, + format="csv", + is_valid=True, + validation_message="File is valid", + ) + + assert file_info.path == Path("test.csv") + assert file_info.name == "test.csv" + assert file_info.size_mb == 2.5 + assert file_info.format == "csv" + assert file_info.is_valid is True + assert file_info.validation_message == "File is valid" + + def test_file_info_invalid_file(self): + """Test FileInfo for invalid file.""" + file_info = FileInfo( + path=Path("invalid.txt"), + name="invalid.txt", + size_mb=0.1, + format="txt", + is_valid=False, + validation_message="Unsupported file format", + ) + + assert file_info.is_valid is False + assert file_info.validation_message == "Unsupported file format" + + +class TestValidationResult: + """Test suite for ValidationResult dataclass.""" + + def test_validation_result_valid(self): + """Test ValidationResult for valid file.""" + result = ValidationResult( + is_valid=True, + message="File is valid", + details={"rows": 100, "columns": 5}, + ) + + assert result.is_valid is True + assert result.message == "File is valid" + assert result.details["rows"] == 100 + assert result.details["columns"] == 5 + + def test_validation_result_invalid(self): + """Test ValidationResult for invalid file.""" + result = ValidationResult( + is_valid=False, + message="File too large", + details={"size_mb": 500, "max_size_mb": 100}, + ) + + assert result.is_valid is False + assert result.message == "File too large" + assert result.details["size_mb"] == 500 + + +class TestDetectionResult: + """Test suite for DetectionResult dataclass.""" + + def test_detection_result_creation(self): + """Test DetectionResult creation.""" + result = DetectionResult( + column="email", + method="format_pattern", + confidence=0.95, + pii_type="EMAIL_ADDRESS", + entity_types=["EMAIL"], + details={"pattern": r".*@.*\..*"}, + ) + + assert result.column == "email" + assert result.method == "format_pattern" + assert result.confidence == 0.95 + assert result.pii_type == "EMAIL_ADDRESS" + assert result.entity_types == ["EMAIL"] + assert "pattern" in result.details + + def test_detection_result_multiple_entity_types(self): + """Test DetectionResult with multiple entity types.""" + result = DetectionResult( + column="contact_info", + method="presidio", + confidence=0.85, + pii_type="MIXED", + entity_types=["PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER"], + details={ + "entities_found": { + "PERSON": 2, + "EMAIL_ADDRESS": 1, + "PHONE_NUMBER": 1, + } + }, + ) + + assert len(result.entity_types) == 3 + assert "PERSON" in result.entity_types + assert "EMAIL_ADDRESS" in result.entity_types + assert "PHONE_NUMBER" in result.entity_types diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..ab90c73 --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,262 @@ +"""Integration tests for PII detection using real test datasets.""" + +from pathlib import Path + +import pandas as pd +import pytest + +from pii_detector.core.processor import ( + find_piis_based_on_column_format, + find_piis_based_on_column_name, + find_piis_based_on_locations_population, + find_piis_based_on_sparse_entries, + import_dataset, +) +from pii_detector.data import constants + + +class TestDatasetIntegration: + """Integration tests using real test datasets.""" + + @pytest.fixture + def test_data_dir(self): + """Get path to test data directory.""" + return Path(__file__).parent / "data" + + @pytest.fixture + def pii_dataset_path(self, test_data_dir): + """Get path to PII-containing test dataset.""" + return test_data_dir / "sample_pii_data.csv" + + @pytest.fixture + def clean_dataset_path(self, test_data_dir): + """Get path to clean test dataset.""" + return test_data_dir / "clean_data.csv" + + def test_import_pii_dataset(self, pii_dataset_path): + """Test importing PII dataset.""" + success, result = import_dataset(str(pii_dataset_path)) + + assert success is True + assert isinstance(result, list) + assert len(result) == 4 # [dataset, path, label_dict, value_label_dict] + + dataset, path, label_dict, value_label_dict = result + assert isinstance(dataset, pd.DataFrame) + assert len(dataset) == 4 # 4 rows + assert len(dataset.columns) == 13 # 13 columns + + # Check expected columns are present + expected_columns = [ + "participant_id", + "first_name", + "email", + "phone_number", + "deviceid", + ] + for col in expected_columns: + assert col in dataset.columns + + def test_import_clean_dataset(self, clean_dataset_path): + """Test importing clean dataset.""" + success, result = import_dataset(str(clean_dataset_path)) + + assert success is True + dataset, _, _, _ = result + assert len(dataset) == 8 # 8 rows + assert len(dataset.columns) == 8 # 8 columns + + def test_column_name_detection_on_pii_data(self, pii_dataset_path): + """Test column name detection on PII-containing dataset.""" + success, result = import_dataset(str(pii_dataset_path)) + assert success is True + + dataset, _, label_dict, _ = result + + # Test column name detection + pii_columns = find_piis_based_on_column_name( + dataset, label_dict or {}, constants.ENGLISH, "USA", constants.STRICT + ) + + # Should detect obvious PII columns + expected_pii_columns = [ + "email", + "deviceid", + "gps_lat", + "gps_lon", + ] # GPS coordinates and deviceid are in restricted words + + # Check that at least some expected columns are detected + detected_count = sum(1 for col in expected_pii_columns if col in pii_columns) + assert detected_count > 0, ( + f"Expected to detect some of {expected_pii_columns}, got {pii_columns}" + ) + + def test_column_name_detection_on_clean_data(self, clean_dataset_path): + """Test column name detection on clean dataset.""" + success, result = import_dataset(str(clean_dataset_path)) + assert success is True + + dataset, _, label_dict, _ = result + + pii_columns = find_piis_based_on_column_name( + dataset, label_dict or {}, constants.ENGLISH, "USA", constants.STRICT + ) + + # Clean dataset should have fewer or no PII detections based on column names + assert len(pii_columns) <= 1, ( + f"Clean dataset shouldn't have many PII columns, got {pii_columns}" + ) + + def test_format_detection_on_pii_data(self, pii_dataset_path): + """Test format pattern detection on PII-containing dataset.""" + success, result = import_dataset(str(pii_dataset_path)) + assert success is True + + dataset, _, _, _ = result + + format_piis = find_piis_based_on_column_format(dataset) + + # Should detect email and phone number formats + # Note: The exact detection depends on the patterns and thresholds + assert isinstance(format_piis, list) + # Could detect email, phone_number columns based on format + potential_format_columns = ["email", "phone_number", "date_of_birth"] + + # At least one format-based detection should occur + if len(format_piis) > 0: + assert any(col in potential_format_columns for col in format_piis), ( + f"Expected format detection in {potential_format_columns}, got {format_piis}" + ) + + def test_sparsity_detection_on_pii_data(self, pii_dataset_path): + """Test sparsity detection on PII-containing dataset.""" + success, result = import_dataset(str(pii_dataset_path)) + assert success is True + + dataset, _, _, _ = result + + sparse_piis = find_piis_based_on_sparse_entries(dataset, sparse_threshold=0.7) + + # With small test dataset, most columns will be sparse (all unique values) + # Expected sparse columns: first_name, last_name, email, phone_number, address, etc. + assert isinstance(sparse_piis, list) + assert len(sparse_piis) > 0, "Should detect some sparse columns in PII dataset" + + def test_sparsity_detection_on_clean_data(self, clean_dataset_path): + """Test sparsity detection on clean dataset.""" + success, result = import_dataset(str(clean_dataset_path)) + assert success is True + + dataset, _, _, _ = result + + sparse_piis = find_piis_based_on_sparse_entries(dataset, sparse_threshold=0.8) + + # Clean dataset has some repeated values, should have fewer sparse columns + assert isinstance(sparse_piis, list) + # Most columns in clean dataset have unique values per row, so might still be sparse + + @pytest.mark.integration + @pytest.mark.slow + def test_location_detection_on_pii_data(self, pii_dataset_path): + """Test location population detection (marked as slow due to API calls).""" + success, result = import_dataset(str(pii_dataset_path)) + assert success is True + + dataset, _, _, _ = result + + # This test makes actual API calls, so it's marked as slow + # In practice, you'd mock these calls for faster testing + location_piis = find_piis_based_on_locations_population( + dataset, population_threshold=50000 + ) + + # Should return a list (might be empty if API calls fail or locations are large) + assert isinstance(location_piis, list) + + def test_full_pii_detection_workflow(self, pii_dataset_path): + """Test complete PII detection workflow.""" + success, result = import_dataset(str(pii_dataset_path)) + assert success is True + + dataset, _, label_dict, _ = result + + # Run all detection methods + all_pii_candidates = [] + + # Column name detection + column_name_piis = find_piis_based_on_column_name( + dataset, label_dict or {}, constants.ENGLISH, "USA", constants.STRICT + ) + all_pii_candidates.extend([(col, "Column Name") for col in column_name_piis]) + + # Format detection + format_piis = find_piis_based_on_column_format(dataset) + all_pii_candidates.extend([(col, "Format") for col in format_piis]) + + # Sparsity detection + sparse_piis = find_piis_based_on_sparse_entries(dataset) + all_pii_candidates.extend([(col, "Sparse") for col in sparse_piis]) + + # Combine results + unique_piis = {} + for col, method in all_pii_candidates: + if col not in unique_piis: + unique_piis[col] = [method] + else: + unique_piis[col].append(method) + + # Should detect multiple PII columns using various methods + assert len(unique_piis) > 0, "Should detect some PII in the test dataset" + assert len(unique_piis) < len(dataset.columns), ( + "Shouldn't flag ALL columns as PII" + ) + + # Verify that known PII columns are detected by at least one method + known_pii_indicators = ["email", "first_name", "last_name", "phone_number"] + detected_pii_indicators = sum( + 1 for indicator in known_pii_indicators if indicator in unique_piis + ) + + assert detected_pii_indicators > 0, ( + f"Should detect some known PII indicators from {known_pii_indicators}" + ) + + def test_clean_dataset_workflow(self, clean_dataset_path): + """Test PII detection on clean dataset (should detect fewer PIIs).""" + success, result = import_dataset(str(clean_dataset_path)) + assert success is True + + dataset, _, label_dict, _ = result + + # Run all detection methods with appropriate thresholds for clean data + column_name_piis = find_piis_based_on_column_name( + dataset, label_dict or {}, constants.ENGLISH, "USA", constants.STRICT + ) + format_piis = find_piis_based_on_column_format(dataset) + # Use higher threshold for sparsity to reduce false positives on clean data + sparse_piis = find_piis_based_on_sparse_entries(dataset, sparse_threshold=0.95) + + # Clean dataset should have no column name matches (no restricted words) + assert len(column_name_piis) == 0, ( + f"Clean dataset shouldn't match restricted words, got {column_name_piis}" + ) + + # Should have minimal format detections (dates might still be detected) + assert len(format_piis) <= 2, ( + f"Clean dataset should have few format detections, got {format_piis}" + ) + + # With more data and higher threshold, should have fewer sparse detections + print(f"Sparse detections: {sparse_piis}") + print(f"Dataset shape: {dataset.shape}") + + # The key insight: clean datasets have more repeated values, less sparsity + total_detections = len(set(column_name_piis + format_piis + sparse_piis)) + assert total_detections < len(dataset.columns), ( + f"Should not flag ALL columns as PII in clean dataset, got {total_detections}/{len(dataset.columns)}" + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_presidio_integration.py b/tests/test_presidio_integration.py new file mode 100644 index 0000000..8c03fa9 --- /dev/null +++ b/tests/test_presidio_integration.py @@ -0,0 +1,456 @@ +"""Tests for Presidio integration functionality.""" + +from unittest.mock import Mock, patch + +import pandas as pd +import pytest + +from pii_detector.core.hybrid_anonymizer import ( + HybridAnonymizer, + anonymize_dataset_hybrid, +) +from pii_detector.core.presidio_engine import ( + PresidioTextAnalyzer, + get_presidio_analyzer, + presidio_analyze_dataframe_batch, + presidio_analyze_text_column, + presidio_anonymize_dataframe_batch, + presidio_anonymize_text_column, +) +from pii_detector.core.unified_processor import ( + PIIDetectionResult, + UnifiedPIIProcessor, + detect_pii_unified, +) + + +class TestPresidioTextAnalyzer: + """Test the Presidio text analyzer wrapper.""" + + def test_analyzer_initialization(self): + """Test analyzer initialization with graceful degradation.""" + analyzer = PresidioTextAnalyzer() + # Should not raise error even if Presidio not available + assert analyzer is not None + assert isinstance(analyzer.available, bool) + + def test_analyze_text_without_presidio(self): + """Test text analysis when Presidio is not available.""" + with patch("pii_detector.core.presidio_engine.PRESIDIO_AVAILABLE", False): + analyzer = PresidioTextAnalyzer() + result = analyzer.analyze_text("John Doe's email is john@example.com") + assert result == [] + + @pytest.mark.skipif( + True, reason="Requires Presidio installation - integration test only" + ) + def test_analyze_text_with_presidio(self): + """Test text analysis when Presidio is available (integration test).""" + analyzer = PresidioTextAnalyzer() + if analyzer.is_available(): + result = analyzer.analyze_text("John Doe's email is john@example.com") + assert isinstance(result, list) + # Should detect PERSON and EMAIL_ADDRESS entities + + def test_analyze_column_text_empty_data(self): + """Test column analysis with empty data.""" + analyzer = PresidioTextAnalyzer() + empty_series = pd.Series([None, "", " "]) + result = analyzer.analyze_column_text(empty_series) + + expected_keys = [ + "presidio_available", + "entities_found", + "total_detections", + "confidence_scores", + "sample_analyzed", + ] + for key in expected_keys: + assert key in result + assert result["total_detections"] == 0 + assert result["sample_analyzed"] == 0 + + def test_anonymize_text_without_presidio(self): + """Test text anonymization fallback when Presidio not available.""" + with patch("pii_detector.core.presidio_engine.PRESIDIO_AVAILABLE", False): + analyzer = PresidioTextAnalyzer() + text = "Contact John at john@example.com" + result = analyzer.anonymize_text(text) + # Should return original text when Presidio not available + assert result == text + + def test_get_supported_entities_without_presidio(self): + """Test getting supported entities when Presidio not available.""" + with patch("pii_detector.core.presidio_engine.PRESIDIO_AVAILABLE", False): + analyzer = PresidioTextAnalyzer() + entities = analyzer.get_supported_entities() + assert entities == [] + + def test_singleton_analyzer(self): + """Test that get_presidio_analyzer returns singleton instance.""" + analyzer1 = get_presidio_analyzer() + analyzer2 = get_presidio_analyzer() + assert analyzer1 is analyzer2 + + def test_presidio_analyze_text_column_convenience(self): + """Test convenience function for column analysis.""" + test_data = pd.Series(["John Doe", "jane@example.com", "555-123-4567"]) + result = presidio_analyze_text_column(test_data) + + # Should return analysis dictionary + assert isinstance(result, dict) + assert "presidio_available" in result + + def test_presidio_anonymize_text_column_convenience(self): + """Test convenience function for column anonymization.""" + test_data = pd.Series(["John Doe", "jane@example.com", "normal text"]) + result = presidio_anonymize_text_column(test_data) + + # Should return pandas Series + assert isinstance(result, pd.Series) + assert len(result) == len(test_data) + + def test_analyze_column_text_with_batch_size(self): + """Test column text analysis with batch size parameter.""" + analyzer = PresidioTextAnalyzer() + test_data = pd.Series(["John Doe"] * 20) # Larger dataset + + # Test with batch processing + result = analyzer.analyze_column_text( + test_data, confidence_threshold=0.7, sample_size=20, batch_size=5 + ) + + # Should return analysis results + assert isinstance(result, dict) + assert "presidio_available" in result + + def test_presidio_analyze_dataframe_batch_function(self): + """Test DataFrame batch analysis function.""" + df = pd.DataFrame( + { + "name": ["John Doe", "Jane Smith"], + "email": ["john@test.com", "jane@test.com"], + "age": [25, 30], + } + ) + + # Mock to avoid external dependencies in unit tests + with patch( + "pii_detector.core.presidio_engine.get_presidio_analyzer" + ) as mock_get: + mock_analyzer = Mock() + mock_analyzer.is_available.return_value = False + mock_get.return_value = mock_analyzer + + result = presidio_analyze_dataframe_batch(df) + assert result == {} + + def test_presidio_anonymize_dataframe_batch_function(self): + """Test DataFrame batch anonymization function.""" + df = pd.DataFrame( + { + "name": ["John Doe", "Jane Smith"], + "email": ["john@test.com", "jane@test.com"], + "notes": ["Contact info", "Personal data"], + } + ) + + # Mock to avoid external dependencies + with patch( + "pii_detector.core.presidio_engine.get_presidio_analyzer" + ) as mock_get: + mock_analyzer = Mock() + mock_analyzer.is_available.return_value = False + mock_get.return_value = mock_analyzer + + result = presidio_anonymize_dataframe_batch(df, ["name", "email"]) + + # Should return original DataFrame when Presidio not available + assert result.equals(df) + + +class TestUnifiedPIIProcessor: + """Test the unified PII processor.""" + + def test_processor_initialization(self): + """Test processor initialization.""" + processor = UnifiedPIIProcessor() + assert processor is not None + assert processor.language == "en" + assert processor.presidio_analyzer is not None + + def test_pii_detection_result(self): + """Test PIIDetectionResult class.""" + result = PIIDetectionResult( + column_name="email_col", + detection_method="presidio_text_analysis", + confidence=0.85, + entity_types=["EMAIL_ADDRESS"], + details={"sample_size": 10}, + ) + + assert result.column_name == "email_col" + assert result.confidence == 0.85 + assert "EMAIL_ADDRESS" in result.entity_types + assert result.details["sample_size"] == 10 + + def test_detect_pii_comprehensive_basic(self): + """Test comprehensive PII detection with basic dataset.""" + # Create test dataset + data = { + "name": ["John Doe", "Jane Smith", "Bob Johnson"], + "email": ["john@test.com", "jane@test.com", "bob@test.com"], + "age": [25, 30, 35], + "notes": ["Some notes", "More notes", "Extra info"], + } + df = pd.DataFrame(data) + + processor = UnifiedPIIProcessor() + results = processor.detect_pii_comprehensive(df) + + # Should return detection results + assert isinstance(results, dict) + + # Check that high-confidence detections include likely PII columns + processor.get_high_confidence_detections(results, threshold=0.7) + # At minimum, email format should be detected + + # Test summary generation + summary = processor.get_detection_summary(results) + assert "total_detections" in summary + assert isinstance(summary["total_detections"], int) + + def test_default_config(self): + """Test default configuration settings.""" + processor = UnifiedPIIProcessor() + config = processor._get_default_config() + + expected_keys = [ + "use_column_name_detection", + "use_format_detection", + "use_sparsity_detection", + "use_presidio_detection", + ] + for key in expected_keys: + assert key in config + assert isinstance(config[key], bool) + + def test_detect_pii_unified_convenience(self): + """Test convenience function for unified detection.""" + data = {"email": ["test@example.com", "user@test.org"]} + df = pd.DataFrame(data) + + results = detect_pii_unified(df) + assert isinstance(results, dict) + + def test_combine_detection_results(self): + """Test combining structural and text detection results.""" + processor = UnifiedPIIProcessor() + + structural_result = PIIDetectionResult( + column_name="test_col", + detection_method="column_name_matching", + confidence=0.8, + ) + + text_result = PIIDetectionResult( + column_name="test_col", + detection_method="presidio_text_analysis", + confidence=0.9, + entity_types=["EMAIL_ADDRESS"], + ) + + config = processor._get_default_config() + combined = processor._combine_detection_results( + "test_col", structural_result, text_result, config + ) + + assert combined is not None + assert combined.detection_method == "hybrid_detection" + assert "EMAIL_ADDRESS" in combined.entity_types + # Confidence should be weighted average + expected_conf = 0.8 * 0.6 + 0.9 * 0.4 # default weights + assert abs(combined.confidence - expected_conf) < 0.01 + + +class TestHybridAnonymizer: + """Test the hybrid anonymizer.""" + + def test_anonymizer_initialization(self): + """Test anonymizer initialization.""" + anonymizer = HybridAnonymizer() + assert anonymizer is not None + assert anonymizer.current_techniques is not None + assert anonymizer.presidio_analyzer is not None + + def test_anonymize_dataset_basic(self): + """Test basic dataset anonymization.""" + # Create test dataset + data = { + "name": ["John Doe", "Jane Smith"], + "email": ["john@test.com", "jane@test.com"], + "age": [25, 30], + } + df = pd.DataFrame(data) + pii_columns = ["name", "email"] + + anonymizer = HybridAnonymizer() + anonymized_df, report = anonymizer.anonymize_dataset(df, pii_columns) + + # Should return anonymized dataset and report + assert isinstance(anonymized_df, pd.DataFrame) + assert isinstance(report, dict) + assert anonymized_df.shape == df.shape + + # Check report structure + assert "original_shape" in report + assert "columns_processed" in report + assert "methods_applied" in report + + def test_determine_anonymization_method(self): + """Test method determination logic.""" + anonymizer = HybridAnonymizer() + + # Test with format patterns detection + detection_result = PIIDetectionResult( + column_name="phone", detection_method="format_patterns", confidence=0.9 + ) + + test_series = pd.Series(["555-123-4567", "555-987-6543"]) + method = anonymizer._determine_anonymization_method( + test_series, detection_result, {} + ) + + assert method == "text_masking" + + def test_get_available_methods(self): + """Test getting available anonymization methods.""" + anonymizer = HybridAnonymizer() + methods = anonymizer.get_available_methods() + + # Should include standard methods + expected_methods = [ + "remove", + "hash_pseudonymization", + "age_categorization", + "text_masking", + "add_noise", + ] + + for method in expected_methods: + assert method in methods + assert "description" in methods[method] + assert "suitable_for" in methods[method] + + def test_anonymize_text_content(self): + """Test text content anonymization.""" + anonymizer = HybridAnonymizer() + text = "Contact John Doe at john@example.com or 555-123-4567" + + # Should handle gracefully whether Presidio is available or not + result = anonymizer.anonymize_text_content(text) + assert isinstance(result, str) + assert len(result) > 0 + + def test_anonymize_dataset_hybrid_convenience(self): + """Test convenience function for hybrid anonymization.""" + data = {"email": ["test@example.com", "user@test.org"]} + df = pd.DataFrame(data) + + anonymized_df, report = anonymize_dataset_hybrid(df, ["email"]) + assert isinstance(anonymized_df, pd.DataFrame) + assert isinstance(report, dict) + + +class TestPresidioIntegrationEnd2End: + """End-to-end integration tests.""" + + def test_full_pipeline_without_presidio(self): + """Test full pipeline when Presidio is not available.""" + # Create test dataset + data = { + "participant_name": ["John Doe", "Jane Smith", "Bob Wilson"], + "email_address": ["john@test.com", "jane@test.com", "bob@test.com"], + "phone_number": ["555-0123", "555-0456", "555-0789"], + "age": [25, 30, 35], + "comments": ["Good participant", "Very helpful", "Cooperative"], + } + df = pd.DataFrame(data) + + # Step 1: Detection + processor = UnifiedPIIProcessor() + detection_results = processor.detect_pii_comprehensive(df) + + # Step 2: Anonymization + anonymizer = HybridAnonymizer() + anonymized_df, report = anonymizer.anonymize_dataset(df, detection_results) + + # Verify pipeline completed + assert isinstance(detection_results, dict) + assert isinstance(anonymized_df, pd.DataFrame) + assert isinstance(report, dict) + assert anonymized_df.shape[0] == df.shape[0] # Same number of rows + + def test_configuration_options(self): + """Test various configuration options.""" + data = {"test_col": ["test data"]} + df = pd.DataFrame(data) + + # Custom detection config + detection_config = { + "use_presidio_detection": False, # Disable Presidio + "use_column_name_detection": True, + "column_name_confidence": 0.9, + } + + processor = UnifiedPIIProcessor() + results = processor.detect_pii_comprehensive( + df, detection_config=detection_config + ) + + # Custom anonymization config + anonymization_config = {"test_col": {"method": "hash_pseudonymization"}} + + anonymizer = HybridAnonymizer() + anonymized_df, report = anonymizer.anonymize_dataset( + df, ["test_col"], anonymization_config + ) + + # Should complete without errors + assert isinstance(results, dict) + assert isinstance(anonymized_df, pd.DataFrame) + + @pytest.mark.slow + def test_large_dataset_performance(self): + """Test performance with larger dataset.""" + import numpy as np + + # Create larger test dataset + size = 1000 + data = { + "id": range(size), + "name": [f"Person_{i}" for i in range(size)], + "email": [f"person{i}@test.com" for i in range(size)], + "notes": [f"Note about person {i}" for i in range(size)], + "value": np.random.randint(1, 100, size), + } + df = pd.DataFrame(data) + + # Run detection + processor = UnifiedPIIProcessor() + detection_results = processor.detect_pii_comprehensive(df) + + # Run anonymization + anonymizer = HybridAnonymizer() + anonymized_df, report = anonymizer.anonymize_dataset(df, detection_results) + + # Verify results + assert len(anonymized_df) == size + assert "columns_processed" in report + + # Performance should be reasonable (this is a smoke test) + assert len(detection_results) >= 0 # At least runs without error + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/test_processor.py b/tests/test_processor.py new file mode 100644 index 0000000..186fd3b --- /dev/null +++ b/tests/test_processor.py @@ -0,0 +1,129 @@ +"""Tests for the core processor module.""" + +import pandas as pd +import pytest + +from pii_detector.core.processor import ( + clean_column, + column_is_sparse, + remove_other_refuse_and_dont_know, + word_match, +) +from pii_detector.data.constants import FUZZY, STRICT + + +class TestWordMatch: + """Test word matching functionality.""" + + def test_strict_match_exact(self): + """Test strict matching with exact match.""" + assert word_match("name", "name", STRICT) is True + + def test_strict_match_case_insensitive(self): + """Test strict matching is case insensitive.""" + assert word_match("NAME", "name", STRICT) is True + assert word_match("name", "NAME", STRICT) is True + + def test_strict_match_no_match(self): + """Test strict matching with no match.""" + assert word_match("first_name", "name", STRICT) is False + + def test_fuzzy_match_contained(self): + """Test fuzzy matching with contained word.""" + assert word_match("first_name", "name", FUZZY) is True + assert word_match("lastname", "name", FUZZY) is True + + def test_fuzzy_match_case_insensitive(self): + """Test fuzzy matching is case insensitive.""" + assert word_match("FIRST_NAME", "name", FUZZY) is True + + def test_fuzzy_match_no_match(self): + """Test fuzzy matching with no match.""" + assert word_match("age", "name", FUZZY) is False + + +class TestColumnCleaning: + """Test column cleaning functionality.""" + + def test_remove_other_refuse_and_dont_know(self): + """Test removal of survey response codes.""" + # Create test series with survey codes + test_data = pd.Series(["answer1", "999", "-999", "answer2", "777"]) + result = remove_other_refuse_and_dont_know(test_data) + + # Should remove 999, -999, 777 (3-digit repeated numbers) + expected_values = ["answer1", "answer2"] + assert list(result) == expected_values + + def test_clean_column_basic(self): + """Test basic column cleaning.""" + # Create test series with NaN, empty string, and survey codes + test_data = pd.Series(["valid1", None, "", "999", "valid2", "-777"]) + result = clean_column(test_data) + + # Should keep only valid entries + expected_values = ["valid1", "valid2"] + assert list(result) == expected_values + + def test_column_is_sparse_high_sparsity(self): + """Test sparse column detection with high sparsity.""" + # Create dataset with mostly unique values + test_df = pd.DataFrame( + {"sparse_col": ["value1", "value2", "value3", "value4", "value5"]} + ) + + # With threshold 0.8, should be considered sparse (5/5 = 1.0 > 0.8) + assert column_is_sparse(test_df, "sparse_col", 0.8) is True + + def test_column_is_sparse_low_sparsity(self): + """Test sparse column detection with low sparsity.""" + # Create dataset with repeated values + test_df = pd.DataFrame( + {"dense_col": ["value1", "value1", "value1", "value2", "value2"]} + ) + + # With threshold 0.8, should not be considered sparse (2/5 = 0.4 < 0.8) + assert column_is_sparse(test_df, "dense_col", 0.8) is False + + +class TestImportDataset: + """Test dataset import functionality.""" + + def test_unsupported_file_format(self): + """Test handling of unsupported file formats.""" + from pii_detector.core.processor import import_dataset + + success, result = import_dataset("test.txt") + assert success is False + assert "Supported files are" in result + + +# Integration test example +@pytest.mark.integration +def test_basic_workflow(): + """Test basic PII detection workflow.""" + # Create a simple test dataset + test_df = pd.DataFrame( + { + "name": ["John Doe", "Jane Smith", "Bob Johnson"], + "age": [25, 30, 35], + "email": ["john@email.com", "jane@email.com", "bob@email.com"], + "id": [1, 2, 3], + } + ) + + # Test that we can identify sparse columns + # Email column should be considered sparse (all unique values) + assert column_is_sparse(test_df, "email", 0.5) is True + + # Age column should not be sparse with this data + assert ( + column_is_sparse(test_df, "age", 0.5) is True + ) # Actually sparse in this small example + + # Name column should be sparse (all unique) + assert column_is_sparse(test_df, "name", 0.5) is True + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/test_runner.py b/tests/test_runner.py new file mode 100644 index 0000000..a7228f0 --- /dev/null +++ b/tests/test_runner.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +""" +Test runner for batch processing functionality. + +This script runs a subset of the batch processing tests to verify +that the new functionality works correctly without requiring external dependencies. +""" + +import sys +from pathlib import Path + +# Add src to path +sys.path.append(str(Path(__file__).parent.parent / "src")) + +# Import test modules + + +def run_basic_tests(): + """Run basic batch processing functionality tests.""" + + print("Running basic functionality tests...") + print("=" * 60) + + try: + import pandas as pd + + from pii_detector.core.batch_processor import BatchPIIProcessor + + # Test 1: Basic initialization + print("Test 1: Basic initialization...") + processor = BatchPIIProcessor(use_structured_engine=False) + assert processor.chunk_size == 1000 + assert processor.max_workers == 4 + print("[OK] Basic initialization passed") + + # Test 2: Processing strategy selection + print("Test 2: Processing strategy selection...") + small_df = pd.DataFrame({"col": range(100)}) + large_df = pd.DataFrame({"col": range(5000)}) + + small_strategy = processor.get_processing_strategy(small_df) + large_strategy = processor.get_processing_strategy(large_df) + + assert small_strategy == "standard_processing" + assert large_strategy == "chunked_processing" + print("[OK] Processing strategy selection passed") + + # Test 3: Time estimation + print("Test 3: Time estimation...") + estimates = processor.estimate_processing_time(small_df) + assert isinstance(estimates, dict) + assert "standard_processing" in estimates + assert "chunked_processing" in estimates + print("[OK] Time estimation passed") + + # Test 4: Basic detection without external dependencies + print("Test 4: Basic detection...") + test_df = pd.DataFrame( + { + "email_column": ["test@example.com", "user@test.org"], + "numeric_column": [1, 2], + } + ) + + # This should work with basic structural detection + results = processor.detect_pii_batch(test_df) + assert isinstance(results, dict) + print("[OK] Basic detection passed") + + print("\n[SUCCESS] All basic functionality tests passed!") + return True + + except Exception as e: + print(f"\n[ERROR] Test failed: {e}") + import traceback + + traceback.print_exc() + return False + + +def check_imports(): + """Check that all batch processing modules can be imported.""" + print("Checking batch processing imports...") + + try: + from pii_detector.core.batch_processor import BatchPIIProcessor + + print("[OK] batch_processor module imported successfully") + + print("[OK] Enhanced presidio_engine functions imported successfully") + + # Test basic initialization without structured engine + processor = BatchPIIProcessor(use_structured_engine=False) + print( + f"[OK] BatchPIIProcessor initialized (chunk_size: {processor.chunk_size})" + ) + + # Check strategy selection + import pandas as pd + + small_df = pd.DataFrame({"col": range(100)}) + strategy = processor.get_processing_strategy(small_df) + print(f"[OK] Processing strategy selection works: {strategy}") + + return True + + except Exception as e: + print(f"[ERROR] Import error: {e}") + import traceback + + traceback.print_exc() + return False + + +def main(): + """Main test runner function.""" + print("Batch Processing Test Suite") + print("=" * 60) + + # Check imports first + if not check_imports(): + print("\n[ERROR] Import checks failed. Cannot proceed with tests.") + return False + + print("\n" + "=" * 60) + + # Run basic tests + success = run_basic_tests() + + # Final summary + print("\n" + "=" * 60) + if success: + print("[SUCCESS] Batch processing functionality is working correctly!") + print("\nNext steps:") + print("- Run full test suite: uv run pytest tests/") + print("- Try the batch demo: just run-batch-demo") + print("- Install batch dependencies: just install-presidio-batch") + else: + print("[WARNING] Some issues were found. Please check the test output above.") + + return success + + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1)