11# privacy_metrics.py
2-
32import numpy as np
43import pandas as pd
54from sklearn .neighbors import NearestNeighbors
5+ from sklearn .preprocessing import MinMaxScaler , OneHotEncoder
66
77class DisclosureProtection :
88 """
99 A class to compute the disclosure protection metric for synthetic data.
1010
11- The metric is defined as 1 minus the proportion of synthetic records that are too similar
12- (i.e. within a risk threshold) to a record in the real dataset .
11+ This metric measures the proportion of synthetic records that are too similar
12+ (within a defined threshold) to real records, posing a disclosure risk .
1313
1414 Parameters
1515 ----------
1616 real_data : pd.DataFrame
17- A DataFrame containing the real data. The data should be numeric or preprocessed .
17+ A DataFrame containing the real data. Supports both numerical and categorical features .
1818 synthetic_data : pd.DataFrame
19- A DataFrame containing the synthetic data (with the same columns as real_data).
19+ A DataFrame containing the synthetic data (with the same structure as real_data).
2020 threshold : float, optional
2121 A distance threshold under which a synthetic record is considered a potential disclosure risk.
2222 If not provided, it is computed as the 10th percentile of the nearest-neighbor distances among real records.
2323 """
24-
24+
2525 def __init__ (self , real_data : pd .DataFrame , synthetic_data : pd .DataFrame , threshold : float = None ):
2626 self .real_data = real_data .copy ()
2727 self .synthetic_data = synthetic_data .copy ()
2828 self .threshold = threshold
29+
30+ # Preprocess data for distance computation
31+ self .real_data , self .synthetic_data = self ._preprocess_data (self .real_data , self .synthetic_data )
32+
33+ # Compute distance threshold if not provided
2934 self ._compute_threshold ()
3035
36+ def _preprocess_data (self , real_data : pd .DataFrame , synthetic_data : pd .DataFrame ):
37+ """
38+ Preprocess both real and synthetic datasets:
39+ - Standardize numerical columns
40+ - One-hot encode categorical columns
41+ - Align columns to ensure consistency
42+ """
43+
44+ # Identify numerical and categorical columns
45+ categorical_cols = real_data .select_dtypes (include = ["object" , "category" ]).columns .tolist ()
46+ numerical_cols = real_data .select_dtypes (include = [np .number ]).columns .tolist ()
47+
48+ # One-Hot Encode Categorical Columns
49+ if categorical_cols :
50+ encoder = OneHotEncoder (sparse_output = True , drop = "first" , handle_unknown = "ignore" )
51+ real_cats = encoder .fit_transform (real_data [categorical_cols ])
52+ synthetic_cats = encoder .transform (synthetic_data [categorical_cols ])
53+
54+ # Convert to DataFrame
55+ real_cat_df = pd .DataFrame (real_cats .toarray (), columns = encoder .get_feature_names_out (categorical_cols ))
56+ synthetic_cat_df = pd .DataFrame (synthetic_cats .toarray (), columns = encoder .get_feature_names_out (categorical_cols ))
57+
58+ # Drop original categorical columns and replace with encoded versions
59+ real_data = real_data .drop (columns = categorical_cols )
60+ synthetic_data = synthetic_data .drop (columns = categorical_cols )
61+ real_data = pd .concat ([real_data , real_cat_df ], axis = 1 )
62+ synthetic_data = pd .concat ([synthetic_data , synthetic_cat_df ], axis = 1 )
63+
64+ # Standardize numerical features
65+ if numerical_cols :
66+ scaler = MinMaxScaler ()
67+ real_data [numerical_cols ] = scaler .fit_transform (real_data [numerical_cols ])
68+ synthetic_data [numerical_cols ] = scaler .transform (synthetic_data [numerical_cols ])
69+
70+ # Align columns (in case some categories exist in one dataset but not the other)
71+ real_data , synthetic_data = real_data .align (synthetic_data , join = "left" , axis = 1 , fill_value = 0 )
72+
73+ return real_data , synthetic_data
74+
3175 def _compute_threshold (self ):
3276 """
3377 Compute the threshold if not provided. Uses the 10th percentile of the nearest-neighbor
3478 distances among real records (excluding self-distance).
3579 """
3680 if self .threshold is None :
37- # Fit a nearest neighbor model on the real data.
38- # n_neighbors=2 because the closest neighbor of a record is itself.
3981 nn = NearestNeighbors (n_neighbors = 2 )
4082 nn .fit (self .real_data )
4183 distances , _ = nn .kneighbors (self .real_data )
42- # distances[:, 1] are the distances to the closest distinct record.
43- self .threshold = np .percentile (distances [:, 1 ], 10 )
44-
84+ self .threshold = np .percentile (distances [:, 1 ], 10 ) # Exclude self-distance
85+
4586 def score (self ) -> float :
4687 """
4788 Compute the disclosure protection score.
48-
49- For each synthetic record, compute its distance to the nearest real record.
50- The risk rate is the proportion of synthetic records with distance below the threshold.
51- The disclosure protection score is 1 - risk_rate (higher is better).
5289
5390 Returns
5491 -------
@@ -61,7 +98,7 @@ def score(self) -> float:
6198 distances = distances .flatten ()
6299 risk_count = np .sum (distances < self .threshold )
63100 risk_rate = risk_count / len (distances )
64- return 1 - risk_rate
101+ return 1 - risk_rate # Higher score means better protection
65102
66103 def report (self ) -> dict :
67104 """
@@ -79,6 +116,7 @@ def report(self) -> dict:
79116 risk_count = np .sum (distances < self .threshold )
80117 risk_rate = risk_count / len (distances )
81118 score = 1 - risk_rate
119+
82120 return {
83121 "threshold" : self .threshold ,
84122 "risk_rate" : risk_rate ,
0 commit comments