Skip to content

Commit 3201629

Browse files
committed
update law school admision notebook
update different functions so that the example notebook of law school works.
1 parent e9a14a9 commit 3201629

File tree

8 files changed

+139
-73
lines changed

8 files changed

+139
-73
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,4 +93,7 @@ ehthumbs.db
9393
# Other
9494
*.env
9595
*.env.*
96-
.envrc
96+
.envrc
97+
datasets/data/bar_pass_prediction.csv
98+
example_notebooks/CART_LawSchoolAdmissionBar.ipynb
99+
example_notebooks/helper_functions.py

synthpop/method/GC.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -334,11 +334,12 @@ def _rebuild_gaussian_copula(self, model_parameters: Dict[str, Any], default_par
334334
univariates.append(univariate)
335335
model_parameters["univariates"] = univariates
336336
model_parameters["columns"] = columns
337-
correlation = model_parameters.get("correlation")
337+
correlation = model_parameters.get('correlation')
338338
if correlation:
339-
model_parameters["correlation"] = self._rebuild_correlation_matrix(correlation)
339+
model_parameters['correlation'] = (
340+
self._rebuild_correlation_matrix(correlation))
340341
else:
341-
model_parameters["correlation"] = [[1.0]]
342+
model_parameters['correlation'] = [[1.0]]
342343
return model_parameters
343344

344345
@classmethod

synthpop/metrics/diagnostic_report.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def generate_report(self) -> pd.DataFrame:
8585
col_report["range_coverage"] = range_coverage(real, synthetic)
8686
col_report["boundary_adherence"] = boundary_adherence(real, synthetic)
8787
col_report["ks_complement"] = ks_complement(real, synthetic)
88-
col_report["tv_complement"] = tv_complement(real, synthetic)
88+
col_report["tv_complement"] = "N/A"
8989
col_report["statistic_similarity"] = statistic_similarity(real, synthetic)
9090
col_report["category_coverage"] = "N/A"
9191
col_report["category_adherence"] = "N/A"
@@ -95,7 +95,7 @@ def generate_report(self) -> pd.DataFrame:
9595
col_report["range_coverage"] = "N/A"
9696
col_report["boundary_adherence"] = "N/A"
9797
col_report["ks_complement"] = "N/A"
98-
col_report["tv_complement"] = "N/A"
98+
col_report["tv_complement"] = tv_complement(real, synthetic)
9999
col_report["statistic_similarity"] = "N/A"
100100
col_report["category_coverage"] = category_coverage(real, synthetic)
101101
col_report["category_adherence"] = category_adherence(real, synthetic)

synthpop/metrics/efficacy_metrics.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,18 @@ def evaluate(self, real_df: pd.DataFrame, synthetic_df: pd.DataFrame) -> dict:
7575
X_real = real_df.drop(columns=[self.target_column])
7676
y_real = real_df[self.target_column]
7777

78-
# For the purposes of efficacy metrics, we train on synthetic data and test on real data.
78+
# Handle categorical encoding only if it's a classification task
79+
if self.task == 'classification':
80+
categorical_cols = X_syn.select_dtypes(include=['object', 'category']).columns.tolist()
81+
82+
if categorical_cols:
83+
X_syn = pd.get_dummies(X_syn, columns=categorical_cols, drop_first=True)
84+
X_real = pd.get_dummies(X_real, columns=categorical_cols, drop_first=True)
85+
86+
# Align columns in case of different categorical levels between real and synthetic data
87+
X_syn, X_real = X_syn.align(X_real, join='left', axis=1, fill_value=0)
88+
89+
# Model Training and Evaluation
7990
if self.task == 'regression':
8091
model = LinearRegression()
8192
model.fit(X_syn, y_syn)

synthpop/metrics/privacy_metrics.py

Lines changed: 54 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,91 @@
11
# privacy_metrics.py
2-
32
import numpy as np
43
import pandas as pd
54
from sklearn.neighbors import NearestNeighbors
5+
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
66

77
class DisclosureProtection:
88
"""
99
A class to compute the disclosure protection metric for synthetic data.
1010
11-
The metric is defined as 1 minus the proportion of synthetic records that are too similar
12-
(i.e. within a risk threshold) to a record in the real dataset.
11+
This metric measures the proportion of synthetic records that are too similar
12+
(within a defined threshold) to real records, posing a disclosure risk.
1313
1414
Parameters
1515
----------
1616
real_data : pd.DataFrame
17-
A DataFrame containing the real data. The data should be numeric or preprocessed.
17+
A DataFrame containing the real data. Supports both numerical and categorical features.
1818
synthetic_data : pd.DataFrame
19-
A DataFrame containing the synthetic data (with the same columns as real_data).
19+
A DataFrame containing the synthetic data (with the same structure as real_data).
2020
threshold : float, optional
2121
A distance threshold under which a synthetic record is considered a potential disclosure risk.
2222
If not provided, it is computed as the 10th percentile of the nearest-neighbor distances among real records.
2323
"""
24-
24+
2525
def __init__(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame, threshold: float = None):
2626
self.real_data = real_data.copy()
2727
self.synthetic_data = synthetic_data.copy()
2828
self.threshold = threshold
29+
30+
# Preprocess data for distance computation
31+
self.real_data, self.synthetic_data = self._preprocess_data(self.real_data, self.synthetic_data)
32+
33+
# Compute distance threshold if not provided
2934
self._compute_threshold()
3035

36+
def _preprocess_data(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame):
37+
"""
38+
Preprocess both real and synthetic datasets:
39+
- Standardize numerical columns
40+
- One-hot encode categorical columns
41+
- Align columns to ensure consistency
42+
"""
43+
44+
# Identify numerical and categorical columns
45+
categorical_cols = real_data.select_dtypes(include=["object", "category"]).columns.tolist()
46+
numerical_cols = real_data.select_dtypes(include=[np.number]).columns.tolist()
47+
48+
# One-Hot Encode Categorical Columns
49+
if categorical_cols:
50+
encoder = OneHotEncoder(sparse_output=True, drop="first", handle_unknown="ignore")
51+
real_cats = encoder.fit_transform(real_data[categorical_cols])
52+
synthetic_cats = encoder.transform(synthetic_data[categorical_cols])
53+
54+
# Convert to DataFrame
55+
real_cat_df = pd.DataFrame(real_cats.toarray(), columns=encoder.get_feature_names_out(categorical_cols))
56+
synthetic_cat_df = pd.DataFrame(synthetic_cats.toarray(), columns=encoder.get_feature_names_out(categorical_cols))
57+
58+
# Drop original categorical columns and replace with encoded versions
59+
real_data = real_data.drop(columns=categorical_cols)
60+
synthetic_data = synthetic_data.drop(columns=categorical_cols)
61+
real_data = pd.concat([real_data, real_cat_df], axis=1)
62+
synthetic_data = pd.concat([synthetic_data, synthetic_cat_df], axis=1)
63+
64+
# Standardize numerical features
65+
if numerical_cols:
66+
scaler = MinMaxScaler()
67+
real_data[numerical_cols] = scaler.fit_transform(real_data[numerical_cols])
68+
synthetic_data[numerical_cols] = scaler.transform(synthetic_data[numerical_cols])
69+
70+
# Align columns (in case some categories exist in one dataset but not the other)
71+
real_data, synthetic_data = real_data.align(synthetic_data, join="left", axis=1, fill_value=0)
72+
73+
return real_data, synthetic_data
74+
3175
def _compute_threshold(self):
3276
"""
3377
Compute the threshold if not provided. Uses the 10th percentile of the nearest-neighbor
3478
distances among real records (excluding self-distance).
3579
"""
3680
if self.threshold is None:
37-
# Fit a nearest neighbor model on the real data.
38-
# n_neighbors=2 because the closest neighbor of a record is itself.
3981
nn = NearestNeighbors(n_neighbors=2)
4082
nn.fit(self.real_data)
4183
distances, _ = nn.kneighbors(self.real_data)
42-
# distances[:, 1] are the distances to the closest distinct record.
43-
self.threshold = np.percentile(distances[:, 1], 10)
44-
84+
self.threshold = np.percentile(distances[:, 1], 10) # Exclude self-distance
85+
4586
def score(self) -> float:
4687
"""
4788
Compute the disclosure protection score.
48-
49-
For each synthetic record, compute its distance to the nearest real record.
50-
The risk rate is the proportion of synthetic records with distance below the threshold.
51-
The disclosure protection score is 1 - risk_rate (higher is better).
5289
5390
Returns
5491
-------
@@ -61,7 +98,7 @@ def score(self) -> float:
6198
distances = distances.flatten()
6299
risk_count = np.sum(distances < self.threshold)
63100
risk_rate = risk_count / len(distances)
64-
return 1 - risk_rate
101+
return 1 - risk_rate # Higher score means better protection
65102

66103
def report(self) -> dict:
67104
"""
@@ -79,6 +116,7 @@ def report(self) -> dict:
79116
risk_count = np.sum(distances < self.threshold)
80117
risk_rate = risk_count / len(distances)
81118
score = 1 - risk_rate
119+
82120
return {
83121
"threshold": self.threshold,
84122
"risk_rate": risk_rate,

synthpop/metrics/single_columns_metrics.py

Lines changed: 33 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -124,52 +124,46 @@ def ks_complement(real: pd.Series, synthetic: pd.Series) -> float:
124124
return 1 - ks_stat
125125

126126

127-
def tv_complement(real: pd.Series, synthetic: pd.Series, bins: int = 10) -> float:
127+
def tv_complement(real_series: pd.Series, synthetic_series: pd.Series) -> float:
128128
"""
129-
Compute the complement of the Total Variation (TV) distance between the histograms
130-
of the real and synthetic data. A value of 1 indicates identical distributions.
129+
Computes the TVComplement score between a real and a synthetic categorical column.
131130
132-
If the data is datetime or timedelta, convert it to numeric values (in seconds).
131+
TVD is defined as:
132+
TVD = 1/2 * sum(|R_ω - S_ω|) for all categories ω in the union of both series.
133+
134+
The TVComplement score is:
135+
score = 1 - TVD
136+
137+
Parameters
138+
----------
139+
real_series : pd.Series
140+
Categorical data from the real dataset.
141+
synthetic_series : pd.Series
142+
Categorical data from the synthetic dataset.
133143
134-
Args:
135-
real (pd.Series): Real numerical data.
136-
synthetic (pd.Series): Synthetic numerical data.
137-
bins (int, optional): Number of bins to use for the histograms. Defaults to 10.
144+
Returns
145+
-------
146+
float
147+
The TVComplement score (between 0 and 1).
148+
"""
149+
# Compute normalized frequency distributions (probabilities)
150+
real_freq = real_series.value_counts(normalize=True)
151+
synthetic_freq = synthetic_series.value_counts(normalize=True)
138152

139-
Returns:
140-
float: 1 - TV distance, where TV is computed over the normalized histograms.
141-
"""
142-
real_clean = real.dropna()
143-
synthetic_clean = synthetic.dropna()
153+
# Get the union of categories present in both series
154+
all_categories = real_freq.index.union(synthetic_freq.index)
144155

145-
if len(real_clean) == 0 or len(synthetic_clean) == 0:
146-
return 0.0
147-
148-
# Convert datetime/timedelta to numeric values if necessary.
149-
if np.issubdtype(real_clean.dtype, np.datetime64):
150-
# Convert to seconds since epoch
151-
real_clean = real_clean.astype('int64') / 1e9
152-
synthetic_clean = synthetic_clean.astype('int64') / 1e9
153-
elif np.issubdtype(real_clean.dtype, np.timedelta64):
154-
# Convert to total seconds
155-
if hasattr(real_clean, 'dt'):
156-
real_clean = real_clean.dt.total_seconds()
157-
synthetic_clean = synthetic_clean.dt.total_seconds()
158-
else:
159-
real_clean = real_clean.astype('int64') / 1e9
160-
synthetic_clean = synthetic_clean.astype('int64') / 1e9
161-
162-
all_data = pd.concat([real_clean, synthetic_clean])
163-
bin_edges = np.histogram_bin_edges(all_data, bins=bins)
164-
real_hist, _ = np.histogram(real_clean, bins=bin_edges, density=True)
165-
synth_hist, _ = np.histogram(synthetic_clean, bins=bin_edges, density=True)
156+
# Reindex to ensure both distributions have the same categories, fill missing with 0
157+
real_freq = real_freq.reindex(all_categories, fill_value=0)
158+
synthetic_freq = synthetic_freq.reindex(all_categories, fill_value=0)
159+
160+
# Calculate Total Variation Distance (TVD)
161+
tvd = 0.5 * np.abs(real_freq - synthetic_freq).sum()
166162

167-
# Normalize the histograms
168-
real_hist = real_hist / np.sum(real_hist)
169-
synth_hist = synth_hist / np.sum(synth_hist)
163+
# Compute TVComplement: higher score means higher similarity
164+
tv_complement_score = 1 - tvd
170165

171-
tv_distance = 0.5 * np.sum(np.abs(real_hist - synth_hist))
172-
return 1 - tv_distance
166+
return tv_complement_score
173167

174168

175169
# ------------------------------------------------------------------------------

synthpop/processor/data_processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def _preprocess(self, data: pd.DataFrame) -> pd.DataFrame:
7474
elif dtype == "timedelta":
7575
data[col] = pd.to_timedelta(data[col]).dt.total_seconds()
7676

77-
return data
77+
return data[self.original_columns]
7878

7979
def postprocess(self, synthetic_data: pd.DataFrame) -> pd.DataFrame:
8080
"""Transform numerical synthetic data back to its original format."""

synthpop/processor/missing_data_handler.py

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from sklearn.impute import SimpleImputer, IterativeImputer
66
from sklearn.linear_model import LogisticRegression
77
from sklearn.preprocessing import LabelEncoder
8+
from .data_processor import DataProcessor
89
import warnings
910

1011

@@ -204,6 +205,11 @@ def detect_missingness(self, dfc: pd.DataFrame) -> dict:
204205
def apply_imputation(self, df: pd.DataFrame, missingness: dict) -> pd.DataFrame:
205206
"""Automatically applies imputation based on missingness type and column data type."""
206207
df = df.copy()
208+
metadata = self.get_column_dtypes(df)
209+
processor = DataProcessor(metadata)
210+
processed_data = processor.preprocess(df)
211+
imputer = IterativeImputer(random_state=42)
212+
df_iterative = pd.DataFrame(imputer.fit_transform(processed_data), columns= df.columns)
207213
for col, mtype in missingness.items():
208214
if df[col].isna().sum() == 0:
209215
continue
@@ -218,16 +224,30 @@ def apply_imputation(self, df: pd.DataFrame, missingness: dict) -> pd.DataFrame:
218224
df[col].fillna(df[col].mode()[0], inplace=True)
219225
elif mtype == "MAR":
220226
# Use get_dummies encoding for categorical data
221-
dummies = pd.get_dummies(df[col], prefix=col, dummy_na=True)
227+
le = LabelEncoder()
228+
non_missing = df[col].dropna()
229+
le.fit(non_missing)
230+
predictor_cols = [c for c in df.columns if c != col]
231+
predictors = df_iterative[predictor_cols].copy()
232+
df_copy = df.copy()
233+
df_copy[f"{col}_encoded"] = df_copy[col].apply(lambda x: le.transform([x])[0] if pd.notna(x) else np.nan)
234+
235+
# Combine predictors and the encoded target.
236+
combined = pd.concat([predictors, df_copy[[f"{col}_encoded"]]], axis=1)
237+
# Impute missing values using IterativeImputer.
222238
imputer = IterativeImputer(random_state=42)
223-
imputed = imputer.fit_transform(dummies)
224-
imputed_rounded = np.rint(imputed).astype(int)
225-
imputed_df = pd.DataFrame(
226-
imputed_rounded, columns=dummies.columns, index=df.index
227-
)
228-
# Convert back to a single categorical column by taking the column with the maximum value.
229-
predicted_category = imputed_df.idxmax(axis=1)
230-
df[col] = predicted_category.str.split(f"{col}_").str[-1]
239+
imputed_array = imputer.fit_transform(combined)
240+
imputed_df = pd.DataFrame(imputed_array, columns=combined.columns, index=df.index)
241+
242+
# Extract the imputed encoded target column.
243+
imputed_encoded = imputed_df[f"{col}_encoded"]
244+
imputed_encoded = imputed_encoded.round().astype(int)
245+
min_code = 0
246+
max_code = len(le.classes_) - 1
247+
imputed_encoded = imputed_encoded.clip(lower=min_code, upper=max_code)
248+
# Decode back to the original categorical labels.
249+
imputed_categories = le.inverse_transform(imputed_encoded)
250+
df[col] = imputed_categories
231251
elif mtype == "MNAR":
232252
df[col].fillna("Missing", inplace=True)
233253

@@ -252,7 +272,6 @@ def apply_imputation(self, df: pd.DataFrame, missingness: dict) -> pd.DataFrame:
252272

253273
# --- Datetime Data ---
254274
elif pd.api.types.is_datetime64_any_dtype(df[col]):
255-
print("entering here")
256275
numeric_series = df[col].apply(lambda x: x.timestamp() if pd.notnull(x) else np.nan)
257276
if mtype == "MCAR":
258277
imputer = SimpleImputer(strategy="median")

0 commit comments

Comments
 (0)