fix

fatemetkl · fatemetkl · commit 8e764e170408 · 2025-09-04T07:51:55.000-06:00
diff --git a/.gitignore b/.gitignore
@@ -24,4 +24,4 @@ wheels/
 **/workspace/*.bkp
 
 # Dataset files
-examples/**/data/
+examples/**/data/
diff --git a/examples/ensemble_attack_example/README.md b/examples/ensemble_attack_example/README.md
@@ -32,5 +32,3 @@ TODO: include the illustrations
 
 ## Terminology
 To be added....
-
-
diff --git a/examples/ensemble_attack_example/config.yaml b/examples/ensemble_attack_example/config.yaml
@@ -40,12 +40,12 @@ data_processing_config:
   trans_json_file_path: ${base_example_dir}/data_configs/trans.json
   population_sample_size: 40000
 
-# Training settings (placeholder)  
+# Training settings (placeholder)
 shadow_training:
   epochs: 10
   learning_rate: 0.001
   batch_size: 64
   model_type: "tabddpm"
 
 # General settings
-random_seed: 42
+random_seed: 42
diff --git a/examples/ensemble_attack_example/real_data_collection.py b/examples/ensemble_attack_example/real_data_collection.py
@@ -2,7 +2,9 @@
 This data collection script is tailored to the structure of the provided folders in
 MIDST competition.
 """
+
 from pathlib import Path
+
 import pandas as pd
 from omegaconf import DictConfig
 
@@ -12,6 +14,15 @@
 
 
 def expand_ranges(ranges):
+    """
+    Reads a list of tuples representing ranges and expands them into a flat list of integers.
+
+    Args:
+        ranges: List of tuples, where each tuple contains two integers (start, end).
+
+    Returns:
+        A flat list of integers covering the ranges.
+    """
     expanded = []
     for r in ranges:
         start, end = r
@@ -62,9 +73,7 @@ def collect_midst_attack_data(
 
     df_real = pd.DataFrame()
     for i in data_id:
-        data_dir_ith = (
-            data_dir / attack_type / data_split / f"{generation_name}_{i}" / file_name
-        )
+        data_dir_ith = data_dir / attack_type / data_split / f"{generation_name}_{i}" / file_name
         df_real_ith = pd.read_csv(data_dir_ith)
         df_real = df_real_ith if i == 1 else pd.concat([df_real, df_real_ith])
 
@@ -134,7 +143,6 @@ def collect_population_data_ensemble(
     Returns:
         The collected population data as a dataframe.
     """
-
     # Ensemble Attack collects train data of all the attack types (back box and white box)
     attack_types = data_processing_config.collect_attack_data_types
     df_population = collect_midst_data(
diff --git a/examples/ensemble_attack_example/run_attack.py b/examples/ensemble_attack_example/run_attack.py
@@ -1,16 +1,28 @@
-""" This file is an uncompleted example script for running the ensemble attack on MIDST challenge provided resources and data. """
+"""
+This file is an uncompleted example script for running the Ensemble Attack on MIDST challenge
+provided resources and data.
+"""
+
 from logging import INFO
+from pathlib import Path
+
 import hydra
 from omegaconf import DictConfig
-from pathlib import Path
+
+from examples.ensemble_attack_example.real_data_collection import collect_population_data_ensemble
 from src.midst_toolkit.attacks.ensemble.process_split_data import process_split_data
 from src.midst_toolkit.common.logger import log
-from examples.ensemble_attack_example.real_data_collection import collect_population_data_ensemble
 
 
 @hydra.main(config_path=".", config_name="config", version_base=None)
 def main(cfg: DictConfig):
+    """
+    Run the Ensemble Attack example pipeline.
+    As the first step, data processing is done.
 
+    Args:
+        cfg: Attack OmegaConf DictConfig object.
+    """
     if cfg.pipeline.run_data_processing:
         log(INFO, "Running data processing pipeline...")
         # Collect the real data from the MIDST challenge resources.
diff --git a/src/midst_toolkit/attacks/ensemble/process_split_data.py b/src/midst_toolkit/attacks/ensemble/process_split_data.py
@@ -1,15 +1,15 @@
 from logging import INFO
 from pathlib import Path
+
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import train_test_split
 
-from midst_toolkit.attacks.ensemble.utils import (
+from midst_toolkit.common.logger import log
+from src.midst_toolkit.attacks.ensemble.utils import (
     save_dataframe,
 )
 
-from midst_toolkit.common.logger import log
-
 
 def split_real_data(
     df_real: pd.DataFrame,
@@ -24,6 +24,7 @@ def split_real_data(
         column_to_stratify: Column name to use for stratified splitting.
         proportion: Proportions for train and validation splits.
         random_seed: Random seed for reproducibility.
+
     Returns:
         A tuple containing the train, validation, and test dataframes.
     """
@@ -41,8 +42,7 @@ def split_real_data(
     # Further split the control into val and test set:
     df_real_val, df_real_test = train_test_split(
         df_real_control,
-        test_size=(1 - proportion["train"] - proportion["val"])
-        / (1 - proportion["train"]),
+        test_size=(1 - proportion["train"] - proportion["val"]) / (1 - proportion["train"]),
         random_state=random_seed,
         stratify=df_real_control[column_to_stratify],
     )
@@ -121,9 +121,7 @@ def generate_val_test(
         ignore_index=True,
     )
 
-    df_test = df_test.sample(frac=1, random_state=random_seed).reset_index(
-        drop=True
-    )
+    df_test = df_test.sample(frac=1, random_state=random_seed).reset_index(drop=True)
 
     y_test = df_test["is_train"].values
     df_test = df_test.drop(columns=["is_train"])
@@ -148,20 +146,17 @@ def process_split_data(
         num_total_samples: Number os samples that I randomly selected from the population. Defaults to 40000.
         random_seed: Random seed used for reproducibility. Defaults to 42.
     """
-
     # Original Ensemble attack samples 40k data points to construct
     # 1) the main population (real data) used for training the synthetic data generator model,
     # 2) evaluation that is the meta train data used to train the meta classifier,
     # 3) test to evaluate the meta classifier.
 
-    df_real_data = all_population_data.sample(
-        n=num_total_samples, random_state=random_seed
-    )
+    df_real_data = all_population_data.sample(n=num_total_samples, random_state=random_seed)
 
     # Split the data. df_real_train is used for training the synthetic data generator model.
     df_real_train, df_real_val, df_real_test = split_real_data(
         df_real_data,
-        column_to_stratify=column_to_stratify, 
+        column_to_stratify=column_to_stratify,
         random_seed=random_seed,
     )
     # Generate validation and test sets with labels. Validation is used for training the meta classifier
@@ -172,9 +167,7 @@ def process_split_data(
         df_real_train,
         df_real_val,
         df_real_test,
-        stratify=df_real_train[
-            column_to_stratify
-        ],  # TODO: This value is not documented in the original codebase.
+        stratify=df_real_train[column_to_stratify],  # TODO: This value is not documented in the original codebase.
         random_seed=random_seed,
     )
 
diff --git a/src/midst_toolkit/attacks/ensemble/utils.py b/src/midst_toolkit/attacks/ensemble/utils.py
@@ -3,7 +3,7 @@
 
 import pandas as pd
 
-from midst_toolkit.common.logger import log
+from src.midst_toolkit.common.logger import log
 
 
 def save_dataframe(df: pd.DataFrame, file_path: Path, file_name: str) -> None:
@@ -14,11 +14,10 @@ def save_dataframe(df: pd.DataFrame, file_path: Path, file_name: str) -> None:
         df: DataFrame to be saved.
         file_path: Path where the file will be saved.
         file_name: Name of the file to save the DataFrame as.
-    
+
     Returns:
         None
     """
-
     assert Path.exists(file_path), f"Path {file_path} does not exist."
     df.to_csv(file_path / file_name, index=False)
     log(INFO, f"DataFrame saved to {file_path / file_name}")
@@ -33,10 +32,10 @@ def load_dataframe(file_path: Path, file_name: str) -> pd.DataFrame:
         file_name: Name of the file to load the DataFrame from.
 
     Returns:
-        pd.DataFrame: Loaded dataframe.
+        Loaded dataframe.
     """
     full_path = file_path / file_name
     assert Path.exists(full_path), f"File {full_path} does not exist."
     df = pd.read_csv(full_path)
     log(INFO, f"DataFrame loaded from {full_path}")
-    return df
+    return df
diff --git a/tests/unit/attacks/ensemble/assets/population_data/all_population.csv b/tests/unit/attacks/ensemble/assets/population_data/all_population.csv
@@ -97,4 +97,4 @@ trans_id,account_id,trans_date,trans_type,operation,amount,balance,k_symbol,bank
 97776,334,752,2,4,1680.0,28091.8,1,0,0
 1177053,4035,1135,2,4,6300.0,38763.4,1,0,0
 980869,3347,1923,2,1,2624.0,10827.7,5,10,24763751
-712969,2436,1144,2,4,2040.0,42428.1,1,0,0
+712969,2436,1144,2,4,2040.0,42428.1,1,0,0
diff --git a/tests/unit/attacks/ensemble/test_config.yaml b/tests/unit/attacks/ensemble/test_config.yaml
@@ -15,4 +15,4 @@ data_processing_config:
   population_sample_size: 80
 
 # General settings
-random_seed: 42
+random_seed: 42
diff --git a/tests/unit/attacks/ensemble/test_process_data_split.py b/tests/unit/attacks/ensemble/test_process_data_split.py
@@ -1,10 +1,11 @@
 from pathlib import Path
-from omegaconf import DictConfig
+
 import pytest
-from hydra import initialize, compose
+from hydra import compose, initialize
 from omegaconf import DictConfig
-from src.midst_toolkit.attacks.ensemble.utils import load_dataframe
+
 from src.midst_toolkit.attacks.ensemble.process_split_data import process_split_data
+from src.midst_toolkit.attacks.ensemble.utils import load_dataframe
 
 
 @pytest.fixture(scope="session")