Packaged directory iterations, switched to the library's ENUM, resolved other comments.

sarakodeiri · sarakodeiri · commit 1813d26f60b2 · 2025-11-14T12:42:59.000-05:00
diff --git a/examples/common/utils.py b/examples/common/utils.py
@@ -0,0 +1,28 @@
+# Functions used for attacks across multiple examples.
+
+from collections.abc import Generator
+from pathlib import Path
+
+
+def iterate_model_folders(input_data_path: Path, diffusion_model_names: list[str]) -> Generator[tuple[str, Path, str]]:
+    """
+    Iterates over the competition's shadow model folder structure and yields model information.
+
+    Args:
+        input_data_path: The base path for the input data.
+        diffusion_model_names: A list of diffusion model names to iterate over.
+
+    Yields:
+        A tuple containing the model name, the path to the model's data, and the model folder name.
+    """
+    modes = ["train", "dev", "final"]
+    for model_name in diffusion_model_names:
+        model_path = input_data_path / f"{model_name}_black_box"
+        for mode in modes:
+            current_path = model_path / mode
+            if not current_path.exists():
+                continue
+
+            model_folders = [entry for entry in current_path.iterdir() if entry.is_dir()]
+            for model_folder_path in model_folders:
+                yield model_name, model_folder_path, model_folder_path.name
diff --git a/examples/ept_attack/run_ept_attack.py b/examples/ept_attack/run_ept_attack.py
@@ -13,6 +13,7 @@
 import hydra
 from omegaconf import DictConfig
 
+from examples.common.utils import iterate_model_folders
 from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe
 from midst_toolkit.attacks.ept.feature_extraction import extract_features
 from midst_toolkit.common.logger import log
@@ -32,7 +33,6 @@ def run_attribute_prediction(config: DictConfig) -> None:
     log(INFO, "Running attribute prediction model training.")
 
     diffusion_model_names = ["tabddpm", "tabsyn"] if config.attack_settings.single_table else ["clavaddpm"]
-    modes = ["train", "dev", "final"]
     input_data_path = Path(config.data_paths.input_data_path)
     output_features_path = Path(config.data_paths.output_data_path, "attribute_prediction_features")
 
@@ -48,41 +48,33 @@ def run_attribute_prediction(config: DictConfig) -> None:
 
     # TODO: Package iterating over competition structure (maybe into a utility function)
     # Iterating over directories specific to the shadow models folder structure in the competition
-    for model_name in diffusion_model_names:
-        model_path = input_data_path / f"{model_name}_black_box"
-        for mode in modes:
-            current_path = model_path / mode
+    for model_name, model_data_path, model_folder in iterate_model_folders(input_data_path, diffusion_model_names):
+        # Load the data files as dataframes
+        df_synthetic_data = load_dataframe(model_data_path, "trans_synthetic.csv")
+        df_challenge_data = load_dataframe(model_data_path, "challenge_with_id.csv")
 
-            model_folders = [entry.name for entry in current_path.iterdir() if entry.is_dir()]
-            for model_folder in model_folders:
-                # Load the data files as dataframes
-                model_data_path = current_path / model_folder
+        # Keep only the columns that are present in feature_column_types
+        columns_to_keep = feature_column_types["numerical"] + feature_column_types["categorical"]
+        df_synthetic_data = df_synthetic_data[columns_to_keep]
+        df_challenge_data = df_challenge_data[columns_to_keep]
 
-                df_synthetic_data = load_dataframe(model_data_path, "trans_synthetic.csv")
-                df_challenge_data = load_dataframe(model_data_path, "challenge_with_id.csv")
+        # Run feature extraction
+        df_extracted_features = extract_features(
+            synthetic_data=df_synthetic_data,
+            challenge_data=df_challenge_data,
+            column_types=feature_column_types,
+            random_seed=config.random_seed,
+        )
 
-                # Keep only the columns that are present in feature_column_types
-                columns_to_keep = feature_column_types["numerical"] + feature_column_types["categorical"]
-                df_synthetic_data = df_synthetic_data[columns_to_keep]
-                df_challenge_data = df_challenge_data[columns_to_keep]
+        final_output_dir = output_features_path / f"{model_name}_black_box"
 
-                # Run feature extraction
-                df_extracted_features = extract_features(
-                    synthetic_data=df_synthetic_data,
-                    challenge_data=df_challenge_data,
-                    column_types=feature_column_types,
-                    random_seed=config.random_seed,
-                )
+        final_output_dir.mkdir(parents=True, exist_ok=True)
 
-                final_output_dir = output_features_path / f"{model_name}_black_box"
+        # Extract the number at the end of model_folder
+        model_folder_number = int(model_folder.split("_")[-1])
+        file_name = f"attribute_prediction_features_{model_folder_number}.csv"
 
-                final_output_dir.mkdir(parents=True, exist_ok=True)
-
-                # Extract the number at the end of model_folder
-                model_folder_number = int(model_folder.split("_")[-1])
-                file_name = f"attribute_prediction_features_{model_folder_number}.csv"
-
-                save_dataframe(df=df_extracted_features, file_path=final_output_dir, file_name=file_name)
+        save_dataframe(df=df_extracted_features, file_path=final_output_dir, file_name=file_name)
 
 
 @hydra.main(config_path=".", config_name="config", version_base=None)
diff --git a/src/midst_toolkit/attacks/ept/feature_extraction.py b/src/midst_toolkit/attacks/ept/feature_extraction.py
@@ -5,7 +5,6 @@
 
 """
 
-from enum import Enum
 from logging import INFO
 
 import numpy as np
@@ -15,14 +14,10 @@
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
 
+from midst_toolkit.common.enumerations import TaskType
 from midst_toolkit.common.logger import log
 
 
-class TaskType(Enum):
-    CLASSIFICATION = "classification"
-    REGRESSION = "regression"
-
-
 def preprocess_train_predict(
     train_points: pd.DataFrame,
     test_points: pd.DataFrame,
@@ -77,7 +72,7 @@ def preprocess_train_predict(
         "The union of numeric_columns and categorical_columns must match the columns in the combined dataframe"
     )
 
-    task_type = TaskType.CLASSIFICATION if target_col in categorical_columns else TaskType.REGRESSION
+    task_type = TaskType.MULTICLASS_CLASSIFICATION if target_col in categorical_columns else TaskType.REGRESSION
 
     # Remove target column from feature columns
     numeric_columns = [col for col in numeric_columns if col != target_col]
@@ -95,7 +90,7 @@ def preprocess_train_predict(
 
     model = (
         RandomForestClassifier(random_state=random_seed)
-        if task_type == TaskType.CLASSIFICATION
+        if task_type == TaskType.MULTICLASS_CLASSIFICATION
         else RandomForestRegressor(random_state=random_seed)
     )
 
@@ -124,8 +119,8 @@ def extract_features(
     4. Compile the results into a DataFrame.
 
     Args:
-        synthetic_data: Synthetic data generated by the target model without ID columns; the data we want to
-            extract features from.
+        synthetic_data: Synthetic data to extract features from. Note: This data should not contain any identifier
+            columns, as the function will attempt to train a prediction model for every column included.
         challenge_data: The data the predictions are compared against, to compute prediction accuracy/errors.
         column_types: A dictionary specifying the types of columns (numerical or categorical) in the data.
         random_seed: Random seed for reproducibility. Defaults to None.
@@ -160,7 +155,7 @@ def extract_features(
         features.append(y_test)
         columns.append(column)
 
-        if task_type == TaskType.CLASSIFICATION:
+        if task_type == TaskType.MULTICLASS_CLASSIFICATION:
             # TODO: Maybe change the variable name from accuracy to correctness
             # Calculate accuracy
             accuracy = predictions == y_test
diff --git a/tests/unit/attacks/ept_attack/test_feature_extraction.py b/tests/unit/attacks/ept_attack/test_feature_extraction.py
@@ -2,7 +2,8 @@
 import pandas as pd
 import pytest
 
-from midst_toolkit.attacks.ept.feature_extraction import TaskType, extract_features, preprocess_train_predict
+from midst_toolkit.attacks.ept.feature_extraction import extract_features, preprocess_train_predict
+from midst_toolkit.common.enumerations import TaskType
 
 
 @pytest.fixture
@@ -51,7 +52,7 @@ def test_preprocess_train_predict_classification(sample_dataframes, sample_colum
         random_seed=42,
     )
 
-    assert task_type == TaskType.CLASSIFICATION
+    assert task_type == TaskType.MULTICLASS_CLASSIFICATION
     assert len(predictions) == len(test_df)
     assert predictions.dtype == "object"  # RandomForestClassifier predicts original class
     pd.testing.assert_series_equal(y_test, test_df[target_col], check_dtype=False, check_names=False)