Fixed docstrings

fatemetkl · fatemetkl · commit 6a6b99da61ad · 2025-09-04T07:35:26.000-06:00
diff --git a/examples/ensemble_attack_example/real_data_collection.py b/examples/ensemble_attack_example/real_data_collection.py
@@ -30,11 +30,11 @@ def collect_midst_attack_data(
     Collect the real data in a specific setting of the provided MIDST challenge resources.
 
     Args:
-        attack_type (str): The attack setting.
-        data_dir (Path): The path where the data is stored.
-        data_split (str): Indicates if this is train, dev, or final data.
-        dataset (str): The dataset to be collected. Either "train" or "challenge".
-        data_config (dict): Configuration dictionary containing data paths and file names.
+        attack_type: The attack setting.
+        data_dir: The path where the data is stored.
+        data_split: Indicates if this is train, dev, or final data.
+        dataset: The dataset to be collected. Either "train" or "challenge".
+        data_processing_config: Configuration dictionary containing data specific information.
 
     Returns:
         pd.DataFrame: The specified dataset in this setting.
@@ -77,21 +77,22 @@ def collect_midst_data(
     attack_types: list[str],
     data_splits: list[str],
     dataset: str,
-    data_config: DictConfig,
+    data_processing_config: DictConfig,
 ) -> pd.DataFrame:
     """
     Collect train or challenge data of the specified attack type from the provided data folders
     in the MIDST competition.
 
     Args:
-        attack_types (list[str]): List of attack names to be collected.
-        data_splits (list[str]): A list indicating the data split to be collected.
+        midst_data_input_dir: The path where the MIDST data folders are stored.
+        attack_types: List of attack names for data collection.
+        data_splits: A list indicating the data split to be collected.
             Could be any of train, dev, or final data splits.
-        dataset (str): The dataset to be collected. Either "train" or "challenge".
-        data_config (dict): Configuration dictionary containing data paths and file names.
+        dataset: The dataset to be collected. Either "train" or "challenge".
+        data_processing_config: Configuration dictionary containing data paths and file names.
 
     Returns:
-        pd.DataFrame: Collected train or challenge data as a DataFrame.
+        Collected train or challenge data as a dataframe.
     """
     assert dataset in [
         "train",
@@ -105,7 +106,7 @@ def collect_midst_data(
                 data_dir=midst_data_input_dir,
                 data_split=data_split,
                 dataset=dataset,
-                data_processing_config=data_config,
+                data_processing_config=data_processing_config,
             )
 
         population.append(df_real)
@@ -119,19 +120,19 @@ def collect_population_data_ensemble(
     save_dir: Path,
 ) -> pd.DataFrame:
     """
-    Collect the population data from the MIDST competition based on ensemble mia implementation.
+    Collect the population data from the MIDST competition based on Ensemble Attack implementation.
     Returns real data population that consists of the train data of all the attacks
     (black box and white box), and challenge points from train, dev and final of
     "tabddpm_black_box" attack. The population data is saved in the provided path,
     and returned as a dataframe.
 
     Args:
-        data_config (dict): Configuration dictionary containing data paths and file names.
-        attack_types (list[str] | None): List of attack names to be collected.
-            If None, all the attacks are collected based on ensemble mia implementation.
+        midst_data_input_dir: The path where the MIDST data folders are stored.
+        data_processing_config: Configuration dictionary containing data information and file names.
+        save_dir: The path where the collected population data should be saved.
 
     Returns:
-        pd.DataFrame: The collected population data.
+        The collected population data as a dataframe.
     """
 
     # Ensemble Attack collects train data of all the attack types (back box and white box)
@@ -141,7 +142,7 @@ def collect_population_data_ensemble(
         attack_types,
         data_splits=["train"],
         dataset="train",
-        data_config=data_processing_config,
+        data_processing_config=data_processing_config,
     )
     # Drop ids.
     df_population_no_id = df_population.drop(columns=["trans_id", "account_id"])
@@ -156,7 +157,7 @@ def collect_population_data_ensemble(
         attack_types=challenge_attack_types,
         data_splits=["train", "dev", "final"],
         dataset="challenge",
-        data_config=data_processing_config,
+        data_cdata_processing_configonfig=data_processing_config,
     )
     # Save the challenge points
     save_dataframe(df_challenge, save_dir, "challenge_points_all.csv")
diff --git a/src/midst_toolkit/attacks/ensemble/process_split_data.py b/src/midst_toolkit/attacks/ensemble/process_split_data.py
@@ -20,13 +20,12 @@ def split_real_data(
     """Splits a real dataset into train, validation, and test sets, saves them as CSV files, and returns the splits.
 
     Args:
-        df_real (pd.DataFrame): The input real dataset to be split.
-        column_to_stratify (str, optional): Column name to use for stratified splitting. Defaults to None.
-        proportion (dict, optional): Proportions for train and validation splits.
-        random_seed (int, optional): Random seed for reproducibility. Defaults to None.
-
+        df_real: The input real dataset to be split.
+        column_to_stratify: Column name to use for stratified splitting.
+        proportion: Proportions for train and validation splits.
+        random_seed: Random seed for reproducibility.
     Returns:
-        Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the train, validation, and test DataFrames.
+        A tuple containing the train, validation, and test dataframes.
     """
     if proportion is None:
         proportion = {"train": 0.5, "val": 0.25}
@@ -67,14 +66,14 @@ def generate_val_test(
     The resulting validation and test sets are used for meta classifier training and evaluation, respectively.
 
     Args:
-        df_real_train (pd.DataFrame): Real training data.
-        df_real_control_val (pd.DataFrame): Real control data for validation.
-        df_real_control_test (pd.DataFrame): Real control data for final evaluation.
-        stratify (pd.Series): Series used to stratify the real training data.
-        random_seed (int): Random seed for reproducibility.
+        df_real_train: Real training data.
+        df_real_control_val: Real control data for validation.
+        df_real_control_test: Real control data for final evaluation.
+        stratify: Series used to stratify the real training data.
+        random_seed: Random seed for reproducibility.
 
     Returns:
-        Tuple[pd.DataFrame, np.ndarray, pd.DataFrame, np.ndarray]: Features and labels for validation and test sets.
+        Features and labels for validation and test sets, respectively.
     """
     df_real_train["stratify"] = stratify
 
@@ -141,6 +140,13 @@ def process_split_data(
 ) -> None:
     """
     Splits the data into train, validation, and test sets according to the attack design.
+
+    Args:
+        all_population_data: The total population data that the attacker has access to as a DataFrame.
+        processed_attack_data_path: Path where the processed attack data will be saved.
+        column_to_stratify: Column name to use for stratified splitting.
+        num_total_samples: Number os samples that I randomly selected from the population. Defaults to 40000.
+        random_seed: Random seed used for reproducibility. Defaults to 42.
     """
 
     # Original Ensemble attack samples 40k data points to construct
diff --git a/src/midst_toolkit/attacks/ensemble/utils.py b/src/midst_toolkit/attacks/ensemble/utils.py
@@ -29,11 +29,11 @@ def load_dataframe(file_path: Path, file_name: str) -> pd.DataFrame:
     Load a DataFrame from a CSV file.
 
     Args:
-        file_path (str): Path where the file is stored.
-        file_name (str): Name of the file to load the DataFrame from.
+        file_path: Path where the file is stored.
+        file_name: Name of the file to load the DataFrame from.
 
     Returns:
-        pd.DataFrame: Loaded DataFrame.
+        pd.DataFrame: Loaded dataframe.
     """
     full_path = file_path / file_name
     assert Path.exists(full_path), f"File {full_path} does not exist."