Add anonymization option to preprocessing

MisterArdavan · MisterArdavan · commit d5eeb0dd107b · 2025-08-19T19:16:18.000-04:00
diff --git a/notebooks/analysis/Requested and Used VRAM.ipynb b/notebooks/analysis/Requested and Used VRAM.ipynb
@@ -132,6 +132,7 @@
     "preprocessed_jobs_df = ea.load_preprocessed_jobs_dataframe_from_duckdb(\n",
     "    db_path=Path(project_root) / \"data/slurm_data.db\",\n",
     "    table_name=\"Jobs\",\n",
+    "    anonymize=True,\n",
     ")\n",
     "display(preprocessed_jobs_df.head(10))\n",
     "print(preprocessed_jobs_df.shape)"
diff --git a/src/analysis/efficiency_analysis.py b/src/analysis/efficiency_analysis.py
@@ -23,6 +23,7 @@ def load_preprocessed_jobs_dataframe_from_duckdb(
     table_name: str = "Jobs",
     sample_size: int | None = None,
     random_state: pd._typing.RandomState | None = None,
+    anonymize: bool = False,
 ) -> pd.DataFrame:
     """
     Load jobs DataFrame from a DuckDB database and preprocess it.
@@ -32,6 +33,7 @@ def load_preprocessed_jobs_dataframe_from_duckdb(
         table_name (str, optional): Table name to query. Defaults to 'Jobs'.
         sample_size (int, optional): Number of rows to sample from the DataFrame. Defaults to None (no sampling).
         random_state (pd._typing.RandomState, optional): Random state for reproducibility. Defaults to None.
+        anonymize (bool, optional): Whether to anonymize the DataFrame. Defaults to False.
 
     Returns:
         pd.DataFrame: DataFrame containing the table data.
@@ -46,7 +48,11 @@ def load_preprocessed_jobs_dataframe_from_duckdb(
 
         jobs_df = db.fetch_all_jobs(table_name=table_name)
         processed_data = preprocess_data(
-            jobs_df, min_elapsed_seconds=0, include_failed_cancelled_jobs=False, include_cpu_only_jobs=False
+            jobs_df,
+            min_elapsed_seconds=0,
+            include_failed_cancelled_jobs=False,
+            include_cpu_only_jobs=False,
+            anonymize=anonymize,
         )
         if sample_size is not None:
             processed_data = processed_data.sample(n=sample_size, random_state=random_state)
diff --git a/src/config/snapshots/partition_info.json b/src/config/snapshots/partition_info.json
@@ -1,8 +1,17 @@
 [
+  {
+    "name": "arm",
+    "type": "cpu",
+    "node_count": 9,
+    "maxtime": "14 days",
+    "deftime": "1 hour",
+    "max_ram": 470,
+    "max_cpus": 144
+  },
   {
     "name": "arm-gpu",
     "type": "gpu",
-    "node_count": 3,
+    "node_count": 4,
     "maxtime": "14 days",
     "deftime": "1 hour",
     "max_ram": 560,
@@ -83,7 +92,7 @@
   {
     "name": "cpu",
     "type": "cpu",
-    "node_count": 147,
+    "node_count": 153,
     "maxtime": "2 days",
     "deftime": "1 hour",
     "max_ram": 1510,
@@ -92,7 +101,7 @@
   {
     "name": "cpu-preempt",
     "type": "cpu",
-    "node_count": 138,
+    "node_count": 144,
     "maxtime": "2 days",
     "deftime": "1 hour",
     "max_ram": 1510,
@@ -155,7 +164,7 @@
   {
     "name": "gpupod-l40s",
     "type": "gpu",
-    "node_count": 10,
+    "node_count": 13,
     "maxtime": "14 days",
     "deftime": "1 hour",
     "max_ram": 500,
@@ -164,11 +173,11 @@
   {
     "name": "ials-gpu",
     "type": "gpu",
-    "node_count": 31,
+    "node_count": 28,
     "maxtime": "14 days",
     "deftime": "1 hour",
-    "max_ram": 500,
-    "max_cpus": 32
+    "max_ram": 180,
+    "max_cpus": 24
   },
   {
     "name": "jdelhommelle",
@@ -272,7 +281,7 @@
   {
     "name": "uri-cpu",
     "type": "cpu",
-    "node_count": 43,
+    "node_count": 49,
     "maxtime": "30 days",
     "deftime": "1 hour",
     "max_ram": 1000,
diff --git a/src/preprocess/preprocess.py b/src/preprocess/preprocess.py
@@ -180,11 +180,25 @@ def _write_preprocessing_error_logs(preprocessing_error_logs: list[dict]) -> Non
         f.writelines(summary_lines)
 
 
+def anonymize_str_column(column: pd.Series, prefix: str) -> pd.Series:
+    """Anonymize a DataFrame column by replacing its values with a unique identifier.
+
+    Args:
+        column (pd.Series): The column to anonymize.
+        prefix (str): The prefix to add to the anonymized values.
+
+    Returns:
+        pd.Series: The anonymized column.
+    """
+    return prefix + column.rank(method="dense").astype(int).astype(str).str.zfill(2)
+
+
 def preprocess_data(
     input_df: pd.DataFrame,
     min_elapsed_seconds: int = DEFAULT_MIN_ELAPSED_SECONDS,
     include_failed_cancelled_jobs: bool = False,
     include_cpu_only_jobs: bool = False,
+    anonymize: bool = False,
 ) -> pd.DataFrame:
     """
     Preprocess dataframe, filtering out unwanted rows and columns, filling missing values and converting types.
@@ -196,6 +210,7 @@ def preprocess_data(
         min_elapsed_seconds (int, optional): Minimum elapsed time in seconds to keep a job record. Defaults to 600.
         include_failed_cancelled_jobs (bool, optional): Whether to include jobs with status FAILED or CANCELLED.
         include_cpu_only_jobs (bool, optional): Whether to include jobs that do not use GPUs (CPU-only jobs).
+        anonymize (bool, optional): Whether to anonymize user and account information.
 
     Returns:
         pd.DataFrame: The preprocessed dataframe
@@ -288,9 +303,15 @@ def preprocess_data(
     if error_indices:
         data = data.drop(index=list(error_indices)).reset_index(drop=True)
 
+    # TODO (Tan): remove these two columns as they are calculated during analysis
     data.loc[:, "user_jobs"] = data.groupby("User")["User"].transform("size")
     data.loc[:, "account_jobs"] = data.groupby("Account")["Account"].transform("size")
 
+    # Anonymize user and account information
+    if anonymize:
+        data.loc[:, "User"] = anonymize_str_column(data["User"], "user_")
+        data.loc[:, "Account"] = anonymize_str_column(data["Account"], "account_")
+
     # Convert columns to categorical
     for col, enum_obj in ATTRIBUTE_CATEGORIES.items():
         enum_values = [e.value for e in enum_obj]
diff --git a/src/visualization/efficiency_metrics.py b/src/visualization/efficiency_metrics.py
@@ -322,7 +322,7 @@ def visualize_metric_distribution(self, output_dir_path: Path | None = None, **k
         )
         column = validated_kwargs.column
         figsize = validated_kwargs.figsize
-        output_dir_path = self.validate_output_dir(output_dir_path)            
+        output_dir_path = self.validate_output_dir(output_dir_path)
 
         # Distribution of Avg Requested VRAM Efficiency Score (actual values; all are <= 0)
         # We keep scores as-is (negative or zero) and construct bins that respect the skew while
diff --git a/src/visualization/visualization.py b/src/visualization/visualization.py
@@ -46,7 +46,7 @@ def validate_dataframe(self) -> pd.DataFrame:
         if self.df.columns.empty:
             raise ValueError("DataFrame has no columns.")
         return self.df
-    
+
     @staticmethod
     def anonymize_str_column(column: pd.Series, prefix: str) -> pd.Series:
         """Anonymize a DataFrame column by replacing its values with a unique identifier.

Original file line number	Diff line number	Diff line change
`@@ -322,7 +322,7 @@ def visualize_metric_distribution(self, output_dir_path: Path \| None = None, **k`
`322`	`322`	`)`
`323`	`323`	`column = validated_kwargs.column`
`324`	`324`	`figsize = validated_kwargs.figsize`
`325`		`- output_dir_path = self.validate_output_dir(output_dir_path)`
	`325`	`+ output_dir_path = self.validate_output_dir(output_dir_path)`
`326`	`326`
`327`	`327`	`# Distribution of Avg Requested VRAM Efficiency Score (actual values; all are <= 0)`
`328`	`328`	`# We keep scores as-is (negative or zero) and construct bins that respect the skew while`