Feature/update column types (#17)

naryasomayaj · web-flow · commit 364eb686ef28 · 2025-08-06T18:53:43.000Z
* updating documentation for GPUTYpe

* added ArrayJobID documentation

* committing changes to units

* Merge branch 'main' into feature/update-column-types

* committing changes to column types -- TESTING TIMELIMIT AS MINUTES

* committing changes for TimeLimit units

* changing TimeLimit to seconds in preprocess.py

* resolving comments on pr

* committing changes to feature visualize columns

* committing changes checking if mem_db is not none

* Add disconnect() call to avoid teardown error in pytest

* merging main and resolving conflicts for preprocess
diff --git a/README.md b/README.md
@@ -166,6 +166,7 @@ contains tools to add a number of useful derived columns for plotting and analys
 | UUID   | VARCHAR | Unique identifier | 
 | JobID  | INTEGER | Slurm job ID |
 | ArrayID | INTEGER | Position in job array |
+|ArrayJobID| INTEGER | Slurm job ID within array|
 | JobName |  VARCHAR | Name of job |
 | IsArray |  BOOLEAN | Indicator if job is part of an array |
 | Interactive |  VARCHAR | Indicator if job was interactive
@@ -187,7 +188,7 @@ contains tools to add a number of useful derived columns for plotting and analys
 | CPUs |  SMALLINT | Number of CPU cores |
 | Memory |  INTEGER | Job allocated memory (bytes) |
 | GPUs |  SMALLINT | Number of GPUs requested |
-| GPUType |  VARCHAR[] | List of GPU types |
+| GPUType |  DICT | Dictionary with keys as type of GPU (str) and the values as number of GPUs corresponding to that type (int) |
 | GPUMemUsage |  FLOAT | GPU memory usage (bytes) |
 | GPUComputeUsage |  FLOAT | GPU compute usage (pct) |
 | CPUMemUsage |  FLOAT | GPU memory usage (bytes) |
diff --git a/src/preprocess/preprocess.py b/src/preprocess/preprocess.py
@@ -290,9 +290,9 @@ def preprocess_data(
     for col in time_columns:
         data[col] = pd.to_datetime(data[col], errors="coerce")
 
-    timedelta_columns = ["TimeLimit", "Elapsed"]
-    for col in timedelta_columns:
-        data[col] = pd.to_timedelta(data[col], unit="s", errors="coerce")
+    time_limit_in_seconds = data["TimeLimit"] * 60
+    data["TimeLimit"] = pd.to_timedelta(time_limit_in_seconds, unit="s", errors="coerce")
+    data["Elapsed"] = pd.to_timedelta(data["Elapsed"], unit="s", errors="coerce")
 
     # Added parameters for calculating VRAM metrics
     data.loc[:, "Queued"] = data["StartTime"] - data["SubmitTime"]
diff --git a/src/visualization/columns.py b/src/visualization/columns.py
@@ -70,11 +70,6 @@ def _plot_duration_histogram(self, jobs_df: pd.DataFrame, col: str, output_dir_p
             None
         """
         col_data = jobs_df[col]
-        # Determine unit for conversion
-        if col == "Elapsed" and not pd.api.types.is_timedelta64_dtype(col_data):
-            col_data = pd.to_timedelta(col_data, unit="seconds", errors="coerce")
-        elif col == "TimeLimit" and not pd.api.types.is_timedelta64_dtype(col_data):
-            col_data = pd.to_timedelta(col_data, unit="minutes", errors="coerce")
 
         # Convert to minutes for plotting
         timelimit_minutes = col_data.dropna().dt.total_seconds() / 60
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -8,7 +8,7 @@
 @pytest.fixture(scope="module")
 def mock_data_frame():
     temp_db_dir = tempfile.mkdtemp()
-    db = None
+    mem_db = None
     try:
         temp_db_path = f"{temp_db_dir}/mock.db"
         convert_csv_to_db("tests/mock_data/mock.csv", temp_db_path)
@@ -17,5 +17,8 @@ def mock_data_frame():
     except Exception as e:
         raise Exception("Exception at mock_data_frame") from e
     finally:
-        del db
+        if mem_db is not None:
+            mem_db._disconnect()
+            del mem_db
         shutil.rmtree(temp_db_dir)
+        
diff --git a/tests/mock_data/convert_csv_to_db.py b/tests/mock_data/convert_csv_to_db.py
@@ -69,4 +69,4 @@ def convert_csv_to_db(path_to_csv: str, path_to_db: str):
         raise e
     finally:
         if conn is not None:
-            conn.close()
+            conn.close()
diff --git a/tests/test_database_connection.py b/tests/test_database_connection.py
@@ -87,6 +87,7 @@ def temp_file_db():
         raise e
     finally:
         if mem_db is not None:
+            mem_db._disconnect()
             del mem_db
         shutil.rmtree(temp_db_dir)
 
@@ -164,4 +165,4 @@ def test_fetch_query_with_invalid_column(temp_file_db):
     with pytest.raises(Exception) as exc_info:
         temp_file_db.fetch_query(query)
     msg = str(exc_info.value)
-    assert "This query does not match the database schema." in msg
+    assert "This query does not match the database schema." in msg
diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py
@@ -380,8 +380,8 @@ def test_preprocess_timedelta_conversion(mock_data_frame):
     time_limit = data["TimeLimit"]
 
     assert time_limit.dtype == "timedelta64[ns]"
-    assert time_limit[0].total_seconds() == ground_truth["TimeLimit"][0]
-    assert time_limit[max_len - 1].total_seconds() == ground_truth["TimeLimit"][max_len - 1]
+    assert time_limit[0].total_seconds() / 60 == ground_truth["TimeLimit"][0] 
+    assert time_limit[max_len - 1].total_seconds() / 60 == ground_truth["TimeLimit"][max_len - 1]
 
 
 def test_preprocess_gpu_type(mock_data_frame):
@@ -394,7 +394,6 @@ def test_preprocess_gpu_type(mock_data_frame):
         include_cpu_only_jobs=True,
     )
 
-    # Check that GPUType is filled with 'cpu' for CPU-only jobs
     assert all(row == ["cpu"] for row in data.loc[data["GPUType"].isna(), "GPUType"])
 
     # Check that numpy arrays in GPUType are converted to lists