Skip to content

Commit 364eb68

Browse files
authored
Feature/update column types (#17)
* updating documentation for GPUTYpe * added ArrayJobID documentation * committing changes to units * Merge branch 'main' into feature/update-column-types * committing changes to column types -- TESTING TIMELIMIT AS MINUTES * committing changes for TimeLimit units * changing TimeLimit to seconds in preprocess.py * resolving comments on pr * committing changes to feature visualize columns * committing changes checking if mem_db is not none * Add disconnect() call to avoid teardown error in pytest * merging main and resolving conflicts for preprocess
1 parent 202f611 commit 364eb68

File tree

7 files changed

+15
-16
lines changed

7 files changed

+15
-16
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ contains tools to add a number of useful derived columns for plotting and analys
166166
| UUID | VARCHAR | Unique identifier |
167167
| JobID | INTEGER | Slurm job ID |
168168
| ArrayID | INTEGER | Position in job array |
169+
|ArrayJobID| INTEGER | Slurm job ID within array|
169170
| JobName | VARCHAR | Name of job |
170171
| IsArray | BOOLEAN | Indicator if job is part of an array |
171172
| Interactive | VARCHAR | Indicator if job was interactive
@@ -187,7 +188,7 @@ contains tools to add a number of useful derived columns for plotting and analys
187188
| CPUs | SMALLINT | Number of CPU cores |
188189
| Memory | INTEGER | Job allocated memory (bytes) |
189190
| GPUs | SMALLINT | Number of GPUs requested |
190-
| GPUType | VARCHAR[] | List of GPU types |
191+
| GPUType | DICT | Dictionary with keys as type of GPU (str) and the values as number of GPUs corresponding to that type (int) |
191192
| GPUMemUsage | FLOAT | GPU memory usage (bytes) |
192193
| GPUComputeUsage | FLOAT | GPU compute usage (pct) |
193194
| CPUMemUsage | FLOAT | GPU memory usage (bytes) |

src/preprocess/preprocess.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -290,9 +290,9 @@ def preprocess_data(
290290
for col in time_columns:
291291
data[col] = pd.to_datetime(data[col], errors="coerce")
292292

293-
timedelta_columns = ["TimeLimit", "Elapsed"]
294-
for col in timedelta_columns:
295-
data[col] = pd.to_timedelta(data[col], unit="s", errors="coerce")
293+
time_limit_in_seconds = data["TimeLimit"] * 60
294+
data["TimeLimit"] = pd.to_timedelta(time_limit_in_seconds, unit="s", errors="coerce")
295+
data["Elapsed"] = pd.to_timedelta(data["Elapsed"], unit="s", errors="coerce")
296296

297297
# Added parameters for calculating VRAM metrics
298298
data.loc[:, "Queued"] = data["StartTime"] - data["SubmitTime"]

src/visualization/columns.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,6 @@ def _plot_duration_histogram(self, jobs_df: pd.DataFrame, col: str, output_dir_p
7070
None
7171
"""
7272
col_data = jobs_df[col]
73-
# Determine unit for conversion
74-
if col == "Elapsed" and not pd.api.types.is_timedelta64_dtype(col_data):
75-
col_data = pd.to_timedelta(col_data, unit="seconds", errors="coerce")
76-
elif col == "TimeLimit" and not pd.api.types.is_timedelta64_dtype(col_data):
77-
col_data = pd.to_timedelta(col_data, unit="minutes", errors="coerce")
7873

7974
# Convert to minutes for plotting
8075
timelimit_minutes = col_data.dropna().dt.total_seconds() / 60

tests/conftest.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
@pytest.fixture(scope="module")
99
def mock_data_frame():
1010
temp_db_dir = tempfile.mkdtemp()
11-
db = None
11+
mem_db = None
1212
try:
1313
temp_db_path = f"{temp_db_dir}/mock.db"
1414
convert_csv_to_db("tests/mock_data/mock.csv", temp_db_path)
@@ -17,5 +17,8 @@ def mock_data_frame():
1717
except Exception as e:
1818
raise Exception("Exception at mock_data_frame") from e
1919
finally:
20-
del db
20+
if mem_db is not None:
21+
mem_db._disconnect()
22+
del mem_db
2123
shutil.rmtree(temp_db_dir)
24+

tests/mock_data/convert_csv_to_db.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,4 +69,4 @@ def convert_csv_to_db(path_to_csv: str, path_to_db: str):
6969
raise e
7070
finally:
7171
if conn is not None:
72-
conn.close()
72+
conn.close()

tests/test_database_connection.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ def temp_file_db():
8787
raise e
8888
finally:
8989
if mem_db is not None:
90+
mem_db._disconnect()
9091
del mem_db
9192
shutil.rmtree(temp_db_dir)
9293

@@ -164,4 +165,4 @@ def test_fetch_query_with_invalid_column(temp_file_db):
164165
with pytest.raises(Exception) as exc_info:
165166
temp_file_db.fetch_query(query)
166167
msg = str(exc_info.value)
167-
assert "This query does not match the database schema." in msg
168+
assert "This query does not match the database schema." in msg

tests/test_preprocess.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -380,8 +380,8 @@ def test_preprocess_timedelta_conversion(mock_data_frame):
380380
time_limit = data["TimeLimit"]
381381

382382
assert time_limit.dtype == "timedelta64[ns]"
383-
assert time_limit[0].total_seconds() == ground_truth["TimeLimit"][0]
384-
assert time_limit[max_len - 1].total_seconds() == ground_truth["TimeLimit"][max_len - 1]
383+
assert time_limit[0].total_seconds() / 60 == ground_truth["TimeLimit"][0]
384+
assert time_limit[max_len - 1].total_seconds() / 60 == ground_truth["TimeLimit"][max_len - 1]
385385

386386

387387
def test_preprocess_gpu_type(mock_data_frame):
@@ -394,7 +394,6 @@ def test_preprocess_gpu_type(mock_data_frame):
394394
include_cpu_only_jobs=True,
395395
)
396396

397-
# Check that GPUType is filled with 'cpu' for CPU-only jobs
398397
assert all(row == ["cpu"] for row in data.loc[data["GPUType"].isna(), "GPUType"])
399398

400399
# Check that numpy arrays in GPUType are converted to lists

0 commit comments

Comments
 (0)