Skip to content

Commit 22e2dbf

Browse files
author
Douglas Blank
committed
WIP gpu_report
1 parent 313e2c1 commit 22e2dbf

File tree

3 files changed

+1048
-53
lines changed

3 files changed

+1048
-53
lines changed

cometx/cli/admin.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,33 @@
6363
- GPU utilization charts (if GPU data is available)
6464
- GPU memory utilization charts (if GPU data is available)
6565
66+
gpu-report
67+
Generate a GPU usage report for one or more workspaces/projects.
68+
69+
Usage:
70+
cometx admin gpu-report WORKSPACE [WORKSPACE ...] --start-date DATE
71+
cometx admin gpu-report WORKSPACE/PROJECT [WORKSPACE/PROJECT ...] --start-date DATE
72+
73+
Arguments:
74+
WORKSPACE_PROJECT (required, one or more)
75+
One or more WORKSPACE or WORKSPACE/PROJECT to run GPU report for.
76+
If WORKSPACE is provided without a project, all projects in that workspace will be included.
77+
78+
Options:
79+
--start-date DATE
80+
Start date for the report in YYYY-MM-DD format (required).
81+
82+
--end-date DATE
83+
End date for the report in YYYY-MM-DD format (optional).
84+
If not provided, reports from start-date onwards.
85+
86+
--metrics METRIC [METRIC ...]
87+
List of metrics to track (optional).
88+
If not provided, uses default GPU metrics.
89+
90+
Output:
91+
Returns a dictionary of metrics keyed by experiment key.
92+
6693
Global Options (available for all commands):
6794
--api-key KEY
6895
Set the COMET_API_KEY for authentication.
@@ -83,6 +110,9 @@
83110
cometx admin usage-report my-workspace/project1 my-workspace/project2
84111
cometx admin usage-report workspace1 workspace2 --units week
85112
cometx admin usage-report workspace --units day --no-open
113+
cometx admin gpu-report my-workspace --start-date 2024-01-01
114+
cometx admin gpu-report my-workspace --start-date 2024-01-01 --end-date 2024-12-31
115+
cometx admin gpu-report workspace1/project1 workspace2 --start-date 2024-01-01 --metrics sys.gpu.0.gpu_utilization
86116
87117
"""
88118

@@ -93,6 +123,7 @@
93123

94124
from comet_ml import API
95125

126+
from .admin_gpu_report import main as gpu_report_main
96127
from .admin_usage_report import generate_usage_report
97128

98129
ADDITIONAL_ARGS = False
@@ -234,6 +265,88 @@ def get_parser_arguments(parser):
234265
type=str,
235266
)
236267

268+
# gpu-report subcommand
269+
gpu_report_description = """Generate a GPU usage report for one or more workspaces/projects.
270+
271+
Arguments:
272+
WORKSPACE_PROJECT (required, one or more)
273+
One or more WORKSPACE or WORKSPACE/PROJECT to run GPU report for.
274+
If WORKSPACE is provided without a project, all projects in that workspace will be included.
275+
276+
Options:
277+
--start-date DATE
278+
Start date for the report in YYYY-MM-DD format (required).
279+
280+
--end-date DATE
281+
End date for the report in YYYY-MM-DD format (optional).
282+
If not provided, reports from start-date onwards.
283+
284+
--metrics METRIC [METRIC ...]
285+
List of metrics to track (optional).
286+
If not provided, uses default GPU metrics:
287+
- sys.gpu.0.gpu_utilization
288+
- sys.gpu.0.memory_utilization
289+
- sys.gpu.0.used_memory
290+
- sys.gpu.0.power_usage
291+
- sys.gpu.0.temperature
292+
293+
--open
294+
Automatically open the generated PDF file after generation.
295+
296+
Output:
297+
Generates a PDF report containing:
298+
- Summary statistics (total experiments, workspaces, metrics tracked)
299+
- Average metrics by workspace charts
300+
- Maximum metrics by month charts
301+
302+
Examples:
303+
cometx admin gpu-report my-workspace --start-date 2024-01-01
304+
cometx admin gpu-report my-workspace --start-date 2024-01-01 --end-date 2024-12-31
305+
cometx admin gpu-report workspace1/project1 workspace2 --start-date 2024-01-01
306+
cometx admin gpu-report my-workspace --start-date 2024-01-01 --metrics sys.gpu.0.gpu_utilization sys.gpu.0.memory_utilization
307+
cometx admin gpu-report my-workspace --start-date 2024-01-01 --open
308+
"""
309+
gpu_parser = subparsers.add_parser(
310+
"gpu-report",
311+
help="Generate a GPU usage report for one or more workspaces/projects",
312+
description=gpu_report_description,
313+
formatter_class=argparse.RawDescriptionHelpFormatter,
314+
)
315+
# Add global arguments to subparser so they show in help
316+
add_global_arguments(gpu_parser)
317+
gpu_parser.add_argument(
318+
"WORKSPACE_PROJECT",
319+
nargs="+",
320+
help="One or more WORKSPACE or WORKSPACE/PROJECT to run GPU report for",
321+
metavar="WORKSPACE",
322+
type=str,
323+
)
324+
gpu_parser.add_argument(
325+
"--start-date",
326+
help="Start date for the report in YYYY-MM-DD format (required)",
327+
type=str,
328+
required=True,
329+
)
330+
gpu_parser.add_argument(
331+
"--end-date",
332+
help="End date for the report in YYYY-MM-DD format (optional)",
333+
type=str,
334+
default=None,
335+
)
336+
gpu_parser.add_argument(
337+
"--metrics",
338+
help="List of metrics to track (optional, uses defaults if not provided)",
339+
nargs="+",
340+
type=str,
341+
default=None,
342+
)
343+
gpu_parser.add_argument(
344+
"--open",
345+
help="Automatically open the generated PDF file after generation",
346+
default=False,
347+
action="store_true",
348+
)
349+
237350

238351
def admin(parsed_args, remaining=None):
239352
# Called via `cometx admin ...`
@@ -326,6 +439,45 @@ def admin(parsed_args, remaining=None):
326439
except Exception as e:
327440
print("ERROR: " + str(e))
328441
return
442+
elif parsed_args.ACTION == "gpu-report":
443+
workspace_projects = parsed_args.WORKSPACE_PROJECT
444+
start_date = parsed_args.start_date
445+
end_date = parsed_args.end_date
446+
metrics = parsed_args.metrics
447+
448+
try:
449+
result = gpu_report_main(
450+
workspace_projects=workspace_projects,
451+
start_date=start_date,
452+
end_date=end_date,
453+
metrics=metrics,
454+
max_workers=None, # Use default
455+
)
456+
if result:
457+
num_experiments = len(result.get("metrics", {}))
458+
num_charts = len(result.get("charts", []))
459+
pdf_file = result.get("pdf_file")
460+
print(
461+
f"\nGPU report completed. Processed {num_experiments} experiments."
462+
)
463+
if num_charts > 0:
464+
print(f"Generated {num_charts} charts:")
465+
for chart_file in result.get("charts", []):
466+
print(f" - {chart_file}")
467+
if pdf_file:
468+
print(f"PDF report: {pdf_file}")
469+
# Open PDF if --open flag is set
470+
if parsed_args.open:
471+
from .admin_gpu_report import open_pdf
472+
473+
open_pdf(pdf_file, debug=parsed_args.debug)
474+
except Exception as e:
475+
print("ERROR: " + str(e))
476+
if parsed_args.debug:
477+
import traceback
478+
479+
traceback.print_exc()
480+
return
329481

330482
except KeyboardInterrupt:
331483
if parsed_args.debug:

0 commit comments

Comments
 (0)