@@ -39,8 +39,6 @@ def _command(self, args: argparse.Namespace):
3939 run = self .api .runs .get (run_name = args .run_name )
4040 if run is None :
4141 raise CLIError (f"Run { args .run_name } not found" )
42- if run .status .is_finished ():
43- raise CLIError (f"Run { args .run_name } is finished" )
4442 metrics = _get_run_jobs_metrics (api = self .api , run = run )
4543
4644 if not args .watch :
@@ -55,8 +53,6 @@ def _command(self, args: argparse.Namespace):
5553 run = self .api .runs .get (run_name = args .run_name )
5654 if run is None :
5755 raise CLIError (f"Run { args .run_name } not found" )
58- if run .status .is_finished ():
59- raise CLIError (f"Run { args .run_name } is finished" )
6056 metrics = _get_run_jobs_metrics (api = self .api , run = run )
6157 except KeyboardInterrupt :
6258 pass
@@ -78,11 +74,12 @@ def _get_run_jobs_metrics(api: Client, run: Run) -> List[JobMetrics]:
7874def _get_metrics_table (run : Run , metrics : List [JobMetrics ]) -> Table :
7975 table = Table (box = None )
8076 table .add_column ("NAME" , style = "bold" , no_wrap = True )
77+ table .add_column ("STATUS" )
8178 table .add_column ("CPU" )
8279 table .add_column ("MEMORY" )
8380 table .add_column ("GPU" )
8481
85- run_row : Dict [Union [str , int ], Any ] = {"NAME" : run .name }
82+ run_row : Dict [Union [str , int ], Any ] = {"NAME" : run .name , "STATUS" : run . status . value }
8683 if len (run ._run .jobs ) != 1 :
8784 add_row_from_dict (table , run_row )
8885
@@ -101,9 +98,9 @@ def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
10198 cpu_usage = f"{ cpu_usage :.0f} %"
10299 memory_usage = _get_metric_value (job_metrics , "memory_working_set_bytes" )
103100 if memory_usage is not None :
104- memory_usage = f" { round (memory_usage / 1024 / 1024 ) } MB"
101+ memory_usage = _format_memory (memory_usage , 2 )
105102 if resources is not None :
106- memory_usage += f"/{ resources .memory_mib } MB "
103+ memory_usage += f"/{ _format_memory ( resources .memory_mib * 1024 * 1024 , 2 ) } "
107104 gpu_metrics = ""
108105 gpus_detected_num = _get_metric_value (job_metrics , "gpus_detected_num" )
109106 if gpus_detected_num is not None :
@@ -113,13 +110,16 @@ def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
113110 if gpu_memory_usage is not None :
114111 if i != 0 :
115112 gpu_metrics += "\n "
116- gpu_metrics += f"# { i } { round (gpu_memory_usage / 1024 / 1024 ) } MB "
113+ gpu_metrics += f"gpu= { i } mem= { _format_memory (gpu_memory_usage , 2 ) } "
117114 if resources is not None :
118- gpu_metrics += f"/{ resources .gpus [i ].memory_mib } MB"
119- gpu_metrics += f" { gpu_util_percent } % Util"
115+ gpu_metrics += (
116+ f"/{ _format_memory (resources .gpus [i ].memory_mib * 1024 * 1024 , 2 )} "
117+ )
118+ gpu_metrics += f" util={ gpu_util_percent } %"
120119
121120 job_row : Dict [Union [str , int ], Any ] = {
122121 "NAME" : f" replica={ job .job_spec .replica_num } job={ job .job_spec .job_num } " ,
122+ "STATUS" : job .job_submissions [- 1 ].status .value ,
123123 "CPU" : cpu_usage or "-" ,
124124 "MEMORY" : memory_usage or "-" ,
125125 "GPU" : gpu_metrics or "-" ,
@@ -136,3 +136,18 @@ def _get_metric_value(job_metrics: JobMetrics, name: str) -> Optional[Any]:
136136 if metric .name == name :
137137 return metric .values [- 1 ]
138138 return None
139+
140+
141+ def _format_memory (memory_bytes : int , decimal_places : int ) -> str :
142+ """See test_format_memory in tests/_internal/cli/commands/test_metrics.py for examples."""
143+ memory_mb = memory_bytes / 1024 / 1024
144+ if memory_mb >= 1024 :
145+ value = memory_mb / 1024
146+ unit = "GB"
147+ else :
148+ value = memory_mb
149+ unit = "MB"
150+
151+ if decimal_places == 0 :
152+ return f"{ round (value )} { unit } "
153+ return f"{ value :.{decimal_places }f} " .rstrip ("0" ).rstrip ("." ) + unit
0 commit comments