@@ -549,20 +549,26 @@ def run_cmd(self, server_idx: int) -> List[str]:
549549 server_cmd_with_port = add_host_port_to_cmd (server_cmd , server_hostname , server_port )
550550
551551 server_file_path = os .path .join (self .output_dir , f"trtllm-serve.{ server_idx } .log" )
552+ server_error_file_path = os .path .join (
553+ self .output_dir , f"trtllm-serve.{ server_idx } .error.log"
554+ )
552555
553556 print_info (f"Starting server. cmd is { server_cmd_with_port } " )
554- with open (server_file_path , "w" ) as server_ctx :
557+ with (
558+ open (server_file_path , "w" ) as server_ctx ,
559+ open (server_error_file_path , "w" ) as server_err_ctx ,
560+ ):
555561 server_proc = subprocess .Popen (
556562 server_cmd_with_port ,
557563 stdout = server_ctx ,
558- stderr = subprocess . STDOUT ,
564+ stderr = server_err_ctx ,
559565 env = copy .deepcopy (os .environ ),
560566 )
561567
562568 wait_for_endpoint_ready (
563569 f"http://{ server_hostname } :{ server_port } /health" ,
564570 timeout = self .timeout ,
565- check_files = [server_file_path ],
571+ check_files = [server_file_path , server_error_file_path ],
566572 server_proc = server_proc ,
567573 )
568574
@@ -571,20 +577,27 @@ def run_cmd(self, server_idx: int) -> List[str]:
571577 client_file_path = os .path .join (
572578 self .output_dir , f"trtllm-benchmark.{ server_idx } .{ client_idx } .log"
573579 )
580+ client_error_file_path = os .path .join (
581+ self .output_dir , f"trtllm-benchmark.{ server_idx } .{ client_idx } .error.log"
582+ )
574583
575584 client_cmd_with_port = add_host_port_to_cmd (
576585 client_cmd , server_hostname , server_port
577586 )
578587 print_info (f"Starting client. cmd is { client_cmd_with_port } " )
579588
580- output = subprocess .check_output (
589+ result = subprocess .run (
581590 client_cmd_with_port ,
582- stderr = subprocess . STDOUT ,
591+ capture_output = True ,
583592 env = copy .deepcopy (os .environ ),
584- ).decode ()
593+ check = True ,
594+ )
595+ output = result .stdout .decode ()
585596
586597 with open (client_file_path , "w" ) as client_ctx :
587598 client_ctx .write (output )
599+ with open (client_error_file_path , "w" ) as client_err_ctx :
600+ client_err_ctx .write (result .stderr .decode ())
588601
589602 outputs .append (output )
590603
@@ -723,7 +736,10 @@ def run_cmd(self, server_idx: int) -> List[str]:
723736 if "CTX" in self .disagg_serving_type or "GEN" in self .disagg_serving_type :
724737 self ._generate_hostname_file (server_idx , port )
725738 server_file_path = os .path .join (
726- self .output_dir , f"trtllm-serve.{ server_idx } .{ self .disagg_serving_type } .log"
739+ self .output_dir , f"trtllm-serve.{ self .disagg_serving_type } .{ server_idx } .log"
740+ )
741+ server_error_file_path = os .path .join (
742+ self .output_dir , f"trtllm-serve.{ self .disagg_serving_type } .{ server_idx } .error.log"
727743 )
728744 is_ctx = "CTX" in self .disagg_serving_type
729745 server_cmd = ctx_cmd if is_ctx else gen_cmd
@@ -732,11 +748,14 @@ def run_cmd(self, server_idx: int) -> List[str]:
732748 print_info (
733749 f"Starting server. disagg_serving_type: { self .disagg_serving_type } cmd is { server_cmd } "
734750 )
735- with open (server_file_path , "w" ) as server_ctx :
751+ with (
752+ open (server_file_path , "w" ) as server_ctx ,
753+ open (server_error_file_path , "w" ) as server_err_ctx ,
754+ ):
736755 server_proc = subprocess .Popen (
737756 server_cmd ,
738757 stdout = server_ctx ,
739- stderr = subprocess . STDOUT ,
758+ stderr = server_err_ctx ,
740759 env = copy .deepcopy (os .environ ),
741760 )
742761 self .wait_for_benchmark_ready (benchmark_status_file )
@@ -747,16 +766,22 @@ def run_cmd(self, server_idx: int) -> List[str]:
747766
748767 elif self .disagg_serving_type == "DISAGG_SERVER" :
749768 disagg_server_file_path = os .path .join (
750- self .output_dir , f"trtllm-serve.{ server_idx } .{ self .disagg_serving_type } .log"
769+ self .output_dir , f"trtllm-serve.{ self .disagg_serving_type } .{ server_idx } .log"
770+ )
771+ disagg_server_error_file_path = os .path .join (
772+ self .output_dir , f"trtllm-serve.{ self .disagg_serving_type } .{ server_idx } .error.log"
751773 )
752774 try :
753775 self ._generate_disagg_server_config (server_idx , port )
754776 print_info (f"Starting disagg server. cmd is { disagg_cmd } " )
755- with open (disagg_server_file_path , "w" ) as disagg_server_ctx :
777+ with (
778+ open (disagg_server_file_path , "w" ) as disagg_server_ctx ,
779+ open (disagg_server_error_file_path , "w" ) as disagg_server_err_ctx ,
780+ ):
756781 disagg_server_proc = subprocess .Popen (
757782 disagg_cmd ,
758783 stdout = disagg_server_ctx ,
759- stderr = subprocess . STDOUT ,
784+ stderr = disagg_server_err_ctx ,
760785 env = copy .deepcopy (os .environ ),
761786 )
762787 self .wait_for_benchmark_ready (benchmark_status_file )
@@ -770,21 +795,28 @@ def run_cmd(self, server_idx: int) -> List[str]:
770795 disagg_server_hostname , disagg_server_port = (
771796 self ._get_disagg_server_hostname_and_port (server_idx )
772797 )
773- server_files = [
774- os . path . join ( self . output_dir , f"trtllm-serve. { server_idx } .DISAGG_SERVER.log" ),
775- ]
776- for ctx_idx in range ( self .num_ctx_servers ):
777- server_files . append (
798+ server_files = (
799+ [
800+ os . path . join (
801+ self .output_dir , f"trtllm-serve.DISAGG_SERVER. { server_idx } .log"
802+ ),
778803 os .path .join (
779- self .output_dir , f"trtllm-serve.{ server_idx } .CTX_{ ctx_idx } .log"
804+ self .output_dir , f"trtllm-serve.DISAGG_SERVER.{ server_idx } .error.log"
805+ ),
806+ ]
807+ + [
808+ os .path .join (
809+ self .output_dir , f"trtllm-serve.CTX_{ ctx_idx } .{ server_idx } .log"
780810 )
781- )
782- for gen_idx in range ( self . num_gen_servers ):
783- server_files . append (
811+ for ctx_idx in range ( self . num_ctx_servers )
812+ ]
813+ + [
784814 os .path .join (
785- self .output_dir , f"trtllm-serve.{ server_idx } .GEN_ { gen_idx } .log"
815+ self .output_dir , f"trtllm-serve.GEN_ { gen_idx } . { server_idx } .log"
786816 )
787- )
817+ for gen_idx in range (self .num_gen_servers )
818+ ]
819+ )
788820 wait_for_endpoint_ready (
789821 f"http://{ disagg_server_hostname } :{ disagg_server_port } /health" ,
790822 timeout = self .timeout ,
@@ -796,20 +828,27 @@ def run_cmd(self, server_idx: int) -> List[str]:
796828 benchmark_file_path = os .path .join (
797829 self .output_dir , f"trtllm-benchmark.{ server_idx } .{ client_idx } .log"
798830 )
831+ benchmark_error_file_path = os .path .join (
832+ self .output_dir , f"trtllm-benchmark.{ server_idx } .{ client_idx } .error.log"
833+ )
799834
800835 client_cmd_with_port = add_host_port_to_cmd (
801836 client_cmd , disagg_server_hostname , disagg_server_port
802837 )
803838 print_info (f"Starting benchmark. cmd is { client_cmd_with_port } " )
804839
805- output = subprocess .check_output (
840+ result = subprocess .run (
806841 client_cmd_with_port ,
842+ capture_output = True ,
807843 env = copy .deepcopy (os .environ ),
808- stderr = subprocess .STDOUT ,
809- ).decode ()
844+ check = True ,
845+ )
846+ output = result .stdout .decode ()
810847
811848 with open (benchmark_file_path , "w" ) as benchmark_ctx :
812849 benchmark_ctx .write (output )
850+ with open (benchmark_error_file_path , "w" ) as benchmark_err_ctx :
851+ benchmark_err_ctx .write (result .stderr .decode ())
813852 outputs .append (output )
814853
815854 finally :
@@ -1197,11 +1236,21 @@ def run_ex(self, commands) -> Dict[int, List[str]]:
11971236
11981237 except Exception as e :
11991238 print_error (f"Test command failed for server { server_idx } . Error: { e } " )
1200- if isinstance (e , subprocess .CalledProcessError ):
1201- print_error ("--- stdout ---" )
1202- if e .stdout :
1203- print_error (e .stdout .decode () if isinstance (e .stdout , bytes ) else e .stdout )
1204- print_error ("--------------" )
1239+ # Print content of trtllm-serve error log files
1240+ error_log_pattern = os .path .join (
1241+ commands .output_dir , f"trtllm-serve*{ server_idx } .error.log"
1242+ )
1243+ error_log_files = glob .glob (error_log_pattern )
1244+ for error_log_file in error_log_files :
1245+ if os .path .exists (error_log_file ):
1246+ print_error (f"--- { error_log_file } ---" )
1247+ with open (error_log_file , "r" ) as f :
1248+ content = f .read ()
1249+ if content .strip ():
1250+ print_error (content )
1251+ else :
1252+ print_error ("(empty)" )
1253+ print_error ("-" * len (f"--- { error_log_file } ---" ))
12051254 outputs [server_idx ] = []
12061255
12071256 return outputs
0 commit comments