@@ -594,10 +594,29 @@ public void run() {
594594 Map <HboxContainerId , String > containersAppFinishTime =
595595 applicationContext .getContainersAppFinishTime ();
596596
597+ // container info schema (since 1.3a b30b967d7c283e8ac542b66413fcc2bc5c5fb95c):
598+ // 0: node url
599+ // 1: gpu device id
600+ // 2: role
601+ // 3: status
602+ // 4: cpu metrics
603+ // 5: gpu mem metrics
604+ // 6: gpu util metrics
605+ // 7: start time
606+ // 8: finish time
607+ // 9: percent progress
608+ // 10: log url
609+ // 11: stats - cpu
610+ // 12: stats - gpu mem
611+ // 13: stats - gpu utils
612+ // 14: stats - mem usage warn (if cpuStatistics.size > 0 || version >= 1.9.2)
613+ // 15: rank (since 1.9.2)
614+ int workerIdx = 0 ;
597615 for (Container container : workerContainers ) {
598616 List <String > containerMessage = new ArrayList <>();
599617 containerMessage .add (container .getNodeHttpAddress ());
600618 HboxContainerId currentContainerID = new HboxContainerId (container .getId ());
619+ String rank = "-" ;
601620 if (applicationContext .getContainerGPUDevice (currentContainerID ) != null ) {
602621 if (applicationContext
603622 .getContainerGPUDevice (currentContainerID )
@@ -617,6 +636,7 @@ public void run() {
617636 containerMessage .add (HboxConstants .CHIEF );
618637 } else {
619638 containerMessage .add (HboxConstants .WORKER );
639+ rank = "" + workerIdx ++;
620640 }
621641
622642 HboxContainerStatus status = applicationContext .getContainerStatus (currentContainerID );
@@ -677,6 +697,8 @@ public void run() {
677697 } else {
678698 usageStatistics .add ("false" );
679699 }
700+ } else {
701+ usageStatistics .add ("-" ); // container info schema idx=14
680702 }
681703
682704 if (containersAppStartTime .get (currentContainerID ) != null
@@ -729,13 +751,16 @@ public void run() {
729751 container .getId ().toString (),
730752 userName ));
731753 containerMessage .addAll (usageStatistics );
754+ containerMessage .add (rank ); // container info schema idx=15
732755 logMessage .put (container .getId ().toString (), containerMessage );
733756 }
734757
758+ int psIdx = 0 ;
735759 for (Container container : psContainers ) {
736760 List <String > containerMessage = new ArrayList <>();
737761 containerMessage .add (container .getNodeHttpAddress ());
738762 HboxContainerId currentContainerID = new HboxContainerId (container .getId ());
763+ String rank = "-" ;
739764 if (applicationContext .getContainerGPUDevice (currentContainerID ) != null ) {
740765 if (applicationContext
741766 .getContainerGPUDevice (currentContainerID )
@@ -750,17 +775,22 @@ public void run() {
750775 containerMessage .add ("-" );
751776 }
752777 if (hboxAppType .equals ("TENSORFLOW" ) || "TENSOR2TENSOR" .equals (hboxAppType )) {
753- containerMessage .add ("ps" );
778+ containerMessage .add (HboxConstants .PS );
779+ rank = "" + psIdx ++;
754780 } else if (hboxAppType .equals ("MXNET" )
755781 || hboxAppType .equals ("DISTLIGHTLDA" )
756782 || hboxAppType .equals ("XFLOW" )) {
757- containerMessage .add ("server" );
783+ containerMessage .add (HboxConstants . SERVER );
758784 } else if (hboxAppType .equals ("XDL" )) {
759785 if (currentContainerID .toString ().equals (schedulerContainerId )) {
760786 containerMessage .add (HboxConstants .SCHEDULER );
761787 } else {
762- containerMessage .add ("ps" );
788+ containerMessage .add (HboxConstants .PS );
789+ rank = "" + psIdx ++;
763790 }
791+ } else {
792+ containerMessage .add (HboxConstants .PS );
793+ rank = "" + psIdx ++;
764794 }
765795 HboxContainerStatus status = applicationContext .getContainerStatus (currentContainerID );
766796 if (status != null ) {
@@ -814,6 +844,8 @@ public void run() {
814844 } else {
815845 usageStatistics .add ("false" );
816846 }
847+ } else {
848+ usageStatistics .add ("-" ); // container info schema idx=14
817849 }
818850
819851 if (containersAppStartTime .get (currentContainerID ) != null
@@ -841,6 +873,7 @@ public void run() {
841873 container .getId ().toString (),
842874 userName ));
843875 containerMessage .addAll (usageStatistics );
876+ containerMessage .add (rank ); // container info schema idx=15
844877 logMessage .put (container .getId ().toString (), containerMessage );
845878 }
846879
0 commit comments