@@ -507,6 +507,7 @@ def process_monitor(self, sproc: "subprocess.Popen[str]", kill_switch: threading
507
507
memory_usage : MutableSequence [Optional [int ]] = [None ]
508
508
509
509
mem_tm : "Optional[Timer]" = None
510
+ ks_tm : "Optional[Timer]" = None
510
511
511
512
def get_tree_mem_usage (memory_usage : MutableSequence [Optional [int ]]) -> None :
512
513
nonlocal mem_tm
@@ -528,10 +529,27 @@ def get_tree_mem_usage(memory_usage: MutableSequence[Optional[int]]) -> None:
528
529
if mem_tm is not None :
529
530
mem_tm .cancel ()
530
531
532
+ def monitor_kill_switch () -> None :
533
+ nonlocal ks_tm
534
+ if kill_switch .is_set ():
535
+ _logger .error ("[job %s] terminating by kill switch" , self .name )
536
+ if sproc .stdin : sproc .stdin .close ()
537
+ sproc .terminate ()
538
+ else :
539
+ ks_tm = Timer (interval = 1 , function = monitor_kill_switch )
540
+ ks_tm .daemon = True
541
+ ks_tm .start ()
542
+
543
+ ks_tm = Timer (interval = 1 , function = monitor_kill_switch )
544
+ ks_tm .daemon = True
545
+ ks_tm .start ()
546
+
531
547
mem_tm = Timer (interval = 1 , function = get_tree_mem_usage , args = (memory_usage ,))
532
548
mem_tm .daemon = True
533
549
mem_tm .start ()
550
+
534
551
sproc .wait ()
552
+ ks_tm .cancel ()
535
553
mem_tm .cancel ()
536
554
if memory_usage [0 ] is not None :
537
555
_logger .info (
@@ -845,20 +863,48 @@ def docker_monitor(
845
863
process : "subprocess.Popen[str]" ,
846
864
kill_switch : threading .Event ,
847
865
) -> None :
848
- """Record memory usage of the running Docker container."""
866
+ """Record memory usage of the running Docker container. Terminate if kill_switch is activated."""
867
+
868
+ ks_tm : "Optional[Timer]" = None
869
+ cid : Optional [str ] = None
870
+
871
+ def monitor_kill_switch () -> None :
872
+ nonlocal ks_tm
873
+ if kill_switch .is_set ():
874
+ _logger .error ("[job %s] terminating by kill switch" , self .name )
875
+ if process .stdin :
876
+ process .stdin .close ()
877
+ if cid is not None :
878
+ kill_proc = subprocess .Popen ( # nosec
879
+ [docker_exe , "kill" , cid ], shell = False # nosec
880
+ )
881
+ try :
882
+ kill_proc .wait (timeout = 10 )
883
+ except subprocess .TimeoutExpired :
884
+ kill_proc .kill ()
885
+ process .terminate () # Always terminate, even if we tried with the cidfile
886
+ else :
887
+ ks_tm = Timer (interval = 1 , function = monitor_kill_switch )
888
+ ks_tm .daemon = True
889
+ ks_tm .start ()
890
+
891
+ ks_tm = Timer (interval = 1 , function = monitor_kill_switch )
892
+ ks_tm .daemon = True
893
+ ks_tm .start ()
894
+
849
895
# Todo: consider switching to `docker create` / `docker start`
850
896
# instead of `docker run` as `docker create` outputs the container ID
851
897
# to stdout, but the container is frozen, thus allowing us to start the
852
898
# monitoring process without dealing with the cidfile or too-fast
853
899
# container execution
854
- cid : Optional [str ] = None
855
900
while cid is None :
856
901
time .sleep (1 )
857
902
# This is needed to avoid a race condition where the job
858
903
# was so fast that it already finished when it arrives here
859
904
if process .returncode is None :
860
905
process .poll ()
861
906
if process .returncode is not None :
907
+ ks_tm .cancel ()
862
908
if cleanup_cidfile :
863
909
try :
864
910
os .remove (cidfile )
@@ -890,6 +936,9 @@ def docker_monitor(
890
936
except OSError as exc :
891
937
_logger .warning ("Ignored error with %s stats: %s" , docker_exe , exc )
892
938
return
939
+ finally :
940
+ ks_tm .cancel ()
941
+
893
942
max_mem_percent : float = 0.0
894
943
mem_percent : float = 0.0
895
944
with open (stats_file_name ) as stats :
@@ -924,7 +973,7 @@ def _job_popen(
924
973
job_script_contents : Optional [str ] = None ,
925
974
timelimit : Optional [int ] = None ,
926
975
name : Optional [str ] = None ,
927
- monitor_function : Optional [Callable [["subprocess.Popen[str]" ], None ]] = None ,
976
+ monitor_function : Optional [Callable [["subprocess.Popen[str]" , "threading.Event" ], None ]] = None ,
928
977
default_stdout : Optional [Union [IO [bytes ], TextIO ]] = None ,
929
978
default_stderr : Optional [Union [IO [bytes ], TextIO ]] = None ,
930
979
) -> int :
@@ -979,7 +1028,7 @@ def terminate(): # type: () -> None
979
1028
tm .daemon = True
980
1029
tm .start ()
981
1030
if monitor_function :
982
- monitor_function (sproc )
1031
+ monitor_function (sproc , kill_switch )
983
1032
rcode = sproc .wait ()
984
1033
985
1034
if tm is not None :
@@ -1055,7 +1104,7 @@ def terminate(): # type: () -> None
1055
1104
tm .daemon = True
1056
1105
tm .start ()
1057
1106
if monitor_function :
1058
- monitor_function (sproc )
1107
+ monitor_function (sproc , kill_switch )
1059
1108
1060
1109
rcode = sproc .wait ()
1061
1110
0 commit comments