Skip to content

Commit 3824995

Browse files
committed
add option to disable timeout
1 parent 7ae5925 commit 3824995

File tree

1 file changed

+10
-9
lines changed

1 file changed

+10
-9
lines changed

tensorflowonspark/TFCluster.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def shutdown(self, ssc=None, grace_secs=0, timeout=259200):
121121
Args:
122122
:ssc: *For Streaming applications only*. Spark StreamingContext
123123
:grace_secs: Grace period to wait after all executors have completed their tasks before terminating the Spark application, e.g. to allow the chief worker to perform any final/cleanup duties like exporting or evaluating the model. Default is 0.
124-
:timeout: Time in seconds to wait for TF cluster to complete before terminating the Spark application. This can be useful if the TF code hangs for any reason. Default is 3 days.
124+
:timeout: Time in seconds to wait for TF cluster to complete before terminating the Spark application. This can be useful if the TF code hangs for any reason. Default is 3 days. Use -1 to disable timeout.
125125
"""
126126
logging.info("Stopping TensorFlow nodes")
127127

@@ -131,14 +131,15 @@ def shutdown(self, ssc=None, grace_secs=0, timeout=259200):
131131
(ps_list if node['job_name'] == 'ps' else worker_list).append(node)
132132

133133
# setup execution timeout
134-
def timeout_handler(signum, frame):
135-
logging.error("TensorFlow execution timed out, exiting Spark application with error status")
136-
self.sc.cancelAllJobs()
137-
self.sc.stop()
138-
sys.exit(1)
139-
140-
signal.signal(signal.SIGALRM, timeout_handler)
141-
signal.alarm(timeout)
134+
if timeout > 0:
135+
def timeout_handler(signum, frame):
136+
logging.error("TensorFlow execution timed out, exiting Spark application with error status")
137+
self.sc.cancelAllJobs()
138+
self.sc.stop()
139+
sys.exit(1)
140+
141+
signal.signal(signal.SIGALRM, timeout_handler)
142+
signal.alarm(timeout)
142143

143144
# wait for Spark Streaming termination or TF app completion for InputMode.TENSORFLOW
144145
if ssc is not None:

0 commit comments

Comments
 (0)