Skip to content

Commit aef2e05

Browse files
SinaChavoshiTensorflow Cloud maintainers
authored andcommitted
Capture tracelog when SIGSEGV happens during remote execution.
PiperOrigin-RevId: 378647010
1 parent ec327d0 commit aef2e05

File tree

2 files changed

+20
-0
lines changed

2 files changed

+20
-0
lines changed

src/python/tensorflow_cloud/core/preprocess.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,18 @@ def get_preprocessed_entry_point(
128128
'os.environ["TF_KERAS_RUNNING_REMOTELY"]="1"\n',
129129
]
130130

131+
# Capture job trace log when SIGSEGV occures
132+
133+
if worker_count > 0 and machine_config.is_tpu_config(worker_config):
134+
# faulthandler is not supported in TPU v1.x images
135+
logger.info("Faulthandler is not supported with TF < v2.2 images.")
136+
137+
else:
138+
script_lines.extend([
139+
"import faulthandler\n",
140+
"faulthandler.enable()\n",
141+
])
142+
131143
# Setting default Tuner_ID if one is provided in args
132144
script_lines.extend([
133145
"import sys\n",

src/python/tensorflow_cloud/core/tests/unit/preprocess_test.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ def test_auto_one_device_strategy(self):
6464
"import os\n",
6565
"import tensorflow as tf\n",
6666
'os.environ["TF_KERAS_RUNNING_REMOTELY"]="1"\n',
67+
"import faulthandler\n",
68+
"faulthandler.enable()\n",
6769
"import sys\n",
6870
"for flag in sys.argv[1:]:\n",
6971
' if flag.startswith("TUNER_ID"):\n',
@@ -83,6 +85,8 @@ def test_auto_mirrored_strategy(self):
8385
"import os\n",
8486
"import tensorflow as tf\n",
8587
'os.environ["TF_KERAS_RUNNING_REMOTELY"]="1"\n',
88+
"import faulthandler\n",
89+
"faulthandler.enable()\n",
8690
"import sys\n",
8791
"for flag in sys.argv[1:]:\n",
8892
' if flag.startswith("TUNER_ID"):\n',
@@ -100,6 +104,8 @@ def test_auto_multi_worker_strategy(self):
100104
"import os\n",
101105
"import tensorflow as tf\n",
102106
'os.environ["TF_KERAS_RUNNING_REMOTELY"]="1"\n',
107+
"import faulthandler\n",
108+
"faulthandler.enable()\n",
103109
"import sys\n",
104110
"for flag in sys.argv[1:]:\n",
105111
' if flag.startswith("TUNER_ID"):\n',
@@ -188,6 +194,8 @@ def test_ipython_notebook(self, mock_python_exporter):
188194
"import os\n",
189195
"import tensorflow as tf\n",
190196
'os.environ["TF_KERAS_RUNNING_REMOTELY"]="1"\n',
197+
"import faulthandler\n",
198+
"faulthandler.enable()\n",
191199
"import sys\n",
192200
"for flag in sys.argv[1:]:\n",
193201
' if flag.startswith("TUNER_ID"):\n',

0 commit comments

Comments
 (0)