Skip to content

Commit 61c8eac

Browse files
fyrestone留宝刘宝
authored
Refine failure recovery log and exception (#2633)
* Refine fo log and exception * Pin xgboost_ray to 0.1.5 Co-authored-by: 留宝 <[email protected]> Co-authored-by: 刘宝 <[email protected]>
1 parent bacb2b5 commit 61c8eac

File tree

5 files changed

+109
-25
lines changed

5 files changed

+109
-25
lines changed

.github/workflows/platform-ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ jobs:
8383
fi
8484
if [ -n "$WITH_RAY" ]; then
8585
pip install pip install ray[default]
86-
pip install xgboost_ray
86+
pip install xgboost_ray==0.1.5
8787
pip install --upgrade numpy
8888
fi
8989
if [ -n "$RUN_DASK" ]; then

mars/deploy/oscar/tests/test_fault_injection.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import os
1616
import pytest
17+
import traceback
1718
import numpy as np
1819
import pandas as pd
1920

@@ -239,22 +240,43 @@ async def test_rerun_subtask_describe(fault_cluster, fault_config):
239240
@pytest.mark.parametrize(
240241
"fault_cluster", [{"config": RERUN_SUBTASK_CONFIG_FILE}], indirect=True
241242
)
243+
@pytest.mark.parametrize(
244+
"fault_config",
245+
[
246+
[
247+
FaultType.UnhandledException,
248+
{FaultPosition.ON_EXECUTE_OPERAND: 1},
249+
pytest.raises(FaultInjectionUnhandledError),
250+
["_UnhandledException", "handle_fault"],
251+
],
252+
[
253+
FaultType.Exception,
254+
{FaultPosition.ON_EXECUTE_OPERAND: 100},
255+
pytest.raises(FaultInjectionError),
256+
["_ExceedMaxRerun", "handle_fault"],
257+
],
258+
],
259+
)
242260
@pytest.mark.asyncio
243-
async def test_rerun_subtask_unhandled(fault_cluster):
261+
async def test_rerun_subtask_fail(fault_cluster, fault_config):
262+
fault_type, fault_count, expect_raises, exception_match = fault_config
244263
name = await create_fault_injection_manager(
245264
session_id=fault_cluster.session.session_id,
246265
address=fault_cluster.session.address,
247-
fault_count={FaultPosition.ON_EXECUTE_OPERAND: 1},
248-
fault_type=FaultType.UnhandledException,
266+
fault_count=fault_count,
267+
fault_type=fault_type,
249268
)
269+
exception_typename, stack_string = exception_match
250270
extra_config = {ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME: name}
251271

252272
raw = np.random.RandomState(0).rand(10, 10)
253273
a = mt.tensor(raw, chunk_size=5)
254274
b = a + 1
255275

256-
with pytest.raises(FaultInjectionUnhandledError):
276+
with expect_raises as e:
257277
b.execute(extra_config=extra_config)
278+
assert e.typename == exception_typename, "".join(traceback.format_tb(e.tb))
279+
assert e.traceback[-1].name == stack_string, "".join(traceback.format_tb(e.tb))
258280

259281

260282
@pytest.mark.parametrize(
@@ -266,29 +288,36 @@ async def test_rerun_subtask_unhandled(fault_cluster):
266288
[
267289
FaultType.Exception,
268290
{FaultPosition.ON_EXECUTE_OPERAND: 1},
269-
pytest.raises(FaultInjectionError, match="Fault Injection"),
291+
pytest.raises(FaultInjectionError, match="RemoteFunction"),
292+
["_UnretryableException", "handle_fault"],
270293
],
271294
[
272295
FaultType.ProcessExit,
273296
{FaultPosition.ON_EXECUTE_OPERAND: 1},
274297
pytest.raises(ServerClosed),
298+
["_UnretryableException", "*"],
275299
],
276300
],
277301
)
278302
@pytest.mark.asyncio
279303
async def test_retryable(fault_cluster, fault_config):
280-
fault_type, fault_count, expect_raises = fault_config
304+
fault_type, fault_count, expect_raises, exception_match = fault_config
281305
name = await create_fault_injection_manager(
282306
session_id=fault_cluster.session.session_id,
283307
address=fault_cluster.session.address,
284308
fault_count=fault_count,
285309
fault_type=fault_type,
286310
)
311+
exception_typename, stack_string = exception_match
287312
extra_config = {ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME: name}
288313

289314
def f(x):
290315
return x + 1
291316

292317
r = spawn(f, args=(1,), retry_when_fail=False)
293-
with expect_raises:
318+
with expect_raises as e:
294319
r.execute(extra_config=extra_config)
320+
assert e.typename == exception_typename, "".join(traceback.format_tb(e.tb))
321+
assert stack_string == "*" or e.traceback[-1].name == stack_string, "".join(
322+
traceback.format_tb(e.tb)
323+
)

mars/deploy/oscar/tests/test_ray_fault_injection.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,26 @@ async def test_rerun_subtask_describe(ray_start_regular, fault_cluster, fault_co
152152
@pytest.mark.parametrize(
153153
"fault_cluster", [{"config": SUBTASK_RERUN_CONFIG}], indirect=True
154154
)
155+
@pytest.mark.parametrize(
156+
"fault_config",
157+
[
158+
[
159+
FaultType.UnhandledException,
160+
{FaultPosition.ON_EXECUTE_OPERAND: 1},
161+
pytest.raises(FaultInjectionUnhandledError),
162+
["_UnhandledException", "handle_fault"],
163+
],
164+
[
165+
FaultType.Exception,
166+
{FaultPosition.ON_EXECUTE_OPERAND: 100},
167+
pytest.raises(FaultInjectionError),
168+
["_ExceedMaxRerun", "handle_fault"],
169+
],
170+
],
171+
)
155172
@pytest.mark.asyncio
156-
async def test_rerun_subtask_unhandled(ray_start_regular, fault_cluster):
157-
await test_fault_injection.test_rerun_subtask_unhandled(fault_cluster)
173+
async def test_rerun_subtask_fail(ray_start_regular, fault_cluster, fault_config):
174+
await test_fault_injection.test_rerun_subtask_fail(fault_cluster, fault_config)
158175

159176

160177
@require_ray
@@ -167,12 +184,14 @@ async def test_rerun_subtask_unhandled(ray_start_regular, fault_cluster):
167184
[
168185
FaultType.Exception,
169186
{FaultPosition.ON_EXECUTE_OPERAND: 1},
170-
pytest.raises(FaultInjectionError, match="Fault Injection"),
187+
pytest.raises(FaultInjectionError, match="RemoteFunction"),
188+
["_UnretryableException", "handle_fault"],
171189
],
172190
[
173191
FaultType.ProcessExit,
174192
{FaultPosition.ON_EXECUTE_OPERAND: 1},
175193
pytest.raises(ServerClosed),
194+
["_UnretryableException", "*"],
176195
],
177196
],
178197
)

mars/services/scheduling/worker/execution.py

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import functools
1717
import logging
1818
import operator
19+
import pprint
1920
import sys
2021
from collections import defaultdict
2122
from dataclasses import dataclass, field
@@ -67,28 +68,45 @@ async def _retry_run(
6768
except (OSError, MarsError) as ex:
6869
if subtask_info.num_retries < subtask_info.max_retries:
6970
logger.error(
70-
"Rerun the %s of subtask %s due to %s",
71+
"Rerun[%s/%s] the %s of subtask %s due to %s.",
72+
subtask_info.num_retries,
73+
subtask_info.max_retries,
7174
target_async_func,
7275
subtask.subtask_id,
7376
ex,
7477
)
7578
subtask_info.num_retries += 1
7679
continue
77-
raise ex
80+
if subtask_info.max_retries > 0:
81+
message = (
82+
f"Exceed max rerun[{subtask_info.num_retries}/{subtask_info.max_retries}]:"
83+
f" {target_async_func} of subtask {subtask.subtask_id} due to {ex}."
84+
)
85+
logger.error(message)
86+
87+
class _ExceedMaxRerun(type(ex)):
88+
pass
89+
90+
raise _ExceedMaxRerun(message).with_traceback(ex.__traceback__)
91+
else:
92+
raise ex
7893
except asyncio.CancelledError:
7994
raise
8095
except Exception as ex:
81-
if subtask_info.num_retries < subtask_info.max_retries:
82-
logger.error(
83-
"Failed to rerun the %s of subtask %s, "
84-
"num_retries: %s, max_retries: %s, unhandled exception: %s",
85-
target_async_func,
86-
subtask.subtask_id,
87-
subtask_info.num_retries,
88-
subtask_info.max_retries,
89-
ex,
96+
if subtask_info.max_retries > 0:
97+
message = (
98+
f"Failed to rerun the {target_async_func} of subtask {subtask.subtask_id}, "
99+
f"num_retries: {subtask_info.num_retries}, max_retries: {subtask_info.max_retries} "
100+
f"due to unhandled exception: {ex}."
90101
)
91-
raise ex
102+
logger.error(message)
103+
104+
class _UnhandledException(type(ex)):
105+
pass
106+
107+
raise _UnhandledException(message).with_traceback(ex.__traceback__)
108+
else:
109+
raise ex
92110

93111

94112
def _fill_subtask_result_with_exception(
@@ -403,7 +421,25 @@ async def _run_subtask_once():
403421
if subtask.retryable:
404422
return await _retry_run(subtask, subtask_info, _run_subtask_once)
405423
else:
406-
return await _run_subtask_once()
424+
try:
425+
return await _run_subtask_once()
426+
except Exception as e:
427+
unretryable_op = [
428+
chunk.op
429+
for chunk in subtask.chunk_graph
430+
if not getattr(chunk.op, "retryable", True)
431+
]
432+
message = (
433+
f"Run subtask failed due to {e}, the subtask {subtask.subtask_id} is "
434+
f"not retryable, it contains unretryable op: \n"
435+
f"{pprint.pformat(unretryable_op)}"
436+
)
437+
logger.error(message)
438+
439+
class _UnretryableException(type(e)):
440+
pass
441+
442+
raise _UnretryableException(message).with_traceback(e.__traceback__)
407443

408444
async def run_subtask(
409445
self, subtask: Subtask, band_name: str, supervisor_address: str

mars/services/task/tests/test_service.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def f1():
168168

169169
await task_api.wait_task(task_id, timeout=10)
170170
results = await task_api.get_task_results(progress=True)
171-
assert type(results[0].error) is SystemError
171+
assert isinstance(results[0].error, SystemError)
172172

173173

174174
@pytest.mark.asyncio

0 commit comments

Comments
 (0)