Skip to content

Commit 553bae7

Browse files
feat: adding python side retry mechanism (#3354)
* feat: adding python retry/reconnect mechanism * feat: connecting only one time per retry * fix: avoid running commands when the instance has exited and using exited function more broadly. * feat: check if MAPDL has died properly during processes kill * fix: looping * refactor: error message * chore: adding changelog file 3354.miscellaneous.md * chore: adding changelog file 3354.dependencies.md * chore: adding changelog file 3354.miscellaneous.md * fix: test * fix: wrong name in variable. --------- Co-authored-by: pyansys-ci-bot <[email protected]>
1 parent 6fca235 commit 553bae7

File tree

5 files changed

+151
-48
lines changed

5 files changed

+151
-48
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
feat: adding python side retry mechanism

src/ansys/mapdl/core/errors.py

Lines changed: 87 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from functools import wraps
2525
import signal
2626
import threading
27+
from time import sleep
2728
from typing import Callable, Optional
2829

2930
import grpc
@@ -306,26 +307,70 @@ def wrapper(*args, **kwargs):
306307
old_handler = signal.signal(signal.SIGINT, handler)
307308

308309
# Capture gRPC exceptions
309-
try:
310-
out = func(*args, **kwargs)
311-
except grpc.RpcError as error:
312-
# Custom errors
313-
if error.code() == grpc.StatusCode.RESOURCE_EXHAUSTED:
314-
if "Received message larger than max" in error.details():
315-
try:
316-
lim_ = int(error.details().split("(")[1].split("vs")[0])
317-
except IndexError:
318-
lim_ = int(512 * 1024**2)
319-
320-
raise MapdlgRPCError(
321-
f"RESOURCE_EXHAUSTED: {error.details()}. "
322-
"You can try to increase the gRPC message length size using 'PYMAPDL_MAX_MESSAGE_LENGTH'"
323-
" environment variable. For instance:\n\n"
324-
f"$ export PYMAPDL_MAX_MESSAGE_LENGTH={lim_}"
310+
n_attempts = 3
311+
initial_backoff = 0.05
312+
multiplier_backoff = 3
313+
314+
i_attemps = 0
315+
316+
while True:
317+
try:
318+
out = func(*args, **kwargs)
319+
320+
# Exit while-loop if success
321+
break
322+
323+
except grpc.RpcError as error:
324+
325+
mapdl = retrieve_mapdl_from_args(args)
326+
327+
i_attemps += 1
328+
if i_attemps <= n_attempts:
329+
330+
wait = (
331+
initial_backoff * multiplier_backoff**i_attemps
332+
) # Exponential backoff
333+
sleep(wait)
334+
335+
# reconnect
336+
mapdl._log.debug(
337+
f"Re-connection attempt {i_attemps} after waiting {wait:0.3f} seconds"
338+
)
339+
340+
connected = mapdl._connect(timeout=wait)
341+
342+
# Retry again
343+
continue
344+
345+
# Custom errors
346+
reason = ""
347+
suggestion = ""
348+
349+
if error.code() == grpc.StatusCode.RESOURCE_EXHAUSTED:
350+
if "Received message larger than max" in error.details():
351+
try:
352+
lim_ = int(error.details().split("(")[1].split("vs")[0])
353+
except IndexError:
354+
lim_ = int(512 * 1024**2)
355+
356+
raise MapdlgRPCError(
357+
f"RESOURCE_EXHAUSTED: {error.details()}. "
358+
"You can try to increase the gRPC message length size using 'PYMAPDL_MAX_MESSAGE_LENGTH'"
359+
" environment variable. For instance:\n\n"
360+
f"$ export PYMAPDL_MAX_MESSAGE_LENGTH={lim_}"
361+
)
362+
363+
if error.code() == grpc.StatusCode.UNAVAILABLE:
364+
# Very likely the MAPDL server has died.
365+
suggestion = (
366+
" MAPDL *might* have died because it executed a not-allowed command or ran out of memory.\n"
367+
" Check the MAPDL command output for more details.\n"
368+
" Open an issue on GitHub if you need assistance: "
369+
"https://github.com/ansys/pymapdl/issues"
325370
)
326371

327-
# Generic error
328-
handle_generic_grpc_error(error, func, args, kwargs)
372+
# Generic error
373+
handle_generic_grpc_error(error, func, args, kwargs, reason, suggestion)
329374

330375
# No exceptions
331376
if threading.current_thread().__class__.__name__ == "_MainThread":
@@ -344,15 +389,26 @@ def wrapper(*args, **kwargs):
344389
return wrapper
345390

346391

347-
def handle_generic_grpc_error(error, func, args, kwargs):
348-
"""Handle non-custom gRPC errors"""
349-
392+
def retrieve_mapdl_from_args(args):
350393
# can't use isinstance here due to circular imports
351394
try:
352395
class_name = args[0].__class__.__name__
353396
except (IndexError, AttributeError):
354397
class_name = ""
355398

399+
if class_name == "MapdlGrpc":
400+
mapdl = args[0]
401+
elif hasattr(args[0], "_mapdl"):
402+
mapdl = args[0]._mapdl
403+
404+
return mapdl
405+
406+
407+
def handle_generic_grpc_error(error, func, args, kwargs, reason="", suggestion=""):
408+
"""Handle non-custom gRPC errors"""
409+
410+
mapdl = retrieve_mapdl_from_args(args)
411+
356412
# trying to get "cmd" argument:
357413
cmd = args[1] if len(args) >= 2 else ""
358414
cmd = kwargs.get("cmd", cmd)
@@ -364,28 +420,30 @@ def handle_generic_grpc_error(error, func, args, kwargs):
364420
else:
365421
msg_ = f"calling:{caller}\nwith the following arguments:\n args: {args}\n kwargs: {kwargs}"
366422

367-
if class_name == "MapdlGrpc":
368-
mapdl = args[0]
369-
elif hasattr(args[0], "_mapdl"):
370-
mapdl = args[0]._mapdl
423+
if reason:
424+
reason = f"Possible reason:\n{reason}\n"
425+
426+
if suggestion:
427+
suggestion = f"Suggestions:\n{suggestion}\n"
371428

372429
msg = (
373430
f"Error:\nMAPDL server connection terminated unexpectedly while {msg_}\n"
431+
f"{reason}"
432+
f"{suggestion}"
374433
"Error:\n"
375434
f" {error.details()}\n"
376435
f"Full error:\n{error}"
377436
)
378437

379-
# MAPDL gRPC is unavailable.
380-
if error.code() == grpc.StatusCode.UNAVAILABLE:
381-
raise MapdlExitedError(msg)
382-
383438
# Generic error
384439
# Test if MAPDL is alive or not.
385440
if mapdl.is_alive:
386441
raise MapdlRuntimeError(msg)
387442

388443
else:
444+
# Making sure we do not keep executing gRPC calls.
445+
mapdl._exited = True
446+
389447
# Must close unfinished processes
390448
mapdl._close_process()
391449
raise MapdlExitedError(msg)

src/ansys/mapdl/core/mapdl_core.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
from ansys.mapdl.core.errors import (
5757
ComponentNoData,
5858
MapdlCommandIgnoredError,
59+
MapdlExitedError,
5960
MapdlFileNotFoundError,
6061
MapdlInvalidRoutineError,
6162
MapdlRuntimeError,
@@ -434,7 +435,7 @@ def components(self) -> "ComponentManager":
434435
435436
>>> mapdl.solution.converged
436437
"""
437-
if self._exited: # pragma: no cover
438+
if self.exited: # pragma: no cover
438439
raise MapdlRuntimeError("MAPDL exited.")
439440
return self._componentmanager
440441

@@ -844,7 +845,7 @@ def post_processing(self) -> "PostProcessing":
844845
array([1.07512979e-04, 8.59137773e-05, 5.70690047e-05, ...,
845846
5.70333124e-05, 8.58600402e-05, 1.07445726e-04])
846847
"""
847-
if self._exited:
848+
if self.exited:
848849
raise MapdlRuntimeError(
849850
"MAPDL exited.\n\nCan only postprocess a live " "MAPDL instance."
850851
)
@@ -963,7 +964,7 @@ def solution(self) -> "Solution":
963964
964965
>>> mapdl.solution.converged
965966
"""
966-
if self._exited:
967+
if self.exited:
967968
raise MapdlRuntimeError("MAPDL exited.")
968969
return self._solution
969970

@@ -2110,6 +2111,11 @@ def run(
21102111
>>> mapdl.prep7()
21112112
21122113
"""
2114+
if self.exited:
2115+
raise MapdlExitedError(
2116+
f"The MAPDL instance has been exited before running the command: {command}"
2117+
)
2118+
21132119
# check if multiline
21142120
if "\n" in command or "\r" in command:
21152121
raise ValueError("Use ``input_strings`` for multi-line commands")

src/ansys/mapdl/core/mapdl_grpc.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -534,9 +534,9 @@ def _multi_connect(self, n_attempts=5, timeout=15):
534534
attempt_timeout = int(timeout / n_attempts)
535535

536536
max_time = time.time() + timeout
537-
i = 0
537+
i = 1
538538
while time.time() < max_time and i <= n_attempts:
539-
self._log.debug("Connection attempt %d", i + 1)
539+
self._log.debug("Connection attempt %d", i)
540540
connected = self._connect(timeout=attempt_timeout)
541541
i += 1
542542
if connected:
@@ -564,7 +564,7 @@ def _multi_connect(self, n_attempts=5, timeout=15):
564564
else ""
565565
)
566566
raise MapdlConnectionError(
567-
msg + f"The MAPDL process has died{pid_msg}."
567+
msg + f" The MAPDL process has died{pid_msg}."
568568
)
569569

570570
self._exited = False
@@ -1194,6 +1194,11 @@ def _close_process(self, timeout=2): # pragma: no cover
11941194
# Killing child processes
11951195
self._kill_child_processes(timeout=timeout)
11961196

1197+
if self.is_alive:
1198+
raise MapdlRuntimeError("MAPDL could not be exited.")
1199+
else:
1200+
self._exited = True
1201+
11971202
def _cache_pids(self):
11981203
"""Store the process IDs used when launching MAPDL.
11991204

tests/test_grpc.py

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
MapdlExitedError,
3737
MapdlgRPCError,
3838
MapdlRuntimeError,
39+
protect_grpc,
3940
)
4041
from ansys.mapdl.core.mapdl_grpc import MAX_MESSAGE_LENGTH, MapdlGrpc
4142
from ansys.mapdl.core.misc import random_string
@@ -48,6 +49,19 @@
4849
pytestmark = requires("grpc")
4950

5051

52+
class UnavailableError(grpc.RpcError):
53+
def __init__(self, message="Service is temporarily unavailable."):
54+
self._message = message
55+
self._code = grpc.StatusCode.UNAVAILABLE
56+
super().__init__(message)
57+
58+
def code(self):
59+
return self._code
60+
61+
def details(self):
62+
return self._message
63+
64+
5165
def write_tmp_in_mapdl_instance(mapdl, filename, ext="txt"):
5266
"""Write a temporary file from MAPDL."""
5367
with mapdl.non_interactive:
@@ -627,26 +641,45 @@ def test_generic_grpc_exception(monkeypatch, grpc_channel):
627641
mapdl = MapdlGrpc(channel=grpc_channel)
628642
assert mapdl.is_alive
629643

630-
class UnavailableError(grpc.RpcError):
631-
def __init__(self, message="Service is temporarily unavailable."):
632-
self._message = message
633-
self._code = grpc.StatusCode.UNAVAILABLE
634-
super().__init__(message)
644+
@protect_grpc
645+
def _raise_error_code(*args, **kwargs):
646+
raise UnavailableError()
635647

636-
def code(self):
637-
return self._code
648+
# Monkey patch to raise the same issue.
649+
monkeypatch.setattr(mapdl, "prep7", _raise_error_code)
638650

639-
def details(self):
640-
return self._message
651+
with pytest.raises(
652+
MapdlRuntimeError, match="MAPDL server connection terminated unexpectedly while"
653+
):
654+
# passing mapdl to simulate the function `_raise_error_code` to be a method.
655+
mapdl.prep7(mapdl)
656+
657+
assert mapdl.is_alive
641658

642-
def _raise_error_code(args, **kwargs):
659+
660+
def test_generic_grpc_exception_exited(monkeypatch, grpc_channel):
661+
mapdl = MapdlGrpc(channel=grpc_channel)
662+
assert mapdl.is_alive
663+
664+
@protect_grpc
665+
def _raise_error_code(*args, **kwargs):
643666
raise UnavailableError()
644667

645-
monkeypatch.setattr(mapdl._stub, "SendCommand", _raise_error_code)
668+
def _null_close_process():
669+
return None
670+
671+
# faking exiting MAPDL
672+
mapdl._exited = True
673+
674+
# Monkey patch to raise the same issue.
675+
monkeypatch.setattr(mapdl, "prep7", _raise_error_code)
676+
677+
# monkey patch `_close_process` so MAPDL does not exit when
678+
monkeypatch.setattr(mapdl, "_close_process", _null_close_process)
646679

647680
with pytest.raises(
648681
MapdlExitedError, match="MAPDL server connection terminated unexpectedly while"
649682
):
650-
mapdl.prep7()
683+
mapdl.prep7(mapdl)
651684

652-
assert mapdl.is_alive
685+
mapdl._exited = False # Restoring

0 commit comments

Comments
 (0)