Skip to content

Commit 3278699

Browse files
authored
Wrap errors in operand execution to protect scheduling service (#2964)
1 parent b93c02b commit 3278699

File tree

18 files changed

+392
-57
lines changed

18 files changed

+392
-57
lines changed

mars/core/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
# noinspection PyUnresolvedReferences
1616
from ..typing import ChunkType, TileableType, EntityType, OperandType
17+
from .base import ExecutionError
1718
from .entity import (
1819
Entity,
1920
EntityData,

mars/core/base.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,9 @@ def serial(self, obj: Base, context: Dict):
142142

143143
class MarsError(Exception):
144144
pass
145+
146+
147+
class ExecutionError(MarsError):
148+
def __init__(self, nested_error: BaseException):
149+
super().__init__(nested_error)
150+
self.nested_error = nested_error

mars/dataframe/groupby/aggregation.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,8 @@
1515
import functools
1616
import itertools
1717
import logging
18-
import typing
1918
import uuid
20-
from typing import List
19+
from typing import Callable, Dict, List
2120

2221
import numpy as np
2322
import pandas as pd
@@ -129,8 +128,8 @@ def _group_kurt(x, *args, **kwargs):
129128

130129
def build_mock_agg_result(
131130
groupby: GROUPBY_TYPE,
132-
groupby_params: typing.Dict,
133-
raw_func: typing.Callable,
131+
groupby_params: Dict,
132+
raw_func: Callable,
134133
**raw_func_kw,
135134
):
136135
try:

mars/lib/cython/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 1999-2022 Alibaba Group Holding Ltd.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

mars/lib/cython/libcpp.pxd

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Copyright 1999-2022 Alibaba Group Holding Ltd.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# complementary header for C++ STL libs not included in Cython
16+
17+
from libc.stdint cimport uint_fast64_t
18+
19+
20+
cdef extern from "<random>" namespace "std" nogil:
21+
cdef cppclass mt19937_64:
22+
ctypedef uint_fast64_t result_type
23+
24+
mt19937_64() except +
25+
mt19937_64(result_type seed) except +
26+
result_type operator()() except +
27+
result_type min() except +
28+
result_type max() except +
29+
void discard(size_t z) except +
30+
void seed(result_type seed) except +

mars/oscar/backends/mars/pool.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
from ....utils import get_next_port, dataslots, ensure_coverage
2929
from ..config import ActorPoolConfig
30-
from ..message import CreateActorMessage
30+
from ..message import CreateActorMessage, reset_random_seed as reset_message_seed
3131
from ..pool import MainActorPoolBase, SubActorPoolBase, _register_message_handler
3232

3333

@@ -168,6 +168,7 @@ def _start_sub_pool(
168168

169169
# make sure enough randomness for every sub pool
170170
random.seed(uuid.uuid1().bytes)
171+
reset_message_seed()
171172

172173
conf = actor_config.get_pool_config(process_index)
173174
suspend_sigint = conf["suspend_sigint"]

mars/oscar/backends/message.pyi

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
# Copyright 1999-2022 Alibaba Group Holding Ltd.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from enum import Enum
16+
from types import TracebackType
17+
from typing import Any, Type
18+
19+
from ..core import ActorRef
20+
21+
DEFAULT_PROTOCOL: int = 0
22+
23+
class MessageType(Enum):
24+
control = 0
25+
result = 1
26+
error = 2
27+
create_actor = 3
28+
destroy_actor = 4
29+
has_actor = 5
30+
actor_ref = 6
31+
send = 7
32+
tell = 8
33+
cancel = 9
34+
35+
class ControlMessageType(Enum):
36+
stop = 0
37+
restart = 1
38+
sync_config = 2
39+
get_config = 3
40+
wait_pool_recovered = 4
41+
add_sub_pool_actor = 5
42+
43+
class _MessageBase:
44+
message_type: MessageType
45+
protocol: int
46+
message_id: bytes
47+
message_trace: list
48+
profiling_context: Any
49+
50+
def __init__(
51+
self,
52+
message_id: bytes = None,
53+
protocol: int = DEFAULT_PROTOCOL,
54+
message_trace: list = None,
55+
profiling_context: Any = None,
56+
): ...
57+
def __repr__(self): ...
58+
59+
class ControlMessage(_MessageBase):
60+
message_type = MessageType.control
61+
62+
address: str
63+
control_message_type: ControlMessageType
64+
content: Any
65+
66+
def __init__(
67+
self,
68+
message_id: bytes = None,
69+
address: str = None,
70+
control_message_type: ControlMessageType = None,
71+
content: Any = None,
72+
protocol: int = DEFAULT_PROTOCOL,
73+
): ...
74+
75+
class ResultMessage(_MessageBase):
76+
message_type = MessageType.result
77+
78+
result: Any
79+
80+
def __init__(
81+
self,
82+
message_id: bytes = None,
83+
result: Any = None,
84+
protocol: int = DEFAULT_PROTOCOL,
85+
message_trace: list = None,
86+
profiling_context: Any = None,
87+
): ...
88+
89+
class ErrorMessage(_MessageBase):
90+
message_type = MessageType.error
91+
92+
address: str
93+
pid: int
94+
error_type: Type
95+
error: BaseException
96+
traceback: TracebackType
97+
98+
def __init__(
99+
self,
100+
message_id: bytes = None,
101+
address: str = None,
102+
pid: int = -1,
103+
error_type: Type[BaseException] = None,
104+
error: BaseException = None,
105+
traceback: TracebackType = None,
106+
protocol: int = DEFAULT_PROTOCOL,
107+
message_trace: list = None,
108+
): ...
109+
def as_instanceof_cause(self) -> BaseException: ...
110+
111+
class CreateActorMessage(_MessageBase):
112+
message_type = MessageType.create_actor
113+
114+
actor_cls: Type
115+
actor_id: bytes
116+
args: tuple
117+
kwargs: dict
118+
allocate_strategy: Any
119+
from_main: bool
120+
121+
def __init__(
122+
self,
123+
message_id: bytes = None,
124+
actor_cls: Type = None,
125+
actor_id: bytes = None,
126+
args: tuple = None,
127+
kwargs: dict = None,
128+
allocate_strategy: Any = None,
129+
from_main: bool = False,
130+
protocol: int = DEFAULT_PROTOCOL,
131+
message_trace: list = None,
132+
): ...
133+
134+
class DestroyActorMessage(_MessageBase):
135+
message_type = MessageType.destroy_actor
136+
137+
actor_ref: ActorRef
138+
from_main: bool
139+
140+
def __init__(
141+
self,
142+
message_id: bytes = None,
143+
actor_ref: ActorRef = None,
144+
from_main: bool = False,
145+
protocol: int = DEFAULT_PROTOCOL,
146+
message_trace: list = None,
147+
): ...
148+
149+
class HasActorMessage(_MessageBase):
150+
message_type = MessageType.has_actor
151+
152+
actor_ref: ActorRef
153+
154+
def __init__(
155+
self,
156+
message_id: bytes = None,
157+
actor_ref: ActorRef = None,
158+
protocol: int = DEFAULT_PROTOCOL,
159+
message_trace: list = None,
160+
): ...
161+
162+
class ActorRefMessage(_MessageBase):
163+
message_type = MessageType.actor_ref
164+
165+
actor_ref: ActorRef
166+
167+
def __init__(
168+
self,
169+
message_id: bytes = None,
170+
actor_ref: ActorRef = None,
171+
protocol: int = DEFAULT_PROTOCOL,
172+
message_trace: list = None,
173+
): ...
174+
175+
class SendMessage(_MessageBase):
176+
message_type = MessageType.send
177+
178+
actor_ref: ActorRef
179+
content: Any
180+
181+
def __init__(
182+
self,
183+
message_id: bytes = None,
184+
actor_ref: ActorRef = None,
185+
content: object = None,
186+
protocol: int = DEFAULT_PROTOCOL,
187+
message_trace: list = None,
188+
profiling_context: Any = None,
189+
): ...
190+
191+
class TellMessage(SendMessage):
192+
message_type = MessageType.tell
193+
194+
class CancelMessage(_MessageBase):
195+
message_type = MessageType.cancel
196+
197+
address: str
198+
cancel_message_id: bytes
199+
200+
def __init__(
201+
self,
202+
message_id: bytes = None,
203+
address: str = None,
204+
cancel_message_id: bytes = None,
205+
protocol: int = DEFAULT_PROTOCOL,
206+
message_trace: list = None,
207+
): ...
208+
209+
class DeserializeMessageFailed(RuntimeError):
210+
def __init__(self, message_id: bytes): ...
211+
def __str__(self): ...
212+
213+
def reset_random_seed(): ...
214+
def new_message_id() -> bytes: ...

mars/oscar/backends/message.pyx

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# Copyright 1999-2021 Alibaba Group Holding Ltd.
1+
# distutils: language = c++
2+
# Copyright 1999-2022 Alibaba Group Holding Ltd.
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");
45
# you may not use this file except in compliance with the License.
@@ -13,29 +14,27 @@
1314
# limitations under the License.
1415

1516
from enum import Enum
17+
from random import getrandbits
1618
from types import TracebackType
1719
from typing import Any, Type
1820

21+
from libc.stdint cimport uint_fast64_t
22+
23+
from ...lib.cython.libcpp cimport mt19937_64
1924
from ...lib.tblib import pickling_support
2025
from ...serialization.core cimport Serializer
2126
from ...utils import wrap_exception
2227
from ..core cimport ActorRef
2328

24-
try:
25-
from random import randbytes
26-
except ImportError: # pragma: no cover
27-
from random import getrandbits
28-
29-
def randbytes(long n) -> bytes:
30-
return getrandbits(n * 8).to_bytes(n, "little")
31-
32-
3329
# make sure traceback can be pickled
3430
pickling_support.install()
3531

3632
cdef int _DEFAULT_PROTOCOL = 0
3733
DEFAULT_PROTOCOL = _DEFAULT_PROTOCOL
3834

35+
cdef mt19937_64 _rnd_gen
36+
cdef bint _rnd_is_seed_set = False
37+
3938

4039
class MessageType(Enum):
4140
control = 0
@@ -552,5 +551,23 @@ cdef class MessageSerializer(Serializer):
552551
MessageSerializer.register(_MessageBase)
553552

554553

554+
cpdef reset_random_seed():
555+
cdef bytes seed_bytes
556+
global _rnd_is_seed_set
557+
558+
seed_bytes = getrandbits(64).to_bytes(8, "little")
559+
# memcpy(&seed, <char *>seed_bytes, 8)
560+
_rnd_gen.seed((<uint_fast64_t *><char *>seed_bytes)[0])
561+
_rnd_is_seed_set = True
562+
563+
555564
cpdef bytes new_message_id():
556-
return randbytes(32)
565+
cdef uint_fast64_t res_array[4]
566+
cdef int i
567+
568+
if not _rnd_is_seed_set:
569+
reset_random_seed()
570+
571+
for i in range(4):
572+
res_array[i] = _rnd_gen()
573+
return <bytes>((<char *>&(res_array[0]))[:32])

0 commit comments

Comments
 (0)