Skip to content

Commit ea8dc9f

Browse files
authored
Add basic profiling support for supervisor (#2586)
1 parent 578cf63 commit ea8dc9f

File tree

20 files changed

+516
-84
lines changed

20 files changed

+516
-84
lines changed

mars/_utils.pyx

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import importlib
1616
import os
1717
import pickle
1818
import pkgutil
19+
import time
1920
import types
2021
import uuid
2122
from datetime import date, datetime, timedelta, tzinfo
@@ -411,5 +412,17 @@ cpdef long long ceildiv(long long x, long long y) nogil:
411412
return x // y + (x % y != 0)
412413

413414

415+
cdef class Timer:
416+
cdef object _start
417+
cdef readonly object duration
418+
419+
def __enter__(self):
420+
self._start = time.time()
421+
return self
422+
423+
def __exit__(self, exc_type, exc_value, traceback):
424+
self.duration = time.time() - self._start
425+
426+
414427
__all__ = ['to_str', 'to_binary', 'to_text', 'TypeDispatcher', 'tokenize', 'tokenize_int',
415-
'register_tokenizer', 'insert_reversed_tuple', 'ceildiv']
428+
'register_tokenizer', 'insert_reversed_tuple', 'ceildiv', 'Timer']

mars/deploy/oscar/session.py

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import concurrent.futures
1717
import itertools
1818
import logging
19+
import json
1920
import random
2021
import string
2122
import threading
@@ -71,15 +72,22 @@ class Progress:
7172
value: float = 0.0
7273

7374

75+
@dataclass
76+
class Profiling:
77+
result: dict = None
78+
79+
7480
class ExecutionInfo:
7581
def __init__(
7682
self,
7783
aio_task: asyncio.Task,
7884
progress: Progress,
85+
profiling: Profiling,
7986
loop: asyncio.AbstractEventLoop,
8087
):
8188
self._aio_task = aio_task
8289
self._progress = progress
90+
self._profiling = profiling
8391
self._loop = loop
8492

8593
self._future_local = threading.local()
@@ -108,6 +116,9 @@ def aio_task(self):
108116
def progress(self) -> float:
109117
return self._progress.value
110118

119+
def profiling_result(self) -> dict:
120+
return self._profiling.result
121+
111122
def result(self, timeout=None):
112123
self._ensure_future()
113124
return self._future_local.future.result(timeout=timeout)
@@ -805,7 +816,11 @@ async def init(
805816
return await cls._init(address, session_id, new=new, timeout=timeout)
806817

807818
async def _run_in_background(
808-
self, tileables: list, task_id: str, progress: Progress
819+
self,
820+
tileables: list,
821+
task_id: str,
822+
progress: Progress,
823+
profiling: Profiling,
809824
):
810825
with enter_mode(build=True, kernel=True):
811826
# wait for task to finish
@@ -850,6 +865,13 @@ async def _run_in_background(
850865
raise TimeoutError(
851866
f"Task({task_id}) running time > {self.timeout}"
852867
)
868+
profiling.result = task_result.profiling
869+
if task_result.profiling:
870+
logger.warning(
871+
"Profile task %s execution result:\n%s",
872+
task_id,
873+
json.dumps(task_result.profiling, indent=4),
874+
)
853875
if task_result.error:
854876
raise task_result.error.with_traceback(task_result.traceback)
855877
if cancelled:
@@ -886,11 +908,12 @@ async def execute(self, *tileables, **kwargs) -> ExecutionInfo:
886908
)
887909

888910
progress = Progress()
911+
profiling = Profiling()
889912
# create asyncio.Task
890913
aio_task = asyncio.create_task(
891-
self._run_in_background(tileables, task_id, progress)
914+
self._run_in_background(tileables, task_id, progress, profiling)
892915
)
893-
return ExecutionInfo(aio_task, progress, asyncio.get_running_loop())
916+
return ExecutionInfo(aio_task, progress, profiling, asyncio.get_running_loop())
894917

895918
def _get_to_fetch_tileable(
896919
self, tileable: TileableType
@@ -1537,7 +1560,10 @@ async def driver():
15371560
driver(), execution_info.loop
15381561
).result()
15391562
new_execution_info = ExecutionInfo(
1540-
new_aio_task, execution_info._progress, execution_info.loop
1563+
new_aio_task,
1564+
execution_info._progress,
1565+
execution_info._profiling,
1566+
execution_info.loop,
15411567
)
15421568
return new_execution_info
15431569

mars/deploy/oscar/tests/test_local.py

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from ....storage import StorageLevel
3838
from ....services.storage import StorageAPI
3939
from ....tensor.arithmetic.add import TensorAdd
40+
from ....tests.core import check_dict_structure_same
4041
from ..local import new_cluster
4142
from ..service import load_config
4243
from ..session import (
@@ -67,6 +68,22 @@
6768
os.path.dirname(__file__), "local_test_with_third_parity_modules_config.yml"
6869
)
6970

71+
EXPECT_PROFILING_STRUCTURE = {
72+
"supervisor": {
73+
"general": {
74+
"optimize": 0.0005879402160644531,
75+
"incref_fetch_tileables": 0.0010840892791748047,
76+
"stage_*": {
77+
"tile": 0.008243083953857422,
78+
"gen_subtask_graph": 0.012202978134155273,
79+
"run": 0.27870702743530273,
80+
"total": 0.30318617820739746,
81+
},
82+
"total": 0.30951380729675293,
83+
},
84+
"serialization": {},
85+
}
86+
}
7087

7188
params = ["default"]
7289
if vineyard is not None:
@@ -147,8 +164,15 @@ async def test_vineyard_operators(create_cluster):
147164
pd.testing.assert_frame_equal(df, raw)
148165

149166

167+
@pytest.mark.parametrize(
168+
"config",
169+
[
170+
[{"enable_profiling": True}, EXPECT_PROFILING_STRUCTURE],
171+
[{}, {}],
172+
],
173+
)
150174
@pytest.mark.asyncio
151-
async def test_execute(create_cluster):
175+
async def test_execute(create_cluster, config):
152176
session = get_default_async_session()
153177
assert session.address is not None
154178
assert session.session_id is not None
@@ -157,8 +181,14 @@ async def test_execute(create_cluster):
157181
a = mt.tensor(raw, chunk_size=5)
158182
b = a + 1
159183

160-
info = await session.execute(b)
184+
extra_config, expect_profiling_structure = config
185+
186+
info = await session.execute(b, extra_config=extra_config)
161187
await info
188+
if extra_config:
189+
check_dict_structure_same(info.profiling_result(), expect_profiling_structure)
190+
else:
191+
assert not info.profiling_result()
162192
assert info.result() is None
163193
assert info.exception() is None
164194
assert info.progress() == 1
@@ -296,16 +326,23 @@ def _my_func():
296326
await session.destroy()
297327

298328

329+
@pytest.mark.parametrize(
330+
"config",
331+
[
332+
[{"enable_profiling": True}, EXPECT_PROFILING_STRUCTURE],
333+
[{}, {}],
334+
],
335+
)
299336
@pytest.mark.asyncio
300-
async def test_web_session(create_cluster):
337+
async def test_web_session(create_cluster, config):
301338
client = create_cluster[0]
302339
session_id = str(uuid.uuid4())
303340
web_address = client.web_address
304341
session = await AsyncSession.init(web_address, session_id)
305342
assert await session.get_web_endpoint() == web_address
306343
session.as_default()
307344
assert isinstance(session._isolated_session, _IsolatedWebSession)
308-
await test_execute(client)
345+
await test_execute(client, config)
309346
await test_iterative_tiling(client)
310347
AsyncSession.reset_default()
311348
await session.destroy()

mars/deploy/oscar/tests/test_ray.py

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,27 @@
5656
os.path.dirname(__file__), "ray_test_with_third_parity_modules_config.yml"
5757
)
5858

59+
EXPECT_PROFILING_STRUCTURE = {
60+
"supervisor": {
61+
"general": {
62+
"optimize": 0.0005879402160644531,
63+
"incref_fetch_tileables": 0.0010840892791748047,
64+
"stage_*": {
65+
"tile": 0.008243083953857422,
66+
"gen_subtask_graph": 0.012202978134155273,
67+
"run": 0.27870702743530273,
68+
"total": 0.30318617820739746,
69+
},
70+
"total": 0.30951380729675293,
71+
},
72+
"serialization": {
73+
"serialize": 0.014928340911865234,
74+
"deserialize": 0.0011813640594482422,
75+
"total": 0.016109704971313477,
76+
},
77+
}
78+
}
79+
5980

6081
@pytest.fixture
6182
async def create_cluster(request):
@@ -74,9 +95,16 @@ async def create_cluster(request):
7495

7596

7697
@require_ray
98+
@pytest.mark.parametrize(
99+
"config",
100+
[
101+
[{"enable_profiling": True}, EXPECT_PROFILING_STRUCTURE],
102+
[{}, {}],
103+
],
104+
)
77105
@pytest.mark.asyncio
78-
async def test_execute(ray_large_cluster, create_cluster):
79-
await test_local.test_execute(create_cluster)
106+
async def test_execute(ray_large_cluster, create_cluster, config):
107+
await test_local.test_execute(create_cluster, config)
80108

81109

82110
@require_ray
@@ -194,10 +222,17 @@ async def test_optional_supervisor_node(ray_large_cluster, test_option):
194222

195223

196224
@require_ray
225+
@pytest.mark.parametrize(
226+
"config",
227+
[
228+
[{"enable_profiling": True}, EXPECT_PROFILING_STRUCTURE],
229+
[{}, {}],
230+
],
231+
)
197232
@pytest.mark.asyncio
198-
async def test_web_session(ray_large_cluster, create_cluster):
233+
async def test_web_session(ray_large_cluster, create_cluster, config):
199234
client = create_cluster[0]
200-
await test_local.test_web_session(create_cluster)
235+
await test_local.test_web_session(create_cluster, config)
201236
web_address = client.web_address
202237
assert await ray.remote(_run_web_session).remote(web_address)
203238
assert await ray.remote(_sync_web_session_test).remote(web_address)

mars/oscar/backends/context.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
CancelMessage,
3939
ControlMessage,
4040
ControlMessageType,
41+
ProfilingContext,
4142
)
4243
from .router import Router
4344

@@ -155,10 +156,18 @@ async def actor_ref(self, *args, **kwargs):
155156
return self._process_result_message(result)
156157

157158
async def send(
158-
self, actor_ref: ActorRef, message: Tuple, wait_response: bool = True
159+
self,
160+
actor_ref: ActorRef,
161+
message: Tuple,
162+
wait_response: bool = True,
163+
profiling_context: ProfilingContext = None,
159164
):
160165
message = SendMessage(
161-
new_message_id(), actor_ref, message, protocol=DEFAULT_PROTOCOL
166+
new_message_id(),
167+
actor_ref,
168+
message,
169+
protocol=DEFAULT_PROTOCOL,
170+
profiling_context=profiling_context,
162171
)
163172

164173
with debug_async_timeout(

mars/oscar/backends/message.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,21 @@ class MessageTraceItem:
6969
method: str
7070

7171

72+
@dataslots
73+
@dataclass
74+
class ProfilingContext:
75+
task_id: str
76+
77+
7278
class _MessageBase(ABC):
73-
__slots__ = "protocol", "message_id", "message_trace"
79+
__slots__ = "protocol", "message_id", "message_trace", "profiling_context"
7480

7581
def __init__(
7682
self,
7783
message_id: bytes,
7884
protocol: int = None,
7985
message_trace: List[MessageTraceItem] = None,
86+
profiling_context: ProfilingContext = None,
8087
):
8188
self.message_id = message_id
8289
if protocol is None:
@@ -91,6 +98,7 @@ def __init__(
9198
# `A` will find that id:1 already exists in inbox,
9299
# thus deadlock detected.
93100
self.message_trace = message_trace
101+
self.profiling_context = profiling_context
94102

95103
@classproperty
96104
@abstractmethod
@@ -144,8 +152,14 @@ def __init__(
144152
result: Any,
145153
protocol: int = None,
146154
message_trace: List[MessageTraceItem] = None,
155+
profiling_context: ProfilingContext = None,
147156
):
148-
super().__init__(message_id, protocol=protocol, message_trace=message_trace)
157+
super().__init__(
158+
message_id,
159+
protocol=protocol,
160+
message_trace=message_trace,
161+
profiling_context=profiling_context,
162+
)
149163
self.result = result
150164

151165
@classproperty
@@ -276,8 +290,14 @@ def __init__(
276290
content: Any,
277291
protocol: int = None,
278292
message_trace: List[MessageTraceItem] = None,
293+
profiling_context: ProfilingContext = None,
279294
):
280-
super().__init__(message_id, protocol=protocol, message_trace=message_trace)
295+
super().__init__(
296+
message_id,
297+
protocol=protocol,
298+
message_trace=message_trace,
299+
profiling_context=profiling_context,
300+
)
281301
self.actor_ref = actor_ref
282302
self.content = content
283303

mars/oscar/backends/pool.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -523,7 +523,10 @@ async def send(self, message: SendMessage) -> result_message_type:
523523
with self._run_coro(message.message_id, coro) as future:
524524
result = await future
525525
processor.result = ResultMessage(
526-
message.message_id, result, protocol=message.protocol
526+
message.message_id,
527+
result,
528+
protocol=message.protocol,
529+
profiling_context=message.profiling_context,
527530
)
528531
return processor.result
529532

@@ -538,7 +541,10 @@ async def tell(self, message: TellMessage) -> result_message_type:
538541
asyncio.create_task(call)
539542
await asyncio.sleep(0)
540543
processor.result = ResultMessage(
541-
message.message_id, None, protocol=message.protocol
544+
message.message_id,
545+
None,
546+
protocol=message.protocol,
547+
profiling_context=message.profiling_context,
542548
)
543549
return processor.result
544550

0 commit comments

Comments
 (0)