1717import itertools
1818import logging
1919import time
20+ import typing
2021from abc import ABC
2122from collections import namedtuple
2223from dataclasses import dataclass
2526
2627from ....oscar .profiling import ProfilingData
2728from ....serialization import serialize , deserialize
29+ from ....metrics import Metrics
2830from ....utils import lazy_import , implements , classproperty , Timer
2931from ...debug import debug_async_timeout
3032from ...errors import ServerClosed
3133from ..communication .base import Channel , ChannelType , Server , Client
3234from ..communication .core import register_client , register_server
3335from ..communication .errors import ChannelClosed
36+ from .utils import report_event
3437
3538ray = lazy_import ("ray" )
3639logger = logging .getLogger (__name__ )
3942 "ChannelID" , ["local_address" , "client_id" , "channel_index" , "dest_address" ]
4043)
4144
45+ SERIALIZATION_TIMEOUT_MILLS = 1000
46+ DESERIALIZATION_TIMEOUT_MILLS = 1000
47+
48+
49+ def msg_to_simple_str (msg ): # pragma: no cover
50+ """An helper that prints message structure without generate a big str."""
51+ from ..message import SendMessage , _MessageBase
52+
53+ if type (msg ) == _ArgWrapper :
54+ msg = msg .message
55+ if isinstance (msg , SendMessage ):
56+ return f"{ str (type (msg ))} (actor_ref={ msg .actor_ref } , content={ msg_to_simple_str (msg .content )} )"
57+ if isinstance (msg , _MessageBase ):
58+ return str (msg )
59+ if msg and isinstance (msg , typing .List ):
60+ part_str = ", " .join ([msg_to_simple_str (item ) for item in msg [:5 ]])
61+ return f"List<{ part_str } ...{ len (msg )} >"
62+ if msg and isinstance (msg , typing .Tuple ):
63+ part_str = ", " .join ([msg_to_simple_str (item ) for item in msg [:5 ]])
64+ return f"Tuple<{ part_str } ...{ len (msg )} >"
65+ if msg and isinstance (msg , typing .Dict ):
66+ part_str = []
67+ it = iter (msg .items ())
68+ try :
69+ while len (part_str ) < 5 :
70+ entry = next (it )
71+ part_str .append (
72+ f"k={ msg_to_simple_str (entry [0 ])} , v={ msg_to_simple_str (entry [1 ])} "
73+ )
74+ except StopIteration :
75+ pass
76+ part_str = ", " .join (part_str )
77+ return f"Dict<k={ part_str } ...{ len (msg )} >"
78+ if isinstance (msg , (str , float , int )):
79+ return "{!s:.50}" .format (msg )
80+ return str (type (msg ))
81+
4282
4383def _argwrapper_unpickler (serialized_message ):
4484 return _ArgWrapper (deserialize (* serialized_message ))
@@ -63,20 +103,45 @@ def __reduce__(self):
63103 init_metrics ("ray" )
64104 _ray_serialize = ray .serialization .SerializationContext .serialize
65105 _ray_deserialize_object = ray .serialization .SerializationContext ._deserialize_object
106+ serialized_bytes_counter = Metrics .counter (
107+ "mars.channel_serialized_bytes" ,
108+ "The bytes serialized by mars ray channel." ,
109+ )
110+ deserialized_bytes_counter = Metrics .counter (
111+ "mars.channel_deserialized_bytes" ,
112+ "The bytes deserialized by mars ray channel." ,
113+ )
114+ serialization_time_mills = Metrics .counter (
115+ "mars.channel_serialization_time_mills" ,
116+ "The time used by mars ray channel serialization." ,
117+ )
118+ deserialization_time_mills = Metrics .counter (
119+ "mars.channel_deserialization_time_mills" ,
120+ "The time used by mars ray channel deserialization." ,
121+ )
66122
67123 def _serialize (self , value ):
68124 if type (value ) is _ArgWrapper : # pylint: disable=unidiomatic-typecheck
69125 message = value .message
70126 with Timer () as timer :
71127 serialized_object = _ray_serialize (self , value )
128+ bytes_length = serialized_object .total_bytes
129+ serialized_bytes_counter .record (bytes_length )
130+ serialization_time_mills .record (timer .duration * 1000 )
131+ if timer .duration * 1000 > SERIALIZATION_TIMEOUT_MILLS : # pragma: no cover
132+ report_event (
133+ "WARNING" ,
134+ "SERIALIZATION_TIMEOUT" ,
135+ f"Serialization took { timer .duration } seconds for { bytes_length } sized message { msg_to_simple_str (message )} ." ,
136+ )
72137 try :
73138 if message .profiling_context is not None :
74139 task_id = message .profiling_context .task_id
75140 ProfilingData [task_id , "serialization" ].inc (
76141 "serialize" , timer .duration
77142 )
78- except AttributeError :
79- logger .debug (
143+ except AttributeError : # pragma: no cover
144+ logger .info (
80145 "Profiling serialization got error, the send "
81146 "message %s may not be an instance of message" ,
82147 type (message ),
@@ -87,7 +152,20 @@ def _serialize(self, value):
87152
88153 def _deserialize_object (self , data , metadata , object_ref ):
89154 start_time = time .time ()
155+ bytes_length = 0
156+ if data :
157+ bytes_length = len (data )
158+ deserialized_bytes_counter .record (bytes_length )
90159 value = _ray_deserialize_object (self , data , metadata , object_ref )
160+ duration = time .time () - start_time
161+ deserialization_time_mills .record (duration * 1000 )
162+ if duration * 1000 > DESERIALIZATION_TIMEOUT_MILLS : # pragma: no cover
163+ report_event (
164+ "WARNING" ,
165+ "DESERIALIZATION_TIMEOUT" ,
166+ f"Deserialization took { duration } seconds for "
167+ f"{ bytes_length } sized msg { msg_to_simple_str (value )} " ,
168+ )
91169 if type (value ) is _ArgWrapper : # pylint: disable=unidiomatic-typecheck
92170 message = value .message
93171 try :
@@ -96,8 +174,8 @@ def _deserialize_object(self, data, metadata, object_ref):
96174 ProfilingData [task_id , "serialization" ].inc (
97175 "deserialize" , time .time () - start_time
98176 )
99- except AttributeError :
100- logger .debug (
177+ except AttributeError : # pragma: no cover
178+ logger .info (
101179 "Profiling serialization got error, the recv "
102180 "message %s may not be an instance of message" ,
103181 type (message ),
@@ -187,9 +265,22 @@ def _submit_task(self, message: Any, object_ref: "ray.ObjectRef"):
187265 async def handle_task (message : Any , object_ref : "ray.ObjectRef" ):
188266 # use `%.500` to avoid print too long messages
189267 with debug_async_timeout (
190- "ray_object_retrieval_timeout" , "Client sent message is %.500s" , message
268+ "ray_object_retrieval_timeout" ,
269+ "Message that client sent to actor %s is %.500s and object_ref is %s" ,
270+ self .dest_address ,
271+ message ,
272+ object_ref ,
191273 ):
192- result = await object_ref
274+ try :
275+ result = await object_ref
276+ except Exception as e : # pragma: no cover
277+ logger .exception (
278+ "Get object %s from %s failed, got exception %s." ,
279+ object_ref ,
280+ self .dest_address ,
281+ e ,
282+ )
283+ raise
193284 if isinstance (result , RayChannelException ):
194285 raise result .exc_value .with_traceback (result .exc_traceback )
195286 return result .message
0 commit comments