1515from parsl .log_utils import set_file_logger
1616from parsl .monitoring .errors import MonitoringHubStartError
1717from parsl .monitoring .radios .multiprocessing import MultiprocessingQueueRadioSender
18- from parsl .monitoring .router import router_starter
18+ from parsl .monitoring .radios .udp_router import udp_router_starter
19+ from parsl .monitoring .radios .zmq_router import zmq_router_starter
1920from parsl .monitoring .types import TaggedMonitoringMessage
2021from parsl .multiprocessing import ForkProcess , SizedQueue
2122from parsl .process_loggers import wrap_with_logs
@@ -121,11 +122,14 @@ def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> No
121122 # in the future, Queue will allow runtime subscripts.
122123
123124 if TYPE_CHECKING :
124- comm_q : Queue [Union [Tuple [int , int ], str ]]
125+ zmq_comm_q : Queue [Union [int , str ]]
126+ udp_comm_q : Queue [Union [int , str ]]
125127 else :
126- comm_q : Queue
128+ zmq_comm_q : Queue
129+ udp_comm_q : Queue
127130
128- comm_q = SizedQueue (maxsize = 10 )
131+ zmq_comm_q = SizedQueue (maxsize = 10 )
132+ udp_comm_q = SizedQueue (maxsize = 10 )
129133
130134 self .exception_q : Queue [Tuple [str , str ]]
131135 self .exception_q = SizedQueue (maxsize = 10 )
@@ -136,21 +140,35 @@ def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> No
136140 self .router_exit_event : ms .Event
137141 self .router_exit_event = Event ()
138142
139- self .router_proc = ForkProcess (target = router_starter ,
140- kwargs = {"comm_q" : comm_q ,
141- "exception_q" : self .exception_q ,
142- "resource_msgs" : self .resource_msgs ,
143- "exit_event" : self .router_exit_event ,
144- "hub_address" : self .hub_address ,
145- "udp_port" : self .hub_port ,
146- "zmq_port_range" : self .hub_port_range ,
147- "run_dir" : dfk_run_dir ,
148- "logging_level" : logging .DEBUG if self .monitoring_debug else logging .INFO ,
149- },
150- name = "Monitoring-Router-Process" ,
151- daemon = True ,
152- )
153- self .router_proc .start ()
143+ self .zmq_router_proc = ForkProcess (target = zmq_router_starter ,
144+ kwargs = {"comm_q" : zmq_comm_q ,
145+ "exception_q" : self .exception_q ,
146+ "resource_msgs" : self .resource_msgs ,
147+ "exit_event" : self .router_exit_event ,
148+ "hub_address" : self .hub_address ,
149+ "zmq_port_range" : self .hub_port_range ,
150+ "run_dir" : dfk_run_dir ,
151+ "logging_level" : logging .DEBUG if self .monitoring_debug else logging .INFO ,
152+ },
153+ name = "Monitoring-ZMQ-Router-Process" ,
154+ daemon = True ,
155+ )
156+ self .zmq_router_proc .start ()
157+
158+ self .udp_router_proc = ForkProcess (target = udp_router_starter ,
159+ kwargs = {"comm_q" : udp_comm_q ,
160+ "exception_q" : self .exception_q ,
161+ "resource_msgs" : self .resource_msgs ,
162+ "exit_event" : self .router_exit_event ,
163+ "hub_address" : self .hub_address ,
164+ "udp_port" : self .hub_port ,
165+ "run_dir" : dfk_run_dir ,
166+ "logging_level" : logging .DEBUG if self .monitoring_debug else logging .INFO ,
167+ },
168+ name = "Monitoring-UDP-Router-Process" ,
169+ daemon = True ,
170+ )
171+ self .udp_router_proc .start ()
154172
155173 self .dbm_proc = ForkProcess (target = dbm_starter ,
156174 args = (self .exception_q , self .resource_msgs ,),
@@ -162,7 +180,8 @@ def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> No
162180 daemon = True ,
163181 )
164182 self .dbm_proc .start ()
165- logger .info ("Started the router process %s and DBM process %s" , self .router_proc .pid , self .dbm_proc .pid )
183+ logger .info ("Started ZMQ router process %s, UDP router process %s and DBM process %s" ,
184+ self .zmq_router_proc .pid , self .udp_router_proc .pid , self .dbm_proc .pid )
166185
167186 self .filesystem_proc = ForkProcess (target = filesystem_receiver ,
168187 args = (self .resource_msgs , dfk_run_dir ),
@@ -175,25 +194,36 @@ def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> No
175194 self .radio = MultiprocessingQueueRadioSender (self .resource_msgs )
176195
177196 try :
178- comm_q_result = comm_q .get (block = True , timeout = 120 )
179- comm_q .close ()
180- comm_q .join_thread ()
197+ zmq_comm_q_result = zmq_comm_q .get (block = True , timeout = 120 )
198+ zmq_comm_q .close ()
199+ zmq_comm_q .join_thread ()
181200 except queue .Empty :
182- logger .error ("MonitoringRouter has not reported ports in 120s. Aborting" )
201+ logger .error ("Monitoring ZMQ Router has not reported port in 120s. Aborting" )
183202 raise MonitoringHubStartError ()
184203
185- if isinstance (comm_q_result , str ):
186- logger .error ("MonitoringRouter sent an error message: %s" , comm_q_result )
187- raise RuntimeError (f"MonitoringRouter failed to start: { comm_q_result } " )
204+ if isinstance (zmq_comm_q_result , str ):
205+ logger .error ("MonitoringRouter sent an error message: %s" , zmq_comm_q_result )
206+ raise RuntimeError (f"MonitoringRouter failed to start: { zmq_comm_q_result } " )
207+
208+ self .hub_zmq_port = zmq_comm_q_result
188209
189- udp_port , zmq_port = comm_q_result
210+ try :
211+ udp_comm_q_result = udp_comm_q .get (block = True , timeout = 120 )
212+ udp_comm_q .close ()
213+ udp_comm_q .join_thread ()
214+ except queue .Empty :
215+ logger .error ("Monitoring UDP router has not reported port in 120s. Aborting" )
216+ raise MonitoringHubStartError ()
190217
218+ if isinstance (udp_comm_q_result , str ):
219+ logger .error ("MonitoringRouter sent an error message: %s" , udp_comm_q_result )
220+ raise RuntimeError (f"MonitoringRouter failed to start: { udp_comm_q_result } " )
221+
222+ udp_port = udp_comm_q_result
191223 self .monitoring_hub_url = "udp://{}:{}" .format (self .hub_address , udp_port )
192224
193225 logger .info ("Monitoring Hub initialized" )
194226
195- self .hub_zmq_port = zmq_port
196-
197227 def send (self , message : TaggedMonitoringMessage ) -> None :
198228 logger .debug ("Sending message type %s" , message [0 ])
199229 self .radio .send (message )
@@ -216,14 +246,21 @@ def close(self) -> None:
216246 exception_msg [0 ],
217247 exception_msg [1 ]
218248 )
219- self .router_proc .terminate ()
249+ self .zmq_router_proc .terminate ()
250+ self .udp_router_proc .terminate ()
220251 self .dbm_proc .terminate ()
221252 self .filesystem_proc .terminate ()
222253 logger .info ("Setting router termination event" )
223254 self .router_exit_event .set ()
224- logger .info ("Waiting for router to terminate" )
225- self .router_proc .join ()
226- self .router_proc .close ()
255+
256+ logger .info ("Waiting for ZMQ router to terminate" )
257+ self .zmq_router_proc .join ()
258+ self .zmq_router_proc .close ()
259+
260+ logger .info ("Waiting for UDP router to terminate" )
261+ self .udp_router_proc .join ()
262+ self .udp_router_proc .close ()
263+
227264 logger .debug ("Finished waiting for router termination" )
228265 if len (exception_msgs ) == 0 :
229266 logger .debug ("Sending STOP to DBM" )
0 commit comments