Skip to content

Commit 5e03e1f

Browse files
authored
Launch interchange as a fresh process (#3463)
This PR removes a use of multiprocessing fork-without-exec. At heart, this is how the interchange has wanted to be launched for some time (because of earlier remote interchange work). Launching multiprocessing fork caused a bunch of problems related to inheriteing state from from the parent submitting process that go away with this (jumbled logging topics, race conditions around at least logging-while-forking, inherited signal handlers). The configuration dictionary, previously passed in memory over a fork, is now sent in pickled form over stdin. Using pickle here rather than (eg.) JSON keeps the path open for sending richer configuration objects, beyond what can be encoded in JSON. This isn't something needed right now, but at least configurable monitoring radios (the immediate driving force behind this PR) are modelled around passing arbitrary configuration objects around to configure things - and so it seems likely that if interchange monitoring configuration is exposed to the user, richer objects would be passed here. See PR #3315 for monitoring radio prototype.
1 parent 00520e3 commit 5e03e1f

File tree

4 files changed

+66
-46
lines changed

4 files changed

+66
-46
lines changed

parsl/executors/high_throughput/executor.py

Lines changed: 36 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import logging
22
import math
33
import pickle
4+
import subprocess
45
import threading
56
import typing
67
import warnings
78
from collections import defaultdict
89
from concurrent.futures import Future
910
from dataclasses import dataclass
10-
from multiprocessing import Process
1111
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
1212

1313
import typeguard
@@ -18,15 +18,14 @@
1818
from parsl.app.errors import RemoteExceptionWrapper
1919
from parsl.data_provider.staging import Staging
2020
from parsl.executors.errors import BadMessage, ScalingFailed
21-
from parsl.executors.high_throughput import interchange, zmq_pipes
21+
from parsl.executors.high_throughput import zmq_pipes
2222
from parsl.executors.high_throughput.errors import CommandClientTimeoutError
2323
from parsl.executors.high_throughput.mpi_prefix_composer import (
2424
VALID_LAUNCHERS,
2525
validate_resource_spec,
2626
)
2727
from parsl.executors.status_handling import BlockProviderExecutor
2828
from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
29-
from parsl.multiprocessing import ForkProcess
3029
from parsl.process_loggers import wrap_with_logs
3130
from parsl.providers import LocalProvider
3231
from parsl.providers.base import ExecutionProvider
@@ -305,7 +304,7 @@ def __init__(self,
305304
self._task_counter = 0
306305
self.worker_ports = worker_ports
307306
self.worker_port_range = worker_port_range
308-
self.interchange_proc: Optional[Process] = None
307+
self.interchange_proc: Optional[subprocess.Popen] = None
309308
self.interchange_port_range = interchange_port_range
310309
self.heartbeat_threshold = heartbeat_threshold
311310
self.heartbeat_period = heartbeat_period
@@ -520,38 +519,45 @@ def _queue_management_worker(self):
520519

521520
logger.info("Queue management worker finished")
522521

523-
def _start_local_interchange_process(self):
522+
def _start_local_interchange_process(self) -> None:
524523
""" Starts the interchange process locally
525524
526-
Starts the interchange process locally and uses an internal command queue to
525+
Starts the interchange process locally and uses the command queue to
527526
get the worker task and result ports that the interchange has bound to.
528527
"""
529-
self.interchange_proc = ForkProcess(target=interchange.starter,
530-
kwargs={"client_address": "127.0.0.1",
531-
"client_ports": (self.outgoing_q.port,
532-
self.incoming_q.port,
533-
self.command_client.port),
534-
"interchange_address": self.address,
535-
"worker_ports": self.worker_ports,
536-
"worker_port_range": self.worker_port_range,
537-
"hub_address": self.hub_address,
538-
"hub_zmq_port": self.hub_zmq_port,
539-
"logdir": self.logdir,
540-
"heartbeat_threshold": self.heartbeat_threshold,
541-
"poll_period": self.poll_period,
542-
"logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
543-
"cert_dir": self.cert_dir,
544-
},
545-
daemon=True,
546-
name="HTEX-Interchange"
547-
)
548-
self.interchange_proc.start()
549528

529+
interchange_config = {"client_address": "127.0.0.1",
530+
"client_ports": (self.outgoing_q.port,
531+
self.incoming_q.port,
532+
self.command_client.port),
533+
"interchange_address": self.address,
534+
"worker_ports": self.worker_ports,
535+
"worker_port_range": self.worker_port_range,
536+
"hub_address": self.hub_address,
537+
"hub_zmq_port": self.hub_zmq_port,
538+
"logdir": self.logdir,
539+
"heartbeat_threshold": self.heartbeat_threshold,
540+
"poll_period": self.poll_period,
541+
"logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
542+
"cert_dir": self.cert_dir,
543+
}
544+
545+
config_pickle = pickle.dumps(interchange_config)
546+
547+
self.interchange_proc = subprocess.Popen(b"interchange.py", stdin=subprocess.PIPE)
548+
stdin = self.interchange_proc.stdin
549+
assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
550+
551+
logger.debug("Popened interchange process. Writing config object")
552+
stdin.write(config_pickle)
553+
stdin.flush()
554+
logger.debug("Sent config object. Requesting worker ports")
550555
try:
551556
(self.worker_task_port, self.worker_result_port) = self.command_client.run("WORKER_PORTS", timeout_s=120)
552557
except CommandClientTimeoutError:
553-
logger.error("Interchange has not completed initialization in 120s. Aborting")
558+
logger.error("Interchange has not completed initialization. Aborting")
554559
raise Exception("Interchange failed to start")
560+
logger.debug("Got worker ports")
555561

556562
def _start_queue_management_thread(self):
557563
"""Method to start the management thread as a daemon.
@@ -810,13 +816,12 @@ def shutdown(self, timeout: float = 10.0):
810816
logger.info("Attempting HighThroughputExecutor shutdown")
811817

812818
self.interchange_proc.terminate()
813-
self.interchange_proc.join(timeout=timeout)
814-
if self.interchange_proc.is_alive():
819+
try:
820+
self.interchange_proc.wait(timeout=timeout)
821+
except subprocess.TimeoutExpired:
815822
logger.info("Unable to terminate Interchange process; sending SIGKILL")
816823
self.interchange_proc.kill()
817824

818-
self.interchange_proc.close()
819-
820825
logger.info("Finished HighThroughputExecutor shutdown attempt")
821826

822827
def get_usage_information(self):

parsl/executors/high_throughput/interchange.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -672,13 +672,10 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
672672
logger.addHandler(handler)
673673

674674

675-
@wrap_with_logs(target="interchange")
676-
def starter(*args: Any, **kwargs: Any) -> None:
677-
"""Start the interchange process
678-
679-
The executor is expected to call this function. The args, kwargs match that of the Interchange.__init__
680-
"""
675+
if __name__ == "__main__":
681676
setproctitle("parsl: HTEX interchange")
682-
# logger = multiprocessing.get_logger()
683-
ic = Interchange(*args, **kwargs)
677+
678+
config = pickle.load(sys.stdin.buffer)
679+
680+
ic = Interchange(**config)
684681
ic.start()

parsl/tests/test_htex/test_htex.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import pathlib
22
import warnings
3+
from subprocess import Popen, TimeoutExpired
34
from unittest import mock
45

56
import pytest
67

78
from parsl import HighThroughputExecutor, curvezmq
8-
from parsl.multiprocessing import ForkProcess
99

1010
_MOCK_BASE = "parsl.executors.high_throughput.executor"
1111

@@ -78,16 +78,33 @@ def test_htex_shutdown(
7878
timeout_expires: bool,
7979
htex: HighThroughputExecutor,
8080
):
81-
mock_ix_proc = mock.Mock(spec=ForkProcess)
81+
mock_ix_proc = mock.Mock(spec=Popen)
8282

8383
if started:
8484
htex.interchange_proc = mock_ix_proc
85-
mock_ix_proc.is_alive.return_value = True
85+
86+
# This will, in the absence of any exit trigger, block forever if
87+
# no timeout is given and if the interchange does not terminate.
88+
# Raise an exception to report that, rather than actually block,
89+
# and hope that nothing is catching that exception.
90+
91+
# this function implements the behaviour if the interchange has
92+
# not received a termination call
93+
def proc_wait_alive(timeout):
94+
if timeout:
95+
raise TimeoutExpired(cmd="mock-interchange", timeout=timeout)
96+
else:
97+
raise RuntimeError("This wait call would hang forever")
98+
99+
def proc_wait_terminated(timeout):
100+
return 0
101+
102+
mock_ix_proc.wait.side_effect = proc_wait_alive
86103

87104
if not timeout_expires:
88105
# Simulate termination of the Interchange process
89106
def kill_interchange(*args, **kwargs):
90-
mock_ix_proc.is_alive.return_value = False
107+
mock_ix_proc.wait.side_effect = proc_wait_terminated
91108

92109
mock_ix_proc.terminate.side_effect = kill_interchange
93110

@@ -96,16 +113,16 @@ def kill_interchange(*args, **kwargs):
96113
mock_logs = mock_logger.info.call_args_list
97114
if started:
98115
assert mock_ix_proc.terminate.called
99-
assert mock_ix_proc.join.called
100-
assert {"timeout": 10} == mock_ix_proc.join.call_args[1]
116+
assert mock_ix_proc.wait.called
117+
assert {"timeout": 10} == mock_ix_proc.wait.call_args[1]
101118
if timeout_expires:
102119
assert "Unable to terminate Interchange" in mock_logs[1][0][0]
103120
assert mock_ix_proc.kill.called
104121
assert "Attempting" in mock_logs[0][0][0]
105122
assert "Finished" in mock_logs[-1][0][0]
106123
else:
107124
assert not mock_ix_proc.terminate.called
108-
assert not mock_ix_proc.join.called
125+
assert not mock_ix_proc.wait.called
109126
assert "has not started" in mock_logs[0][0][0]
110127

111128

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
python_requires=">=3.8.0",
5757
install_requires=install_requires,
5858
scripts = ['parsl/executors/high_throughput/process_worker_pool.py',
59+
'parsl/executors/high_throughput/interchange.py',
5960
'parsl/executors/workqueue/exec_parsl_function.py',
6061
'parsl/executors/workqueue/parsl_coprocess.py',
6162
],

0 commit comments

Comments
 (0)