Replies: 1 comment
-
jThis simple Python class monitors the The interface as it is here only has a blocking import time
from collections import namedtuple, deque
import flux
from flux.idset import IDset
from flux.hostlist import Hostlist
OfflineEvent = namedtuple("OfflineEvent", "timestamp, rank, name")
class NodeMonitor:
def __init__(self, handle):
self.handle = handle
self.hostlist = Hostlist(handle.attr_get("hostlist"))
self.backlog = deque()
self.rpc = None
self.last_online = IDset()
def start(self):
self.rpc = self.handle.rpc(
"groups.get",
{"name": "broker.online"},
nodeid=0,
flags=flux.constants.FLUX_RPC_STREAMING,
)
return self
def poll(self, timeout=-1.0):
if self.rpc is None:
raise RuntimeError("poll() called before start()")
while not self.backlog:
resp = self.rpc.wait_for(timeout).get()
self.__online_group_update(resp)
self.rpc.reset()
return self.__next_event()
def __next_event(self):
return self.backlog.popleft()
def __online_group_update(self, resp):
# All ranks leaving in this update share an event timestamp
timestamp = time.time()
# Calculate the ranks that left the online group by subtracting
# the current set from the previous set. This returns only those
# ranks that left:
online = IDset(resp["members"])
leave = self.last_online - online
# Append a single event to the backlog for each offline rank
for rank in leave:
self.__append_event(timestamp, rank)
# Update last_online
self.last_online = online
def __append_event(self, timestamp, rank):
self.backlog.append(OfflineEvent(timestamp, rank, self.hostlist[rank]))
handle = flux.Flux()
nodemon = NodeMonitor(handle).start()
while True:
timestamp, rank, hostname = nodemon.poll()
print(f"rank {rank} ({hostname}) lost at {timestamp}") Example: $ flux start --test-size=8 --test-exit-mode=leader -Stbon.topo=kary:2
$ flux overlay status
0 tuolumne2150: full
├─ 1 tuolumne2150: full
│ ├─ 3 tuolumne2150: full
│ │ └─ 7 tuolumne2150: full
│ └─ 4 tuolumne2150: full
└─ 2 tuolumne2150: full
├─ 5 tuolumne2150: full
└─ 6 tuolumne2150: full
$ flux python monitor.py &
[1] 3520708
$ flux overlay disconnect 2
flux-overlay: asking tuolumne2150 (rank 0) to disconnect child tuolumne2150 (rank 2)
Mar 27 07:17:57.286358 PDT 2025 broker.err[0]: tuolumne2150 (rank 2) disconnected by request and severed contact with 2 other nodes
Mar 27 07:17:57.286450 PDT 2025 broker.crit[2]: tuolumne2150 (rank 0) sent disconnect control message
Mar 27 07:17:57.386963 PDT 2025 broker.err[0]: dead to Flux: tuolumne[2150,2150,2150] (rank 2,5-6)
rank 2 (tuolumne2150) lost at 1743085077.3871646
rank 5 (tuolumne2150) lost at 1743085077.3871646
rank 6 (tuolumne2150) lost at 1743085077.3871646 |
Beta Was this translation helpful? Give feedback.
0 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
-
A user had asked how to monitor for lost nodes in a Flux instance.
In v0.72.0 the ResourceJournalConsumer class was added, which could be used to monitor for
offline
events in the resource eventlog. In v0.71.0, however, that class does not exist and in fact those events are not posted to the eventlog, so a different approach is needed. This discussion will be used to document possible approaches.Beta Was this translation helpful? Give feedback.
All reactions