1717import logging
1818import threading
1919import time
20- from typing import Annotated
2120from uuid import UUID
2221
23- from pydantic import BaseModel , conint , model_validator
24-
2522from zenml .enums import ExecutionStatus
2623
2724logger = logging .getLogger (__name__ )
@@ -33,36 +30,18 @@ class StepHeartBeatTerminationException(Exception):
3330 pass
3431
3532
36- class StepHeartBeatOptions (BaseModel ):
37- """Options group for step heartbeat execution."""
38-
39- step_id : UUID
40- interval : Annotated [int , conint (ge = 10 , le = 60 )]
41- name : str | None = None
42-
43- @model_validator (mode = "after" )
44- def set_default_name (self ) -> "StepHeartBeatOptions" :
45- """Model validator - set name value if missing.
46-
47- Returns:
48- The validated step heartbeat options.
49- """
50- if not self .name :
51- self .name = f"HeartBeatWorker-{ self .step_id } "
52-
53- return self
54-
55-
56- class HeartbeatWorker :
33+ class StepHeartbeatWorker :
5734 """Worker class implementing heartbeat polling and remote termination."""
5835
59- def __init__ (self , options : StepHeartBeatOptions ):
36+ STEP_HEARTBEAT_INTERVAL_SECONDS = 60
37+
38+ def __init__ (self , step_id : UUID ):
6039 """Heartbeat worker constructor.
6140
6241 Args:
63- options: Parameter group - polling interval, step id, etc .
42+ step_id: The step id heartbeat is running for .
6443 """
65- self .options = options
44+ self ._step_id = step_id
6645
6746 self ._thread : threading .Thread | None = None
6847 self ._running : bool = False
@@ -72,14 +51,23 @@ def __init__(self, options: StepHeartBeatOptions):
7251
7352 # properties
7453
54+ @property
55+ def is_terminated (self ) -> bool :
56+ """Property function for termination signal.
57+
58+ Returns:
59+ True if the worker has been terminated.
60+ """
61+ return self ._terminated
62+
7563 @property
7664 def interval (self ) -> int :
7765 """Property function for heartbeat interval.
7866
7967 Returns:
8068 The heartbeat polling interval value.
8169 """
82- return self .options . interval
70+ return self .STEP_HEARTBEAT_INTERVAL_SECONDS
8371
8472 @property
8573 def name (self ) -> str :
@@ -88,7 +76,7 @@ def name(self) -> str:
8876 Returns:
8977 The name of the heartbeat worker.
9078 """
91- return str ( self .options . name )
79+ return f"HeartBeatWorker- { self .step_id } "
9280
9381 @property
9482 def step_id (self ) -> UUID :
@@ -97,14 +85,13 @@ def step_id(self) -> UUID:
9785 Returns:
9886 The id of the step heartbeat is running for.
9987 """
100- return self .options . step_id
88+ return self ._step_id
10189
10290 # public functions
10391
10492 def start (self ) -> None :
10593 """Start the heartbeat worker on a background thread."""
10694 if self ._thread and self ._thread .is_alive ():
107- logger .info ("%s already running; start() is a no-op" , self .name )
10895 return
10996
11097 self ._running = True
@@ -113,7 +100,7 @@ def start(self) -> None:
113100 target = self ._run , name = self .name , daemon = True
114101 )
115102 self ._thread .start ()
116- logger .info (
103+ logger .debug (
117104 "Daemon thread %s started (interval=%s)" , self .name , self .interval
118105 )
119106
@@ -122,7 +109,7 @@ def stop(self) -> None:
122109 if not self ._running :
123110 return
124111 self ._running = False
125- logger .info ("%s stop requested" , self .name )
112+ logger .debug ("%s stop requested" , self .name )
126113
127114 def is_alive (self ) -> bool :
128115 """Liveness of the heartbeat worker thread.
@@ -134,7 +121,7 @@ def is_alive(self) -> bool:
134121 return bool (t and t .is_alive ())
135122
136123 def _run (self ) -> None :
137- logger .info ("%s run() loop entered" , self .name )
124+ logger .debug ("%s run() loop entered" , self .name )
138125 try :
139126 while self ._running :
140127 try :
@@ -151,22 +138,21 @@ def _run(self) -> None:
151138 _thread .interrupt_main () # raises KeyboardInterrupt in main thread
152139 # Ensure we stop our own loop as well.
153140 self ._running = False
154- except Exception :
141+ except Exception as exc :
155142 # Log-and-continue policy for all other errors.
156- logger .exception (
157- "%s heartbeat() failed; continuing " , self .name
143+ logger .debug (
144+ "%s heartbeat() failed with %s " , self .name , str ( exc )
158145 )
159146 # Sleep after each attempt (even after errors, unless stopped).
160147 if self ._running :
161148 time .sleep (self .interval )
162149 finally :
163- logger .info ("%s run() loop exiting" , self .name )
150+ logger .debug ("%s run() loop exiting" , self .name )
164151
165152 def _heartbeat (self ) -> None :
166153 from zenml .config .global_config import GlobalConfiguration
167154
168155 store = GlobalConfiguration ().zen_store
169-
170156 response = store .update_step_heartbeat (step_run_id = self .step_id )
171157
172158 if response .status in {
0 commit comments