1717import logging
1818import threading
1919import time
20- from typing import Annotated
2120from uuid import UUID
2221
23- from pydantic import BaseModel , conint , model_validator
24-
2522from zenml .enums import ExecutionStatus
2623
2724logger = logging .getLogger (__name__ )
2825
2926
3027class StepHeartBeatTerminationException (Exception ):
3128 """Custom exception class for heartbeat termination."""
32-
3329 pass
3430
3531
36- class StepHeartBeatOptions (BaseModel ):
37- """Options group for step heartbeat execution."""
38-
39- step_id : UUID
40- interval : Annotated [int , conint (ge = 10 , le = 60 )]
41- name : str | None = None
42-
43- @model_validator (mode = "after" )
44- def set_default_name (self ) -> "StepHeartBeatOptions" :
45- """Model validator - set name value if missing.
46-
47- Returns:
48- The validated step heartbeat options.
49- """
50- if not self .name :
51- self .name = f"HeartBeatWorker-{ self .step_id } "
52-
53- return self
54-
55-
56- class HeartbeatWorker :
32+ class StepHeartbeatWorker :
5733 """Worker class implementing heartbeat polling and remote termination."""
5834
59- def __init__ (self , options : StepHeartBeatOptions ):
35+ STEP_HEARTBEAT_INTERVAL_SECONDS = 60
36+
37+ def __init__ (self , step_id : UUID ):
6038 """Heartbeat worker constructor.
6139
6240 Args:
63- options: Parameter group - polling interval, step id, etc .
41+ step_id: The step id heartbeat is running for .
6442 """
65- self .options = options
43+ self ._step_id = step_id
6644
6745 self ._thread : threading .Thread | None = None
6846 self ._running : bool = False
@@ -79,7 +57,7 @@ def interval(self) -> int:
7957 Returns:
8058 The heartbeat polling interval value.
8159 """
82- return self .options . interval
60+ return self .STEP_HEARTBEAT_INTERVAL_SECONDS
8361
8462 @property
8563 def name (self ) -> str :
@@ -88,7 +66,7 @@ def name(self) -> str:
8866 Returns:
8967 The name of the heartbeat worker.
9068 """
91- return str ( self .options . name )
69+ return f"HeartBeatWorker- { self .step_id } "
9270
9371 @property
9472 def step_id (self ) -> UUID :
@@ -97,14 +75,13 @@ def step_id(self) -> UUID:
9775 Returns:
9876 The id of the step heartbeat is running for.
9977 """
100- return self .options . step_id
78+ return self ._step_id
10179
10280 # public functions
10381
10482 def start (self ) -> None :
10583 """Start the heartbeat worker on a background thread."""
10684 if self ._thread and self ._thread .is_alive ():
107- logger .info ("%s already running; start() is a no-op" , self .name )
10885 return
10986
11087 self ._running = True
@@ -113,7 +90,7 @@ def start(self) -> None:
11390 target = self ._run , name = self .name , daemon = True
11491 )
11592 self ._thread .start ()
116- logger .info (
93+ logger .debug (
11794 "Daemon thread %s started (interval=%s)" , self .name , self .interval
11895 )
11996
@@ -122,7 +99,7 @@ def stop(self) -> None:
12299 if not self ._running :
123100 return
124101 self ._running = False
125- logger .info ("%s stop requested" , self .name )
102+ logger .debug ("%s stop requested" , self .name )
126103
127104 def is_alive (self ) -> bool :
128105 """Liveness of the heartbeat worker thread.
@@ -134,7 +111,7 @@ def is_alive(self) -> bool:
134111 return bool (t and t .is_alive ())
135112
136113 def _run (self ) -> None :
137- logger .info ("%s run() loop entered" , self .name )
114+ logger .debug ("%s run() loop entered" , self .name )
138115 try :
139116 while self ._running :
140117 try :
@@ -151,22 +128,21 @@ def _run(self) -> None:
151128 _thread .interrupt_main () # raises KeyboardInterrupt in main thread
152129 # Ensure we stop our own loop as well.
153130 self ._running = False
154- except Exception :
131+ except Exception as exc :
155132 # Log-and-continue policy for all other errors.
156- logger .exception (
157- "%s heartbeat() failed; continuing " , self .name
133+ logger .debug (
134+ "%s heartbeat() failed with %s " , self .name , str ( exc )
158135 )
159136 # Sleep after each attempt (even after errors, unless stopped).
160137 if self ._running :
161138 time .sleep (self .interval )
162139 finally :
163- logger .info ("%s run() loop exiting" , self .name )
140+ logger .debug ("%s run() loop exiting" , self .name )
164141
165142 def _heartbeat (self ) -> None :
166143 from zenml .config .global_config import GlobalConfiguration
167144
168145 store = GlobalConfiguration ().zen_store
169-
170146 response = store .update_step_heartbeat (step_run_id = self .step_id )
171147
172148 if response .status in {
0 commit comments