@@ -250,6 +250,11 @@ def coordinator(self):
250250 else :
251251 return self .coordinator_id
252252
253+ def connected (self ):
254+ """Return True iff the coordinator node is connected"""
255+ with self ._lock :
256+ return self .coordinator_id is not None and self ._client .connected (self .coordinator_id )
257+
253258 def ensure_coordinator_ready (self , timeout_ms = None ):
254259 """Block until the coordinator for this group is known.
255260
@@ -1058,28 +1063,28 @@ def _run_once(self):
10581063 self .coordinator ._client ._lock .acquire ()
10591064 self .coordinator ._lock .acquire ()
10601065 try :
1061- if self .enabled and self .coordinator .state is MemberState .STABLE :
1062- # TODO: When consumer.wakeup() is implemented, we need to
1063- # disable here to prevent propagating an exception to this
1064- # heartbeat thread
1065- # must get client._lock, or maybe deadlock at heartbeat
1066- # failure callback in consumer poll
1067- self .coordinator ._client .poll (timeout_ms = 0 )
1068-
10691066 if not self .enabled :
10701067 heartbeat_log .debug ('Heartbeat disabled. Waiting' )
10711068 self .coordinator ._client ._lock .release ()
10721069 self .coordinator ._lock .wait ()
1073- heartbeat_log .debug ('Heartbeat re-enabled.' )
1070+ if self .enabled :
1071+ heartbeat_log .debug ('Heartbeat re-enabled.' )
1072+ return
10741073
1075- elif self .coordinator .state is not MemberState .STABLE :
1074+ if self .coordinator .state is not MemberState .STABLE :
10761075 # the group is not stable (perhaps because we left the
10771076 # group or because the coordinator kicked us out), so
10781077 # disable heartbeats and wait for the main thread to rejoin.
10791078 heartbeat_log .debug ('Group state is not stable, disabling heartbeats' )
10801079 self .disable ()
1080+ return
1081+
1082+ # TODO: When consumer.wakeup() is implemented, we need to
1083+ # disable here to prevent propagating an exception to this
1084+ # heartbeat thread
1085+ self .coordinator ._client .poll (timeout_ms = 0 )
10811086
1082- elif self .coordinator .coordinator_unknown ():
1087+ if self .coordinator .coordinator_unknown ():
10831088 future = self .coordinator .lookup_coordinator ()
10841089 if not future .is_done or future .failed ():
10851090 # the immediate future check ensures that we backoff
@@ -1088,6 +1093,10 @@ def _run_once(self):
10881093 self .coordinator ._client ._lock .release ()
10891094 self .coordinator ._lock .wait (self .coordinator .config ['retry_backoff_ms' ] / 1000 )
10901095
1096+ elif not self .coordinator .connected ():
1097+ self .coordinator ._client ._lock .release ()
1098+ self .coordinator ._lock .wait (self .coordinator .config ['retry_backoff_ms' ] / 1000 )
1099+
10911100 elif self .coordinator .heartbeat .session_timeout_expired ():
10921101 # the session timeout has expired without seeing a
10931102 # successful heartbeat, so we should probably make sure
@@ -1103,11 +1112,10 @@ def _run_once(self):
11031112 self .coordinator .maybe_leave_group ()
11041113
11051114 elif not self .coordinator .heartbeat .should_heartbeat ():
1106- # poll again after waiting for the retry backoff in case
1107- # the heartbeat failed or the coordinator disconnected
1108- heartbeat_log .log (0 , 'Not ready to heartbeat, waiting' )
1115+ next_hb = self .coordinator .heartbeat .time_to_next_heartbeat ()
1116+ heartbeat_log .debug ('Waiting %0.1f secs to send next heartbeat' , next_hb )
11091117 self .coordinator ._client ._lock .release ()
1110- self .coordinator ._lock .wait (self . coordinator . config [ 'retry_backoff_ms' ] / 1000 )
1118+ self .coordinator ._lock .wait (next_hb )
11111119
11121120 else :
11131121 self .coordinator .heartbeat .sent_heartbeat ()
0 commit comments