@@ -28,6 +28,7 @@ import (
2828 "time"
2929
3030 "github.com/pborman/uuid"
31+ "go.uber.org/cadence/internal/common/backoff"
3132 "go.uber.org/zap"
3233)
3334
@@ -121,8 +122,8 @@ const (
121122
122123 errTooManySessionsMsg string = "too many outstanding sessions"
123124
124- defaultSessionHeartBeatTimeout time.Duration = time .Second * 20
125- maxSessionHeartBeatInterval time.Duration = time .Second * 10
125+ defaultSessionHeartbeatTimeout time.Duration = time .Second * 20
126+ maxSessionHeartbeatInterval time.Duration = time .Second * 10
126127)
127128
128129var (
@@ -305,7 +306,7 @@ func createSession(ctx Context, creationTasklist string, options *SessionOptions
305306 },
306307 }
307308
308- heartbeatTimeout := defaultSessionHeartBeatTimeout
309+ heartbeatTimeout := defaultSessionHeartbeatTimeout
309310 if options .HeartbeatTimeout != time .Duration (0 ) {
310311 heartbeatTimeout = options .HeartbeatTimeout
311312 }
@@ -410,22 +411,47 @@ func sessionCreationActivity(ctx context.Context, sessionID string) error {
410411
411412 activityEnv := getActivityEnv (ctx )
412413 heartbeatInterval := activityEnv .heartbeatTimeout / 3
413- if heartbeatInterval > maxSessionHeartBeatInterval {
414- heartbeatInterval = maxSessionHeartBeatInterval
414+ if heartbeatInterval > maxSessionHeartbeatInterval {
415+ heartbeatInterval = maxSessionHeartbeatInterval
415416 }
416417 ticker := time .NewTicker (heartbeatInterval )
417418 defer ticker .Stop ()
418419
420+ heartbeatRetryPolicy := backoff .NewExponentialRetryPolicy (time .Second )
421+ heartbeatRetryPolicy .SetMaximumInterval (time .Second * 2 )
422+ heartbeatRetryPolicy .SetExpirationInterval (heartbeatInterval )
423+
419424 for {
420425 select {
421426 case <- ctx .Done ():
422427 sessionEnv .CompleteSession (sessionID )
423428 return ctx .Err ()
424429 case <- ticker .C :
425- err := activityEnv .serviceInvoker .Heartbeat ([]byte {})
430+ heartbeatOp := func () error {
431+ // here we skip the internal heartbeat batching, as otherwise the activity has only once chance
432+ // for heartbeating and if that failed, the entire session will get fail due to heartbeat timeout.
433+ // since the heartbeat interval is controlled by the session framework, we don't need to worry about
434+ // calling heartbeat too frequently and causing trouble for the sever. (note the min heartbeat timeout
435+ // is 1 sec.)
436+ return activityEnv .serviceInvoker .Heartbeat ([]byte {}, true )
437+ }
438+ isRetryable := func (_ error ) bool {
439+ // there will be two types of error here:
440+ // 1. transient errors like timeout, in which case we should not fail the session
441+ // 2. non-retryable errors like activity cancelled, activity not found or domain
442+ // not active. In those cases, the internal implementation will cancel the context,
443+ // so in the next iteration, ctx.Done() will be selected. Here we rely on the heartbeat
444+ // internal implementation to tell which error is non-retryable.
445+ select {
446+ case <- ctx .Done ():
447+ return false
448+ default :
449+ return true
450+ }
451+ }
452+ err := backoff .Retry (ctx , heartbeatOp , heartbeatRetryPolicy , isRetryable )
426453 if err != nil {
427- sessionEnv .CompleteSession (sessionID )
428- return err
454+ GetActivityLogger (ctx ).Info ("session heartbeat failed" , zap .Error (err ), zap .String ("sessionID" , sessionID ))
429455 }
430456 case <- doneCh :
431457 return nil
0 commit comments