Skip to content

Commit f223388

Browse files
author
Arta Asadi
committed
fix: handle timeout
1 parent 6d33547 commit f223388

File tree

2 files changed

+141
-39
lines changed

2 files changed

+141
-39
lines changed

discovery/pkg/worker/service.go

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,27 +3,52 @@ package worker
33
import (
44
"github.com/spf13/cobra"
55
"go.uber.org/zap"
6+
"os"
7+
"os/signal"
8+
"syscall"
9+
"time"
610
)
711

812
func WorkerCommand() *cobra.Command {
913
cmd := &cobra.Command{
1014
RunE: func(cmd *cobra.Command, args []string) error {
11-
ctx := cmd.Context()
15+
setupCtx := cmd.Context()
1216
cmd.SilenceUsage = true
1317
logger, err := zap.NewProduction()
1418
if err != nil {
1519
return err
1620
}
1721

22+
sigChan := make(chan os.Signal, 1)
23+
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
24+
go func() {
25+
sig := <-sigChan
26+
logger.Warn("Received termination signal, worker will shut down when idle timeout is reached or on next check.", zap.String("signal", sig.String()))
27+
}()
28+
1829
w, err := NewWorker(
1930
logger,
20-
cmd.Context(),
31+
setupCtx,
2132
)
2233
if err != nil {
2334
return err
2435
}
2536

26-
return w.Run(ctx)
37+
// Define the idle timeout duration
38+
idleTimeout := 2 * time.Minute
39+
40+
// Run the worker with a background context, independent of signals,
41+
// and pass the idle timeout.
42+
logger.Info("Starting worker with idle timeout", zap.Duration("timeout", idleTimeout))
43+
runErr := w.Run(context.Background(), idleTimeout) // Use context.Background()
44+
45+
if runErr != nil && !errors.Is(runErr, context.Canceled) { // Avoid logging error for clean context cancellations if any were added back
46+
logger.Error("Worker Run exited with error", zap.Error(runErr))
47+
return runErr // Propagate actual errors
48+
}
49+
50+
logger.Info("Worker Run finished.")
51+
return nil
2752
},
2853
}
2954

discovery/pkg/worker/worker.go

Lines changed: 113 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -65,49 +65,126 @@ func NewWorker(
6565
return w, nil
6666
}
6767

68-
func (w *Worker) Run(ctx context.Context) error {
69-
w.logger.Info("starting to consume", zap.String("url", envs.NatsURL), zap.String("consumer", envs.NatsConsumer),
70-
zap.String("stream", envs.StreamName), zap.String("topic", envs.TopicName))
71-
72-
consumeCtx, err := w.jq.ConsumeWithConfig(ctx, envs.NatsConsumer, envs.StreamName, []string{envs.TopicName}, jetstream.ConsumerConfig{
73-
Replicas: 1,
74-
AckPolicy: jetstream.AckExplicitPolicy,
75-
DeliverPolicy: jetstream.DeliverAllPolicy,
76-
MaxAckPending: -1,
77-
AckWait: time.Minute * 30,
78-
InactiveThreshold: time.Hour,
79-
}, []jetstream.PullConsumeOpt{
80-
jetstream.PullMaxMessages(1),
81-
}, func(msg jetstream.Msg) {
82-
w.logger.Info("received a new job")
83-
84-
err := w.ProcessMessage(ctx, msg)
85-
if err != nil {
86-
// Log error from ProcessMessage itself (e.g., initial setup failure)
87-
// Note: Errors during task.RunTask are handled within ProcessMessage's defer
88-
w.logger.Error("failed during message processing setup", zap.Error(err))
89-
}
68+
func (w *Worker) Run(ctx context.Context, idleTimeout time.Duration) error {
69+
w.logger.Info("Starting Run loop",
70+
zap.String("nats_consumer", envs.NatsConsumer),
71+
zap.String("stream", envs.StreamName),
72+
zap.String("topic", envs.TopicName),
73+
zap.Duration("idleTimeout", idleTimeout))
74+
75+
idleTimer := time.NewTimer(idleTimeout)
9076

91-
// Ack is always sent by Run after ProcessMessage finishes or fails setup
92-
if ackErr := msg.Ack(); ackErr != nil {
93-
w.logger.Error("failed to send the ack message", zap.Error(ackErr))
77+
if !idleTimer.Stop() {
78+
select {
79+
case <-idleTimer.C:
80+
default:
9481
}
82+
}
83+
idleTimer.Reset(idleTimeout)
9584

96-
w.logger.Info("processing a job completed")
97-
})
98-
if err != nil {
99-
w.logger.Error("failed to start consuming messages", zap.Error(err))
100-
return err
85+
var consumer jetstream.ConsumeContext
86+
87+
consumerCtx, cancelConsumer := context.WithCancel(ctx)
88+
defer cancelConsumer()
89+
90+
consumerErrChan := make(chan error, 1)
91+
activityChan := make(chan struct{}, 1)
92+
93+
go func() {
94+
var consErr error
95+
consumer, consErr = w.jq.ConsumeWithConfig(
96+
consumerCtx,
97+
envs.NatsConsumer,
98+
envs.StreamName,
99+
[]string{envs.TopicName},
100+
jetstream.ConsumerConfig{
101+
Replicas: 1,
102+
AckPolicy: jetstream.AckExplicitPolicy,
103+
DeliverPolicy: jetstream.DeliverAllPolicy,
104+
MaxAckPending: -1,
105+
AckWait: time.Minute * 30,
106+
InactiveThreshold: time.Hour,
107+
},
108+
[]jetstream.PullConsumeOpt{
109+
jetstream.PullMaxMessages(1), // Process one message at a time
110+
},
111+
func(msg jetstream.Msg) {
112+
w.logger.Info("Received a new job")
113+
err := w.ProcessMessage(consumerCtx, msg)
114+
if err != nil {
115+
w.logger.Error("Failed during message processing", zap.Error(err))
116+
}
117+
118+
if ackErr := msg.Ack(); ackErr != nil {
119+
w.logger.Error("Failed to send the ack message", zap.Error(ackErr))
120+
121+
} else {
122+
w.logger.Info("Message Ack'd successfully.")
123+
}
124+
125+
w.logger.Info("Processing a job completed, signaling activity.")
126+
127+
select {
128+
case activityChan <- struct{}{}:
129+
default:
130+
w.logger.Warn("Activity channel buffer full?")
131+
}
132+
})
133+
134+
consumerErrChan <- consErr
135+
136+
<-consumerCtx.Done()
137+
w.logger.Info("Consumer goroutine exiting.")
138+
139+
}()
140+
141+
setupErr := <-consumerErrChan
142+
if setupErr != nil {
143+
w.logger.Error("Failed to start NATS consumer", zap.Error(setupErr))
144+
if !idleTimer.Stop() {
145+
select {
146+
case <-idleTimer.C:
147+
default:
148+
}
149+
}
150+
return setupErr
101151
}
102152

103-
w.logger.Info("consuming messages...")
153+
w.logger.Info("NATS consumer started successfully. Waiting for messages or idle timeout...")
154+
155+
for {
156+
select {
157+
case <-ctx.Done():
158+
w.logger.Info("Worker Run context cancelled. Initiating shutdown.")
159+
cancelConsumer()
160+
if consumer != nil {
161+
w.logger.Info("Draining consumer due to context cancellation...")
162+
consumer.Drain()
163+
w.logger.Info("Consumer drained.")
164+
}
165+
return ctx.Err()
104166

105-
<-ctx.Done()
106-
w.logger.Info("Main context cancelled, draining consumer...")
107-
consumeCtx.Drain()
108-
w.logger.Info("Consumer stopped.")
167+
case <-activityChan:
168+
w.logger.Debug("Activity detected, resetting idle timer.")
169+
if !idleTimer.Stop() {
170+
select {
171+
case <-idleTimer.C:
172+
default:
173+
}
174+
}
175+
idleTimer.Reset(idleTimeout)
109176

110-
return nil
177+
case <-idleTimer.C:
178+
w.logger.Info("Idle timeout reached. Initiating shutdown.")
179+
cancelConsumer()
180+
if consumer != nil {
181+
w.logger.Info("Draining consumer due to idle timeout...")
182+
consumer.Drain()
183+
w.logger.Info("Consumer drained.")
184+
}
185+
return nil
186+
}
187+
}
111188
}
112189

113190
func (w *Worker) ProcessMessage(ctx context.Context, msg jetstream.Msg) (err error) {

0 commit comments

Comments
 (0)