-
Notifications
You must be signed in to change notification settings - Fork 181
improve processor stability #473
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -257,7 +257,7 @@ func (pp *PartitionProcessor) Start(setupCtx, ctx context.Context) error { | |
| join := join | ||
| pp.runnerGroup.Go(func() error { | ||
| defer pp.state.SetState(PPStateStopping) | ||
| return join.CatchupForever(runnerCtx, false) | ||
| return join.CatchupForever(runnerCtx, true) | ||
| }) | ||
| } | ||
|
|
||
|
|
@@ -274,7 +274,7 @@ func (pp *PartitionProcessor) Start(setupCtx, ctx context.Context) error { | |
| // (b) run the processor table in catchup mode so it keeps updating it's state. | ||
| case runModePassive: | ||
| if pp.table != nil { | ||
| err = pp.table.CatchupForever(runnerCtx, false) | ||
| err = pp.table.CatchupForever(runnerCtx, true) | ||
| } | ||
| default: | ||
| err = fmt.Errorf("processor has invalid run mode") | ||
|
|
@@ -298,16 +298,16 @@ func (pp *PartitionProcessor) Stop() error { | |
| pp.state.SetState(PPStateStopping) | ||
| defer pp.state.SetState(PPStateStopped) | ||
|
|
||
| close(pp.input) | ||
| close(pp.visitInput) | ||
|
|
||
| if pp.cancelRunnerGroup != nil { | ||
| pp.cancelRunnerGroup() | ||
| } | ||
|
|
||
| // wait for the runner to be done | ||
| runningErrs := multierror.Append(pp.runnerGroup.Wait().ErrorOrNil()) | ||
|
|
||
| close(pp.input) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. channels are now closed after the runner-group is done --> visitors are attaching to the runner-group for this. |
||
| close(pp.visitInput) | ||
|
|
||
| // close all the tables | ||
| stopErrg, _ := multierr.NewErrGroup(context.Background()) | ||
| for _, join := range pp.joins { | ||
|
|
@@ -637,15 +637,6 @@ func (pp *PartitionProcessor) VisitValues(ctx context.Context, name string, meta | |
|
|
||
| var wg sync.WaitGroup | ||
|
|
||
| // drains the channel and drops out when closed. | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there was actually no point to distinguish between draining until close or draining until it's empty, because this function is writing to the channel. |
||
| // This is done when the processor shuts down during visit | ||
| // and makes sure the waitgroup is fully counted down. | ||
| drainUntilClose := func() { | ||
| for range pp.visitInput { | ||
| wg.Done() | ||
| } | ||
| } | ||
|
|
||
| // drains the input channel until there are no more items. | ||
| // does not wait for close, because the channel stays open for the next visit | ||
| drainUntilEmpty := func() { | ||
|
|
@@ -662,6 +653,17 @@ func (pp *PartitionProcessor) VisitValues(ctx context.Context, name string, meta | |
| } | ||
| } | ||
|
|
||
| // register a channel that will close once the visitor itself is done. | ||
| visitDone := make(chan struct{}) | ||
| defer close(visitDone) | ||
|
|
||
| // start a goroutine in the processor's runner-errgroup that prevents the broker from shutting down | ||
| // while the visitor is running. | ||
| pp.runnerGroup.Go(func() error { | ||
| <-visitDone | ||
| return nil | ||
| }) | ||
|
|
||
| defer it.Release() | ||
|
|
||
| stopping, doneWaitingForStop := pp.stopping() | ||
|
|
@@ -673,7 +675,7 @@ func (pp *PartitionProcessor) VisitValues(ctx context.Context, name string, meta | |
| wg.Add(1) | ||
| select { | ||
| case <-stopping: | ||
| drainUntilClose() | ||
| drainUntilEmpty() | ||
| wg.Done() | ||
| return ErrVisitAborted | ||
| case <-ctx.Done(): | ||
|
|
@@ -703,7 +705,7 @@ func (pp *PartitionProcessor) VisitValues(ctx context.Context, name string, meta | |
| }() | ||
| select { | ||
| case <-stopping: | ||
| drainUntilClose() | ||
| drainUntilEmpty() | ||
| return ErrVisitAborted | ||
| case <-ctx.Done(): | ||
| drainUntilEmpty() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -421,11 +421,11 @@ func (g *Processor) handleSessionErrors(ctx, sessionCtx context.Context, session | |
| ) | ||
|
|
||
| if errors.As(err, &errProc) { | ||
| g.log.Debugf("error processing message (non-transient), shutting down processor: %v", err) | ||
| g.log.Printf("error processing message (non-transient), shutting down processor: %v", err) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's have those important errors not as debug. |
||
| sessionCtxCancel() | ||
| } | ||
| if errors.As(err, &errSetup) { | ||
| g.log.Debugf("setup error (non-transient), shutting down processor: %v", err) | ||
| g.log.Printf("setup error (non-transient), shutting down processor: %v", err) | ||
| sessionCtxCancel() | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ func TestProcessorShutdown_KafkaDisconnect(t *testing.T) { | |
| brokers := initSystemTest(t) | ||
| var ( | ||
| topic = goka.Stream(fmt.Sprintf("goka_systemtest_proc_shutdown_disconnect-%d", time.Now().Unix())) | ||
| join = goka.Stream(fmt.Sprintf("goka_systemtest_proc_shutdown_disconnect-%d-join", time.Now().Unix())) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. adding some join tables to the tests so we can test the reconnecting joins change from above. |
||
| group = goka.Group(topic) | ||
| ) | ||
|
|
||
|
|
@@ -29,6 +30,7 @@ func TestProcessorShutdown_KafkaDisconnect(t *testing.T) { | |
| tmgr, err := goka.DefaultTopicManagerBuilder(brokers) | ||
| require.NoError(t, err) | ||
| require.NoError(t, tmgr.EnsureStreamExists(string(topic), 10)) | ||
| require.NoError(t, tmgr.EnsureTableExists(string(join), 10)) | ||
|
|
||
| // emit values | ||
| errg.Go(func() error { | ||
|
|
@@ -69,6 +71,7 @@ func TestProcessorShutdown_KafkaDisconnect(t *testing.T) { | |
| ctx.SetValue(msg) | ||
| } | ||
| }), | ||
| goka.Join(goka.Table(join), new(codec.String)), | ||
| goka.Persist(new(codec.Int64)), | ||
| ), | ||
| goka.WithConsumerGroupBuilder(goka.ConsumerGroupBuilderWithConfig(cfg)), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -89,7 +89,7 @@ func checkBroker(broker Broker, config *sarama.Config) error { | |
| } | ||
|
|
||
| err := broker.Open(config) | ||
| if err != nil { | ||
| if err != nil && !errors.Is(err, sarama.ErrAlreadyConnected) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. accordin to docs, |
||
| return fmt.Errorf("error opening broker connection: %v", err) | ||
| } | ||
| connected, err := broker.Connected() | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this makes the join-table trying to reconnect while the processor is running (together with the other table some lines below)