Skip to content
This repository was archived by the owner on Jul 30, 2025. It is now read-only.

Commit 18aca5e

Browse files
committed
kgo-repeater: add additional error handling for txn to inc stability
1 parent 3c01706 commit 18aca5e

File tree

2 files changed

+42
-0
lines changed

2 files changed

+42
-0
lines changed

pkg/worker/repeater/repeater_worker.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"bytes"
2222
"context"
2323
"encoding/binary"
24+
"errors"
2425
"fmt"
2526
"math/rand"
2627
_ "net/http/pprof"
@@ -29,6 +30,7 @@ import (
2930

3031
"github.com/google/uuid"
3132
log "github.com/sirupsen/logrus"
33+
"github.com/twmb/franz-go/pkg/kerr"
3234
"github.com/twmb/franz-go/pkg/kgo"
3335

3436
"github.com/redpanda-data/kgo-verifier/pkg/util"
@@ -381,6 +383,14 @@ loop:
381383

382384
if v.transactionsEnabled {
383385
if _, err := v.transactionSTM.BeforeMessageSent(); err != nil {
386+
if errors.Is(err, kerr.OperationNotAttempted) {
387+
// Try to recover this producer by rolling back the transaction.
388+
err = v.transactionSTM.TryRollbackTransaction()
389+
if err == nil {
390+
continue
391+
}
392+
}
393+
384394
log.Errorf("Produce error; transaction failure: %v", err)
385395
break loop
386396
}
@@ -430,6 +440,18 @@ loop:
430440
// consumer needs logic to handle the unexpected token
431441
log.Debugf("Produce %s acked %d on partition %d offset %d", v.config.workerCfg.Name, token, r.Partition, r.Offset)
432442
if err != nil {
443+
// For transactions an INVALID_TXN_STATE is encountered often while restarting nodes
444+
// Try to be tolerant of this error.
445+
// TODO: Is there a way to avoid this?
446+
if v.transactionsEnabled && (errors.Is(err, kerr.OperationNotAttempted) || errors.Is(err, kerr.InvalidTxnState)) {
447+
err = v.transactionSTM.TryRollbackTransaction()
448+
if err == nil {
449+
v.pending <- token
450+
ackWait.Done()
451+
return
452+
}
453+
}
454+
433455
// On produce error, we drop the token: we rely on producer errors
434456
// being rare and/or a background Tuner re-injecting fresh tokens
435457
log.Errorf("Produce %s error, dropped token %d: %v", v.config.workerCfg.Name, token, err)
@@ -440,6 +462,7 @@ loop:
440462
v.globalStats.Ack_latency.Update(ackLatency.Microseconds())
441463
v.totalProduced += 1
442464
}
465+
443466
ackWait.Done()
444467
}
445468

pkg/worker/transaction_stm.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,25 @@ func (t *TransactionSTM) TryEndTransaction() error {
6464
return nil
6565
}
6666

67+
// If an OPERATION_NOT_ATTEMPTED error occurs on the producer it
68+
// can sometimes recover by rolling back the current transaction.
69+
func (t *TransactionSTM) TryRollbackTransaction() error {
70+
if err := t.client.AbortBufferedRecords(t.ctx); err != nil {
71+
log.Errorf("Error aborting buffered records: %v", err)
72+
return err
73+
}
74+
if err := t.client.EndTransaction(t.ctx, kgo.TryAbort); err != nil {
75+
log.Errorf("Error rolling back transaction: %v", err)
76+
return err
77+
}
78+
79+
log.Debugf("Rolled back a transaction; currentMgsProduced = %d aborted = %t", t.currentMgsProduced, t.abortedTransaction)
80+
81+
t.currentMgsProduced = 0
82+
t.activeTransaction = false
83+
return nil
84+
}
85+
6786
// Returns true iff a new transaction was started and/or a current
6887
// transaction ended. This is to notify any producers that control
6988
// markers will be added to a partition's log.

0 commit comments

Comments
 (0)