@@ -21,6 +21,7 @@ import (
2121 "bytes"
2222 "context"
2323 "encoding/binary"
24+ "errors"
2425 "fmt"
2526 "math/rand"
2627 _ "net/http/pprof"
@@ -29,6 +30,7 @@ import (
2930
3031 "github.com/google/uuid"
3132 log "github.com/sirupsen/logrus"
33+ "github.com/twmb/franz-go/pkg/kerr"
3234 "github.com/twmb/franz-go/pkg/kgo"
3335
3436 "github.com/redpanda-data/kgo-verifier/pkg/util"
@@ -381,6 +383,14 @@ loop:
381383
382384 if v .transactionsEnabled {
383385 if _ , err := v .transactionSTM .BeforeMessageSent (); err != nil {
386+ if errors .Is (err , kerr .OperationNotAttempted ) {
387+ // Try to recover this producer by rolling back the transaction.
388+ err = v .transactionSTM .TryRollbackTransaction ()
389+ if err == nil {
390+ continue
391+ }
392+ }
393+
384394 log .Errorf ("Produce error; transaction failure: %v" , err )
385395 break loop
386396 }
@@ -430,6 +440,18 @@ loop:
430440 // consumer needs logic to handle the unexpected token
431441 log .Debugf ("Produce %s acked %d on partition %d offset %d" , v .config .workerCfg .Name , token , r .Partition , r .Offset )
432442 if err != nil {
443+ // For transactions an INVALID_TXN_STATE is encountered often while restarting nodes
444+ // Try to be tolerant of this error.
445+ // TODO: Is there a way to avoid this?
446+ if v .transactionsEnabled && (errors .Is (err , kerr .OperationNotAttempted ) || errors .Is (err , kerr .InvalidTxnState )) {
447+ err = v .transactionSTM .TryRollbackTransaction ()
448+ if err == nil {
449+ v .pending <- token
450+ ackWait .Done ()
451+ return
452+ }
453+ }
454+
433455 // On produce error, we drop the token: we rely on producer errors
434456 // being rare and/or a background Tuner re-injecting fresh tokens
435457 log .Errorf ("Produce %s error, dropped token %d: %v" , v .config .workerCfg .Name , token , err )
@@ -440,6 +462,7 @@ loop:
440462 v .globalStats .Ack_latency .Update (ackLatency .Microseconds ())
441463 v .totalProduced += 1
442464 }
465+
443466 ackWait .Done ()
444467 }
445468
0 commit comments