hashicorp
diff --git a/‎net_transport.go‎
Lines changed: 81 additions & 12 deletions b/‎net_transport.go‎
Lines changed: 81 additions & 12 deletions
@@ -28,9 +28,11 @@ const (
 	// DefaultTimeoutScale is the default TimeoutScale in a NetworkTransport.
 	DefaultTimeoutScale = 256 * 1024 // 256KB
 
-	// rpcMaxPipeline controls the maximum number of outstanding
-	// AppendEntries RPC calls.
-	rpcMaxPipeline = 128
+	// DefaultMaxRPCsInFlight is the default value used for pipelining configuration
+	// if a zero value is passed. See https://github.com/hashicorp/raft/pull/541
+	// for rationale. Note, if this is changed we should update the doc comments
+	// below for NetworkTransportConfig.MaxRPCsInFlight.
+	DefaultMaxRPCsInFlight = 2
 
 	// connReceiveBufferSize is the size of the buffer we will use for reading RPC requests into
 	// on followers
@@ -39,6 +41,16 @@ const (
 	// connSendBufferSize is the size of the buffer we will use for sending RPC request data from
 	// the leader to followers.
 	connSendBufferSize = 256 * 1024 // 256KB
+
+	// minInFlightForPipelining is a property of our current pipelining
+	// implementation and must not be changed unless we change the invariants of
+	// that implementation. Roughly speaking even with a zero-length in-flight
+	// buffer we still allow 2 requests to be in-flight before we block because we
+	// only block after sending and the receiving go-routine always unblocks the
+	// chan right after first send. This is a constant just to provide context
+	// rather than a magic number in a few places we have to check invariants to
+	// avoid panics etc.
+	minInFlightForPipelining = 2
 )
 
 var (
@@ -76,7 +88,8 @@ type NetworkTransport struct {
 
 	logger hclog.Logger
 
-	maxPool int
+	maxPool     int
+	maxInFlight int
 
 	serverAddressProvider ServerAddressProvider
 
@@ -108,6 +121,39 @@ type NetworkTransportConfig struct {
 	// MaxPool controls how many connections we will pool
 	MaxPool int
 
+	// MaxRPCsInFlight controls the pipelining "optimization" when replicating
+	// entries to followers.
+	//
+	// Setting this to 1 explicitly disables pipelining since no overlapping of
+	// request processing is allowed. If set to 1 the pipelining code path is
+	// skipped entirely and every request is entirely synchronous.
+	//
+	// If zero is set (or left as default), DefaultMaxRPCsInFlight is used which
+	// is currently 2. A value of 2 overlaps the preparation and sending of the
+	// next request while waiting for the previous response, but avoids additional
+	// queuing.
+	//
+	// Historically this was internally fixed at (effectively) 130 however
+	// performance testing has shown that in practice the pipelining optimization
+	// combines badly with batching and actually has a very large negative impact
+	// on commit latency when throughput is high, whilst having very little
+	// benefit on latency or throughput in any other case! See
+	// [#541](https://github.com/hashicorp/raft/pull/541) for more analysis of the
+	// performance impacts.
+	//
+	// Increasing this beyond 2 is likely to be beneficial only in very
+	// high-latency network conditions. HashiCorp doesn't recommend using our own
+	// products this way.
+	//
+	// To maintain the behavior from before version 1.4.1 exactly, set this to
+	// 130. The old internal constant was 128 but was used directly as a channel
+	// buffer size. Since we send before blocking on the channel and unblock the
+	// channel as soon as the receiver is done with the earliest outstanding
+	// request, even an unbuffered channel (buffer=0) allows one request to be
+	// sent while waiting for the previous one (i.e. 2 inflight). so the old
+	// buffer actually allowed 130 RPCs to be inflight at once.
+	MaxRPCsInFlight int
+
 	// Timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply
 	// the timeout by (SnapshotSize / TimeoutScale).
 	Timeout time.Duration
@@ -162,11 +208,17 @@ func NewNetworkTransportWithConfig(
 			Level:  hclog.DefaultLevel,
 		})
 	}
+	maxInFlight := config.MaxRPCsInFlight
+	if maxInFlight == 0 {
+		// Default zero value
+		maxInFlight = DefaultMaxRPCsInFlight
+	}
 	trans := &NetworkTransport{
 		connPool:              make(map[ServerAddress][]*netConn),
 		consumeCh:             make(chan RPC),
 		logger:                config.Logger,
 		maxPool:               config.MaxPool,
+		maxInFlight:           maxInFlight,
 		shutdownCh:            make(chan struct{}),
 		stream:                config.Stream,
 		timeout:               config.Timeout,
@@ -379,14 +431,20 @@ func (n *NetworkTransport) returnConn(conn *netConn) {
 // AppendEntriesPipeline returns an interface that can be used to pipeline
 // AppendEntries requests.
 func (n *NetworkTransport) AppendEntriesPipeline(id ServerID, target ServerAddress) (AppendPipeline, error) {
+	if n.maxInFlight < minInFlightForPipelining {
+		// Pipelining is disabled since no more than one request can be outstanding
+		// at once. Skip the whole code path and use synchronous requests.
+		return nil, ErrPipelineReplicationNotSupported
+	}
+
 	// Get a connection
 	conn, err := n.getConnFromAddressProvider(id, target)
 	if err != nil {
 		return nil, err
 	}
 
 	// Create the pipeline
-	return newNetPipeline(n, conn), nil
+	return newNetPipeline(n, conn, n.maxInFlight), nil
 }
 
 // AppendEntries implements the Transport interface.
@@ -720,14 +778,25 @@ func sendRPC(conn *netConn, rpcType uint8, args interface{}) error {
 	return nil
 }
 
-// newNetPipeline is used to construct a netPipeline from a given
-// transport and connection.
-func newNetPipeline(trans *NetworkTransport, conn *netConn) *netPipeline {
+// newNetPipeline is used to construct a netPipeline from a given transport and
+// connection. It is a bug to ever call this with maxInFlight less than 2
+// (minInFlightForPipelining) and will cause a panic.
+func newNetPipeline(trans *NetworkTransport, conn *netConn, maxInFlight int) *netPipeline {
+	if maxInFlight < minInFlightForPipelining {
+		// Shouldn't happen (tm) since we validate this in the one call site and
+		// skip pipelining if it's lower.
+		panic("pipelining makes no sense if maxInFlight < 2")
+	}
 	n := &netPipeline{
-		conn:         conn,
-		trans:        trans,
-		doneCh:       make(chan AppendFuture, rpcMaxPipeline),
-		inprogressCh: make(chan *appendFuture, rpcMaxPipeline),
+		conn:  conn,
+		trans: trans,
+		// The buffer size is 2 less than the configured max because we send before
+		// waiting on the channel and the decode routine unblocks the channel as
+		// soon as it's waiting on the first request. So a zero-buffered channel
+		// still allows 1 request to be sent even while decode is still waiting for
+		// a response from the previous one. i.e. two are inflight at the same time.
+		inprogressCh: make(chan *appendFuture, maxInFlight-2),
+		doneCh:       make(chan AppendFuture, maxInFlight-2),
 		shutdownCh:   make(chan struct{}),
 	}
 	go n.decodeResponses()