Skip to content

Commit d86e0aa

Browse files
authored
GODRIVER-2464 Add timeout for RTT monitor hello operations. (#994)
1 parent 6f2489e commit d86e0aa

File tree

8 files changed

+210
-100
lines changed

8 files changed

+210
-100
lines changed

data/server-discovery-and-monitoring/integration/hello-command-error.json

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@
117117
"failPoint": {
118118
"configureFailPoint": "failCommand",
119119
"mode": {
120-
"times": 2
120+
"times": 4
121121
},
122122
"data": {
123123
"failCommands": [
@@ -162,22 +162,6 @@
162162
}
163163
]
164164
}
165-
},
166-
{
167-
"name": "assertEventCount",
168-
"object": "testRunner",
169-
"arguments": {
170-
"event": "ServerMarkedUnknownEvent",
171-
"count": 1
172-
}
173-
},
174-
{
175-
"name": "assertEventCount",
176-
"object": "testRunner",
177-
"arguments": {
178-
"event": "PoolClearedEvent",
179-
"count": 1
180-
}
181165
}
182166
],
183167
"expectations": [

data/server-discovery-and-monitoring/integration/hello-command-error.yml

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -84,15 +84,16 @@ tests:
8484
documents:
8585
- _id: 1
8686
- _id: 2
87-
# Configure the next streaming hello check to fail with a command
88-
# error.
89-
# Use times: 2 so that the RTT hello is blocked as well.
87+
# Configure the next streaming hello check to fail with a command error.
88+
# Use "times: 4" to increase the probability that the Monitor check fails
89+
# since the RTT hello may trigger this failpoint one or many times as
90+
# well.
9091
- name: configureFailPoint
9192
object: testRunner
9293
arguments:
9394
failPoint:
9495
configureFailPoint: failCommand
95-
mode: { times: 2 }
96+
mode: { times: 4 }
9697
data:
9798
failCommands: ["hello", "isMaster"]
9899
appName: commandErrorCheckTest
@@ -119,17 +120,19 @@ tests:
119120
documents:
120121
- _id: 3
121122
- _id: 4
122-
# Assert the server was marked Unknown and pool was cleared exactly once.
123-
- name: assertEventCount
124-
object: testRunner
125-
arguments:
126-
event: ServerMarkedUnknownEvent
127-
count: 1
128-
- name: assertEventCount
129-
object: testRunner
130-
arguments:
131-
event: PoolClearedEvent
132-
count: 1
123+
# We cannot assert the server was marked Unknown and pool was cleared an
124+
# exact number of times because the RTT hello may have triggered this
125+
# failpoint one or many times as well.
126+
# - name: assertEventCount
127+
# object: testRunner
128+
# arguments:
129+
# event: ServerMarkedUnknownEvent
130+
# count: 1
131+
# - name: assertEventCount
132+
# object: testRunner
133+
# arguments:
134+
# event: PoolClearedEvent
135+
# count: 1
133136

134137
expectations:
135138
- command_started_event:

data/server-discovery-and-monitoring/integration/hello-network-error.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@
116116
"failPoint": {
117117
"configureFailPoint": "failCommand",
118118
"mode": {
119-
"times": 2
119+
"times": 4
120120
},
121121
"data": {
122122
"failCommands": [

data/server-discovery-and-monitoring/integration/hello-network-error.yml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,15 @@ tests:
8484
- _id: 1
8585
- _id: 2
8686
# Configure the next streaming hello check to fail with a non-timeout
87-
# network error. Use times: 2 to ensure that the the Monitor check fails
88-
# since the RTT hello may trigger this failpoint as well.
87+
# network error. Use "times: 4" to increase the probability that the
88+
# Monitor check fails since the RTT hello may trigger this failpoint one
89+
# or many times as well.
8990
- name: configureFailPoint
9091
object: testRunner
9192
arguments:
9293
failPoint:
9394
configureFailPoint: failCommand
94-
mode: { times: 2 }
95+
mode: { times: 4 }
9596
data:
9697
failCommands: ["hello", "isMaster"]
9798
appName: networkErrorCheckTest
@@ -116,8 +117,8 @@ tests:
116117
- _id: 3
117118
- _id: 4
118119
# We cannot assert the server was marked Unknown and pool was cleared an
119-
# exact number of times because the RTT hello may or may not have
120-
# triggered this failpoint as well.
120+
# exact number of times because the RTT hello may have triggered this
121+
# failpoint one or many times as well.
121122
# - name: assertEventCount
122123
# object: testRunner
123124
# arguments:

data/server-discovery-and-monitoring/integration/hello-timeout.json

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@
117117
"failPoint": {
118118
"configureFailPoint": "failCommand",
119119
"mode": {
120-
"times": 2
120+
"times": 4
121121
},
122122
"data": {
123123
"failCommands": [
@@ -160,22 +160,6 @@
160160
}
161161
]
162162
}
163-
},
164-
{
165-
"name": "assertEventCount",
166-
"object": "testRunner",
167-
"arguments": {
168-
"event": "ServerMarkedUnknownEvent",
169-
"count": 1
170-
}
171-
},
172-
{
173-
"name": "assertEventCount",
174-
"object": "testRunner",
175-
"arguments": {
176-
"event": "PoolClearedEvent",
177-
"count": 1
178-
}
179163
}
180164
],
181165
"expectations": [

data/server-discovery-and-monitoring/integration/hello-timeout.yml

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,16 @@ tests:
8484
documents:
8585
- _id: 1
8686
- _id: 2
87-
# Configure the next streaming hello check to fail with a timeout
88-
# Use times: 2 so that the RTT hello is blocked as well.
87+
# Configure the next streaming hello check to fail with a timeout.
88+
# Use "times: 4" to increase the probability that the Monitor check times
89+
# out since the RTT hello may trigger this failpoint one or many times as
90+
# well.
8991
- name: configureFailPoint
9092
object: testRunner
9193
arguments:
9294
failPoint:
9395
configureFailPoint: failCommand
94-
mode: { times: 2 }
96+
mode: { times: 4 }
9597
data:
9698
failCommands: ["hello", "isMaster"]
9799
appName: timeoutMonitorCheckTest
@@ -119,17 +121,19 @@ tests:
119121
documents:
120122
- _id: 3
121123
- _id: 4
122-
# Assert the server was marked Unknown and pool was cleared exactly once.
123-
- name: assertEventCount
124-
object: testRunner
125-
arguments:
126-
event: ServerMarkedUnknownEvent
127-
count: 1
128-
- name: assertEventCount
129-
object: testRunner
130-
arguments:
131-
event: PoolClearedEvent
132-
count: 1
124+
# We cannot assert the server was marked Unknown and pool was cleared an
125+
# exact number of times because the RTT hello may have triggered this
126+
# failpoint one or many times as well.
127+
# - name: assertEventCount
128+
# object: testRunner
129+
# arguments:
130+
# event: ServerMarkedUnknownEvent
131+
# count: 1
132+
# - name: assertEventCount
133+
# object: testRunner
134+
# arguments:
135+
# event: PoolClearedEvent
136+
# count: 1
133137

134138
expectations:
135139
- command_started_event:

x/mongo/driver/topology/rtt_monitor.go

Lines changed: 62 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,15 @@ const (
2525
)
2626

2727
type rttConfig struct {
28-
interval time.Duration
29-
minRTTWindow time.Duration // Window size to calculate minimum RTT over.
28+
// The minimum interval between RTT measurements. The actual interval may be greater if running
29+
// the operation takes longer than the interval.
30+
interval time.Duration
31+
32+
// The timeout applied to running the "hello" operation. If the timeout is reached while running
33+
// the operation, the RTT sample is discarded. The default is 1 minute.
34+
timeout time.Duration
35+
36+
minRTTWindow time.Duration
3037
createConnectionFn func() *connection
3138
createOperationFn func(driver.Connection) *operation.Hello
3239
}
@@ -77,8 +84,6 @@ func (r *rttMonitor) disconnect() {
7784

7885
func (r *rttMonitor) start() {
7986
defer r.closeWg.Done()
80-
ticker := time.NewTicker(r.cfg.interval)
81-
defer ticker.Stop()
8287

8388
var conn *connection
8489
defer func() {
@@ -93,9 +98,27 @@ func (r *rttMonitor) start() {
9398
}
9499
}()
95100

101+
ticker := time.NewTicker(r.cfg.interval)
102+
defer ticker.Stop()
103+
96104
for {
97-
conn = r.runHello(conn)
105+
conn := r.cfg.createConnectionFn()
106+
err := conn.connect(r.ctx)
107+
108+
// Add an RTT sample from the new connection handshake and start a runHellos() loop if we
109+
// successfully established the new connection. Otherwise, close the connection and try to
110+
// create another new connection.
111+
if err == nil {
112+
r.addSample(conn.helloRTT)
113+
r.runHellos(conn)
114+
}
115+
116+
// Close any connection here because we're either about to try to create another new
117+
// connection or we're about to exit the loop.
118+
_ = conn.close()
98119

120+
// If a connection error happens quickly, always wait for the monitoring interval to try
121+
// to create a new connection to prevent creating connections too quickly.
99122
select {
100123
case <-ticker.C:
101124
case <-r.ctx.Done():
@@ -104,37 +127,45 @@ func (r *rttMonitor) start() {
104127
}
105128
}
106129

107-
// runHello runs a "hello" operation using the provided connection, measures the duration, and adds
108-
// the duration as an RTT sample and returns the connection used. If the provided connection is nil
109-
// or closed, runHello tries to establish a new connection. If the "hello" operation returns an
110-
// error, runHello closes the connection.
111-
func (r *rttMonitor) runHello(conn *connection) *connection {
112-
if conn == nil || conn.closed() {
113-
conn := r.cfg.createConnectionFn()
130+
// runHellos runs "hello" operations in a loop using the provided connection, measuring and
131+
// recording the operation durations as RTT samples. If it encounters any errors, it returns.
132+
func (r *rttMonitor) runHellos(conn *connection) {
133+
ticker := time.NewTicker(r.cfg.interval)
134+
defer ticker.Stop()
114135

115-
err := conn.connect(r.ctx)
116-
if err != nil {
117-
return nil
136+
for {
137+
// Assume that the connection establishment recorded the first RTT sample, so wait for the
138+
// first tick before trying to record another RTT sample.
139+
select {
140+
case <-ticker.C:
141+
case <-r.ctx.Done():
142+
return
118143
}
119144

120-
// If we just created a new connection, record the "hello" RTT from the new connection and
121-
// return the new connection. Don't run another "hello" command this interval because it's
122-
// now unnecessary.
123-
r.addSample(conn.helloRTT)
124-
return conn
125-
}
145+
// Create a Context with the operation timeout specified in the RTT monitor config. If a
146+
// timeout is not set in the RTT monitor config, default to the connection's
147+
// "connectTimeoutMS". The purpose of the timeout is to allow the RTT monitor to continue
148+
// monitoring server RTTs after an operation gets stuck. An operation can get stuck if the
149+
// server or a proxy stops responding to requests on the RTT connection but does not close
150+
// the TCP socket, effectively creating an operation that will never complete. We expect
151+
// that "connectTimeoutMS" provides at least enough time for a single round trip.
152+
timeout := r.cfg.timeout
153+
if timeout <= 0 {
154+
timeout = conn.config.connectTimeout
155+
}
156+
ctx, cancel := context.WithTimeout(r.ctx, timeout)
126157

127-
start := time.Now()
128-
err := r.cfg.createOperationFn(initConnection{conn}).Execute(r.ctx)
129-
if err != nil {
130-
// Errors from the RTT monitor do not reset the RTTs or update the topology, so we close the
131-
// existing connection and recreate it on the next check.
132-
_ = conn.close()
133-
return nil
158+
start := time.Now()
159+
err := r.cfg.createOperationFn(initConnection{conn}).Execute(ctx)
160+
cancel()
161+
if err != nil {
162+
return
163+
}
164+
// Only record a sample if the "hello" operation was successful. If it was not successful,
165+
// the operation may not have actually performed a complete round trip, so the duration may
166+
// be artificially short.
167+
r.addSample(time.Since(start))
134168
}
135-
r.addSample(time.Since(start))
136-
137-
return conn
138169
}
139170

140171
// reset sets the average and min RTT to 0. This should only be called from the server monitor when an error

0 commit comments

Comments
 (0)