Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
731 changes: 731 additions & 0 deletions source/client-side-operations-timeout/tests/pending-response.json

Large diffs are not rendered by default.

375 changes: 375 additions & 0 deletions source/client-side-operations-timeout/tests/pending-response.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,375 @@
description: "Connection churn is prevented by reading pending responses during connection checkout"
schemaVersion: "1.28"
runOnRequirements:
- minServerVersion: "4.4"
# TODO(SERVER-96344): When using failpoints, mongos returns MaxTimeMSExpired
# after maxTimeMS, whereas mongod returns it after
# max(blockTimeMS, maxTimeMS). Until this ticket is resolved, these tests
# will not pass on sharded clusters.
topologies: ["single", "replicaset"]
createEntities:
- client:
id: &failPointClient failPointClient
useMultipleMongoses: false
- client:
id: &client client
uriOptions:
maxPoolSize: 1
minPoolSize: 1
awaitMinPoolSizeMS: 10000
useMultipleMongoses: false
observeEvents:
- commandFailedEvent
- commandSucceededEvent
- connectionCheckOutStartedEvent
- connectionCheckedOutEvent
- connectionCheckOutFailedEvent
- connectionCheckedInEvent
- connectionCreatedEvent
- connectionReadyEvent
- connectionClosedEvent
- connectionPendingResponseSucceededEvent
- connectionPendingResponseStartedEvent
- connectionPendingResponseFailedEvent
- database:
id: &database test
client: *client
databaseName: *database
- collection:
id: &collection coll
database: *database
collectionName: *collection
initialData:
- collectionName: *collection
databaseName: *database
documents: []
tests:
# Attempting a pending response read on a non-timeout operation that can
# immediately read from the TCP buffer should complete the pending read and
# the connection should be checked out.
- description: "non-timeout op with response and no error"
operations:
# Create a failpoint to block the first operation.
- name: failPoint
object: testRunner
arguments:
client: *failPointClient
failPoint:
configureFailPoint: failCommand
mode: {times: 1}
data:
failCommands: ["insert"]
blockConnection: true
blockTimeMS: 100
# Execute operation with timeout less than block time.
- name: insertOne
object: *collection
arguments:
timeoutMS: 75
document: {_id: 3, x: 1}
expectError:
isTimeoutError: true
# Execute a subsequent operation to complete the read.
- name: find
object: *collection
arguments:
filter: {_id: 1}
expectEvents:
- client: *client
events:
- commandFailedEvent:
commandName: insert
- commandSucceededEvent:
commandName: find
- client: *client
eventType: cmap
events:
- connectionCheckOutStartedEvent: {}
- connectionCheckedOutEvent: {}
- connectionCheckedInEvent: {} # Insert fails.
- connectionCheckOutStartedEvent: {}
- connectionPendingResponseStartedEvent: {}
- connectionPendingResponseSucceededEvent: {} # Find operation drains connection.
- connectionCheckedOutEvent: {}
- connectionCheckedInEvent: {} # Find succeeds.
# Attempting a pending response read on a non-timeout operation that gets no
# response from the server after 3s should close the connection.
- description: "non-timeout op with no response"
operations:
# Create a failpoint to block the first operation.
- name: failPoint
object: testRunner
arguments:
client: *failPointClient
failPoint:
configureFailPoint: failCommand
mode: {times: 1}
data:
failCommands: ["insert"]
blockConnection: true
blockTimeMS: 3100
# Execute operation with timeout less than block time.
- name: insertOne
object: *collection
arguments:
timeoutMS: 50
document: {_id: 3, x: 1}
expectError:
isTimeoutError: true
# Execute a subsequent operation to complete the read. We expect failure
# in the pending read, resulting in a closed connection. However, the
# find should retry and succeed.
- name: find
object: *collection
arguments:
filter: {_id: 1}
expectEvents:
- client: *client
events:
- commandFailedEvent:
commandName: insert
- commandSucceededEvent:
commandName: find
- client: *client
eventType: cmap
events:
- connectionCheckOutStartedEvent: {}
- connectionCheckedOutEvent: {}
- connectionCheckedInEvent: {} # Insert fails.
- connectionCheckOutStartedEvent: {}
- connectionPendingResponseStartedEvent: {}
- connectionPendingResponseFailedEvent:
reason: timeout
- connectionClosedEvent:
reason: error
- connectionCheckOutFailedEvent: {}
- connectionCheckOutStartedEvent: {} # Find's retry starts and succeeds
- connectionCreatedEvent: {}
- connectionReadyEvent: {}
- connectionCheckedOutEvent: {}
- connectionCheckedInEvent: {}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's minimal overhead to including more events than just these events, so I suggest adding other CMAP events, just to be explicit. I'd expect it looks something like:

          - connectionCheckedOutEvent: {}
          - connectionCheckedInEvent: {} # Insert fails.
          - connectionCheckOutStarted: {} # NEW
          - connectionPendingResponseStarted: {}
          - connectionPendingResponseFailed:
              reason: timeout
          - connectionClosedEvent:
              reason: error
          - connectionCreatedEvent: {} # NEW
          - connectionReadyEvent: {} # NEW
          - connectionCheckedOutEvent: {} # Find's retry starts and succeeds
          - connectionCheckedInEvent: {}

Preston's original PR relied on throwing a retryable error after failing to read a pending response. If we're explicitly not allowing that approach, including extra events enforce that the retry happens inside a single checkout request.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I don't see a reason not to apply this to all tests in the file too)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests we updated to include the following events:

  • connectionCheckOutStarted
  • connectionReadyEvent
  • connectionCreatedEvent
  • connectionReadyEvent

# Attempting a pending response read on a realistic timeout operation that can
# immediately read from the TCP buffer should complete the pending read and
# the connection should be checked out.
- description: "timeout op with response and no error"
operations:
# Create a failpoint to block the first operation.
- name: failPoint
object: testRunner
arguments:
client: *failPointClient
failPoint:
configureFailPoint: failCommand
mode: {times: 1}
data:
failCommands: ["insert"]
blockConnection: true
blockTimeMS: 250
# Execute operation with timeout less than block time.
- name: insertOne
object: *collection
arguments:
timeoutMS: 75
document: {_id: 3, x: 1}
expectError:
isTimeoutError: true
# Execute a subsequent operation to complete the read.
- name: find
object: *collection
arguments:
timeoutMS: 200
filter: {_id: 1}
expectEvents:
- client: *client
events:
- commandFailedEvent:
commandName: insert
- commandSucceededEvent:
commandName: find
- client: *client
eventType: cmap
events:
- connectionCheckOutStartedEvent: {}
- connectionCheckedOutEvent: {}
- connectionCheckedInEvent: {} # Insert fails.
- connectionCheckOutStartedEvent: {}
- connectionPendingResponseStartedEvent: {}
- connectionPendingResponseSucceededEvent: {}
- connectionCheckedOutEvent: {}
- connectionCheckedInEvent: {} # Find succeeds.
# It may take multiple calls to the pending response handler to drain the
# inbound buffer.
- description: "multiple calls to drain buffer"
operations:
# Create a failpoint to block the first and second operation.
- name: failPoint
object: testRunner
arguments:
client: *failPointClient
failPoint:
configureFailPoint: failCommand
mode: {times: 1}
data:
failCommands: ["insert"]
blockConnection: true
blockTimeMS: 500
# Execute operation with timeout less than block time.
- name: insertOne
object: *collection
arguments:
timeoutMS: 50
document: {_id: 3, x: 1}
expectError:
isTimeoutError: true
# Execute a subsequent operation with a timeout less than the block time.
- name: find
object: *collection
arguments:
timeoutMS: 50
filter: {_id: 1}
expectError:
isTimeoutError: true
# Execute a final operation to drain the buffer.
- name: find
object: *collection
arguments:
filter: {_id: 1}
Comment on lines +232 to +236
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we missing an explicit timeoutMS value here?

expectEvents:
- client: *client
events:
- commandFailedEvent:
commandName: insert
- commandSucceededEvent:
commandName: find
- client: *client
eventType: cmap
events:
- connectionCheckOutStartedEvent: {}
- connectionCheckedOutEvent: {}
- connectionCheckedInEvent: {} # Insert fails.
- connectionCheckOutStartedEvent: {}
- connectionPendingResponseStartedEvent: {} # First find fails
- connectionPendingResponseFailedEvent:
reason: timeout
- connectionCheckOutFailedEvent: {}
- connectionCheckOutStartedEvent: {}
- connectionPendingResponseStartedEvent: {} # Second find drains the buffer.
- connectionPendingResponseSucceededEvent: {}
- connectionCheckedOutEvent: {}
- connectionCheckedInEvent: {} # Second find succeeds.
# If the connection is closed server-side while draining the response, the
# driver must retry with a different connection.
- description: "read op retries when connection closes server-side while draining response"
operations:
# Create a failpoint to block the first and second operation.
- name: failPoint
object: testRunner
arguments:
client: *failPointClient
failPoint:
configureFailPoint: failCommand
mode: {times: 1}
data:
failCommands: ["insert"]
blockConnection: true
blockTimeMS: 500
closeConnection: true
# Execute operation with timeout less than block time.
- name: insertOne
object: *collection
arguments:
timeoutMS: 50
document: {_id: 3, x: 1}
expectError:
isTimeoutError: true
- name: wait
object: testRunner
arguments:
ms: 500
# Execute a subsequent operation with a timeout less than the block time.
- name: find
object: *collection
arguments:
timeoutMS: 50
filter: {_id: 1}
Comment on lines +289 to +294
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We've seen flakiness with establishing connections and this test will require a new connection to be established. Should we up the timeout on the find just in case?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should not establish any new connection in this test. We have minPoolSize set to 1 together with awaitMinPoolSizeMS parameter, which means the connection should be established before the test start. Also we have maxPoolSize set to 1 too. So the only connection should be marked as "pending response" after the timeout, then we wait "long enough" for the server to reply... and then in scope of the last find operation it should be simple discarding of already buffered data.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I'm confused by how closeConnection: true fits into the picture here. I imagined that:

  • failCommand is triggered, write blocks for 500ms
  • client-side, insert times out
  • failCommand expires, and connection is closed
  • the second insert is executed, checks the connection in the pool, attempts to read the bytes from the socket only to find that the socket has been closed. the connection is discarded, the checkout request immediately then attempts to open a new connection, which succeeds and the second insert completes successfully

Am I misunderstanding how this test works?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, yes! You are right! I've overlooked closeConnection: true. Thank you for spotting the problem!

expectEvents:
- client: *client
events:
- commandFailedEvent:
commandName: insert
- commandSucceededEvent:
commandName: find
- client: *client
eventType: cmap
events:
- connectionCheckOutStartedEvent: {}
- connectionCheckedOutEvent: {}
- connectionCheckedInEvent: {} # Insert fails.
- connectionCheckOutStartedEvent: {}
- connectionPendingResponseStartedEvent: {} # Pending read fails on first find
- connectionPendingResponseFailedEvent:
reason: error
- connectionClosedEvent:
reason: error
- connectionCreatedEvent: { }
- connectionReadyEvent: { }
- connectionCheckedOutEvent: {}
- connectionCheckedInEvent: {} # Find finishes.
# If the connection is closed server-side while draining the response, the
# driver must retry with a different connection (write operation)
- description: "write op retries when connection closes server-side while draining response"
operations:
# Create a failpoint to block the first and second operation.
- name: failPoint
object: testRunner
arguments:
client: *failPointClient
failPoint:
configureFailPoint: failCommand
mode: {times: 1}
data:
failCommands: ["insert"]
blockConnection: true
blockTimeMS: 500
closeConnection: true
# Execute operation with timeout less than block time.
- name: insertOne
object: *collection
arguments:
timeoutMS: 50
document: {_id: 3, x: 1}
expectError:
isTimeoutError: true
- name: wait
object: testRunner
arguments:
ms: 500
# Execute a subsequent operation with a timeout less than the block time.
- name: insertOne
object: *collection
arguments:
timeoutMS: 50
document: {_id: 3, x: 1}
Comment on lines +348 to +352
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment as on above test.

expectEvents:
- client: *client
events:
- commandFailedEvent:
commandName: insert
- commandSucceededEvent:
commandName: insert
- client: *client
eventType: cmap
events:
- connectionCheckOutStartedEvent: { }
- connectionCheckedOutEvent: { }
- connectionCheckedInEvent: { } # Insert fails.
- connectionCheckOutStartedEvent: { }
- connectionPendingResponseStartedEvent: { } # Pending read fails on first find
- connectionPendingResponseFailedEvent:
reason: error
- connectionClosedEvent:
reason: error
- connectionCreatedEvent: { }
- connectionReadyEvent: { }
- connectionCheckedOutEvent: { }
- connectionCheckedInEvent: { } # Find finishes.
Loading
Loading