@@ -78,8 +78,8 @@ public final class MainDriver<R extends StateTreeNode> implements MongoDriver {
7878 final Formatter formatter ;
7979
8080 final long flushTimeout ;
81- final long initialRootTimeout ;
82- final long driverReinitializeTimeout ;
81+ final long initializeTimeout ;
82+ final long reinitializationTimeout ;
8383
8484 /**
8585 * {@link MongoClient#close()} throws if called more than once.
@@ -123,26 +123,77 @@ public MainDriver(
123123 this .bsonSerializer = bsonSerializer ;
124124 this .downstream = downstream ;
125125
126- MongoClientSettings .Builder builder = MongoClientSettings
127- .builder (clientSettings );
126+ // Flushes work by waiting for the latest version to arrive on the change stream.
127+ // If we wait for two heartbeats and don't see the update, something has gone wrong.
128+ //
129+ // (Note that flush does a retry, so it will actually take
130+ // twice as long as this before throwing a FlushFailureException.)
131+ flushTimeout = 2L * driverSettings .timescaleMS ();
132+
133+ // Initialization must wait for a heartbeat to succeed, so we wait twice that long,
134+ // plus one more for the network connection and initial query.
135+ initializeTimeout = 3L * driverSettings .timescaleMS ();
136+
137+ // The sum of the steps required to reinitialize after a disconnect,
138+ // plus a little extra to make sure we don't cut it off when it's about to succeed.
139+ reinitializationTimeout =
140+ 2L * driverSettings .timescaleMS () // ChangeStream reconnect
141+ + initializeTimeout // Initialize after reconnecting
142+ + driverSettings .timescaleMS () // Extra buffer
143+ ;
144+
145+ MongoClientSettings .Builder commonSettingsBuilder = MongoClientSettings
146+ .builder (clientSettings )
147+ .applyToServerSettings (s ->
148+ // If timescaleMS is shorter than the default min heartbeat,
149+ // then we need to reduce this setting to prevent the client
150+ // from using a stale view of the server state for too long.
151+ // If timescaleMS is longer, then the user has told us
152+ // they don't mind longer delays and want the increased
153+ // efficiency of fewer heartbeats.
154+ // Either way, timescaleMS is the right value for this setting.
155+ //
156+ // Note that this doesn't set the heartbeat frequency itself.
157+ // That is left at the default value, since it is only used
158+ // to "notice" connectivity problems when the driver is quiescent,
159+ // which is not time-critical and is not governed by timescaleMS:
160+ // the actual behaviour of the bosk during a network partition
161+ // is that its contents remain fixed, and it doesn't matter much
162+ // whether that is achieved by formally disconnecting or simply
163+ // by doing nothing.
164+ s .minHeartbeatFrequency (driverSettings .timescaleMS (), MILLISECONDS ))
165+ ;
128166
129167 // By default, we deal only with durable data that won't get rolled back.
130168 // In some circumstances, we need the very latest possible data for correctness,
131169 // so we override the ReadConcern in those cases.
132- builder
170+ commonSettingsBuilder
133171 .readConcern (ReadConcern .MAJORITY )
134172 .writeConcern (WriteConcern .MAJORITY );
135173
174+ var changeStreamSettingsBuilder = MongoClientSettings .builder (commonSettingsBuilder .build ())
175+ .applyToClusterSettings (c ->
176+ c .serverSelectionTimeout (initializeTimeout , MILLISECONDS ))
177+ .applyToSocketSettings (s ->
178+ s .connectTimeout (initializeTimeout , MILLISECONDS )
179+ // No read timeout for change streams; they can be idle indefinitely
180+ .readTimeout (0 , MILLISECONDS ))
181+ ;
182+
183+ var changeStreamClient = MongoClients .create (changeStreamSettingsBuilder .build ());
184+ closeables .addFirst (changeStreamClient );
185+
136186 // Override timeouts to make them compatible with driverSettings.timescaleMS()
137- builder
138- .timeout (2L * driverSettings .timescaleMS (), MILLISECONDS );
187+ var querySettingsBuilder = MongoClientSettings .builder (commonSettingsBuilder .build ());
188+ querySettingsBuilder
189+ .timeout (flushTimeout , MILLISECONDS );
190+
191+ var queryClient = MongoClients .create (querySettingsBuilder .build ());
192+ closeables .addFirst (queryClient );
139193
140- var mongoClient = MongoClients .create (builder .build ());
141- closeables .addFirst (mongoClient );
142- MongoCollection <BsonDocument > changeStreamCollection = mongoClient
194+ this .queryCollection = TransactionalCollection .of (queryClient
143195 .getDatabase (driverSettings .database ())
144- .getCollection (COLLECTION_NAME , BsonDocument .class );
145- this .queryCollection = TransactionalCollection .of (changeStreamCollection , mongoClient );
196+ .getCollection (COLLECTION_NAME , BsonDocument .class ), queryClient );
146197 LOGGER .debug ("Using database \" {}\" collection \" {}\" " , driverSettings .database (), COLLECTION_NAME );
147198
148199 this .formatter = new Formatter (boskInfo , bsonSerializer );
@@ -153,20 +204,13 @@ public MainDriver(
153204 if (factory != null ) {
154205 listener = factory .apply (listener );
155206 }
207+
208+ MongoCollection <BsonDocument > changeStreamCollection = changeStreamClient
209+ .getDatabase (driverSettings .database ())
210+ .getCollection (COLLECTION_NAME , BsonDocument .class );
156211 this .receiver = new ChangeReceiver (boskInfo .name (), boskInfo .instanceID (), listener , driverSettings , changeStreamCollection );
157212 }
158213
159- // Flushes work by waiting for the latest version to arrive on the change stream.
160- // If we wait twice as long as that takes, and we don't see the update, something
161- // has gone wrong.
162- flushTimeout = 2L * driverSettings .timescaleMS ();
163-
164- // TODO: Justify this calculation.
165- initialRootTimeout = 5L * driverSettings .timescaleMS ();
166-
167- // The ChangeStream resets itself after timescaleMS, so it needs
168- // several times that long to restore itself and publish a new driver.
169- driverReinitializeTimeout = 5L * driverSettings .timescaleMS ();
170214 }
171215
172216 @ Override
@@ -478,7 +522,7 @@ public void onConnectionSucceeded() throws
478522 private void runInitialRootAction (FutureTask <R > initialRootAction ) throws InterruptedException , TimeoutException , InitialRootActionException {
479523 initialRootAction .run ();
480524 try {
481- initialRootAction .get (initialRootTimeout , MILLISECONDS );
525+ initialRootAction .get (initializeTimeout , MILLISECONDS );
482526 LOGGER .debug ("initialRoot action completed successfully" );
483527 } catch (ExecutionException e ) {
484528 LOGGER .debug ("initialRoot action failed" , e );
@@ -544,6 +588,7 @@ private FormatDriver<R> newPreferredFormatDriver() {
544588 }
545589
546590 private FormatDriver <R > detectFormat () throws UninitializedCollectionException , UnrecognizedFormatException {
591+ LOGGER .debug ("Detecting format" );
547592 Manifest manifest = loadManifest ();
548593 DatabaseFormat format = manifest .pando ().isPresent ()? manifest .pando ().get () : SEQUOIA ;
549594 BsonString documentId = (format == SEQUOIA )
@@ -621,7 +666,7 @@ private MDCScope beginDriverOperation(String description, Object... args) {
621666 throw new IllegalStateException ("Driver is closed" );
622667 }
623668 MDCScope ex = setupMDC (boskInfo .name (), boskInfo .instanceID ());
624- LOGGER .debug (description , args );
669+ LOGGER .debug (description + " w/" + this . formatDriver . getClass (). getSimpleName () , args );
625670 if (driverSettings .testing ().eventDelayMS () < 0 ) {
626671 LOGGER .debug ("| eventDelayMS {}ms " , driverSettings .testing ().eventDelayMS ());
627672 try {
@@ -655,8 +700,9 @@ private <X extends Exception, Y extends Exception> void doRetryableDriverOperati
655700 throw new DisconnectedException (e );
656701 }
657702 } else {
658- LOGGER .debug ("MongoException is not recoverable; rethrowing" , e );
659- throw e ;
703+ LOGGER .debug ("MongoException is not recoverable; disconnecting" , e );
704+ setDisconnectedDriver (e );
705+ throw new DisconnectedException (e );
660706 }
661707 }
662708 break ;
@@ -680,8 +726,8 @@ private <X extends Exception, Y extends Exception> void doRetryableDriverOperati
680726 private <X extends Exception , Y extends Exception > void waitAndRetry (RetryableOperation <X , Y > operation , String description , Object ... args ) throws X , Y {
681727 try {
682728 formatDriverLock .lock ();
683- LOGGER .debug ("Waiting for new FormatDriver for {} ms" , driverReinitializeTimeout );
684- boolean success = formatDriverChanged .await (driverReinitializeTimeout , MILLISECONDS );
729+ LOGGER .debug ("Waiting for new FormatDriver for {} ms" , reinitializationTimeout );
730+ boolean success = formatDriverChanged .await (reinitializationTimeout , MILLISECONDS );
685731 if (!success ) {
686732 LOGGER .warn ("Timed out waiting for MongoDB to recover; will retry anyway, but the operation may fail" );
687733 }
@@ -705,7 +751,7 @@ private <X extends Exception, Y extends Exception> void waitAndRetry(RetryableOp
705751 } finally {
706752 formatDriverLock .unlock ();
707753 }
708- LOGGER .debug ("Retrying " + description , args );
754+ LOGGER .debug ("Retrying " + description + " w/" + this . formatDriver . getClass (). getSimpleName () , args );
709755 operation .run ();
710756 }
711757
@@ -716,14 +762,22 @@ private <X extends Exception, Y extends Exception> void waitAndRetry(RetryableOp
716762 * better driver to arrive instead.
717763 */
718764 void setDisconnectedDriver (Throwable reason ) {
719- LOGGER .debug ("quietlySetDisconnectedDriver({}) (previously {})" , reason .getClass ().getSimpleName (), formatDriver .getClass ().getSimpleName ());
765+ LOGGER .debug ("setDisconnectedDriver({}) (previously {})" , reason .getClass ().getSimpleName (), formatDriver .getClass ().getSimpleName ());
766+ FormatDriver <R > oldDriver ;
720767 try {
721768 formatDriverLock .lock ();
722- formatDriver .close ();
769+ oldDriver = formatDriver ;
770+ oldDriver .close ();
723771 formatDriver = new DisconnectedDriver <>(reason );
724772 } finally {
725773 formatDriverLock .unlock ();
726774 }
775+
776+ if (!(oldDriver instanceof DisconnectedDriver <?>)) {
777+ // The receiver is what reconnects us. Poke it to make sure it knows things
778+ // have gone south, and we need to try to reconnect.
779+ receiver .interrupt ();
780+ }
727781 }
728782
729783 /**
0 commit comments