88use std:: {
99 collections:: VecDeque ,
1010 sync:: {
11+ atomic:: {
12+ self ,
13+ AtomicBool ,
14+ } ,
1115 Arc ,
1216 LazyLock ,
1317 Weak ,
@@ -78,6 +82,7 @@ use crate::metrics::{
7882 connection_lifetime_timer,
7983 get_connection_timer,
8084 log_execute,
85+ log_poisoned_connection,
8186 log_query,
8287 log_query_result,
8388 log_transaction,
@@ -87,6 +92,10 @@ use crate::metrics::{
8792static POSTGRES_TIMEOUT : LazyLock < u64 > =
8893 LazyLock :: new ( || env_config ( "POSTGRES_TIMEOUT_SECONDS" , 30 ) ) ;
8994
95+ #[ derive( Debug , thiserror:: Error ) ]
96+ #[ error( "Postgres timeout" ) ]
97+ pub struct PostgresTimeout ;
98+
9099// We have observed postgres connections hanging during bootstrapping --
91100// which means backends can't start -- and during commit -- which means all
92101// future commits fail with OCC errors.
@@ -107,7 +116,9 @@ where
107116 Err ( e) => Err ( e. into( ) )
108117 }
109118 } ,
110- _ = sleep( Duration :: from_secs( * POSTGRES_TIMEOUT ) ) . fuse( ) => Err ( anyhow:: anyhow!( "Postgres timeout" ) ) ,
119+ _ = sleep( Duration :: from_secs( * POSTGRES_TIMEOUT ) ) . fuse( ) => {
120+ Err ( anyhow:: anyhow!( PostgresTimeout ) )
121+ } ,
111122 }
112123}
113124
@@ -169,17 +180,31 @@ impl PooledConnection {
169180
170181/// An active Postgres connection from a [`ConvexPgPool`].
171182///
172- /// Returns the underlying connection to the pool when dropped.
183+ /// Returns the underlying connection to the pool when dropped (unless
184+ /// `self.poisoned` is true).
173185pub ( crate ) struct PostgresConnection < ' a > {
174186 pool : & ' a ConvexPgPool ,
175187 _permit : SemaphorePermit < ' a > ,
176188 conn : Option < PooledConnection > ,
189+ poisoned : AtomicBool ,
177190 schema : & ' a SchemaName ,
178191 labels : Vec < StaticMetricLabel > ,
179192 _tracker : ConnectionTracker ,
180193 _timer : Timer < VMHistogramVec > ,
181194}
182195
196+ fn handle_error ( poisoned : & AtomicBool , e : impl Into < anyhow:: Error > ) -> anyhow:: Error {
197+ let e: anyhow:: Error = e. into ( ) ;
198+ if e. downcast_ref :: < tokio_postgres:: Error > ( )
199+ . is_some_and ( |e| e. is_closed ( ) || e. to_string ( ) . contains ( "unexpected message from server" ) )
200+ || e. downcast_ref :: < PostgresTimeout > ( ) . is_some ( )
201+ {
202+ tracing:: error!( "Not reusing connection after error: {e:#}" ) ;
203+ poisoned. store ( true , atomic:: Ordering :: Relaxed ) ;
204+ }
205+ e
206+ }
207+
183208impl PostgresConnection < ' _ > {
184209 fn substitute_db_name ( & self , query : & ' static str ) -> String {
185210 query. replace ( "@db_name" , & self . schema . escaped )
@@ -193,11 +218,12 @@ impl PostgresConnection<'_> {
193218
194219 pub async fn batch_execute ( & self , query : & ' static str ) -> anyhow:: Result < ( ) > {
195220 log_execute ( self . labels . clone ( ) ) ;
196- Ok ( self
197- . conn ( )
221+ let query = self . substitute_db_name ( query ) ;
222+ self . conn ( )
198223 . client
199- . batch_execute ( & self . substitute_db_name ( query) )
200- . await ?)
224+ . batch_execute ( & query)
225+ . await
226+ . map_err ( |e| handle_error ( & self . poisoned , e) )
201227 }
202228
203229 pub async fn query_opt (
@@ -206,12 +232,10 @@ impl PostgresConnection<'_> {
206232 params : & [ & ( dyn ToSql + Sync ) ] ,
207233 ) -> anyhow:: Result < Option < Row > > {
208234 log_query ( self . labels . clone ( ) ) ;
209- let row = with_timeout (
210- self . conn ( )
211- . client
212- . query_opt ( & self . substitute_db_name ( statement) , params) ,
213- )
214- . await ?;
235+ let query = self . substitute_db_name ( statement) ;
236+ let row = with_timeout ( self . conn ( ) . client . query_opt ( & query, params) )
237+ . await
238+ . map_err ( |e| handle_error ( & self . poisoned , e) ) ?;
215239 if let Some ( row) = & row {
216240 log_query_result ( row, self . labels . clone ( ) ) ;
217241 }
@@ -226,6 +250,7 @@ impl PostgresConnection<'_> {
226250 self . substitute_db_name ( query) ,
227251 ) )
228252 . await
253+ . map_err ( |e| handle_error ( & self . poisoned , e) )
229254 }
230255
231256 pub async fn query_raw < P , I > (
@@ -240,7 +265,9 @@ impl PostgresConnection<'_> {
240265 {
241266 let labels = self . labels . clone ( ) ;
242267 log_query ( labels. clone ( ) ) ;
243- let stream = with_timeout ( self . conn ( ) . client . query_raw ( statement, params) ) . await ?;
268+ let stream = with_timeout ( self . conn ( ) . client . query_raw ( statement, params) )
269+ . await
270+ . map_err ( |e| handle_error ( & self . poisoned , e) ) ?;
244271 Ok ( Self :: wrap_query_stream ( stream, labels) )
245272 }
246273
@@ -263,7 +290,9 @@ impl PostgresConnection<'_> {
263290 params : & [ & ( dyn ToSql + Sync ) ] ,
264291 ) -> anyhow:: Result < u64 > {
265292 log_execute ( self . labels . clone ( ) ) ;
266- with_timeout ( self . conn ( ) . client . execute ( statement, params) ) . await
293+ with_timeout ( self . conn ( ) . client . execute ( statement, params) )
294+ . await
295+ . map_err ( |e| handle_error ( & self . poisoned , e) )
267296 }
268297
269298 pub async fn transaction ( & mut self ) -> anyhow:: Result < PostgresTransaction < ' _ > > {
@@ -272,17 +301,27 @@ impl PostgresConnection<'_> {
272301 . conn
273302 . as_mut ( )
274303 . expect ( "connection is only taken in Drop" ) ;
275- let inner = with_timeout ( conn. client . transaction ( ) ) . await ?;
304+ let inner = match with_timeout ( conn. client . transaction ( ) ) . await {
305+ Ok ( t) => t,
306+ Err ( e) => return Err ( handle_error ( & self . poisoned , e) ) ,
307+ } ;
276308 Ok ( PostgresTransaction {
277309 inner,
278310 statement_cache : & conn. statement_cache ,
311+ poisoned : & self . poisoned ,
279312 schema : self . schema ,
280313 } )
281314 }
282315}
283316
284317impl Drop for PostgresConnection < ' _ > {
285318 fn drop ( & mut self ) {
319+ if * self . poisoned . get_mut ( ) {
320+ // We log here (not at poison time) in case the same connection is
321+ // poisoned more than once.
322+ log_poisoned_connection ( ) ;
323+ return ;
324+ }
286325 let mut conn = self . conn . take ( ) . expect ( "connection is only taken in Drop" ) ;
287326 conn. last_used = Instant :: now ( ) ;
288327 let mut idle_conns = self . pool . connections . lock ( ) ;
@@ -298,6 +337,7 @@ pub struct PostgresTransaction<'a> {
298337 inner : Transaction < ' a > ,
299338 statement_cache : & ' a Mutex < StatementCache > ,
300339 schema : & ' a SchemaName ,
340+ poisoned : & ' a AtomicBool ,
301341}
302342
303343impl PostgresTransaction < ' _ > {
@@ -312,14 +352,17 @@ impl PostgresTransaction<'_> {
312352 self . substitute_db_name ( query) ,
313353 ) )
314354 . await
355+ . map_err ( |e| handle_error ( self . poisoned , e) )
315356 }
316357
317358 pub async fn query (
318359 & self ,
319360 statement : & Statement ,
320361 params : & [ & ( dyn ToSql + Sync ) ] ,
321362 ) -> anyhow:: Result < Vec < Row > > {
322- with_timeout ( self . inner . query ( statement, params) ) . await
363+ with_timeout ( self . inner . query ( statement, params) )
364+ . await
365+ . map_err ( |e| handle_error ( self . poisoned , e) )
323366 }
324367
325368 pub async fn execute_str (
@@ -332,6 +375,7 @@ impl PostgresTransaction<'_> {
332375 . execute ( & self . substitute_db_name ( statement) , params) ,
333376 )
334377 . await
378+ . map_err ( |e| handle_error ( self . poisoned , e) )
335379 }
336380
337381 pub async fn execute_raw < P , I > ( & self , statement : & Statement , params : I ) -> anyhow:: Result < u64 >
@@ -340,11 +384,15 @@ impl PostgresTransaction<'_> {
340384 I : IntoIterator < Item = P > ,
341385 I :: IntoIter : ExactSizeIterator ,
342386 {
343- with_timeout ( self . inner . execute_raw ( statement, params) ) . await
387+ with_timeout ( self . inner . execute_raw ( statement, params) )
388+ . await
389+ . map_err ( |e| handle_error ( self . poisoned , e) )
344390 }
345391
346392 pub async fn commit ( self ) -> anyhow:: Result < ( ) > {
347- with_timeout ( self . inner . commit ( ) ) . await
393+ with_timeout ( self . inner . commit ( ) )
394+ . await
395+ . map_err ( |e| handle_error ( self . poisoned , e) )
348396 }
349397}
350398
@@ -395,6 +443,27 @@ impl ConvexPgPool {
395443 self . pg_config . get_target_session_attrs ( ) == TargetSessionAttrs :: ReadWrite
396444 }
397445
446+ /// Assumes that we already have a semaphore permit
447+ async fn get_connection_internal ( & self ) -> anyhow:: Result < PooledConnection > {
448+ {
449+ let mut conns = self . connections . lock ( ) ;
450+ // Always reuse the newest connection
451+ while let Some ( conn) = conns. pop_back ( ) {
452+ if conn. client . is_closed ( ) {
453+ continue ;
454+ }
455+ return Ok ( conn) ;
456+ }
457+ }
458+ let ( client, conn) = self
459+ . pg_config
460+ . connect ( self . tls_connect . clone ( ) )
461+ . in_span ( Span :: enter_with_local_parent ( "postgres_connect" ) )
462+ . await ?;
463+ common:: runtime:: tokio_spawn ( "postgres_connection" , conn) ;
464+ Ok ( PooledConnection :: new ( client) )
465+ }
466+
398467 pub ( crate ) async fn get_connection < ' a > (
399468 & ' a self ,
400469 name : & ' static str ,
@@ -408,23 +477,8 @@ impl ConvexPgPool {
408477 . trace_if_pending ( "postgres_semaphore_acquire" )
409478 . await
410479 . context ( "ConvexPgPool has been shut down" ) ?;
411- {
412- let mut conns = self . connections . lock ( ) ;
413- // Always reuse the newest connection
414- while let Some ( conn) = conns. pop_back ( ) {
415- if conn. client . is_closed ( ) {
416- continue ;
417- }
418- return Ok ( ( permit, conn) ) ;
419- }
420- }
421- let ( client, conn) = self
422- . pg_config
423- . connect ( self . tls_connect . clone ( ) )
424- . in_span ( Span :: enter_with_local_parent ( "postgres_connect" ) )
425- . await ?;
426- common:: runtime:: tokio_spawn ( "postgres_connection" , conn) ;
427- anyhow:: Ok ( ( permit, PooledConnection :: new ( client) ) )
480+ let conn = self . get_connection_internal ( ) . await ?;
481+ anyhow:: Ok ( ( permit, conn) )
428482 } )
429483 . await ;
430484 pool_get_timer. finish ( conn. is_ok ( ) ) ;
@@ -433,6 +487,7 @@ impl ConvexPgPool {
433487 pool : self ,
434488 _permit : permit,
435489 conn : Some ( conn) ,
490+ poisoned : AtomicBool :: new ( false ) ,
436491 schema,
437492 labels : vec ! [ StaticMetricLabel :: new( "name" , name) ] ,
438493 _tracker : ConnectionTracker :: new ( & self . stats ) ,
0 commit comments