errors: re-introduce RequestError

muzarski · muzarski · commit 7a11683751ac · 2025-01-23T00:47:46.000+01:00
Introduced "new" error type and adjusted session.rs, speculative_execution
module and iterator module to this type.

This error represents a definite request failure (after potential retries).
diff --git a/scylla/src/client/pager.rs b/scylla/src/client/pager.rs
@@ -29,7 +29,7 @@ use crate::cluster::{ClusterState, NodeRef};
 #[allow(deprecated)]
 use crate::cql_to_rust::{FromRow, FromRowError};
 use crate::deserialize::DeserializeOwnedRow;
-use crate::errors::ProtocolError;
+use crate::errors::{ProtocolError, RequestError};
 use crate::errors::{QueryError, RequestAttemptError};
 use crate::frame::response::result;
 use crate::network::Connection;
@@ -168,7 +168,7 @@ where
         let query_plan =
             load_balancing::Plan::new(load_balancer.as_ref(), &statement_info, &cluster_data);
 
-        let mut last_error: QueryError = QueryError::EmptyPlan;
+        let mut last_error: RequestError = RequestError::EmptyPlan;
         let mut current_consistency: Consistency = self.query_consistency;
 
         self.log_query_start();
@@ -235,8 +235,12 @@ where
                     retry_decision = ?retry_decision
                 );
 
-                last_error = request_error.into_query_error();
-                self.log_attempt_error(&last_error, &retry_decision);
+                // TODO: This is a temporary measure. Will be able to remove it later in this PR
+                // once I narrow the error type in history module.
+                let q_error: QueryError = request_error.clone().into_query_error();
+                self.log_attempt_error(&q_error, &retry_decision);
+
+                last_error = request_error.into();
 
                 match retry_decision {
                     RetryDecision::RetrySameNode(cl) => {
@@ -266,8 +270,9 @@ where
         }
 
         // Send last_error to QueryPager - query failed fully
-        self.log_query_error(&last_error);
-        let (proof, _) = self.sender.send(Err(last_error)).await;
+        let q_error = last_error.into_query_error();
+        self.log_query_error(&q_error);
+        let (proof, _) = self.sender.send(Err(q_error)).await;
         proof
     }
 
diff --git a/scylla/src/client/session.rs b/scylla/src/client/session.rs
@@ -15,7 +15,8 @@ use crate::cluster::node::CloudEndpoint;
 use crate::cluster::node::{InternalKnownNode, KnownNode, NodeRef};
 use crate::cluster::{Cluster, ClusterNeatDebug, ClusterState};
 use crate::errors::{
-    BadQuery, NewSessionError, ProtocolError, QueryError, RequestAttemptError, TracingProtocolError,
+    BadQuery, NewSessionError, ProtocolError, QueryError, RequestAttemptError, RequestError,
+    TracingProtocolError,
 };
 use crate::frame::response::result;
 #[cfg(feature = "ssl")]
@@ -1966,7 +1967,7 @@ where
                         },
                     )
                     .await
-                    .unwrap_or(Err(QueryError::EmptyPlan))
+                    .unwrap_or(Err(RequestError::EmptyPlan))
                 }
             }
         };
@@ -1977,14 +1978,15 @@ where
         let result = match effective_timeout {
             Some(timeout) => tokio::time::timeout(timeout, runner)
                 .await
+                .map(|res| res.map_err(RequestError::into_query_error))
                 .unwrap_or_else(|e| {
                     Err(QueryError::RequestTimeout(format!(
                         "Request took longer than {}ms: {}",
                         timeout.as_millis(),
                         e
                     )))
                 }),
-            None => runner.await,
+            None => runner.await.map_err(RequestError::into_query_error),
         };
 
         if let Some((history_listener, query_id)) = history_listener_and_id {
@@ -2008,12 +2010,12 @@ where
         run_request_once: impl Fn(Arc<Connection>, Consistency, &ExecutionProfileInner) -> QueryFut,
         execution_profile: &ExecutionProfileInner,
         mut context: ExecuteRequestContext<'a>,
-    ) -> Option<Result<RunRequestResult<ResT>, QueryError>>
+    ) -> Option<Result<RunRequestResult<ResT>, RequestError>>
     where
         QueryFut: Future<Output = Result<ResT, RequestAttemptError>>,
         ResT: AllowedRunRequestResTType,
     {
-        let mut last_error: Option<QueryError> = None;
+        let mut last_error: Option<RequestError> = None;
         let mut current_consistency: Consistency = context
             .consistency_set_on_statement
             .unwrap_or(execution_profile.consistency);
@@ -2097,12 +2099,12 @@ where
                     retry_decision = ?retry_decision
                 );
 
-                last_error = Some(request_error.into_query_error());
-                context.log_attempt_error(
-                    &attempt_id,
-                    last_error.as_ref().unwrap(),
-                    &retry_decision,
-                );
+                // TODO: This is a temporary measure. Will be able to remove it later in this PR
+                // once I narrow the error type in history module.
+                let q_error: QueryError = request_error.clone().into_query_error();
+                context.log_attempt_error(&attempt_id, &q_error, &retry_decision);
+
+                last_error = Some(request_error.into());
 
                 match retry_decision {
                     RetryDecision::RetrySameNode(new_cl) => {
diff --git a/scylla/src/errors.rs b/scylla/src/errors.rs
@@ -871,7 +871,47 @@ pub enum CqlEventHandlingError {
     SendError,
 }
 
-/// An error type that occurred during single attempt of:
+/// An error that occurred during execution of
+/// - `QUERY`
+/// - `PREPARE`
+/// - `EXECUTE`
+/// - `BATCH`
+///
+/// request. This error represents a definite request failure, unlike
+/// [`RequestAttemptError`] which represents a failure of a single
+/// attempt.
+#[derive(Error, Debug, Clone)]
+#[non_exhaustive]
+pub enum RequestError {
+    /// Load balancing policy returned an empty plan.
+    #[error(
+            "Load balancing policy returned an empty plan.\
+            First thing to investigate should be the logic of custom LBP implementation.\
+            If you think that your LBP implementation is correct, or you make use of `DefaultPolicy`,\
+            then this is most probably a driver bug!"
+        )]
+    EmptyPlan,
+
+    /// Selected node's connection pool is in invalid state.
+    #[error("No connections in the pool: {0}")]
+    ConnectionPoolError(#[from] ConnectionPoolError),
+
+    /// Failed to execute request.
+    #[error(transparent)]
+    LastAttemptError(#[from] RequestAttemptError),
+}
+
+impl RequestError {
+    pub fn into_query_error(self) -> QueryError {
+        match self {
+            RequestError::EmptyPlan => QueryError::EmptyPlan,
+            RequestError::ConnectionPoolError(e) => e.into(),
+            RequestError::LastAttemptError(e) => e.into_query_error(),
+        }
+    }
+}
+
+/// An error that occurred during a single attempt of:
 /// - `QUERY`
 /// - `PREPARE`
 /// - `EXECUTE`
diff --git a/scylla/src/policies/speculative_execution.rs b/scylla/src/policies/speculative_execution.rs
@@ -6,7 +6,7 @@ use scylla_cql::frame::response::error::DbError;
 use std::{future::Future, sync::Arc, time::Duration};
 use tracing::{trace_span, warn, Instrument};
 
-use crate::errors::QueryError;
+use crate::errors::{RequestAttemptError, RequestError};
 use crate::observability::metrics::Metrics;
 
 /// Context is passed as an argument to `SpeculativeExecutionPolicy` methods
@@ -85,94 +85,95 @@ impl SpeculativeExecutionPolicy for PercentileSpeculativeExecutionPolicy {
 ///
 /// We should ignore errors such that their presence when executing the request
 /// on one node, does not imply that the same error will appear during retry on some other node.
-fn can_be_ignored<ResT>(result: &Result<ResT, QueryError>) -> bool {
+fn can_be_ignored<ResT>(result: &Result<ResT, RequestError>) -> bool {
     match result {
         Ok(_) => false,
         // Do not remove this lint!
         // It's there for a reason - we don't want new variants
         // automatically fall under `_` pattern when they are introduced.
         #[deny(clippy::wildcard_enum_match_arm)]
         Err(e) => match e {
-            // Errors that will almost certainly appear for other nodes as well
-            QueryError::BadQuery(_)
-            | QueryError::CqlRequestSerialization(_)
-            | QueryError::BodyExtensionsParseError(_)
-            | QueryError::CqlResultParseError(_)
-            | QueryError::CqlErrorParseError(_)
-            | QueryError::ProtocolError(_) => false,
-
-            // EmptyPlan is not returned by `Session::execute_query`.
-            // It is represented by None, which is then transformed
-            // to QueryError::EmptyPlan by the caller
-            // (either here is speculative_execution module, or for non-speculative execution).
-            // I believe this should not be ignored, since we do not expect it here.
-            QueryError::EmptyPlan => false,
-
-            // Errors that should not appear here, thus should not be ignored
-            #[allow(deprecated)]
-            QueryError::NextRowError(_)
-            | QueryError::IntoLegacyQueryResultError(_)
-            | QueryError::TimeoutError
-            | QueryError::RequestTimeout(_)
-            | QueryError::MetadataError(_) => false,
-
-            // Errors that can be ignored
-            QueryError::BrokenConnection(_)
-            | QueryError::UnableToAllocStreamId
-            | QueryError::ConnectionPoolError(_) => true,
-
-            // Handle DbErrors
-            QueryError::DbError(db_error, _) => {
+            // This error should not appear it. Anyway, if it possibly could
+            // in the future, it should not be ignored.
+            RequestError::EmptyPlan => false,
+
+            // Can try on another node.
+            RequestError::ConnectionPoolError { .. } => true,
+
+            RequestError::LastAttemptError(e) => {
                 // Do not remove this lint!
                 // It's there for a reason - we don't want new variants
                 // automatically fall under `_` pattern when they are introduced.
                 #[deny(clippy::wildcard_enum_match_arm)]
-                match db_error {
-                        // Errors that will almost certainly appear on other nodes as well
-                        DbError::SyntaxError
-                        | DbError::Invalid
-                        | DbError::AlreadyExists { .. }
-                        | DbError::Unauthorized
-                        | DbError::ProtocolError => false,
-
-                        // Errors that should not appear there - thus, should not be ignored.
-                        DbError::AuthenticationError | DbError::Other(_) => false,
-
-                        // For now, let's assume that UDF failure is not transient - don't ignore it
-                        // TODO: investigate
-                        DbError::FunctionFailure { .. } => false,
-
-                        // Not sure when these can appear - don't ignore them
-                        // TODO: Investigate these errors
-                        DbError::ConfigError | DbError::TruncateError => false,
-
-                        // Errors that we can ignore and perform a retry on some other node
-                        DbError::Unavailable { .. }
-                        | DbError::Overloaded
-                        | DbError::IsBootstrapping
-                        | DbError::ReadTimeout { .. }
-                        | DbError::WriteTimeout { .. }
-                        | DbError::ReadFailure { .. }
-                        | DbError::WriteFailure { .. }
-                        // Preparation may succeed on some other node.
-                        | DbError::Unprepared { .. }
-                        | DbError::ServerError
-                        | DbError::RateLimitReached { .. } => true,
+                match e {
+                    // Errors that will almost certainly appear for other nodes as well
+                    RequestAttemptError::SerializationError(_)
+                    | RequestAttemptError::CqlRequestSerialization(_)
+                    | RequestAttemptError::BodyExtensionsParseError(_)
+                    | RequestAttemptError::CqlResultParseError(_)
+                    | RequestAttemptError::CqlErrorParseError(_)
+                    | RequestAttemptError::UnexpectedResponse(_)
+                    | RequestAttemptError::RepreparedIdChanged { .. }
+                    | RequestAttemptError::RepreparedIdMissingInBatch => false,
+
+                    // Errors that can be ignored
+                    RequestAttemptError::BrokenConnectionError(_)
+                    | RequestAttemptError::UnableToAllocStreamId => true,
+
+                    // Handle DbErrors
+                    RequestAttemptError::DbError(db_error, _) => {
+                        // Do not remove this lint!
+                        // It's there for a reason - we don't want new variants
+                        // automatically fall under `_` pattern when they are introduced.
+                        #[deny(clippy::wildcard_enum_match_arm)]
+                        match db_error {
+                            // Errors that will almost certainly appear on other nodes as well
+                            DbError::SyntaxError
+                            | DbError::Invalid
+                            | DbError::AlreadyExists { .. }
+                            | DbError::Unauthorized
+                            | DbError::ProtocolError => false,
+
+                            // Errors that should not appear there - thus, should not be ignored.
+                            DbError::AuthenticationError | DbError::Other(_) => false,
+
+                            // For now, let's assume that UDF failure is not transient - don't ignore it
+                            // TODO: investigate
+                            DbError::FunctionFailure { .. } => false,
+
+                            // Not sure when these can appear - don't ignore them
+                            // TODO: Investigate these errors
+                            DbError::ConfigError | DbError::TruncateError => false,
+
+                            // Errors that we can ignore and perform a retry on some other node
+                            DbError::Unavailable { .. }
+                            | DbError::Overloaded
+                            | DbError::IsBootstrapping
+                            | DbError::ReadTimeout { .. }
+                            | DbError::WriteTimeout { .. }
+                            | DbError::ReadFailure { .. }
+                            | DbError::WriteFailure { .. }
+                            // Preparation may succeed on some other node.
+                            | DbError::Unprepared { .. }
+                            | DbError::ServerError
+                            | DbError::RateLimitReached { .. } => true,
+                        }
                     }
+                }
             }
         },
     }
 }
 
-const EMPTY_PLAN_ERROR: QueryError = QueryError::EmptyPlan;
+const EMPTY_PLAN_ERROR: RequestError = RequestError::EmptyPlan;
 
 pub(crate) async fn execute<QueryFut, ResT>(
     policy: &dyn SpeculativeExecutionPolicy,
     context: &Context,
     query_runner_generator: impl Fn(bool) -> QueryFut,
-) -> Result<ResT, QueryError>
+) -> Result<ResT, RequestError>
 where
-    QueryFut: Future<Output = Option<Result<ResT, QueryError>>>,
+    QueryFut: Future<Output = Option<Result<ResT, RequestError>>>,
 {
     let mut retries_remaining = policy.max_retry_count(context);
     let retry_interval = policy.retry_interval(context);