|
12 | 12 | // See the License for the specific language governing permissions and
|
13 | 13 | // limitations under the License.
|
14 | 14 |
|
| 15 | +use std::cmp::min; |
15 | 16 | use std::sync::atomic::AtomicBool;
|
16 | 17 | use std::sync::atomic::Ordering;
|
17 | 18 | use std::sync::Arc;
|
@@ -46,6 +47,7 @@ use futures_util::future::join_all;
|
46 | 47 | use futures_util::TryStreamExt;
|
47 | 48 | use log::error;
|
48 | 49 | use log::info;
|
| 50 | +use log::warn; |
49 | 51 | use opendal::raw::normalize_root;
|
50 | 52 | use parking_lot::Mutex;
|
51 | 53 | use rand::random;
|
@@ -161,30 +163,46 @@ impl GlobalHistoryLog {
|
161 | 163 | let meta_key = format!("{}/history_log_transform", self.tenant_id).clone();
|
162 | 164 | let log = GlobalHistoryLog::instance();
|
163 | 165 | let handle = spawn(async move {
|
164 |
| - let mut consecutive_error = 0; |
| 166 | + let mut persistent_error_cnt = 0; |
| 167 | + let mut temp_error_cnt = 0; |
165 | 168 | loop {
|
166 | 169 | match log.transform(&table_clone, &meta_key).await {
|
167 | 170 | Ok(acquired_lock) => {
|
168 | 171 | if acquired_lock {
|
169 |
| - consecutive_error = 0; |
| 172 | + persistent_error_cnt = 0; |
| 173 | + temp_error_cnt = 0; |
170 | 174 | }
|
| 175 | + sleep(sleep_time).await; |
171 | 176 | }
|
172 | 177 | Err(e) => {
|
173 |
| - error!( |
174 |
| - "[HISTORY-TABLES] {} log transform failed due to {}, retry {}", |
175 |
| - table_clone.name, e, consecutive_error |
176 |
| - ); |
177 |
| - consecutive_error += 1; |
178 |
| - if consecutive_error > 3 { |
| 178 | + if is_temp_error(&e) { |
| 179 | + // If the error is temporary, we will retry with exponential backoff |
| 180 | + // The max backoff time is 10 minutes |
| 181 | + let backoff_second = |
| 182 | + min(2u64.saturating_pow(temp_error_cnt), 10 * 60); |
| 183 | + temp_error_cnt += 1; |
| 184 | + warn!( |
| 185 | + "[HISTORY-TABLES] {} log transform failed with temporary error {}, count {}, next retry in {} seconds", |
| 186 | + table_clone.name, e, temp_error_cnt, backoff_second |
| 187 | + ); |
| 188 | + sleep(Duration::from_secs(backoff_second)).await; |
| 189 | + } else { |
179 | 190 | error!(
|
180 |
| - "[HISTORY-TABLES] {} log transform failed too many times, exit", |
181 |
| - table_clone.name |
| 191 | + "[HISTORY-TABLES] {} log transform failed with persistent error {}, retry count {}", |
| 192 | + table_clone.name, e, persistent_error_cnt |
182 | 193 | );
|
183 |
| - break; |
| 194 | + persistent_error_cnt += 1; |
| 195 | + if persistent_error_cnt > 3 { |
| 196 | + error!( |
| 197 | + "[HISTORY-TABLES] {} log transform failed too many times, giving up", |
| 198 | + table_clone.name |
| 199 | + ); |
| 200 | + return; |
| 201 | + } |
| 202 | + sleep(sleep_time).await; |
184 | 203 | }
|
185 | 204 | }
|
186 | 205 | }
|
187 |
| - sleep(sleep_time).await; |
188 | 206 | }
|
189 | 207 | });
|
190 | 208 | handles.push(handle);
|
@@ -433,3 +451,22 @@ pub async fn setup_operator(params: &Option<StorageParams>) -> Result<()> {
|
433 | 451 | GlobalLogger::instance().set_operator(op).await;
|
434 | 452 | Ok(())
|
435 | 453 | }
|
| 454 | + |
| 455 | +/// Check if the error is a temporary error, |
| 456 | +/// We will use this to determine if we should retry the operation. |
| 457 | +fn is_temp_error(e: &ErrorCode) -> bool { |
| 458 | + let code = e.code(); |
| 459 | + let message = e.message(); |
| 460 | + // Storage and I/O errors are considered temporary errors |
| 461 | + let storage = code == ErrorCode::STORAGE_NOT_FOUND |
| 462 | + || code == ErrorCode::STORAGE_PERMISSION_DENIED |
| 463 | + || code == ErrorCode::STORAGE_UNAVAILABLE |
| 464 | + || code == ErrorCode::STORAGE_UNSUPPORTED |
| 465 | + || code == ErrorCode::STORAGE_INSECURE |
| 466 | + || code == ErrorCode::INVALID_OPERATION |
| 467 | + || code == ErrorCode::STORAGE_OTHER; |
| 468 | + // If acquire semaphore failed, we consider it a temporary error |
| 469 | + let meta = code == ErrorCode::INTERNAL && message.contains("acquire semaphore failed"); |
| 470 | + let transaction = code == ErrorCode::UNRESOLVABLE_CONFLICT; |
| 471 | + storage || transaction || meta |
| 472 | +} |
0 commit comments