|
12 | 12 | // See the License for the specific language governing permissions and |
13 | 13 | // limitations under the License. |
14 | 14 |
|
| 15 | +use std::cmp::min; |
15 | 16 | use std::sync::atomic::AtomicBool; |
16 | 17 | use std::sync::atomic::Ordering; |
17 | 18 | use std::sync::Arc; |
@@ -46,6 +47,7 @@ use futures_util::future::join_all; |
46 | 47 | use futures_util::TryStreamExt; |
47 | 48 | use log::error; |
48 | 49 | use log::info; |
| 50 | +use log::warn; |
49 | 51 | use opendal::raw::normalize_root; |
50 | 52 | use parking_lot::Mutex; |
51 | 53 | use rand::random; |
@@ -161,30 +163,46 @@ impl GlobalHistoryLog { |
161 | 163 | let meta_key = format!("{}/history_log_transform", self.tenant_id).clone(); |
162 | 164 | let log = GlobalHistoryLog::instance(); |
163 | 165 | let handle = spawn(async move { |
164 | | - let mut consecutive_error = 0; |
| 166 | + let mut persistent_error_cnt = 0; |
| 167 | + let mut temp_error_cnt = 0; |
165 | 168 | loop { |
166 | 169 | match log.transform(&table_clone, &meta_key).await { |
167 | 170 | Ok(acquired_lock) => { |
168 | 171 | if acquired_lock { |
169 | | - consecutive_error = 0; |
| 172 | + persistent_error_cnt = 0; |
| 173 | + temp_error_cnt = 0; |
170 | 174 | } |
| 175 | + sleep(sleep_time).await; |
171 | 176 | } |
172 | 177 | Err(e) => { |
173 | | - error!( |
174 | | - "[HISTORY-TABLES] {} log transform failed due to {}, retry {}", |
175 | | - table_clone.name, e, consecutive_error |
176 | | - ); |
177 | | - consecutive_error += 1; |
178 | | - if consecutive_error > 3 { |
| 178 | + if is_temp_error(&e) { |
| 179 | + // If the error is temporary, we will retry with exponential backoff |
| 180 | + // The max backoff time is 10 minutes |
| 181 | + let backoff_second = |
| 182 | + min(2u64.saturating_pow(temp_error_cnt), 10 * 60); |
| 183 | + temp_error_cnt += 1; |
| 184 | + warn!( |
| 185 | + "[HISTORY-TABLES] {} log transform failed with temporary error {}, count {}, next retry in {} seconds", |
| 186 | + table_clone.name, e, temp_error_cnt, backoff_second |
| 187 | + ); |
| 188 | + sleep(Duration::from_secs(backoff_second)).await; |
| 189 | + } else { |
179 | 190 | error!( |
180 | | - "[HISTORY-TABLES] {} log transform failed too many times, exit", |
181 | | - table_clone.name |
| 191 | + "[HISTORY-TABLES] {} log transform failed with persistent error {}, retry count {}", |
| 192 | + table_clone.name, e, persistent_error_cnt |
182 | 193 | ); |
183 | | - break; |
| 194 | + persistent_error_cnt += 1; |
| 195 | + if persistent_error_cnt > 3 { |
| 196 | + error!( |
| 197 | + "[HISTORY-TABLES] {} log transform failed too many times, giving up", |
| 198 | + table_clone.name |
| 199 | + ); |
| 200 | + return; |
| 201 | + } |
| 202 | + sleep(sleep_time).await; |
184 | 203 | } |
185 | 204 | } |
186 | 205 | } |
187 | | - sleep(sleep_time).await; |
188 | 206 | } |
189 | 207 | }); |
190 | 208 | handles.push(handle); |
@@ -433,3 +451,22 @@ pub async fn setup_operator(params: &Option<StorageParams>) -> Result<()> { |
433 | 451 | GlobalLogger::instance().set_operator(op).await; |
434 | 452 | Ok(()) |
435 | 453 | } |
| 454 | + |
| 455 | +/// Check if the error is a temporary error, |
| 456 | +/// We will use this to determine if we should retry the operation. |
| 457 | +fn is_temp_error(e: &ErrorCode) -> bool { |
| 458 | + let code = e.code(); |
| 459 | + let message = e.message(); |
| 460 | + // Storage and I/O errors are considered temporary errors |
| 461 | + let storage = code == ErrorCode::STORAGE_NOT_FOUND |
| 462 | + || code == ErrorCode::STORAGE_PERMISSION_DENIED |
| 463 | + || code == ErrorCode::STORAGE_UNAVAILABLE |
| 464 | + || code == ErrorCode::STORAGE_UNSUPPORTED |
| 465 | + || code == ErrorCode::STORAGE_INSECURE |
| 466 | + || code == ErrorCode::INVALID_OPERATION |
| 467 | + || code == ErrorCode::STORAGE_OTHER; |
| 468 | + // If acquire semaphore failed, we consider it a temporary error |
| 469 | + let meta = code == ErrorCode::INTERNAL && message.contains("acquire semaphore failed"); |
| 470 | + let transaction = code == ErrorCode::UNRESOLVABLE_CONFLICT; |
| 471 | + storage || transaction || meta |
| 472 | +} |
0 commit comments