Skip to content

Commit 935a937

Browse files
committed
fix: improve error handling in history table
1 parent fd3ad03 commit 935a937

File tree

3 files changed

+45
-13
lines changed

3 files changed

+45
-13
lines changed

src/common/tracing/src/predefined_tables/history_tables.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
name = "log_history"
44
target = ""
55
create = "CREATE TABLE IF NOT EXISTS system_history.log_history (timestamp TIMESTAMP NULL, path STRING NULL, target STRING NULL, log_level STRING NULL, cluster_id STRING NULL, node_id STRING NULL, warehouse_id STRING NULL, query_id STRING NULL, message STRING NULL, fields VARIANT NULL, batch_number Int64) CLUSTER BY LINEAR(timestamp, query_id)"
6-
transform = "settings (timezone='Etc/UTC') COPY INTO system_history.log_history FROM (SELECT timestamp, path, target, log_level, cluster_id,node_id, warehouse_id, query_id, message, fields, {batch_number} FROM @{stage_name}) file_format = (TYPE = PARQUET) PURGE = TRUE"
6+
transform = "settings (timezone='Etc/UTC') COPY INTO system_history.log_history FROM (SELECT timestamp, path, target, log_level, cluster_id,node_id, warehouse_id, query_id, message, fields, {batch_number} FROM @{stage_name}) file_format = (TYPE = PARQUET) PURGE = TRUE MAX_FILES = 5000"
77
delete = "DELETE FROM system_history.log_history WHERE timestamp < subtract_hours(NOW(), {retention_hours})"
88

99
[[tables]]

src/query/service/src/history_tables/global_history_log.rs

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ use futures_util::future::join_all;
4646
use futures_util::TryStreamExt;
4747
use log::error;
4848
use log::info;
49+
use log::warn;
4950
use opendal::raw::normalize_root;
5051
use parking_lot::Mutex;
5152
use rand::random;
@@ -161,30 +162,42 @@ impl GlobalHistoryLog {
161162
let meta_key = format!("{}/history_log_transform", self.tenant_id).clone();
162163
let log = GlobalHistoryLog::instance();
163164
let handle = spawn(async move {
164-
let mut consecutive_error = 0;
165+
let mut persistent_error_cnt = 0;
166+
let mut temp_error_cnt = 0;
165167
loop {
166168
match log.transform(&table_clone, &meta_key).await {
167169
Ok(acquired_lock) => {
168170
if acquired_lock {
169-
consecutive_error = 0;
171+
persistent_error_cnt = 0;
172+
temp_error_cnt = 0;
170173
}
174+
sleep(sleep_time).await;
171175
}
172176
Err(e) => {
173177
error!(
174-
"[HISTORY-TABLES] {} log transform failed due to {}, retry {}",
175-
table_clone.name, e, consecutive_error
178+
"[HISTORY-TABLES] {} log transform failed with persistent error {}, retry count {}",
179+
table_clone.name, e, persistent_error_cnt
176180
);
177-
consecutive_error += 1;
178-
if consecutive_error > 3 {
179-
error!(
180-
"[HISTORY-TABLES] {} log transform failed too many times, exit",
181-
table_clone.name
181+
if is_temp_error(&e) {
182+
let backoff_second = 2u64.pow(temp_error_cnt);
183+
temp_error_cnt += 1;
184+
warn!(
185+
"[HISTORY-TABLES] {} log transform failed with temporary error {}, next retry in {} seconds",
186+
table_clone.name, e, temp_error_cnt
182187
);
183-
break;
188+
sleep(Duration::from_secs(backoff_second)).await;
189+
} else {
190+
persistent_error_cnt += 1;
191+
if persistent_error_cnt > 3 {
192+
error!(
193+
"[HISTORY-TABLES] {} log transform failed too many times, giving up",
194+
table_clone.name
195+
);
196+
return;
197+
}
184198
}
185199
}
186200
}
187-
sleep(sleep_time).await;
188201
}
189202
});
190203
handles.push(handle);
@@ -433,3 +446,22 @@ pub async fn setup_operator(params: &Option<StorageParams>) -> Result<()> {
433446
GlobalLogger::instance().set_operator(op).await;
434447
Ok(())
435448
}
449+
450+
/// Check if the error is a temporary error,
451+
/// We will use this to determine if we should retry the operation.
452+
fn is_temp_error(e: &ErrorCode) -> bool {
453+
let code = e.code();
454+
let message = e.message();
455+
// Storage and I/O errors are considered temporary errors
456+
let storage = code == ErrorCode::STORAGE_NOT_FOUND
457+
|| code == ErrorCode::STORAGE_PERMISSION_DENIED
458+
|| code == ErrorCode::STORAGE_UNAVAILABLE
459+
|| code == ErrorCode::STORAGE_UNSUPPORTED
460+
|| code == ErrorCode::STORAGE_INSECURE
461+
|| code == ErrorCode::INVALID_OPERATION
462+
|| code == ErrorCode::STORAGE_OTHER;
463+
// If acquire semaphore failed, we consider it a temporary error
464+
let meta = code == ErrorCode::INTERNAL && message.contains("acquire semaphore failed");
465+
let transaction = code == ErrorCode::UNRESOLVABLE_CONFLICT;
466+
storage || transaction || meta
467+
}

src/query/service/src/history_tables/meta.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ impl HistoryMetaHandle {
9090
Duration::from_secs(3),
9191
))
9292
.await
93-
.map_err(|_e| "acquire semaphore failed from GlobalHistoryLog")?;
93+
.map_err(|e| format!("acquire semaphore failed from GlobalHistoryLog {}", e))?;
9494
if interval == 0 {
9595
return Ok(Some(acquired_guard));
9696
}

0 commit comments

Comments
 (0)