Skip to content

Commit f6e6e68

Browse files
authored
Merge pull request #14 from monoscope-tech/update-deps-and-test
Upgrade to DataFusion 52 with Utf8View support and fix WAL metadata limits
2 parents 071322e + a91b3bf commit f6e6e68

20 files changed

+1359
-568
lines changed

Cargo.lock

Lines changed: 265 additions & 304 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ edition = "2024"
55

66
[dependencies]
77
tokio = { version = "1.48", features = ["full"] }
8-
datafusion = "51.0.0"
9-
datafusion-datasource = "51.0.0"
8+
datafusion = "52.1.0"
9+
datafusion-datasource = "52.1.0"
1010
arrow = "57.1.0"
1111
arrow-json = "57.1.0"
1212
uuid = { version = "1.17", features = ["v4", "serde"] }
@@ -16,17 +16,16 @@ serde_json = "1.0.141"
1616
serde_with = "3.14"
1717
serde_yaml = "0.9"
1818
async-trait = "0.1.86"
19-
env_logger = "0.11.6"
2019
log = "0.4.27"
2120
color-eyre = "0.6.5"
2221
arrow-schema = "57.1.0"
2322
regex = "1.11.1"
24-
# Updated to latest delta-rs with datafusion 51 and arrow 57 support
25-
deltalake = { git = "https://github.com/delta-io/delta-rs.git", rev = "cacb6c668f535bccfee182cd4ff3b6375b1a4e25", features = [
23+
# Updated to delta-rs with datafusion 52 Utf8View fixes (includes commits 987e535f, ffb794ba)
24+
deltalake = { git = "https://github.com/delta-io/delta-rs.git", rev = "ffb794ba0745394fc4b747a4ef2e11c2d4ec086a", features = [
2625
"datafusion",
2726
"s3",
2827
] }
29-
delta_kernel = { version = "0.19.0", features = [
28+
delta_kernel = { version = "0.19.1", features = [
3029
"arrow-conversion",
3130
"default-engine-rustls",
3231
"arrow-57",
@@ -42,8 +41,8 @@ sqlx = { version = "0.8", features = [
4241
futures = { version = "0.3.31", features = ["alloc"] }
4342
bytes = "1.4"
4443
tokio-rustls = "0.26.1"
45-
datafusion-postgres = "0.13.0"
46-
datafusion-functions-json = "0.51.0"
44+
datafusion-postgres = "0.14.0"
45+
datafusion-functions-json = "0.52.0"
4746
anyhow = "1.0.100"
4847
tokio-util = "0.7.17"
4948
tokio-stream = { version = "0.1.17", features = ["net"] }
@@ -53,8 +52,8 @@ tracing-opentelemetry = "0.32"
5352
opentelemetry = "0.31"
5453
opentelemetry-otlp = { version = "0.31", features = ["grpc-tonic"] }
5554
opentelemetry_sdk = { version = "0.31", features = ["rt-tokio"] }
56-
datafusion-tracing = { git = "https://github.com/datafusion-contrib/datafusion-tracing.git" }
57-
instrumented-object-store = { git = "https://github.com/datafusion-contrib/datafusion-tracing.git" }
55+
datafusion-tracing = { git = "https://github.com/datafusion-contrib/datafusion-tracing.git", rev = "43734ac7a87eacb599d1d855a21c8c157d71acbb" }
56+
instrumented-object-store = { git = "https://github.com/datafusion-contrib/datafusion-tracing.git", rev = "43734ac7a87eacb599d1d855a21c8c157d71acbb" }
5857
dotenv = "0.15.0"
5958
include_dir = "0.7"
6059
aws-config = { version = "1.6.0", features = ["behavior-version-latest"] }
@@ -63,12 +62,13 @@ aws-sdk-s3 = "1.3.0"
6362
aws-sdk-dynamodb = "1.3.0"
6463
url = "2.5.4"
6564
tokio-cron-scheduler = "0.15"
66-
object_store = "0.12.3"
65+
object_store = "0.12.4"
6766
foyer = { version = "0.21.1", features = ["serde"] }
6867
ahash = "0.8"
6968
lru = "0.16.1"
7069
serde_bytes = "0.11.19"
7170
dashmap = "6.1"
71+
parking_lot = "0.12"
7272
envy = "0.4"
7373
tdigests = "1.0"
7474
bincode = { version = "2.0", features = ["serde"] }
@@ -79,11 +79,12 @@ strum = { version = "0.27", features = ["derive"] }
7979
[dev-dependencies]
8080
sqllogictest = { git = "https://github.com/risinglightdb/sqllogictest-rs.git" }
8181
serial_test = "3.2.0"
82-
datafusion-common = "51.0.0"
82+
datafusion-common = "52.1.0"
8383
tokio-postgres = { version = "0.7.10", features = ["with-chrono-0_4"] }
8484
scopeguard = "1.2.0"
8585
rand = "0.9.2"
8686
tempfile = "3"
87+
test-case = "3.3"
8788

8889
[features]
8990
default = []

src/buffered_write_layer.rs

Lines changed: 87 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@ use tracing::{debug, error, info, instrument, warn};
1414
// 20% overhead accounts for DashMap internal structures, RwLock wrappers,
1515
// Arc<Schema> refs, and Arrow buffer alignment padding
1616
const MEMORY_OVERHEAD_MULTIPLIER: f64 = 1.2;
17+
/// Hard limit multiplier (120%) provides headroom for in-flight writes while preventing OOM
18+
const HARD_LIMIT_MULTIPLIER: usize = 5; // max_bytes + max_bytes/5 = 120%
19+
/// Maximum CAS retry attempts before failing
20+
const MAX_CAS_RETRIES: u32 = 100;
21+
/// Base backoff delay in microseconds for CAS retries
22+
const CAS_BACKOFF_BASE_MICROS: u64 = 1;
23+
/// Maximum backoff exponent (caps delay at ~1ms)
24+
const CAS_BACKOFF_MAX_EXPONENT: u32 = 10;
1725

1826
#[derive(Debug, Default)]
1927
pub struct RecoveryStats {
@@ -25,6 +33,13 @@ pub struct RecoveryStats {
2533
pub corrupted_entries_skipped: u64,
2634
}
2735

36+
#[derive(Debug, Default)]
37+
pub struct FlushStats {
38+
pub buckets_flushed: u64,
39+
pub buckets_failed: u64,
40+
pub total_rows: u64,
41+
}
42+
2843
/// Callback for writing batches to Delta Lake. The callback MUST:
2944
/// - Complete the Delta commit (including S3 upload) before returning Ok
3045
/// - Return Err if the commit fails for any reason
@@ -93,16 +108,15 @@ impl BufferedWriteLayer {
93108

94109
/// Try to reserve memory atomically before a write.
95110
/// Returns estimated batch size on success, or error if hard limit exceeded.
96-
/// Callers MUST implement retry logic - hard failures may cause data loss.
111+
/// Uses exponential backoff to reduce CPU thrashing under contention.
97112
fn try_reserve_memory(&self, batches: &[RecordBatch]) -> anyhow::Result<usize> {
98113
let batch_size: usize = batches.iter().map(estimate_batch_size).sum();
99114
let estimated_size = (batch_size as f64 * MEMORY_OVERHEAD_MULTIPLIER) as usize;
100115

101116
let max_bytes = self.max_memory_bytes();
102-
// Hard limit at 120% provides headroom for in-flight writes while preventing OOM
103-
let hard_limit = max_bytes.saturating_add(max_bytes / 5);
117+
let hard_limit = max_bytes.saturating_add(max_bytes / HARD_LIMIT_MULTIPLIER);
104118

105-
for _ in 0..100 {
119+
for attempt in 0..MAX_CAS_RETRIES {
106120
let current_reserved = self.reserved_bytes.load(Ordering::Acquire);
107121
let current_mem = self.mem_buffer.estimated_memory_bytes();
108122
let new_total = current_mem + current_reserved + estimated_size;
@@ -123,8 +137,20 @@ impl BufferedWriteLayer {
123137
{
124138
return Ok(estimated_size);
125139
}
140+
141+
// Exponential backoff: spin_loop for first few attempts, then brief sleep.
142+
// Note: Using std::thread::sleep in this sync function called from async context.
143+
// This is acceptable because: (1) max sleep is ~1ms, (2) only under high contention,
144+
// (3) converting to async would require spawn_blocking which adds more overhead.
145+
if attempt < 5 {
146+
std::hint::spin_loop();
147+
} else {
148+
// Max backoff = 1μs << 10 = 1024μs ≈ 1ms
149+
let backoff_micros = CAS_BACKOFF_BASE_MICROS << attempt.min(CAS_BACKOFF_MAX_EXPONENT);
150+
std::thread::sleep(std::time::Duration::from_micros(backoff_micros));
151+
}
126152
}
127-
anyhow::bail!("Failed to reserve memory after 100 retries due to contention")
153+
anyhow::bail!("Failed to reserve memory after {} retries due to contention", MAX_CAS_RETRIES)
128154
}
129155

130156
fn release_reservation(&self, size: usize) {
@@ -169,6 +195,12 @@ impl BufferedWriteLayer {
169195
self.release_reservation(reserved_size);
170196

171197
result?;
198+
199+
// Immediate flush mode: flush after every insert
200+
if self.config.buffer.flush_immediately() {
201+
self.flush_all_now().await?;
202+
}
203+
172204
debug!("BufferedWriteLayer insert complete: project={}, table={}", project_id, table_name);
173205
Ok(())
174206
}
@@ -202,7 +234,7 @@ impl BufferedWriteLayer {
202234

203235
for entry in entries {
204236
match entry.operation {
205-
WalOperation::Insert => match WalManager::deserialize_batch(&entry.data) {
237+
WalOperation::Insert => match WalManager::deserialize_batch(&entry.data, &entry.table_name) {
206238
Ok(batch) => {
207239
self.mem_buffer.insert(&entry.project_id, &entry.table_name, batch, entry.timestamp_micros)?;
208240
entries_replayed += 1;
@@ -332,7 +364,7 @@ impl BufferedWriteLayer {
332364
return Ok(());
333365
}
334366

335-
info!("Flushing {} buckets to Delta", flushable.len());
367+
debug!("Flushing {} buckets to Delta", flushable.len());
336368

337369
// Flush buckets in parallel with bounded concurrency
338370
let parallelism = self.config.buffer.flush_parallelism();
@@ -442,6 +474,35 @@ impl BufferedWriteLayer {
442474
Ok(())
443475
}
444476

477+
/// Force flush all buffered data to Delta immediately.
478+
pub async fn flush_all_now(&self) -> anyhow::Result<FlushStats> {
479+
let _flush_guard = self.flush_lock.lock().await;
480+
let all_buckets = self.mem_buffer.get_all_buckets();
481+
let mut stats = FlushStats {
482+
total_rows: all_buckets.iter().map(|b| b.row_count as u64).sum(),
483+
..Default::default()
484+
};
485+
486+
for bucket in all_buckets {
487+
match self.flush_bucket(&bucket).await {
488+
Ok(()) => {
489+
self.checkpoint_and_drain(&bucket);
490+
stats.buckets_flushed += 1;
491+
}
492+
Err(e) => {
493+
error!("flush_all_now: failed bucket {}: {}", bucket.bucket_id, e);
494+
stats.buckets_failed += 1;
495+
}
496+
}
497+
}
498+
Ok(stats)
499+
}
500+
501+
/// Check if buffer is empty (all data flushed).
502+
pub fn is_empty(&self) -> bool {
503+
self.mem_buffer.get_stats().total_rows == 0
504+
}
505+
445506
pub fn get_stats(&self) -> MemBufferStats {
446507
self.mem_buffer.get_stats()
447508
}
@@ -503,8 +564,8 @@ impl BufferedWriteLayer {
503564
#[cfg(test)]
504565
mod tests {
505566
use super::*;
506-
use arrow::array::{Int64Array, StringArray};
507-
use arrow::datatypes::{DataType, Field, Schema};
567+
use crate::test_utils::test_helpers::{json_to_batch, test_span};
568+
use serial_test::serial;
508569
use std::path::PathBuf;
509570
use tempfile::tempdir;
510571

@@ -514,14 +575,14 @@ mod tests {
514575
Arc::new(cfg)
515576
}
516577

517-
fn create_test_batch() -> RecordBatch {
518-
let schema = Arc::new(Schema::new(vec![
519-
Field::new("id", DataType::Int64, false),
520-
Field::new("name", DataType::Utf8, false),
521-
]));
522-
let id_array = Int64Array::from(vec![1, 2, 3]);
523-
let name_array = StringArray::from(vec!["a", "b", "c"]);
524-
RecordBatch::try_new(schema, vec![Arc::new(id_array), Arc::new(name_array)]).unwrap()
578+
fn create_test_batch(project_id: &str) -> RecordBatch {
579+
// Use test_span helper which creates data matching the default schema
580+
json_to_batch(vec![
581+
test_span("test1", "span1", project_id),
582+
test_span("test2", "span2", project_id),
583+
test_span("test3", "span3", project_id),
584+
])
585+
.unwrap()
525586
}
526587

527588
#[tokio::test]
@@ -535,7 +596,7 @@ mod tests {
535596
let table = format!("t{}", test_id);
536597

537598
let layer = BufferedWriteLayer::with_config(cfg).unwrap();
538-
let batch = create_test_batch();
599+
let batch = create_test_batch(&project);
539600

540601
layer.insert(&project, &table, vec![batch.clone()]).await.unwrap();
541602

@@ -544,15 +605,16 @@ mod tests {
544605
assert_eq!(results[0].num_rows(), 3);
545606
}
546607

547-
// NOTE: This test is ignored because walrus-rust creates new files for each instance
548-
// rather than discovering existing files from previous instances in the same directory.
549-
// This is a limitation of the walrus library, not our code.
550-
#[ignore]
608+
#[serial]
551609
#[tokio::test]
552610
async fn test_recovery() {
553611
let dir = tempdir().unwrap();
554612
let cfg = create_test_config(dir.path().to_path_buf());
555613

614+
// SAFETY: walrus-rust reads WALRUS_DATA_DIR from environment. We use #[serial]
615+
// to prevent concurrent access to this process-global state.
616+
unsafe { std::env::set_var("WALRUS_DATA_DIR", &cfg.core.walrus_data_dir) };
617+
556618
// Use unique but short project/table names (walrus has metadata size limit)
557619
let test_id = &uuid::Uuid::new_v4().to_string()[..4];
558620
let project = format!("r{}", test_id);
@@ -561,10 +623,9 @@ mod tests {
561623
// First instance - write data
562624
{
563625
let layer = BufferedWriteLayer::with_config(Arc::clone(&cfg)).unwrap();
564-
let batch = create_test_batch();
626+
let batch = create_test_batch(&project);
565627
layer.insert(&project, &table, vec![batch]).await.unwrap();
566-
// Shutdown to ensure WAL is synced
567-
layer.shutdown().await.unwrap();
628+
// Layer drops here - WAL data should be persisted
568629
}
569630

570631
// Second instance - recover from WAL
@@ -591,7 +652,7 @@ mod tests {
591652
let layer = BufferedWriteLayer::with_config(cfg).unwrap();
592653

593654
// First insert should succeed
594-
let batch = create_test_batch();
655+
let batch = create_test_batch(&project);
595656
layer.insert(&project, &table, vec![batch]).await.unwrap();
596657

597658
// Verify reservation is released (should be 0 after successful insert)

src/config.rs

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -198,18 +198,19 @@ impl AwsConfig {
198198
}
199199

200200
let mut opts = HashMap::new();
201-
insert_opt!(opts, "aws_access_key_id", self.aws_access_key_id);
202-
insert_opt!(opts, "aws_secret_access_key", self.aws_secret_access_key);
203-
insert_opt!(opts, "aws_region", self.aws_default_region);
204-
opts.insert("aws_endpoint".into(), endpoint_override.unwrap_or(&self.aws_s3_endpoint).to_string());
201+
insert_opt!(opts, "AWS_ACCESS_KEY_ID", self.aws_access_key_id);
202+
insert_opt!(opts, "AWS_SECRET_ACCESS_KEY", self.aws_secret_access_key);
203+
insert_opt!(opts, "AWS_REGION", self.aws_default_region);
204+
insert_opt!(opts, "AWS_ALLOW_HTTP", self.aws_allow_http);
205+
opts.insert("AWS_ENDPOINT_URL".into(), endpoint_override.unwrap_or(&self.aws_s3_endpoint).to_string());
205206

206207
if self.is_dynamodb_locking_enabled() {
207-
opts.insert("aws_s3_locking_provider".into(), "dynamodb".into());
208-
insert_opt!(opts, "delta_dynamo_table_name", self.dynamodb.delta_dynamo_table_name);
209-
insert_opt!(opts, "aws_access_key_id_dynamodb", self.dynamodb.aws_access_key_id_dynamodb);
210-
insert_opt!(opts, "aws_secret_access_key_dynamodb", self.dynamodb.aws_secret_access_key_dynamodb);
211-
insert_opt!(opts, "aws_region_dynamodb", self.dynamodb.aws_region_dynamodb);
212-
insert_opt!(opts, "aws_endpoint_url_dynamodb", self.dynamodb.aws_endpoint_url_dynamodb);
208+
opts.insert("AWS_S3_LOCKING_PROVIDER".into(), "dynamodb".into());
209+
insert_opt!(opts, "DELTA_DYNAMO_TABLE_NAME", self.dynamodb.delta_dynamo_table_name);
210+
insert_opt!(opts, "AWS_ACCESS_KEY_ID_DYNAMODB", self.dynamodb.aws_access_key_id_dynamodb);
211+
insert_opt!(opts, "AWS_SECRET_ACCESS_KEY_DYNAMODB", self.dynamodb.aws_secret_access_key_dynamodb);
212+
insert_opt!(opts, "AWS_REGION_DYNAMODB", self.dynamodb.aws_region_dynamodb);
213+
insert_opt!(opts, "AWS_ENDPOINT_URL_DYNAMODB", self.dynamodb.aws_endpoint_url_dynamodb);
213214
}
214215
opts
215216
}
@@ -247,6 +248,8 @@ pub struct BufferConfig {
247248
pub timefusion_wal_corruption_threshold: usize,
248249
#[serde(default = "d_flush_parallelism")]
249250
pub timefusion_flush_parallelism: usize,
251+
#[serde(default)]
252+
pub timefusion_flush_immediately: bool,
250253
}
251254

252255
impl BufferConfig {
@@ -268,6 +271,9 @@ impl BufferConfig {
268271
pub fn flush_parallelism(&self) -> usize {
269272
self.timefusion_flush_parallelism.max(1)
270273
}
274+
pub fn flush_immediately(&self) -> bool {
275+
self.timefusion_flush_immediately
276+
}
271277

272278
pub fn compute_shutdown_timeout(&self, current_memory_mb: usize) -> Duration {
273279
Duration::from_secs((self.timefusion_shutdown_timeout_secs.max(1) + (current_memory_mb / 100) as u64).min(300))

0 commit comments

Comments
 (0)