From eeebe58c4af4bc032755a5ebf6b61c922883aa20 Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 24 Sep 2025 08:17:58 +0800 Subject: [PATCH 01/46] RowGroupWriter --- src/query/service/src/spillers/serialize.rs | 47 +++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/query/service/src/spillers/serialize.rs b/src/query/service/src/spillers/serialize.rs index 7edf72ab90d2d..928e25ce1ce88 100644 --- a/src/query/service/src/spillers/serialize.rs +++ b/src/query/service/src/spillers/serialize.rs @@ -35,6 +35,9 @@ use databend_common_expression::DataSchema; use databend_common_expression::Value; use opendal::Buffer; use parquet::arrow::arrow_reader::ParquetRecordBatchReader; +use parquet::arrow::arrow_writer::compute_leaves; +use parquet::arrow::arrow_writer::get_column_writers; +use parquet::arrow::arrow_writer::ArrowColumnWriter; use parquet::arrow::ArrowWriter; use parquet::basic::Compression; use parquet::errors; @@ -42,7 +45,9 @@ use parquet::file::properties::EnabledStatistics; use parquet::file::properties::WriterProperties; use parquet::file::reader::ChunkReader; use parquet::file::reader::Length; +use parquet::file::writer::SerializedRowGroupWriter; use parquet::format::FileMetaData; +use parquet::schema::types::SchemaDescriptor; #[derive(Debug, Clone)] pub enum Layout { @@ -236,6 +241,48 @@ impl ChunkReader for Reader { } } +pub struct RowGroupWriter { + schema: Arc, + props: Arc, + writers: Vec, +} + +impl RowGroupWriter { + fn new(props: Arc, arrow: Arc, parquet: &SchemaDescriptor) -> Self { + let col_writers = get_column_writers(parquet, &props, &arrow).unwrap(); + Self { + schema: arrow, + props, + writers: col_writers, + } + } + + pub fn write(&mut self, block: DataBlock) -> errors::Result<()> { + let mut writer_iter = self.writers.iter_mut(); + for (field, entry) in self.schema.fields().iter().zip(block.take_columns()) { + let array = (&entry.to_column()).into(); + for col in compute_leaves(field, &array).unwrap() { + writer_iter.next().unwrap().write(&col)?; + } + } + Ok(()) + } + + pub fn close( + self, + writer: &mut SerializedRowGroupWriter<'_, W>, + ) -> errors::Result<()> { + for w in self.writers { + w.close()?.append_to_row_group(writer)? + } + Ok(()) + } + + pub fn memory_size(&self) -> usize { + self.writers.iter().map(|w| w.memory_size()).sum() + } +} + #[cfg(test)] mod tests { use bytes::Bytes; From 2ee598a53355add04bba3a2d580e342799ba5fdf Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 24 Sep 2025 09:30:01 +0800 Subject: [PATCH 02/46] FileWriter --- src/query/service/src/spillers/serialize.rs | 59 ++++++++++++++++++--- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/src/query/service/src/spillers/serialize.rs b/src/query/service/src/spillers/serialize.rs index 928e25ce1ce88..825bca266b447 100644 --- a/src/query/service/src/spillers/serialize.rs +++ b/src/query/service/src/spillers/serialize.rs @@ -32,19 +32,24 @@ use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; use databend_common_expression::DataField; use databend_common_expression::DataSchema; +use databend_common_expression::TableSchema; use databend_common_expression::Value; use opendal::Buffer; use parquet::arrow::arrow_reader::ParquetRecordBatchReader; use parquet::arrow::arrow_writer::compute_leaves; use parquet::arrow::arrow_writer::get_column_writers; use parquet::arrow::arrow_writer::ArrowColumnWriter; +use parquet::arrow::ArrowSchemaConverter; use parquet::arrow::ArrowWriter; use parquet::basic::Compression; use parquet::errors; +use parquet::file::metadata::RowGroupMetaDataPtr; use parquet::file::properties::EnabledStatistics; use parquet::file::properties::WriterProperties; +use parquet::file::properties::WriterPropertiesPtr; use parquet::file::reader::ChunkReader; use parquet::file::reader::Length; +use parquet::file::writer::SerializedFileWriter; use parquet::file::writer::SerializedRowGroupWriter; use parquet::format::FileMetaData; use parquet::schema::types::SchemaDescriptor; @@ -243,16 +248,14 @@ impl ChunkReader for Reader { pub struct RowGroupWriter { schema: Arc, - props: Arc, writers: Vec, } impl RowGroupWriter { - fn new(props: Arc, arrow: Arc, parquet: &SchemaDescriptor) -> Self { - let col_writers = get_column_writers(parquet, &props, &arrow).unwrap(); + fn new(props: &WriterPropertiesPtr, schema: Arc, parquet: &SchemaDescriptor) -> Self { + let col_writers = get_column_writers(parquet, props, &schema).unwrap(); Self { - schema: arrow, - props, + schema, writers: col_writers, } } @@ -268,7 +271,7 @@ impl RowGroupWriter { Ok(()) } - pub fn close( + fn close( self, writer: &mut SerializedRowGroupWriter<'_, W>, ) -> errors::Result<()> { @@ -283,6 +286,50 @@ impl RowGroupWriter { } } +pub struct FileWriter { + props: Arc, + schema: Arc, + writer: SerializedFileWriter, +} + +impl FileWriter { + pub fn new( + props: Arc, + table_schema: &TableSchema, + w: W, + ) -> errors::Result { + let schema = Arc::new(Schema::from(table_schema)); + + let parquet = ArrowSchemaConverter::new() + .with_coerce_types(props.coerce_types()) + .convert(&schema)?; + + let writer = SerializedFileWriter::new(w, parquet.root_schema_ptr(), props.clone())?; + Ok(Self { + props, + schema, + writer, + }) + } + + pub fn new_row_group(&self) -> RowGroupWriter { + RowGroupWriter::new(&self.props, self.schema.clone(), self.writer.schema_descr()) + } + + pub fn flush_row_group( + &mut self, + row_group: RowGroupWriter, + ) -> errors::Result { + let mut row_group_writer = self.writer.next_row_group()?; + row_group.close(&mut row_group_writer)?; + row_group_writer.close() + } + + pub fn close(self) -> errors::Result { + self.writer.close() + } +} + #[cfg(test)] mod tests { use bytes::Bytes; From 155ca77545bb4c06160597f55b69ca6c7e326b96 Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 24 Sep 2025 15:23:24 +0800 Subject: [PATCH 03/46] grow_size --- src/common/base/src/base/dma.rs | 54 +++++++++++++++---- src/query/service/src/spillers/inner.rs | 7 ++- src/query/service/src/spillers/serialize.rs | 30 +++++------ .../storages/common/cache/src/temp_dir.rs | 25 ++++++++- 4 files changed, 85 insertions(+), 31 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index a45c2e8cadd1d..8d2d55fb62282 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -347,7 +347,7 @@ where pub struct DmaWriteBuf { allocator: DmaAllocator, - data: Vec>, + data: Vec, chunk: usize, } @@ -360,6 +360,10 @@ impl DmaWriteBuf { } } + pub fn chunk(&self) -> usize { + self.chunk + } + pub fn size(&self) -> usize { if self.data.is_empty() { return 0; @@ -407,6 +411,42 @@ impl DmaWriteBuf { pub fn into_data(self) -> Vec { self.data } + + pub fn write_last<'a>(&mut self, buf: &'a [u8]) -> &'a [u8] { + let Some(dst) = self.data.last_mut() else { + return buf; + }; + if dst.len() == dst.capacity() { + return buf; + } + + let remain = dst.capacity() - dst.len(); + Self::full_buffer(buf, dst, remain) + } + + fn full_buffer<'a>(buf: &'a [u8], dst: &mut DmaBuffer, remain: usize) -> &'a [u8] { + if buf.len() <= remain { + dst.extend_from_slice(buf); + &buf[buf.len()..] + } else { + let (left, right) = buf.split_at(remain); + dst.extend_from_slice(left); + right + } + } + + pub fn need_alloc(&self) -> bool { + self.data + .last() + .map(|dst| dst.len() == dst.capacity()) + .unwrap_or(true) + } + + pub fn alloc_buffer(&mut self) { + debug_assert!(self.data.iter().all(|buf| buf.len() == self.chunk)); + self.data + .push(Vec::with_capacity_in(self.chunk, self.allocator)); + } } impl Write for DmaWriteBuf { @@ -419,20 +459,12 @@ impl Write for DmaWriteBuf { (dst, remain) } _ => { - self.data - .push(Vec::with_capacity_in(self.chunk, self.allocator)); + self.alloc_buffer(); (self.data.last_mut().unwrap(), self.chunk) } }; - if buf.len() <= remain { - dst.extend_from_slice(buf); - buf = &buf[buf.len()..] - } else { - let (left, right) = buf.split_at(remain); - dst.extend_from_slice(left); - buf = right - } + buf = Self::full_buffer(buf, dst, remain); } Ok(n) } diff --git a/src/query/service/src/spillers/inner.rs b/src/query/service/src/spillers/inner.rs index 2edd9e967b912..65915dce2d8cf 100644 --- a/src/query/service/src/spillers/inner.rs +++ b/src/query/service/src/spillers/inner.rs @@ -224,12 +224,17 @@ impl SpillerInner { deserialize_block(columns_layout, data) } - pub(super) async fn write_encodes(&self, size: usize, buf: DmaWriteBuf) -> Result { + pub(super) fn new_location(&self, size: usize) -> Result { let location = match &self.temp_dir { None => None, Some(disk) => disk.new_file_with_size(size)?.map(Location::Local), } .unwrap_or(Location::Remote(self.create_unique_location())); + Ok(location) + } + + pub(super) async fn write_encodes(&self, size: usize, buf: DmaWriteBuf) -> Result { + let location = self.new_location(size)?; let mut writer = match (&location, &self.local_operator) { (Location::Local(path), None) => { diff --git a/src/query/service/src/spillers/serialize.rs b/src/query/service/src/spillers/serialize.rs index 825bca266b447..42aa2a25a6024 100644 --- a/src/query/service/src/spillers/serialize.rs +++ b/src/query/service/src/spillers/serialize.rs @@ -253,14 +253,11 @@ pub struct RowGroupWriter { impl RowGroupWriter { fn new(props: &WriterPropertiesPtr, schema: Arc, parquet: &SchemaDescriptor) -> Self { - let col_writers = get_column_writers(parquet, props, &schema).unwrap(); - Self { - schema, - writers: col_writers, - } + let writers = get_column_writers(parquet, props, &schema).unwrap(); + Self { schema, writers } } - pub fn write(&mut self, block: DataBlock) -> errors::Result<()> { + pub(super) fn write(&mut self, block: DataBlock) -> errors::Result<()> { let mut writer_iter = self.writers.iter_mut(); for (field, entry) in self.schema.fields().iter().zip(block.take_columns()) { let array = (&entry.to_column()).into(); @@ -287,13 +284,12 @@ impl RowGroupWriter { } pub struct FileWriter { - props: Arc, schema: Arc, writer: SerializedFileWriter, } impl FileWriter { - pub fn new( + pub(super) fn new( props: Arc, table_schema: &TableSchema, w: W, @@ -305,18 +301,18 @@ impl FileWriter { .convert(&schema)?; let writer = SerializedFileWriter::new(w, parquet.root_schema_ptr(), props.clone())?; - Ok(Self { - props, - schema, - writer, - }) + Ok(Self { schema, writer }) } - pub fn new_row_group(&self) -> RowGroupWriter { - RowGroupWriter::new(&self.props, self.schema.clone(), self.writer.schema_descr()) + pub(super) fn new_row_group(&self) -> RowGroupWriter { + RowGroupWriter::new( + self.writer.properties(), + self.schema.clone(), + self.writer.schema_descr(), + ) } - pub fn flush_row_group( + pub(super) fn flush_row_group( &mut self, row_group: RowGroupWriter, ) -> errors::Result { @@ -325,7 +321,7 @@ impl FileWriter { row_group_writer.close() } - pub fn close(self) -> errors::Result { + pub(super) fn close(self) -> errors::Result { self.writer.close() } } diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs index c345595e099ba..1c7cb714080e4 100644 --- a/src/query/storages/common/cache/src/temp_dir.rs +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -215,8 +215,7 @@ impl TempDir { pub fn new_file_with_size(&self, size: usize) -> Result> { let path = self.path.join(GlobalUniqName::unique()).into_boxed_path(); - if self.dir_info.limit < *self.dir_info.size.lock().unwrap() + size - || self.manager.global_limit < self.manager.group.lock().unwrap().size() + size + if self.manager.global_limit < self.manager.group.lock().unwrap().size() + size || self.manager.insufficient_disk(size as u64)? { return Ok(None); @@ -242,6 +241,28 @@ impl TempDir { })))) } + pub fn grow_size(&self, path: &mut TempPath, grow: usize) -> Result { + let Some(path) = Arc::get_mut(&mut path.0) else { + return Err(ErrorCode::Internal("can't set size after share")); + }; + + if self.manager.global_limit < self.manager.group.lock().unwrap().size() + grow + || self.manager.insufficient_disk(grow as u64)? + { + return Ok(false); + } + + let mut dir_size = self.dir_info.size.lock().unwrap(); + if self.dir_info.limit < *dir_size + grow { + return Ok(false); + } + + *dir_size += grow; + path.size += grow; + + Ok(true) + } + fn init_dir(&self) -> Result<()> { let mut rt = Ok(()); self.dir_info.inited.call_once(|| { From e6a850608f344e331a0c4751b9b9dd1d4bebd81d Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 24 Sep 2025 16:07:25 +0800 Subject: [PATCH 04/46] x --- src/common/base/src/base/dma.rs | 30 +++++----- src/query/service/src/spillers/adapter.rs | 32 +++++++++- .../service/src/spillers/async_buffer.rs | 59 +++++++++++++++++-- .../storages/common/cache/src/temp_dir.rs | 18 ++++-- 4 files changed, 112 insertions(+), 27 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 8d2d55fb62282..3f56bbacaa09f 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -194,13 +194,13 @@ pub fn dma_buffer_to_bytes(buf: DmaBuffer) -> Bytes { /// A `DmaFile` is similar to a `File`, but it is opened with the `O_DIRECT` file in order to /// perform direct IO. -struct DmaFile { +struct AsyncDmaFile { fd: File, alignment: Alignment, buf: Option, } -impl DmaFile { +impl AsyncDmaFile { async fn open_raw(path: impl AsRef, #[allow(unused)] dio: bool) -> io::Result { #[allow(unused_mut)] let mut flags = 0; @@ -234,14 +234,14 @@ impl DmaFile { } /// Attempts to open a file in read-only mode. - async fn open(path: impl AsRef, dio: bool) -> io::Result { - let file = DmaFile::open_raw(path, dio).await?; + async fn open(path: impl AsRef, dio: bool) -> io::Result { + let file = AsyncDmaFile::open_raw(path, dio).await?; open_dma(file).await } /// Opens a file in write-only mode. - async fn create(path: impl AsRef, dio: bool) -> io::Result { - let file = DmaFile::create_raw(path, dio).await?; + async fn create(path: impl AsRef, dio: bool) -> io::Result { + let file = AsyncDmaFile::create_raw(path, dio).await?; open_dma(file).await } @@ -314,11 +314,11 @@ impl DmaFile { } } -async fn open_dma(file: File) -> io::Result { +async fn open_dma(file: File) -> io::Result { let stat = fstatvfs(&file).await?; let alignment = Alignment::new(stat.f_bsize.max(512) as usize).unwrap(); - Ok(DmaFile { + Ok(AsyncDmaFile { fd: file, alignment, buf: None, @@ -373,8 +373,8 @@ impl DmaWriteBuf { } pub async fn into_file(mut self, path: impl AsRef, dio: bool) -> io::Result { - let mut file = DmaFile { - fd: DmaFile::create_raw(path, dio).await?, + let mut file = AsyncDmaFile { + fd: AsyncDmaFile::create_raw(path, dio).await?, alignment: self.allocator.0, buf: None, }; @@ -478,7 +478,7 @@ pub async fn dma_write_file_vectored<'a>( path: impl AsRef, bufs: &'a [IoSlice<'a>], ) -> io::Result { - let mut file = DmaFile::create(path.as_ref(), true).await?; + let mut file = AsyncDmaFile::create(path.as_ref(), true).await?; let file_length = bufs.iter().map(|buf| buf.len()).sum(); if file_length == 0 { @@ -536,7 +536,7 @@ pub async fn dma_read_file( mut writer: impl io::Write, ) -> io::Result { const BUFFER_SIZE: usize = 1024 * 1024; - let mut file = DmaFile::open(path.as_ref(), true).await?; + let mut file = AsyncDmaFile::open(path.as_ref(), true).await?; let buf = Vec::with_capacity_in( file.align_up(BUFFER_SIZE), DmaAllocator::new(file.alignment), @@ -571,7 +571,7 @@ pub async fn dma_read_file_range( path: impl AsRef, range: Range, ) -> io::Result<(DmaBuffer, Range)> { - let mut file = DmaFile::open(path.as_ref(), true).await?; + let mut file = AsyncDmaFile::open(path.as_ref(), true).await?; let align_start = file.align_down(range.start as usize); let align_end = file.align_up(range.end as usize); @@ -661,7 +661,7 @@ mod tests { assert_eq!(length, want.len()); assert_eq!(got, want); - let file = DmaFile::open(filename, dio).await?; + let file = AsyncDmaFile::open(filename, dio).await?; let align = file.alignment; drop(file); @@ -731,7 +731,7 @@ mod tests { let bufs = vec![IoSlice::new(&want)]; dma_write_file_vectored(filename, &bufs).await.unwrap(); - let mut file = DmaFile::open(filename, true).await.unwrap(); + let mut file = AsyncDmaFile::open(filename, true).await.unwrap(); let buf = Vec::with_capacity_in(file_size, DmaAllocator::new(file.alignment)); file.set_buffer(buf); diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index e70043766abef..078b1f738e2ac 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -31,6 +31,8 @@ use databend_common_pipeline_transforms::traits::DataBlockSpill; use databend_storages_common_cache::TempPath; use opendal::Buffer; use opendal::Operator; +use parquet::file::metadata::RowGroupMetaDataPtr; +use parquet::format::FileMetaData; use super::inner::*; use super::serialize::*; @@ -95,8 +97,8 @@ impl Spiller { #[async_backtrace::framed] /// Read spilled data with partition id - pub async fn read_spilled_partition(&mut self, procedure_id: &usize) -> Result> { - if let Some(locs) = self.adapter.partition_location.get(procedure_id) { + pub async fn read_spilled_partition(&mut self, partition_id: &usize) -> Result> { + if let Some(locs) = self.adapter.partition_location.get(partition_id) { let mut spilled_data = Vec::with_capacity(locs.len()); for (loc, _data_size, _blocks_num) in locs.iter() { let block = self.read_spilled_file(loc).await?; @@ -354,6 +356,32 @@ pub struct Chunk { pub layout: Layout, } +pub struct SpillWriter { + file: FileWriter>, +} + +impl SpillWriter { + pub fn spill(&mut self, blocks: Vec) -> Result { + let mut row_group = self.file.new_row_group(); + for block in blocks { + row_group.write(block)?; + } + Ok(self.file.flush_row_group(row_group)?) + } + + pub fn close(self) -> Result { + Ok(self.file.close()?) + } +} + +pub struct SpillReader {} + +impl SpillReader { + pub fn restore(&self, _ordinal: i16) { + todo!() + } +} + impl SpillAdapter for Arc { fn add_spill_file(&self, location: Location, layout: Layout, size: usize) { self.as_ref().add_spill_file(location, layout, size); diff --git a/src/query/service/src/spillers/async_buffer.rs b/src/query/service/src/spillers/async_buffer.rs index dc8f8524f33c3..2e19715a42dad 100644 --- a/src/query/service/src/spillers/async_buffer.rs +++ b/src/query/service/src/spillers/async_buffer.rs @@ -13,7 +13,7 @@ // limitations under the License. use std::collections::VecDeque; -use std::io::Write; +use std::io; use std::sync::Arc; use std::sync::Condvar; use std::sync::Mutex; @@ -21,11 +21,16 @@ use std::sync::PoisonError; use bytes::Bytes; use bytes::BytesMut; +use databend_common_base::base::DmaWriteBuf; use databend_common_base::runtime::Runtime; use databend_common_base::runtime::TrySpawn; +use databend_storages_common_cache::TempDir; +use databend_storages_common_cache::TempPath; use opendal::Metadata; use opendal::Writer; +use super::Location; + const CHUNK_SIZE: usize = 4 * 1024 * 1024; /// Buffer Pool Workflow for Spill Operations: @@ -119,6 +124,7 @@ impl BufferPool { available_write_buffers_tx: buffers_tx, }) } + pub fn try_alloc_buffer(&self) -> Option { self.available_write_buffers.try_recv().ok() } @@ -264,8 +270,8 @@ impl BufferWriter { } } -impl std::io::Write for BufferWriter { - fn write(&mut self, buf: &[u8]) -> std::io::Result { +impl io::Write for BufferWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { if buf.is_empty() { return Ok(0); } @@ -315,7 +321,7 @@ impl std::io::Write for BufferWriter { Ok(written) } - fn flush(&mut self) -> std::io::Result<()> { + fn flush(&mut self) -> io::Result<()> { if matches!(&self.current_bytes, Some(current_bytes) if !current_bytes.is_empty()) { if let Some(current_bytes) = self.current_bytes.take() { self.pending_buffers.push_back(current_bytes.freeze()); @@ -467,6 +473,51 @@ impl Background { } } +struct LocalDst { + dir: Arc, + path: TempPath, + buf: Option, +} + +struct RemoteDst { + path: String, + buf: BufferWriter, +} + +pub struct XXX { + local: Option, + remote: Option, +} + +impl io::Write for XXX { + fn write(&mut self, mut buf: &[u8]) -> io::Result { + match &mut self.local { + Some(LocalDst { + dir, + path, + buf: Some(dma), + }) => { + if dma.need_alloc() { + if dir.grow_size(path, dma.chunk())? { + dma.alloc_buffer(); + buf = dma.write_last(buf); + } + } else { + buf = dma.write_last(buf); + } + } + _ => todo!(), + } + + // self.local.unwrap().set_size(size) + todo!() + } + + fn flush(&mut self) -> io::Result<()> { + todo!() + } +} + #[cfg(test)] mod tests { use std::io::Write; diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs index 1c7cb714080e4..2c098e81bd05b 100644 --- a/src/query/storages/common/cache/src/temp_dir.rs +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -17,6 +17,7 @@ use std::collections::HashMap; use std::fmt::Debug; use std::fs; use std::hash::Hash; +use std::io; use std::io::ErrorKind; use std::ops::Deref; use std::ops::Drop; @@ -182,9 +183,8 @@ impl TempDirManager { self.alignment } - fn insufficient_disk(&self, size: u64) -> Result { - let stat = statvfs(self.root.as_ref().unwrap().as_ref()) - .map_err(|e| ErrorCode::Internal(e.to_string()))?; + fn insufficient_disk(&self, size: u64) -> io::Result { + let stat = statvfs(self.root.as_ref().unwrap().as_ref()); debug_assert_eq!(stat.f_frsize, self.alignment.as_usize() as u64); let n = self.alignment.align_up_count(size as usize) as u64; @@ -216,7 +216,10 @@ impl TempDir { let path = self.path.join(GlobalUniqName::unique()).into_boxed_path(); if self.manager.global_limit < self.manager.group.lock().unwrap().size() + size - || self.manager.insufficient_disk(size as u64)? + || self + .manager + .insufficient_disk(size as u64) + .map_err(|e| ErrorCode::Internal(format!("insufficient_disk fail {e}")))? { return Ok(None); } @@ -241,9 +244,12 @@ impl TempDir { })))) } - pub fn grow_size(&self, path: &mut TempPath, grow: usize) -> Result { + pub fn grow_size(&self, path: &mut TempPath, grow: usize) -> io::Result { let Some(path) = Arc::get_mut(&mut path.0) else { - return Err(ErrorCode::Internal("can't set size after share")); + return Err(io::const_error!( + io::ErrorKind::InvalidInput, + "can't set size after share" + )); }; if self.manager.global_limit < self.manager.group.lock().unwrap().size() + grow From c06050ac0828cbf418b8b192e4277cd299dad0e0 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 25 Sep 2025 09:05:38 +0800 Subject: [PATCH 05/46] x --- src/common/base/src/base/dma.rs | 105 ++++++++++++++++---------------- 1 file changed, 54 insertions(+), 51 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 3f56bbacaa09f..b0b7d1134ea85 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -22,6 +22,7 @@ use std::io::IoSlice; use std::io::SeekFrom; use std::io::Write; use std::ops::Range; +use std::os::fd::AsFd; use std::os::fd::BorrowedFd; use std::os::unix::io::AsRawFd; use std::path::Path; @@ -30,7 +31,7 @@ use std::ptr::NonNull; use bytes::Bytes; use rustix::fs::OFlags; -use tokio::fs::File; +use tokio::fs::File as AsyncFile; use tokio::io::AsyncSeekExt; use crate::runtime::spawn_blocking; @@ -192,59 +193,15 @@ pub fn dma_buffer_to_bytes(buf: DmaBuffer) -> Bytes { Bytes::from(data) } -/// A `DmaFile` is similar to a `File`, but it is opened with the `O_DIRECT` file in order to +/// A `AsyncDmaFile` is similar to a `File`, but it is opened with the `O_DIRECT` file in order to /// perform direct IO. -struct AsyncDmaFile { - fd: File, +struct DmaFile { + fd: F, alignment: Alignment, buf: Option, } -impl AsyncDmaFile { - async fn open_raw(path: impl AsRef, #[allow(unused)] dio: bool) -> io::Result { - #[allow(unused_mut)] - let mut flags = 0; - #[cfg(target_os = "linux")] - if dio { - flags = OFlags::DIRECT.bits() as i32 - } - - File::options() - .read(true) - .custom_flags(flags) - .open(path) - .await - } - - async fn create_raw(path: impl AsRef, #[allow(unused)] dio: bool) -> io::Result { - #[allow(unused_mut)] - let mut flags = OFlags::EXCL; - #[cfg(target_os = "linux")] - if dio { - flags |= OFlags::DIRECT; - } - - File::options() - .write(true) - .create(true) - .truncate(true) - .custom_flags(flags.bits() as i32) - .open(path) - .await - } - - /// Attempts to open a file in read-only mode. - async fn open(path: impl AsRef, dio: bool) -> io::Result { - let file = AsyncDmaFile::open_raw(path, dio).await?; - open_dma(file).await - } - - /// Opens a file in write-only mode. - async fn create(path: impl AsRef, dio: bool) -> io::Result { - let file = AsyncDmaFile::create_raw(path, dio).await?; - open_dma(file).await - } - +impl DmaFile { fn set_buffer(&mut self, buf: DmaBuffer) { self.buf = Some(buf) } @@ -308,13 +265,59 @@ impl AsyncDmaFile { fn truncate(&self, length: usize) -> io::Result<()> { rustix::fs::ftruncate(&self.fd, length as u64).map_err(|e| e.into()) } +} + +type AsyncDmaFile = DmaFile; + +impl AsyncDmaFile { + async fn open_raw(path: impl AsRef, dio: bool) -> io::Result { + let flags = if cfg!(target_os = "linux") && dio { + OFlags::DIRECT.bits() as i32 + } else { + 0 + }; + + AsyncFile::options() + .read(true) + .custom_flags(flags) + .open(path) + .await + } + + async fn create_raw(path: impl AsRef, dio: bool) -> io::Result { + let flags = if cfg!(target_os = "linux") && dio { + OFlags::EXCL | OFlags::DIRECT + } else { + OFlags::EXCL + }; + + AsyncFile::options() + .write(true) + .create(true) + .truncate(true) + .custom_flags(flags.bits() as i32) + .open(path) + .await + } + + /// Attempts to open a file in read-only mode. + async fn open(path: impl AsRef, dio: bool) -> io::Result { + let file = AsyncDmaFile::open_raw(path, dio).await?; + open_dma(file).await + } + + /// Opens a file in write-only mode. + async fn create(path: impl AsRef, dio: bool) -> io::Result { + let file = AsyncDmaFile::create_raw(path, dio).await?; + open_dma(file).await + } async fn seek(&mut self, pos: SeekFrom) -> io::Result { self.fd.seek(pos).await } } -async fn open_dma(file: File) -> io::Result { +async fn open_dma(file: AsyncFile) -> io::Result { let stat = fstatvfs(&file).await?; let alignment = Alignment::new(stat.f_bsize.max(512) as usize).unwrap(); @@ -325,7 +328,7 @@ async fn open_dma(file: File) -> io::Result { }) } -async fn fstatvfs(file: &File) -> io::Result { +async fn fstatvfs(file: &AsyncFile) -> io::Result { let fd = file.as_raw_fd(); asyncify(move || { let fd = unsafe { BorrowedFd::borrow_raw(fd) }; From 4d64ca0be56f580b063ac72bca612dbc6312c9fc Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 25 Sep 2025 13:35:51 +0800 Subject: [PATCH 06/46] x --- src/common/base/src/base/dma.rs | 222 ++++++++++++++---- src/common/base/src/base/mod.rs | 8 +- .../service/src/spillers/async_buffer.rs | 40 ++-- src/query/storages/common/cache/src/lib.rs | 1 + .../storages/common/cache/src/temp_dir.rs | 25 +- 5 files changed, 223 insertions(+), 73 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index b0b7d1134ea85..741a37640e850 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -20,10 +20,10 @@ use std::fmt; use std::io; use std::io::IoSlice; use std::io::SeekFrom; -use std::io::Write; use std::ops::Range; use std::os::fd::AsFd; use std::os::fd::BorrowedFd; +use std::os::fd::OwnedFd; use std::os::unix::io::AsRawFd; use std::path::Path; use std::ptr; @@ -195,7 +195,7 @@ pub fn dma_buffer_to_bytes(buf: DmaBuffer) -> Bytes { /// A `AsyncDmaFile` is similar to a `File`, but it is opened with the `O_DIRECT` file in order to /// perform direct IO. -struct DmaFile { +pub struct DmaFile { fd: F, alignment: Alignment, buf: Option, @@ -218,7 +218,6 @@ impl DmaFile { /// Return the alignment requirement for this file. The returned alignment value can be used /// to allocate a buffer to use with this file: - #[expect(dead_code)] pub fn alignment(&self) -> Alignment { self.alignment } @@ -265,12 +264,16 @@ impl DmaFile { fn truncate(&self, length: usize) -> io::Result<()> { rustix::fs::ftruncate(&self.fd, length as u64).map_err(|e| e.into()) } + + fn size(&self) -> io::Result { + Ok(rustix::fs::fstat(&self.fd)?.st_size as _) + } } type AsyncDmaFile = DmaFile; impl AsyncDmaFile { - async fn open_raw(path: impl AsRef, dio: bool) -> io::Result { + async fn open_fd(path: impl AsRef, dio: bool) -> io::Result { let flags = if cfg!(target_os = "linux") && dio { OFlags::DIRECT.bits() as i32 } else { @@ -284,7 +287,7 @@ impl AsyncDmaFile { .await } - async fn create_raw(path: impl AsRef, dio: bool) -> io::Result { + async fn create_fd(path: impl AsRef, dio: bool) -> io::Result { let flags = if cfg!(target_os = "linux") && dio { OFlags::EXCL | OFlags::DIRECT } else { @@ -302,14 +305,29 @@ impl AsyncDmaFile { /// Attempts to open a file in read-only mode. async fn open(path: impl AsRef, dio: bool) -> io::Result { - let file = AsyncDmaFile::open_raw(path, dio).await?; - open_dma(file).await + let file = AsyncDmaFile::open_fd(path, dio).await?; + Self::open_dma(file).await } /// Opens a file in write-only mode. async fn create(path: impl AsRef, dio: bool) -> io::Result { - let file = AsyncDmaFile::create_raw(path, dio).await?; - open_dma(file).await + let file = AsyncDmaFile::create_fd(path, dio).await?; + Self::open_dma(file).await + } + + async fn open_dma(file: AsyncFile) -> io::Result { + let fd = file.as_raw_fd(); + let stat = asyncify(move || { + rustix::fs::fstatvfs(unsafe { BorrowedFd::borrow_raw(fd) }).map_err(|e| e.into()) + }) + .await?; + let alignment = Alignment::new(stat.f_bsize.max(512) as usize).unwrap(); + + Ok(AsyncDmaFile { + fd: file, + alignment, + buf: None, + }) } async fn seek(&mut self, pos: SeekFrom) -> io::Result { @@ -317,24 +335,48 @@ impl AsyncDmaFile { } } -async fn open_dma(file: AsyncFile) -> io::Result { - let stat = fstatvfs(&file).await?; - let alignment = Alignment::new(stat.f_bsize.max(512) as usize).unwrap(); +pub type SyncDmaFile = DmaFile; - Ok(AsyncDmaFile { - fd: file, - alignment, - buf: None, - }) -} +impl SyncDmaFile { + fn open_fd(path: impl rustix::path::Arg, dio: bool) -> io::Result { + let flags = if cfg!(target_os = "linux") && dio { + OFlags::RDONLY | OFlags::DIRECT + } else { + OFlags::RDONLY + }; + rustix::fs::open(path, flags, rustix::fs::Mode::empty()).map_err(|e| e.into()) + } + + fn create_fd(path: impl rustix::path::Arg, dio: bool) -> io::Result { + let flags = if cfg!(target_os = "linux") && dio { + OFlags::EXCL | OFlags::CREATE | OFlags::TRUNC | OFlags::DIRECT + } else { + OFlags::EXCL | OFlags::CREATE | OFlags::TRUNC + }; + + rustix::fs::open(path, flags, rustix::fs::Mode::empty()).map_err(|e| e.into()) + } -async fn fstatvfs(file: &AsyncFile) -> io::Result { - let fd = file.as_raw_fd(); - asyncify(move || { - let fd = unsafe { BorrowedFd::borrow_raw(fd) }; - rustix::fs::fstatvfs(fd).map_err(|e| e.into()) - }) - .await + fn open_dma(fd: OwnedFd) -> io::Result> { + let stat = rustix::fs::fstatvfs(&fd)?; + let alignment = Alignment::new(stat.f_bsize.max(512) as usize).unwrap(); + + Ok(Self { + fd, + alignment, + buf: None, + }) + } + + pub fn open(path: impl AsRef, dio: bool) -> io::Result { + let fd = Self::open_fd(path.as_ref(), dio)?; + Self::open_dma(fd) + } + + pub fn create(path: impl AsRef, dio: bool) -> io::Result { + let fd = Self::create_fd(path.as_ref(), dio)?; + Self::open_dma(fd) + } } async fn asyncify(f: F) -> io::Result @@ -377,7 +419,7 @@ impl DmaWriteBuf { pub async fn into_file(mut self, path: impl AsRef, dio: bool) -> io::Result { let mut file = AsyncDmaFile { - fd: AsyncDmaFile::create_raw(path, dio).await?, + fd: AsyncDmaFile::create_fd(path, dio).await?, alignment: self.allocator.0, buf: None, }; @@ -415,44 +457,134 @@ impl DmaWriteBuf { self.data } - pub fn write_last<'a>(&mut self, buf: &'a [u8]) -> &'a [u8] { - let Some(dst) = self.data.last_mut() else { - return buf; - }; - if dst.len() == dst.capacity() { - return buf; - } + // fn write_last<'a>(&mut self, buf: &'a [u8]) -> &'a [u8] { + // let Some(dst) = self.data.last_mut() else { + // return buf; + // }; + // if dst.len() == dst.capacity() { + // return buf; + // } - let remain = dst.capacity() - dst.len(); - Self::full_buffer(buf, dst, remain) - } + // let remain = dst.capacity() - dst.len(); + // Self::copy(buf, dst, remain) + // } - fn full_buffer<'a>(buf: &'a [u8], dst: &mut DmaBuffer, remain: usize) -> &'a [u8] { - if buf.len() <= remain { - dst.extend_from_slice(buf); - &buf[buf.len()..] + fn copy<'a>(src: &'a [u8], dst: &mut DmaBuffer, remain: usize) -> &'a [u8] { + if src.len() <= remain { + dst.extend_from_slice(src); + &src[src.len()..] } else { - let (left, right) = buf.split_at(remain); + let (left, right) = src.split_at(remain); dst.extend_from_slice(left); right } } - pub fn need_alloc(&self) -> bool { + fn is_last_full(&self) -> bool { self.data .last() .map(|dst| dst.len() == dst.capacity()) .unwrap_or(true) } + pub fn fast_write(&mut self, buf: &[u8]) -> bool { + let Some(dst) = self.data.last_mut() else { + return false; + }; + + if buf.len() > dst.capacity() - dst.len() { + return false; + } + dst.extend_from_slice(buf); + true + } + pub fn alloc_buffer(&mut self) { debug_assert!(self.data.iter().all(|buf| buf.len() == self.chunk)); self.data .push(Vec::with_capacity_in(self.chunk, self.allocator)); } + + pub fn flush_full_buffer(&mut self, file: &mut SyncDmaFile) -> io::Result { + debug_assert_eq!(self.allocator.0, file.alignment); + + if self.size() < self.chunk { + return Ok(0); + } + + let data = if self.is_last_full() { + &self.data + } else { + &self.data[..self.data.len() - 1] + }; + + let len = data.len() * self.chunk; + + let bufs = data.iter().map(|buf| IoSlice::new(buf)).collect::>(); + let writen = rustix::io::writev(&file.fd, &bufs)?; + + let last = self.data.pop(); + self.data.clear(); + match last { + Some(last) if last.len() != last.capacity() => { + self.data.push(last); + } + _ => (), + } + + if writen != len { + Err(io::Error::other("short write")) + } else { + Ok(writen) + } + } + + pub fn flush_and_close(&mut self, mut file: SyncDmaFile) -> io::Result { + debug_assert_eq!(self.allocator.0, file.alignment); + + if self.is_last_full() { + return self.flush_full_buffer(&mut file); + } + + let (diff, to_truncate) = match self.data.last_mut() { + Some(last) if last.is_empty() => { + self.data.pop(); + (0, 0) + } + Some(last) => { + let n = last.len(); + let align_up = file.align_up(n); + if align_up == n { + (self.chunk - n, 0) + } else { + unsafe { last.set_len(align_up) }; + (self.chunk - align_up, align_up - n) + } + } + None => unreachable!(), + }; + let len = self.data.len() * self.chunk - diff; + let bufs = self + .data + .iter() + .map(|buf| IoSlice::new(buf)) + .collect::>(); + + let writen = rustix::io::writev(&file.fd, &bufs)?; + if writen != len { + return Err(io::Error::other("short write")); + } + + if to_truncate == 0 { + return Ok(writen); + } + + file.truncate(file.size()? - to_truncate)?; + Ok(writen - to_truncate) + } } -impl Write for DmaWriteBuf { +impl io::Write for DmaWriteBuf { fn write(&mut self, mut buf: &[u8]) -> io::Result { let n = buf.len(); while !buf.is_empty() { @@ -467,7 +599,7 @@ impl Write for DmaWriteBuf { } }; - buf = Self::full_buffer(buf, dst, remain); + buf = Self::copy(buf, dst, remain); } Ok(n) } @@ -614,6 +746,8 @@ pub async fn dma_read_file_range( #[cfg(test)] mod tests { + use std::io::Write; + use super::*; #[test] diff --git a/src/common/base/src/base/mod.rs b/src/common/base/src/base/mod.rs index 4f98ac033fa04..f97911ead9c00 100644 --- a/src/common/base/src/base/mod.rs +++ b/src/common/base/src/base/mod.rs @@ -31,13 +31,7 @@ mod uniq_id; mod watch_notify; pub use build_info::*; -pub use dma::dma_buffer_to_bytes; -pub use dma::dma_read_file; -pub use dma::dma_read_file_range; -pub use dma::dma_write_file_vectored; -pub use dma::Alignment; -pub use dma::DmaAllocator; -pub use dma::DmaWriteBuf; +pub use dma::*; pub use drop_callback::DropCallback; pub use net::get_free_tcp_port; pub use net::get_free_udp_port; diff --git a/src/query/service/src/spillers/async_buffer.rs b/src/query/service/src/spillers/async_buffer.rs index 2e19715a42dad..774b19aaf7608 100644 --- a/src/query/service/src/spillers/async_buffer.rs +++ b/src/query/service/src/spillers/async_buffer.rs @@ -14,6 +14,7 @@ use std::collections::VecDeque; use std::io; +use std::io::Write; use std::sync::Arc; use std::sync::Condvar; use std::sync::Mutex; @@ -22,6 +23,7 @@ use std::sync::PoisonError; use bytes::Bytes; use bytes::BytesMut; use databend_common_base::base::DmaWriteBuf; +use databend_common_base::base::SyncDmaFile; use databend_common_base::runtime::Runtime; use databend_common_base::runtime::TrySpawn; use databend_storages_common_cache::TempDir; @@ -29,8 +31,6 @@ use databend_storages_common_cache::TempPath; use opendal::Metadata; use opendal::Writer; -use super::Location; - const CHUNK_SIZE: usize = 4 * 1024 * 1024; /// Buffer Pool Workflow for Spill Operations: @@ -476,6 +476,7 @@ impl Background { struct LocalDst { dir: Arc, path: TempPath, + file: Option, buf: Option, } @@ -490,24 +491,33 @@ pub struct XXX { } impl io::Write for XXX { - fn write(&mut self, mut buf: &[u8]) -> io::Result { + fn write(&mut self, buf: &[u8]) -> io::Result { + let n = buf.len(); + // while !buf.is_empty() { match &mut self.local { - Some(LocalDst { - dir, - path, - buf: Some(dma), - }) => { - if dma.need_alloc() { - if dir.grow_size(path, dma.chunk())? { - dma.alloc_buffer(); - buf = dma.write_last(buf); - } - } else { - buf = dma.write_last(buf); + Some( + local @ LocalDst { + file: Some(_), + buf: Some(_), + .. + }, + ) => { + let dma = local.buf.as_mut().unwrap(); + if dma.fast_write(buf) { + return Ok(n); + } + + if local.dir.grow_size(&mut local.path, buf.len(), false)? { + dma.write(buf)?; + dma.flush_full_buffer(local.file.as_mut().unwrap())?; + return Ok(n); } + dma.flush_and_close(local.file.take().unwrap())?; } _ => todo!(), } + //} + // Ok(()) // self.local.unwrap().set_size(size) todo!() diff --git a/src/query/storages/common/cache/src/lib.rs b/src/query/storages/common/cache/src/lib.rs index 979751e1f6229..d978777843232 100644 --- a/src/query/storages/common/cache/src/lib.rs +++ b/src/query/storages/common/cache/src/lib.rs @@ -15,6 +15,7 @@ #![feature(write_all_vectored)] #![feature(associated_type_defaults)] #![feature(assert_matches)] +#![feature(io_const_error)] mod cache; mod cache_items; diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs index 2c098e81bd05b..d69dbcbfa5c07 100644 --- a/src/query/storages/common/cache/src/temp_dir.rs +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -183,11 +183,11 @@ impl TempDirManager { self.alignment } - fn insufficient_disk(&self, size: u64) -> io::Result { - let stat = statvfs(self.root.as_ref().unwrap().as_ref()); + fn insufficient_disk(&self, grow: u64) -> io::Result { + let stat = statvfs(self.root.as_ref().unwrap().as_ref())?; debug_assert_eq!(stat.f_frsize, self.alignment.as_usize() as u64); - let n = self.alignment.align_up_count(size as usize) as u64; + let n = self.alignment.align_up_count(grow as usize) as u64; Ok(stat.f_bavail < self.reserved + n) } } @@ -244,7 +244,12 @@ impl TempDir { })))) } - pub fn grow_size(&self, path: &mut TempPath, grow: usize) -> io::Result { + pub fn grow_size( + &self, + path: &mut TempPath, + grow: usize, + check_disk: bool, + ) -> io::Result { let Some(path) = Arc::get_mut(&mut path.0) else { return Err(io::const_error!( io::ErrorKind::InvalidInput, @@ -252,9 +257,11 @@ impl TempDir { )); }; - if self.manager.global_limit < self.manager.group.lock().unwrap().size() + grow - || self.manager.insufficient_disk(grow as u64)? - { + if self.manager.global_limit < self.manager.group.lock().unwrap().size() + grow { + return Ok(false); + } + + if check_disk && self.manager.insufficient_disk(grow as u64)? { return Ok(false); } @@ -288,6 +295,10 @@ impl TempDir { pub fn path(&self) -> &Path { &self.path } + + pub fn insufficient_disk(&self, grow: usize) -> io::Result { + self.manager.insufficient_disk(grow as _) + } } struct DirInfo { From 9e3c385fc3fa2e127d21232025c0e3e7616445ed Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 25 Sep 2025 14:55:50 +0800 Subject: [PATCH 07/46] x --- src/common/base/src/base/dma.rs | 2 +- .../service/src/spillers/async_buffer.rs | 33 ++++++++++++------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 741a37640e850..6284da1881d00 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -265,7 +265,7 @@ impl DmaFile { rustix::fs::ftruncate(&self.fd, length as u64).map_err(|e| e.into()) } - fn size(&self) -> io::Result { + pub fn size(&self) -> io::Result { Ok(rustix::fs::fstat(&self.fd)?.st_size as _) } } diff --git a/src/query/service/src/spillers/async_buffer.rs b/src/query/service/src/spillers/async_buffer.rs index 774b19aaf7608..03c2c356b36f5 100644 --- a/src/query/service/src/spillers/async_buffer.rs +++ b/src/query/service/src/spillers/async_buffer.rs @@ -493,8 +493,7 @@ pub struct XXX { impl io::Write for XXX { fn write(&mut self, buf: &[u8]) -> io::Result { let n = buf.len(); - // while !buf.is_empty() { - match &mut self.local { + let dma_buf = match &mut self.local { Some( local @ LocalDst { file: Some(_), @@ -509,22 +508,34 @@ impl io::Write for XXX { if local.dir.grow_size(&mut local.path, buf.len(), false)? { dma.write(buf)?; - dma.flush_full_buffer(local.file.as_mut().unwrap())?; + let file = local.file.as_mut().unwrap(); + dma.flush_full_buffer(file)?; + local.path.set_size(file.size()?).unwrap(); return Ok(n); } - dma.flush_and_close(local.file.take().unwrap())?; + + let mut file = local.file.take().unwrap(); + dma.flush_full_buffer(&mut file)?; + local.path.set_size(file.size()?).unwrap(); + drop(file); + local.buf.take().unwrap().into_data() } - _ => todo!(), - } - //} - // Ok(()) + _ => vec![], + }; - // self.local.unwrap().set_size(size) - todo!() + let Some(remote) = &mut self.remote else { + unreachable!() + }; + + for buf in dma_buf { + remote.buf.write(&buf)?; + } + remote.buf.write(buf) } fn flush(&mut self) -> io::Result<()> { - todo!() + // todo close + Ok(()) } } From 110540802960b5c2b6b2225ec62a41ec283ca624 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 25 Sep 2025 16:19:33 +0800 Subject: [PATCH 08/46] x --- src/common/base/src/base/dma.rs | 27 +++--- .../service/src/spillers/async_buffer.rs | 91 +++++++++++++++++-- 2 files changed, 96 insertions(+), 22 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 6284da1881d00..0c4ecc9f14a4b 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -199,6 +199,7 @@ pub struct DmaFile { fd: F, alignment: Alignment, buf: Option, + length: usize, } impl DmaFile { @@ -234,6 +235,7 @@ impl DmaFile { let buf = self.buffer(); match rustix::io::write(&self.fd, buf) { Ok(n) => { + self.length += n; if n != buf.len() { return Err(io::Error::other("short write")); } @@ -268,6 +270,10 @@ impl DmaFile { pub fn size(&self) -> io::Result { Ok(rustix::fs::fstat(&self.fd)?.st_size as _) } + + pub fn len(&self) -> usize { + self.length + } } type AsyncDmaFile = DmaFile; @@ -327,6 +333,7 @@ impl AsyncDmaFile { fd: file, alignment, buf: None, + length: 0, }) } @@ -365,6 +372,7 @@ impl SyncDmaFile { fd, alignment, buf: None, + length: 0, }) } @@ -422,6 +430,7 @@ impl DmaWriteBuf { fd: AsyncDmaFile::create_fd(path, dio).await?, alignment: self.allocator.0, buf: None, + length: 0, }; let file_length = self.size(); @@ -457,18 +466,6 @@ impl DmaWriteBuf { self.data } - // fn write_last<'a>(&mut self, buf: &'a [u8]) -> &'a [u8] { - // let Some(dst) = self.data.last_mut() else { - // return buf; - // }; - // if dst.len() == dst.capacity() { - // return buf; - // } - - // let remain = dst.capacity() - dst.len(); - // Self::copy(buf, dst, remain) - // } - fn copy<'a>(src: &'a [u8], dst: &mut DmaBuffer, remain: usize) -> &'a [u8] { if src.len() <= remain { dst.extend_from_slice(src); @@ -532,6 +529,8 @@ impl DmaWriteBuf { _ => (), } + file.length += writen; + if writen != len { Err(io::Error::other("short write")) } else { @@ -576,10 +575,12 @@ impl DmaWriteBuf { } if to_truncate == 0 { + file.length += writen; return Ok(writen); } - file.truncate(file.size()? - to_truncate)?; + file.length -= to_truncate; + file.truncate(file.length)?; Ok(writen - to_truncate) } } diff --git a/src/query/service/src/spillers/async_buffer.rs b/src/query/service/src/spillers/async_buffer.rs index 03c2c356b36f5..2b51c86a4b26f 100644 --- a/src/query/service/src/spillers/async_buffer.rs +++ b/src/query/service/src/spillers/async_buffer.rs @@ -481,19 +481,65 @@ struct LocalDst { } struct RemoteDst { + offset: usize, path: String, buf: BufferWriter, } -pub struct XXX { +pub struct MixFileWriter { local: Option, remote: Option, } -impl io::Write for XXX { +impl MixFileWriter { + pub fn open() {} + + pub fn finish(self) -> Result { + let local_path = match self.local { + Some( + local @ LocalDst { + file: Some(_), + buf: Some(_), + .. + }, + ) => { + let dma = local.buf.as_mut().unwrap(); + if dma.fast_write(buf) { + return Ok(n); + } + + let mut file = local.file.take().unwrap(); + let file_size = file.len() + dma.size(); + dma.flush_and_close(file)?; + + local.path.set_size(file_size).unwrap(); + + return Ok(MixFile { + local_path: local.path, + remote: None, + remote_offset: 0, + }); + } + Some(LocalDst { path, .. }) => Some(path), + _ => None, + }; + + let Some(remote) = &mut self.remote else { + unreachable!() + }; + + Ok(MixFile { + local_path, + remote: Some(remote.path), + remote_offset: remote.offset, + }) + } +} + +impl io::Write for MixFileWriter { fn write(&mut self, buf: &[u8]) -> io::Result { let n = buf.len(); - let dma_buf = match &mut self.local { + let (dma_buf, offset) = match &mut self.local { Some( local @ LocalDst { file: Some(_), @@ -510,22 +556,28 @@ impl io::Write for XXX { dma.write(buf)?; let file = local.file.as_mut().unwrap(); dma.flush_full_buffer(file)?; - local.path.set_size(file.size()?).unwrap(); + local.path.set_size(file.len()).unwrap(); return Ok(n); } let mut file = local.file.take().unwrap(); dma.flush_full_buffer(&mut file)?; - local.path.set_size(file.size()?).unwrap(); + + let file_size = file.len(); + local.path.set_size(file_size).unwrap(); drop(file); - local.buf.take().unwrap().into_data() + + (local.buf.take().unwrap().into_data(), file_size) } - _ => vec![], + _ => (vec![], 0), }; let Some(remote) = &mut self.remote else { unreachable!() }; + if offset != 0 { + remote.offset = offset; + } for buf in dma_buf { remote.buf.write(&buf)?; @@ -534,11 +586,32 @@ impl io::Write for XXX { } fn flush(&mut self) -> io::Result<()> { - // todo close - Ok(()) + match &mut self.local { + Some(LocalDst { + file: Some(file), + buf: Some(dma), + .. + }) => { + // warning: not completely flushed, data may be lost + dma.flush_full_buffer(file)?; + } + _ => (), + } + + let Some(remote) = &mut self.remote else { + unreachable!() + }; + + remote.buf.flush() } } +pub struct MixFile { + local_path: TempPath, + remote: Option, + remote_offset: usize, +} + #[cfg(test)] mod tests { use std::io::Write; From aeaa9245dd4721cc31c0f58b4edd3f7b5a274fd9 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 25 Sep 2025 18:26:42 +0800 Subject: [PATCH 09/46] x --- src/common/base/src/base/dma.rs | 5 +- .../service/src/spillers/async_buffer.rs | 90 ++++++++----------- 2 files changed, 38 insertions(+), 57 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 0c4ecc9f14a4b..5a42b5f82a238 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -233,10 +233,11 @@ impl DmaFile { fn write_direct(&mut self) -> io::Result { let buf = self.buffer(); + let buf_size = buf.len(); match rustix::io::write(&self.fd, buf) { Ok(n) => { self.length += n; - if n != buf.len() { + if n != buf_size { return Err(io::Error::other("short write")); } self.mut_buffer().clear(); @@ -271,7 +272,7 @@ impl DmaFile { Ok(rustix::fs::fstat(&self.fd)?.st_size as _) } - pub fn len(&self) -> usize { + pub fn length(&self) -> usize { self.length } } diff --git a/src/query/service/src/spillers/async_buffer.rs b/src/query/service/src/spillers/async_buffer.rs index 2b51c86a4b26f..2d49a672d1468 100644 --- a/src/query/service/src/spillers/async_buffer.rs +++ b/src/query/service/src/spillers/async_buffer.rs @@ -480,73 +480,60 @@ struct LocalDst { buf: Option, } -struct RemoteDst { - offset: usize, - path: String, - buf: BufferWriter, -} - pub struct MixFileWriter { - local: Option, - remote: Option, + local: LocalDst, + remote: BufferWriter, + remote_offset: usize, } impl MixFileWriter { - pub fn open() {} + pub fn new(local: LocalDst, remote: BufferWriter) -> Self { + MixFileWriter { + local, + remote, + remote_offset: 0, + } + } - pub fn finish(self) -> Result { + pub fn finish(self) -> io::Result { let local_path = match self.local { - Some( - local @ LocalDst { - file: Some(_), - buf: Some(_), - .. - }, - ) => { + mut local @ LocalDst { + file: Some(_), + buf: Some(_), + .. + } => { let dma = local.buf.as_mut().unwrap(); - if dma.fast_write(buf) { - return Ok(n); - } - let mut file = local.file.take().unwrap(); - let file_size = file.len() + dma.size(); + let file = local.file.take().unwrap(); + let file_size = file.length() + dma.size(); dma.flush_and_close(file)?; local.path.set_size(file_size).unwrap(); return Ok(MixFile { local_path: local.path, - remote: None, remote_offset: 0, }); } - Some(LocalDst { path, .. }) => Some(path), - _ => None, - }; - - let Some(remote) = &mut self.remote else { - unreachable!() + LocalDst { path, .. } => path, }; Ok(MixFile { local_path, - remote: Some(remote.path), - remote_offset: remote.offset, + remote_offset: self.remote_offset, }) } } impl io::Write for MixFileWriter { fn write(&mut self, buf: &[u8]) -> io::Result { - let n = buf.len(); let (dma_buf, offset) = match &mut self.local { - Some( - local @ LocalDst { - file: Some(_), - buf: Some(_), - .. - }, - ) => { + local @ LocalDst { + file: Some(_), + buf: Some(_), + .. + } => { + let n = buf.len(); let dma = local.buf.as_mut().unwrap(); if dma.fast_write(buf) { return Ok(n); @@ -556,14 +543,14 @@ impl io::Write for MixFileWriter { dma.write(buf)?; let file = local.file.as_mut().unwrap(); dma.flush_full_buffer(file)?; - local.path.set_size(file.len()).unwrap(); + local.path.set_size(file.length()).unwrap(); return Ok(n); } let mut file = local.file.take().unwrap(); dma.flush_full_buffer(&mut file)?; - let file_size = file.len(); + let file_size = file.length(); local.path.set_size(file_size).unwrap(); drop(file); @@ -572,43 +559,36 @@ impl io::Write for MixFileWriter { _ => (vec![], 0), }; - let Some(remote) = &mut self.remote else { - unreachable!() - }; if offset != 0 { - remote.offset = offset; + self.remote_offset = offset; } for buf in dma_buf { - remote.buf.write(&buf)?; + self.remote.write(&buf)?; } - remote.buf.write(buf) + self.remote.write(buf) } fn flush(&mut self) -> io::Result<()> { match &mut self.local { - Some(LocalDst { + LocalDst { file: Some(file), buf: Some(dma), .. - }) => { + } => { // warning: not completely flushed, data may be lost dma.flush_full_buffer(file)?; + return Ok(()); } _ => (), } - let Some(remote) = &mut self.remote else { - unreachable!() - }; - - remote.buf.flush() + self.remote.flush() } } pub struct MixFile { local_path: TempPath, - remote: Option, remote_offset: usize, } From fca0282096efb1bbb2c0b91298ad5aa34354837b Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 25 Sep 2025 18:55:10 +0800 Subject: [PATCH 10/46] x --- src/query/service/src/spillers/adapter.rs | 1 + .../service/src/spillers/async_buffer.rs | 123 --------- src/query/service/src/spillers/mod.rs | 1 + src/query/service/src/spillers/serialize.rs | 90 ------- src/query/service/src/spillers/union_file.rs | 249 ++++++++++++++++++ 5 files changed, 251 insertions(+), 213 deletions(-) create mode 100644 src/query/service/src/spillers/union_file.rs diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index 078b1f738e2ac..5b03670a676d5 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -36,6 +36,7 @@ use parquet::format::FileMetaData; use super::inner::*; use super::serialize::*; +use super::union_file::FileWriter; use super::Location; use crate::sessions::QueryContext; use crate::spillers::block_reader::BlocksReader; diff --git a/src/query/service/src/spillers/async_buffer.rs b/src/query/service/src/spillers/async_buffer.rs index 2d49a672d1468..1b6facbce230c 100644 --- a/src/query/service/src/spillers/async_buffer.rs +++ b/src/query/service/src/spillers/async_buffer.rs @@ -22,12 +22,8 @@ use std::sync::PoisonError; use bytes::Bytes; use bytes::BytesMut; -use databend_common_base::base::DmaWriteBuf; -use databend_common_base::base::SyncDmaFile; use databend_common_base::runtime::Runtime; use databend_common_base::runtime::TrySpawn; -use databend_storages_common_cache::TempDir; -use databend_storages_common_cache::TempPath; use opendal::Metadata; use opendal::Writer; @@ -473,125 +469,6 @@ impl Background { } } -struct LocalDst { - dir: Arc, - path: TempPath, - file: Option, - buf: Option, -} - -pub struct MixFileWriter { - local: LocalDst, - remote: BufferWriter, - remote_offset: usize, -} - -impl MixFileWriter { - pub fn new(local: LocalDst, remote: BufferWriter) -> Self { - MixFileWriter { - local, - remote, - remote_offset: 0, - } - } - - pub fn finish(self) -> io::Result { - let local_path = match self.local { - mut local @ LocalDst { - file: Some(_), - buf: Some(_), - .. - } => { - let dma = local.buf.as_mut().unwrap(); - - let file = local.file.take().unwrap(); - let file_size = file.length() + dma.size(); - dma.flush_and_close(file)?; - - local.path.set_size(file_size).unwrap(); - - return Ok(MixFile { - local_path: local.path, - remote_offset: 0, - }); - } - LocalDst { path, .. } => path, - }; - - Ok(MixFile { - local_path, - remote_offset: self.remote_offset, - }) - } -} - -impl io::Write for MixFileWriter { - fn write(&mut self, buf: &[u8]) -> io::Result { - let (dma_buf, offset) = match &mut self.local { - local @ LocalDst { - file: Some(_), - buf: Some(_), - .. - } => { - let n = buf.len(); - let dma = local.buf.as_mut().unwrap(); - if dma.fast_write(buf) { - return Ok(n); - } - - if local.dir.grow_size(&mut local.path, buf.len(), false)? { - dma.write(buf)?; - let file = local.file.as_mut().unwrap(); - dma.flush_full_buffer(file)?; - local.path.set_size(file.length()).unwrap(); - return Ok(n); - } - - let mut file = local.file.take().unwrap(); - dma.flush_full_buffer(&mut file)?; - - let file_size = file.length(); - local.path.set_size(file_size).unwrap(); - drop(file); - - (local.buf.take().unwrap().into_data(), file_size) - } - _ => (vec![], 0), - }; - - if offset != 0 { - self.remote_offset = offset; - } - - for buf in dma_buf { - self.remote.write(&buf)?; - } - self.remote.write(buf) - } - - fn flush(&mut self) -> io::Result<()> { - match &mut self.local { - LocalDst { - file: Some(file), - buf: Some(dma), - .. - } => { - // warning: not completely flushed, data may be lost - dma.flush_full_buffer(file)?; - return Ok(()); - } - _ => (), - } - - self.remote.flush() - } -} - -pub struct MixFile { - local_path: TempPath, - remote_offset: usize, -} - #[cfg(test)] mod tests { use std::io::Write; diff --git a/src/query/service/src/spillers/mod.rs b/src/query/service/src/spillers/mod.rs index 5f624bd9b54d0..bf148c7117bba 100644 --- a/src/query/service/src/spillers/mod.rs +++ b/src/query/service/src/spillers/mod.rs @@ -21,6 +21,7 @@ mod partition_buffer; mod serialize; #[cfg(test)] mod test_memory; +mod union_file; pub use adapter::*; pub use block_writer::*; diff --git a/src/query/service/src/spillers/serialize.rs b/src/query/service/src/spillers/serialize.rs index 42aa2a25a6024..7edf72ab90d2d 100644 --- a/src/query/service/src/spillers/serialize.rs +++ b/src/query/service/src/spillers/serialize.rs @@ -32,27 +32,17 @@ use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; use databend_common_expression::DataField; use databend_common_expression::DataSchema; -use databend_common_expression::TableSchema; use databend_common_expression::Value; use opendal::Buffer; use parquet::arrow::arrow_reader::ParquetRecordBatchReader; -use parquet::arrow::arrow_writer::compute_leaves; -use parquet::arrow::arrow_writer::get_column_writers; -use parquet::arrow::arrow_writer::ArrowColumnWriter; -use parquet::arrow::ArrowSchemaConverter; use parquet::arrow::ArrowWriter; use parquet::basic::Compression; use parquet::errors; -use parquet::file::metadata::RowGroupMetaDataPtr; use parquet::file::properties::EnabledStatistics; use parquet::file::properties::WriterProperties; -use parquet::file::properties::WriterPropertiesPtr; use parquet::file::reader::ChunkReader; use parquet::file::reader::Length; -use parquet::file::writer::SerializedFileWriter; -use parquet::file::writer::SerializedRowGroupWriter; use parquet::format::FileMetaData; -use parquet::schema::types::SchemaDescriptor; #[derive(Debug, Clone)] pub enum Layout { @@ -246,86 +236,6 @@ impl ChunkReader for Reader { } } -pub struct RowGroupWriter { - schema: Arc, - writers: Vec, -} - -impl RowGroupWriter { - fn new(props: &WriterPropertiesPtr, schema: Arc, parquet: &SchemaDescriptor) -> Self { - let writers = get_column_writers(parquet, props, &schema).unwrap(); - Self { schema, writers } - } - - pub(super) fn write(&mut self, block: DataBlock) -> errors::Result<()> { - let mut writer_iter = self.writers.iter_mut(); - for (field, entry) in self.schema.fields().iter().zip(block.take_columns()) { - let array = (&entry.to_column()).into(); - for col in compute_leaves(field, &array).unwrap() { - writer_iter.next().unwrap().write(&col)?; - } - } - Ok(()) - } - - fn close( - self, - writer: &mut SerializedRowGroupWriter<'_, W>, - ) -> errors::Result<()> { - for w in self.writers { - w.close()?.append_to_row_group(writer)? - } - Ok(()) - } - - pub fn memory_size(&self) -> usize { - self.writers.iter().map(|w| w.memory_size()).sum() - } -} - -pub struct FileWriter { - schema: Arc, - writer: SerializedFileWriter, -} - -impl FileWriter { - pub(super) fn new( - props: Arc, - table_schema: &TableSchema, - w: W, - ) -> errors::Result { - let schema = Arc::new(Schema::from(table_schema)); - - let parquet = ArrowSchemaConverter::new() - .with_coerce_types(props.coerce_types()) - .convert(&schema)?; - - let writer = SerializedFileWriter::new(w, parquet.root_schema_ptr(), props.clone())?; - Ok(Self { schema, writer }) - } - - pub(super) fn new_row_group(&self) -> RowGroupWriter { - RowGroupWriter::new( - self.writer.properties(), - self.schema.clone(), - self.writer.schema_descr(), - ) - } - - pub(super) fn flush_row_group( - &mut self, - row_group: RowGroupWriter, - ) -> errors::Result { - let mut row_group_writer = self.writer.next_row_group()?; - row_group.close(&mut row_group_writer)?; - row_group_writer.close() - } - - pub(super) fn close(self) -> errors::Result { - self.writer.close() - } -} - #[cfg(test)] mod tests { use bytes::Bytes; diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs new file mode 100644 index 0000000000000..33b573bd6743d --- /dev/null +++ b/src/query/service/src/spillers/union_file.rs @@ -0,0 +1,249 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::io; +use std::io::Write; +use std::sync::Arc; + +use arrow_schema::Schema; +use databend_common_base::base::DmaWriteBuf; +use databend_common_base::base::SyncDmaFile; +use databend_common_expression::DataBlock; +use databend_common_expression::TableSchema; +use databend_storages_common_cache::TempDir; +use databend_storages_common_cache::TempPath; +use parquet::arrow::arrow_writer::compute_leaves; +use parquet::arrow::arrow_writer::get_column_writers; +use parquet::arrow::arrow_writer::ArrowColumnWriter; +use parquet::arrow::ArrowSchemaConverter; +use parquet::errors; +use parquet::file::metadata::RowGroupMetaDataPtr; +use parquet::file::properties::WriterProperties; +use parquet::file::properties::WriterPropertiesPtr; +use parquet::file::writer::SerializedFileWriter; +use parquet::file::writer::SerializedRowGroupWriter; +use parquet::format::FileMetaData; +use parquet::schema::types::SchemaDescriptor; + +use super::async_buffer::BufferWriter; + +pub struct RowGroupWriter { + schema: Arc, + writers: Vec, +} + +impl RowGroupWriter { + fn new(props: &WriterPropertiesPtr, schema: Arc, parquet: &SchemaDescriptor) -> Self { + let writers = get_column_writers(parquet, props, &schema).unwrap(); + Self { schema, writers } + } + + pub(super) fn write(&mut self, block: DataBlock) -> errors::Result<()> { + let mut writer_iter = self.writers.iter_mut(); + for (field, entry) in self.schema.fields().iter().zip(block.take_columns()) { + let array = (&entry.to_column()).into(); + for col in compute_leaves(field, &array).unwrap() { + writer_iter.next().unwrap().write(&col)?; + } + } + Ok(()) + } + + fn close( + self, + writer: &mut SerializedRowGroupWriter<'_, W>, + ) -> errors::Result<()> { + for w in self.writers { + w.close()?.append_to_row_group(writer)? + } + Ok(()) + } + + pub fn memory_size(&self) -> usize { + self.writers.iter().map(|w| w.memory_size()).sum() + } +} + +pub struct FileWriter { + schema: Arc, + writer: SerializedFileWriter, +} + +impl FileWriter { + pub(super) fn new( + props: Arc, + table_schema: &TableSchema, + w: W, + ) -> errors::Result { + let schema = Arc::new(Schema::from(table_schema)); + + let parquet = ArrowSchemaConverter::new() + .with_coerce_types(props.coerce_types()) + .convert(&schema)?; + + let writer = SerializedFileWriter::new(w, parquet.root_schema_ptr(), props.clone())?; + Ok(Self { schema, writer }) + } + + pub(super) fn new_row_group(&self) -> RowGroupWriter { + RowGroupWriter::new( + self.writer.properties(), + self.schema.clone(), + self.writer.schema_descr(), + ) + } + + pub(super) fn flush_row_group( + &mut self, + row_group: RowGroupWriter, + ) -> errors::Result { + let mut row_group_writer = self.writer.next_row_group()?; + row_group.close(&mut row_group_writer)?; + row_group_writer.close() + } + + pub(super) fn close(self) -> errors::Result { + self.writer.close() + } +} + +struct LocalDst { + dir: Arc, + path: TempPath, + file: Option, + buf: Option, +} + +pub struct MixFileWriter { + local: LocalDst, + remote: BufferWriter, + remote_offset: usize, +} + +impl MixFileWriter { + pub fn new( + dir: Arc, + path: TempPath, + file: SyncDmaFile, + buf: DmaWriteBuf, + remote: BufferWriter, + ) -> Self { + MixFileWriter { + local: LocalDst { + dir, + path, + file: Some(file), + buf: Some(buf), + }, + remote, + remote_offset: 0, + } + } + + pub fn finish(self) -> io::Result { + let local_path = match self.local { + mut local @ LocalDst { + file: Some(_), + buf: Some(_), + .. + } => { + let dma = local.buf.as_mut().unwrap(); + + let file = local.file.take().unwrap(); + let file_size = file.length() + dma.size(); + dma.flush_and_close(file)?; + + local.path.set_size(file_size).unwrap(); + + return Ok(MixFile { + local_path: local.path, + remote_offset: 0, + }); + } + LocalDst { path, .. } => path, + }; + + Ok(MixFile { + local_path, + remote_offset: self.remote_offset, + }) + } +} + +impl io::Write for MixFileWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + let (dma_buf, offset) = if let local @ LocalDst { + file: Some(_), + buf: Some(_), + .. + } = &mut self.local + { + let n = buf.len(); + let dma = local.buf.as_mut().unwrap(); + if dma.fast_write(buf) { + return Ok(n); + } + + if local.dir.grow_size(&mut local.path, buf.len(), false)? { + dma.write(buf)?; + let file = local.file.as_mut().unwrap(); + dma.flush_full_buffer(file)?; + local.path.set_size(file.length()).unwrap(); + return Ok(n); + } + + let mut file = local.file.take().unwrap(); + dma.flush_full_buffer(&mut file)?; + + let file_size = file.length(); + local.path.set_size(file_size).unwrap(); + drop(file); + + (local.buf.take().unwrap().into_data(), file_size) + } else { + (vec![], 0) + }; + + if offset != 0 { + self.remote_offset = offset; + } + + for buf in dma_buf { + self.remote.write(&buf)?; + } + self.remote.write(buf) + } + + fn flush(&mut self) -> io::Result<()> { + match &mut self.local { + LocalDst { + file: Some(file), + buf: Some(dma), + .. + } => { + // warning: not completely flushed, data may be lost + dma.flush_full_buffer(file)?; + return Ok(()); + } + _ => (), + } + + self.remote.flush() + } +} + +pub struct MixFile { + local_path: TempPath, + remote_offset: usize, +} From 7e7868bd016b246c7379de6075329f5b81cc39bb Mon Sep 17 00:00:00 2001 From: coldWater Date: Sun, 28 Sep 2025 11:34:17 +0800 Subject: [PATCH 11/46] x --- src/query/service/src/spillers/inner.rs | 2 +- src/query/service/src/spillers/union_file.rs | 159 +++++++++++++++---- 2 files changed, 127 insertions(+), 34 deletions(-) diff --git a/src/query/service/src/spillers/inner.rs b/src/query/service/src/spillers/inner.rs index 65915dce2d8cf..fcb89e3fe7c87 100644 --- a/src/query/service/src/spillers/inner.rs +++ b/src/query/service/src/spillers/inner.rs @@ -104,7 +104,7 @@ pub struct SpillerInner { pub(super) adapter: A, pub(super) operator: Operator, location_prefix: String, - temp_dir: Option>, + pub(super) temp_dir: Option>, // for dio disabled pub(super) local_operator: Option, use_parquet: bool, diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index 33b573bd6743d..b8f1cfd372bee 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -19,6 +19,7 @@ use std::sync::Arc; use arrow_schema::Schema; use databend_common_base::base::DmaWriteBuf; use databend_common_base::base::SyncDmaFile; +use databend_common_base::runtime::Runtime; use databend_common_expression::DataBlock; use databend_common_expression::TableSchema; use databend_storages_common_cache::TempDir; @@ -36,7 +37,10 @@ use parquet::file::writer::SerializedRowGroupWriter; use parquet::format::FileMetaData; use parquet::schema::types::SchemaDescriptor; +use super::async_buffer::BufferPool; use super::async_buffer::BufferWriter; +use super::SpillAdapter; +use super::SpillerInner; pub struct RowGroupWriter { schema: Arc, @@ -112,9 +116,12 @@ impl FileWriter { row_group.close(&mut row_group_writer)?; row_group_writer.close() } +} - pub(super) fn close(self) -> errors::Result { - self.writer.close() +impl FileWriter { + pub(super) fn finish(self) -> errors::Result { + let writer = self.writer.into_inner()?; + writer.finish() } } @@ -125,13 +132,13 @@ struct LocalDst { buf: Option, } -pub struct MixFileWriter { - local: LocalDst, +pub struct UnionFileWriter { + local: Option, remote: BufferWriter, remote_offset: usize, } -impl MixFileWriter { +impl UnionFileWriter { pub fn new( dir: Arc, path: TempPath, @@ -139,13 +146,21 @@ impl MixFileWriter { buf: DmaWriteBuf, remote: BufferWriter, ) -> Self { - MixFileWriter { - local: LocalDst { + UnionFileWriter { + local: Some(LocalDst { dir, path, file: Some(file), buf: Some(buf), - }, + }), + remote, + remote_offset: 0, + } + } + + pub fn without_local(remote: BufferWriter) -> Self { + UnionFileWriter { + local: None, remote, remote_offset: 0, } @@ -153,11 +168,13 @@ impl MixFileWriter { pub fn finish(self) -> io::Result { let local_path = match self.local { - mut local @ LocalDst { - file: Some(_), - buf: Some(_), - .. - } => { + Some( + mut local @ LocalDst { + file: Some(_), + buf: Some(_), + .. + }, + ) => { let dma = local.buf.as_mut().unwrap(); let file = local.file.take().unwrap(); @@ -167,11 +184,12 @@ impl MixFileWriter { local.path.set_size(file_size).unwrap(); return Ok(MixFile { - local_path: local.path, + local_path: Some(local.path), remote_offset: 0, }); } - LocalDst { path, .. } => path, + Some(LocalDst { path, .. }) => Some(path), + None => None, }; Ok(MixFile { @@ -181,13 +199,15 @@ impl MixFileWriter { } } -impl io::Write for MixFileWriter { +impl io::Write for UnionFileWriter { fn write(&mut self, buf: &[u8]) -> io::Result { - let (dma_buf, offset) = if let local @ LocalDst { - file: Some(_), - buf: Some(_), - .. - } = &mut self.local + let (dma_buf, offset) = if let Some( + local @ LocalDst { + file: Some(_), + buf: Some(_), + .. + }, + ) = &mut self.local { let n = buf.len(); let dma = local.buf.as_mut().unwrap(); @@ -226,17 +246,15 @@ impl io::Write for MixFileWriter { } fn flush(&mut self) -> io::Result<()> { - match &mut self.local { - LocalDst { - file: Some(file), - buf: Some(dma), - .. - } => { - // warning: not completely flushed, data may be lost - dma.flush_full_buffer(file)?; - return Ok(()); - } - _ => (), + if let Some(LocalDst { + file: Some(file), + buf: Some(dma), + .. + }) = &mut self.local + { + // warning: not completely flushed, data may be lost + dma.flush_full_buffer(file)?; + return Ok(()); } self.remote.flush() @@ -244,6 +262,81 @@ impl io::Write for MixFileWriter { } pub struct MixFile { - local_path: TempPath, + local_path: Option, remote_offset: usize, } + +impl SpillerInner { + async fn new_file_writer( + &self, + schema: &TableSchema, + executor: Arc, + max_buffer: usize, + ) -> databend_common_exception::Result> { + let pool = BufferPool::create(executor, max_buffer, 3); + + let op = self.local_operator.as_ref().unwrap_or(&self.operator); + + let remote_location = self.create_unique_location(); + let remote_writer = op.writer(&remote_location).await?; + let remote = pool.buffer_write(remote_writer); + + let union = if let Some(disk) = &self.temp_dir { + if let Some(path) = disk.new_file_with_size(0)? { + let file = SyncDmaFile::create(&path, true)?; + let align = disk.block_alignment(); + let buf = DmaWriteBuf::new(align, 4 * 1024 * 1024); + UnionFileWriter::new(disk.clone(), path, file, buf, remote) + } else { + UnionFileWriter::without_local(remote) + } + } else { + UnionFileWriter::without_local(remote) + }; + + let props = WriterProperties::default().into(); + Ok(FileWriter::new(props, schema, union)?) + } +} + +// #[cfg(test)] +// mod tests { +// use databend_common_exception::Result; +// use opendal::Builder; +// use opendal::Operator; +// use parquet::file::properties::WriterProperties; + +// use super::*; +// use crate::spillers::async_buffer::BufferPool; + +// async fn xxx() -> Result<()> { +// let props = WriterProperties::default().into(); + +// let table_schema = todo!(); +// let executor = todo!(); +// let memory = 1024 * 1024 * 100; + +// let pool = BufferPool::create(executor, memory, 3); + +// let builder = opendal::services::Fs::default().root("/tmp"); +// let op = Operator::new(builder)?.finish(); + +// let writer = op.writer("path").await?; +// let remote = pool.buffer_write(writer); + +// let dir = todo!(); +// let path = todo!(); + +// let file = SyncDmaFile::create(path, true)?; +// let align = todo!(); +// let buf = DmaWriteBuf::new(align, 4 * 1024 * 1024); + +// let mix_file = MixFileWriter::new(dir, path, file, buf, remote); + +// let file_writer = FileWriter::new(props, table_schema, &mut mix_file)?; + +// let file_meta = file_writer.close()?; + +// let xx = mix_file.finish()?; +// } +// } From f86b11ad4ed407864b2f26c83e8d265d3edb05c5 Mon Sep 17 00:00:00 2001 From: coldWater Date: Sun, 28 Sep 2025 13:49:04 +0800 Subject: [PATCH 12/46] x --- src/query/service/src/spillers/adapter.rs | 5 - src/query/service/src/spillers/union_file.rs | 149 +++++++++++-------- 2 files changed, 91 insertions(+), 63 deletions(-) diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index 5b03670a676d5..163d2213b4d73 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -32,7 +32,6 @@ use databend_storages_common_cache::TempPath; use opendal::Buffer; use opendal::Operator; use parquet::file::metadata::RowGroupMetaDataPtr; -use parquet::format::FileMetaData; use super::inner::*; use super::serialize::*; @@ -369,10 +368,6 @@ impl SpillWriter { } Ok(self.file.flush_row_group(row_group)?) } - - pub fn close(self) -> Result { - Ok(self.file.close()?) - } } pub struct SpillReader {} diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index b8f1cfd372bee..52b6ed89d0a33 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -20,6 +20,7 @@ use arrow_schema::Schema; use databend_common_base::base::DmaWriteBuf; use databend_common_base::base::SyncDmaFile; use databend_common_base::runtime::Runtime; +use databend_common_exception::Result; use databend_common_expression::DataBlock; use databend_common_expression::TableSchema; use databend_storages_common_cache::TempDir; @@ -34,7 +35,6 @@ use parquet::file::properties::WriterProperties; use parquet::file::properties::WriterPropertiesPtr; use parquet::file::writer::SerializedFileWriter; use parquet::file::writer::SerializedRowGroupWriter; -use parquet::format::FileMetaData; use parquet::schema::types::SchemaDescriptor; use super::async_buffer::BufferPool; @@ -116,12 +116,20 @@ impl FileWriter { row_group.close(&mut row_group_writer)?; row_group_writer.close() } + + pub fn spill(&mut self, blocks: Vec) -> Result { + let mut row_group = self.new_row_group(); + for block in blocks { + row_group.write(block)?; + } + Ok(self.flush_row_group(row_group)?) + } } impl FileWriter { - pub(super) fn finish(self) -> errors::Result { + pub(super) fn finish(self) -> errors::Result { let writer = self.writer.into_inner()?; - writer.finish() + Ok(writer.finish()?) } } @@ -134,7 +142,8 @@ struct LocalDst { pub struct UnionFileWriter { local: Option, - remote: BufferWriter, + remote: String, + remote_writer: BufferWriter, remote_offset: usize, } @@ -144,7 +153,8 @@ impl UnionFileWriter { path: TempPath, file: SyncDmaFile, buf: DmaWriteBuf, - remote: BufferWriter, + remote: String, + remote_writer: BufferWriter, ) -> Self { UnionFileWriter { local: Some(LocalDst { @@ -154,20 +164,22 @@ impl UnionFileWriter { buf: Some(buf), }), remote, + remote_writer, remote_offset: 0, } } - pub fn without_local(remote: BufferWriter) -> Self { + pub fn without_local(remote: String, remote_writer: BufferWriter) -> Self { UnionFileWriter { local: None, remote, + remote_writer, remote_offset: 0, } } - pub fn finish(self) -> io::Result { - let local_path = match self.local { + pub fn finish(self) -> io::Result { + match self.local { Some( mut local @ LocalDst { file: Some(_), @@ -183,19 +195,23 @@ impl UnionFileWriter { local.path.set_size(file_size).unwrap(); - return Ok(MixFile { + Ok(UnionFile { local_path: Some(local.path), - remote_offset: 0, - }); + remote_path: self.remote, + remote_offset: None, + }) } - Some(LocalDst { path, .. }) => Some(path), - None => None, - }; - - Ok(MixFile { - local_path, - remote_offset: self.remote_offset, - }) + Some(LocalDst { path, .. }) => Ok(UnionFile { + local_path: Some(path), + remote_path: self.remote, + remote_offset: Some(self.remote_offset), + }), + None => Ok(UnionFile { + local_path: None, + remote_path: self.remote, + remote_offset: Some(0), + }), + } } } @@ -240,9 +256,9 @@ impl io::Write for UnionFileWriter { } for buf in dma_buf { - self.remote.write(&buf)?; + self.remote_writer.write(&buf)?; } - self.remote.write(buf) + self.remote_writer.write(buf) } fn flush(&mut self) -> io::Result<()> { @@ -257,22 +273,24 @@ impl io::Write for UnionFileWriter { return Ok(()); } - self.remote.flush() + self.remote_writer.flush() } } -pub struct MixFile { +#[derive(Debug)] +pub struct UnionFile { local_path: Option, - remote_offset: usize, + remote_path: String, + remote_offset: Option, } impl SpillerInner { - async fn new_file_writer( + pub(super) async fn new_file_writer( &self, schema: &TableSchema, executor: Arc, max_buffer: usize, - ) -> databend_common_exception::Result> { + ) -> Result> { let pool = BufferPool::create(executor, max_buffer, 3); let op = self.local_operator.as_ref().unwrap_or(&self.operator); @@ -286,12 +304,12 @@ impl SpillerInner { let file = SyncDmaFile::create(&path, true)?; let align = disk.block_alignment(); let buf = DmaWriteBuf::new(align, 4 * 1024 * 1024); - UnionFileWriter::new(disk.clone(), path, file, buf, remote) + UnionFileWriter::new(disk.clone(), path, file, buf, remote_location, remote) } else { - UnionFileWriter::without_local(remote) + UnionFileWriter::without_local(remote_location, remote) } } else { - UnionFileWriter::without_local(remote) + UnionFileWriter::without_local(remote_location, remote) }; let props = WriterProperties::default().into(); @@ -299,44 +317,59 @@ impl SpillerInner { } } -// #[cfg(test)] -// mod tests { -// use databend_common_exception::Result; -// use opendal::Builder; -// use opendal::Operator; -// use parquet::file::properties::WriterProperties; +#[cfg(test)] +mod tests { + use databend_common_base::runtime::GlobalIORuntime; + use databend_common_exception::Result; + use databend_common_expression::infer_table_schema; + use databend_common_expression::types::UInt64Type; + use databend_common_expression::FromData; + use opendal::Operator; + use parquet::file::properties::WriterProperties; + + use super::*; + use crate::spillers::async_buffer::BufferPool; + use crate::test_kits::ConfigBuilder; + use crate::test_kits::TestFixture; + + #[tokio::test(flavor = "multi_thread")] + async fn test_xxx() -> Result<()> { + let config = ConfigBuilder::create().build(); + let fixture = TestFixture::setup_with_config(&config).await?; + let _ctx = fixture.new_query_ctx().await?; -// use super::*; -// use crate::spillers::async_buffer::BufferPool; + let props = WriterProperties::default().into(); -// async fn xxx() -> Result<()> { -// let props = WriterProperties::default().into(); + let block = DataBlock::new_from_columns(vec![UInt64Type::from_data(vec![7, 8, 9])]); -// let table_schema = todo!(); -// let executor = todo!(); -// let memory = 1024 * 1024 * 100; + let table_schema = infer_table_schema(&block.infer_schema())?; + let executor = GlobalIORuntime::instance(); + let memory = 1024 * 1024 * 100; -// let pool = BufferPool::create(executor, memory, 3); + let pool = BufferPool::create(executor, memory, 3); -// let builder = opendal::services::Fs::default().root("/tmp"); -// let op = Operator::new(builder)?.finish(); + let builder = opendal::services::Fs::default().root("/tmp"); + let op = Operator::new(builder)?.finish(); -// let writer = op.writer("path").await?; -// let remote = pool.buffer_write(writer); + let writer = op.writer("path").await?; + let remote = pool.buffer_write(writer); -// let dir = todo!(); -// let path = todo!(); + // let dir = todo!(); + // let path = todo!(); -// let file = SyncDmaFile::create(path, true)?; -// let align = todo!(); -// let buf = DmaWriteBuf::new(align, 4 * 1024 * 1024); + // let file = SyncDmaFile::create(path, true)?; + // let align = todo!(); + // let buf = DmaWriteBuf::new(align, 4 * 1024 * 1024); -// let mix_file = MixFileWriter::new(dir, path, file, buf, remote); + let file = UnionFileWriter::without_local("path".to_string(), remote); + let mut file_writer = FileWriter::new(props, &table_schema, file)?; -// let file_writer = FileWriter::new(props, table_schema, &mut mix_file)?; + let x = file_writer.spill(vec![block])?; + println!("{x:#?}"); -// let file_meta = file_writer.close()?; + let x = file_writer.finish()?; + println!("{x:#?}"); -// let xx = mix_file.finish()?; -// } -// } + Ok(()) + } +} From f06fe1eeb8d74224c5cc634e55d24344c2a77feb Mon Sep 17 00:00:00 2001 From: coldWater Date: Sun, 28 Sep 2025 15:15:51 +0800 Subject: [PATCH 13/46] x --- src/query/service/src/spillers/union_file.rs | 79 +++++++++++++++++--- 1 file changed, 69 insertions(+), 10 deletions(-) diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index 52b6ed89d0a33..7e933e0970230 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -21,13 +21,21 @@ use databend_common_base::base::DmaWriteBuf; use databend_common_base::base::SyncDmaFile; use databend_common_base::runtime::Runtime; use databend_common_exception::Result; +use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; +use databend_common_expression::DataSchema; use databend_common_expression::TableSchema; +use databend_common_expression::Value; +use databend_storages_common_cache::ParquetMetaData; use databend_storages_common_cache::TempDir; use databend_storages_common_cache::TempPath; +use futures::future::BoxFuture; +use parquet::arrow::arrow_reader::ArrowReaderBuilder; +use parquet::arrow::arrow_reader::ArrowReaderOptions; use parquet::arrow::arrow_writer::compute_leaves; use parquet::arrow::arrow_writer::get_column_writers; use parquet::arrow::arrow_writer::ArrowColumnWriter; +use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::ArrowSchemaConverter; use parquet::errors; use parquet::file::metadata::RowGroupMetaDataPtr; @@ -35,6 +43,7 @@ use parquet::file::properties::WriterProperties; use parquet::file::properties::WriterPropertiesPtr; use parquet::file::writer::SerializedFileWriter; use parquet::file::writer::SerializedRowGroupWriter; +use parquet::format::FileMetaData; use parquet::schema::types::SchemaDescriptor; use super::async_buffer::BufferPool; @@ -127,9 +136,10 @@ impl FileWriter { } impl FileWriter { - pub(super) fn finish(self) -> errors::Result { - let writer = self.writer.into_inner()?; - Ok(writer.finish()?) + pub(super) fn finish(mut self) -> errors::Result<(FileMetaData, UnionFile)> { + let file_meta = self.writer.finish()?; + let file = self.writer.inner_mut().finish()?; + Ok((file_meta, file)) } } @@ -148,7 +158,7 @@ pub struct UnionFileWriter { } impl UnionFileWriter { - pub fn new( + fn new( dir: Arc, path: TempPath, file: SyncDmaFile, @@ -169,7 +179,7 @@ impl UnionFileWriter { } } - pub fn without_local(remote: String, remote_writer: BufferWriter) -> Self { + fn without_local(remote: String, remote_writer: BufferWriter) -> Self { UnionFileWriter { local: None, remote, @@ -178,8 +188,8 @@ impl UnionFileWriter { } } - pub fn finish(self) -> io::Result { - match self.local { + fn finish(&mut self) -> io::Result { + match self.local.take() { Some( mut local @ LocalDst { file: Some(_), @@ -197,18 +207,18 @@ impl UnionFileWriter { Ok(UnionFile { local_path: Some(local.path), - remote_path: self.remote, + remote_path: std::mem::take(&mut self.remote), remote_offset: None, }) } Some(LocalDst { path, .. }) => Ok(UnionFile { local_path: Some(path), - remote_path: self.remote, + remote_path: std::mem::take(&mut self.remote), remote_offset: Some(self.remote_offset), }), None => Ok(UnionFile { local_path: None, - remote_path: self.remote, + remote_path: std::mem::take(&mut self.remote), remote_offset: Some(0), }), } @@ -277,6 +287,55 @@ impl io::Write for UnionFileWriter { } } +async fn xxxx(schema: &DataSchema, row_groups: Vec) -> Result> { + let input = Reader; + + let builder = ArrowReaderBuilder::new(input).await?; + let mut stream = builder + .with_row_groups(row_groups) + .with_batch_size(usize::MAX) + .build()?; + + let mut blocks = Vec::new(); + + while let Some(reader) = stream.next_row_group().await? { + for record in reader { + let record = record?; + let num_rows = record.num_rows(); + let mut columns = Vec::with_capacity(record.num_columns()); + for (array, field) in record.columns().iter().zip(schema.fields()) { + let data_type = field.data_type(); + columns.push(BlockEntry::new( + Value::from_arrow_rs(array.clone(), data_type)?, + || (data_type.clone(), num_rows), + )) + } + let block = DataBlock::new(columns, num_rows); + blocks.push(block); + } + } + + Ok(blocks) +} + +struct Reader; + +impl AsyncFileReader for Reader { + fn get_bytes( + &mut self, + range: std::ops::Range, + ) -> BoxFuture<'_, errors::Result> { + todo!() + } + + fn get_metadata<'a>( + &'a mut self, + options: Option<&'a ArrowReaderOptions>, + ) -> BoxFuture<'a, errors::Result>> { + todo!() + } +} + #[derive(Debug)] pub struct UnionFile { local_path: Option, From d843a4a50cc0937422a8cd9d7d84be82a2d00af5 Mon Sep 17 00:00:00 2001 From: coldWater Date: Sun, 28 Sep 2025 16:47:08 +0800 Subject: [PATCH 14/46] x --- src/query/service/src/spillers/union_file.rs | 113 +++++++++++-------- 1 file changed, 67 insertions(+), 46 deletions(-) diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index 7e933e0970230..f1ce57b4042ff 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::future; use std::io; use std::io::Write; use std::sync::Arc; @@ -30,6 +31,8 @@ use databend_storages_common_cache::ParquetMetaData; use databend_storages_common_cache::TempDir; use databend_storages_common_cache::TempPath; use futures::future::BoxFuture; +use futures::future::FutureExt; +use opendal::Reader; use parquet::arrow::arrow_reader::ArrowReaderBuilder; use parquet::arrow::arrow_reader::ArrowReaderOptions; use parquet::arrow::arrow_writer::compute_leaves; @@ -94,11 +97,7 @@ pub struct FileWriter { } impl FileWriter { - pub(super) fn new( - props: Arc, - table_schema: &TableSchema, - w: W, - ) -> errors::Result { + fn new(props: Arc, table_schema: &TableSchema, w: W) -> errors::Result { let schema = Arc::new(Schema::from(table_schema)); let parquet = ArrowSchemaConverter::new() @@ -131,6 +130,7 @@ impl FileWriter { for block in blocks { row_group.write(block)?; } + Ok(self.flush_row_group(row_group)?) } } @@ -287,62 +287,42 @@ impl io::Write for UnionFileWriter { } } -async fn xxxx(schema: &DataSchema, row_groups: Vec) -> Result> { - let input = Reader; - - let builder = ArrowReaderBuilder::new(input).await?; - let mut stream = builder - .with_row_groups(row_groups) - .with_batch_size(usize::MAX) - .build()?; - - let mut blocks = Vec::new(); - - while let Some(reader) = stream.next_row_group().await? { - for record in reader { - let record = record?; - let num_rows = record.num_rows(); - let mut columns = Vec::with_capacity(record.num_columns()); - for (array, field) in record.columns().iter().zip(schema.fields()) { - let data_type = field.data_type(); - columns.push(BlockEntry::new( - Value::from_arrow_rs(array.clone(), data_type)?, - || (data_type.clone(), num_rows), - )) - } - let block = DataBlock::new(columns, num_rows); - blocks.push(block); - } - } - - Ok(blocks) +#[derive(Debug)] +pub struct UnionFile { + local_path: Option, + remote_path: String, + remote_offset: Option, } -struct Reader; +struct FileReader { + meta: Arc, + reader: Reader, +} -impl AsyncFileReader for Reader { +impl AsyncFileReader for FileReader { fn get_bytes( &mut self, range: std::ops::Range, ) -> BoxFuture<'_, errors::Result> { - todo!() + async move { + let buf = self + .reader + .read(range) + .await + .map_err(|err| errors::ParquetError::External(Box::new(err)))?; + Ok(buf.to_bytes()) + } + .boxed() } fn get_metadata<'a>( &'a mut self, - options: Option<&'a ArrowReaderOptions>, + _options: Option<&'a ArrowReaderOptions>, ) -> BoxFuture<'a, errors::Result>> { - todo!() + future::ready(Ok(self.meta.clone())).boxed() } } -#[derive(Debug)] -pub struct UnionFile { - local_path: Option, - remote_path: String, - remote_offset: Option, -} - impl SpillerInner { pub(super) async fn new_file_writer( &self, @@ -374,6 +354,47 @@ impl SpillerInner { let props = WriterProperties::default().into(); Ok(FileWriter::new(props, schema, union)?) } + + pub(super) async fn xxxx( + &self, + file: UnionFile, + meta: Arc, + schema: &DataSchema, + row_groups: Vec, + ) -> Result> { + let op = self.local_operator.as_ref().unwrap_or(&self.operator); + let input = FileReader { + meta, + reader: op.reader(&file.remote_path).await?, + }; + + let builder = ArrowReaderBuilder::new(input).await?; + let mut stream = builder + .with_row_groups(row_groups) + .with_batch_size(usize::MAX) + .build()?; + + let mut blocks = Vec::new(); + + while let Some(reader) = stream.next_row_group().await? { + for record in reader { + let record = record?; + let num_rows = record.num_rows(); + let mut columns = Vec::with_capacity(record.num_columns()); + for (array, field) in record.columns().iter().zip(schema.fields()) { + let data_type = field.data_type(); + columns.push(BlockEntry::new( + Value::from_arrow_rs(array.clone(), data_type)?, + || (data_type.clone(), num_rows), + )) + } + let block = DataBlock::new(columns, num_rows); + blocks.push(block); + } + } + + Ok(blocks) + } } #[cfg(test)] From cd50d87e85b718b795a007c8ebb1f58632875eef Mon Sep 17 00:00:00 2001 From: coldWater Date: Sun, 28 Sep 2025 17:07:12 +0800 Subject: [PATCH 15/46] x --- src/query/service/src/spillers/union_file.rs | 22 +++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index f1ce57b4042ff..525ddf4a7cd10 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -154,7 +154,7 @@ pub struct UnionFileWriter { local: Option, remote: String, remote_writer: BufferWriter, - remote_offset: usize, + remote_offset: u64, } impl UnionFileWriter { @@ -262,7 +262,7 @@ impl io::Write for UnionFileWriter { }; if offset != 0 { - self.remote_offset = offset; + self.remote_offset = offset as _; } for buf in dma_buf { @@ -291,12 +291,14 @@ impl io::Write for UnionFileWriter { pub struct UnionFile { local_path: Option, remote_path: String, - remote_offset: Option, + remote_offset: Option, } struct FileReader { meta: Arc, - reader: Reader, + local_path: Option, + remote_reader: Reader, + remote_offset: Option, } impl AsyncFileReader for FileReader { @@ -305,8 +307,12 @@ impl AsyncFileReader for FileReader { range: std::ops::Range, ) -> BoxFuture<'_, errors::Result> { async move { + let range = match self.remote_offset { + Some(offset) => (range.start + offset)..(range.end + offset), + None => range, + }; let buf = self - .reader + .remote_reader .read(range) .await .map_err(|err| errors::ParquetError::External(Box::new(err)))?; @@ -355,7 +361,7 @@ impl SpillerInner { Ok(FileWriter::new(props, schema, union)?) } - pub(super) async fn xxxx( + pub(super) async fn load_row_groups( &self, file: UnionFile, meta: Arc, @@ -365,7 +371,9 @@ impl SpillerInner { let op = self.local_operator.as_ref().unwrap_or(&self.operator); let input = FileReader { meta, - reader: op.reader(&file.remote_path).await?, + local_path: file.local_path, + remote_offset: file.remote_offset, + remote_reader: op.reader(&file.remote_path).await?, }; let builder = ArrowReaderBuilder::new(input).await?; From 28b84bb2cf4cb654f73015f715025eb86757718c Mon Sep 17 00:00:00 2001 From: coldWater Date: Sun, 28 Sep 2025 18:50:32 +0800 Subject: [PATCH 16/46] x --- src/query/service/src/spillers/union_file.rs | 52 +++++++++++++++++--- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index 525ddf4a7cd10..0cdfb5c4f2b40 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -307,16 +307,56 @@ impl AsyncFileReader for FileReader { range: std::ops::Range, ) -> BoxFuture<'_, errors::Result> { async move { - let range = match self.remote_offset { - Some(offset) => (range.start + offset)..(range.end + offset), - None => range, + let local_bytes = if let Some(local_path) = &self.local_path { + let local_range = self + .remote_offset + .map(|offset| { + if range.end <= offset { + return range.clone(); + } + if range.start < offset { + range.start..offset + } else { + offset..offset + } + }) + .unwrap_or(range); + + let (dma_buf, rt_range) = databend_common_base::base::dma_read_file_range( + local_path, + local_range.clone(), + ) + .await?; + + let bytes = + databend_common_base::base::dma_buffer_to_bytes(dma_buf).slice(rt_range); + if local_range == range { + return Ok(bytes); + } + Some(bytes) + } else { + None }; - let buf = self + + let remote_range = self + .remote_offset + .map(|offset| (range.start - offset)..(range.end - offset)) + .unwrap_or(range.clone()); + + let remote_bytes = self .remote_reader - .read(range) + .read(remote_range) .await .map_err(|err| errors::ParquetError::External(Box::new(err)))?; - Ok(buf.to_bytes()) + + if local_bytes.is_some() { + Ok(opendal::Buffer::from_iter( + local_bytes.into_iter().chain(remote_bytes.into_iter()), + ) + .to_bytes()) + } else { + Ok(remote_bytes.to_bytes()) + } } .boxed() } From 1edc9629758e6dd4cd115c24eb8c828083688433 Mon Sep 17 00:00:00 2001 From: coldWater Date: Sun, 28 Sep 2025 21:31:37 +0800 Subject: [PATCH 17/46] x --- src/common/base/src/base/dma.rs | 124 +++++++++++-------- src/query/service/src/spillers/union_file.rs | 35 +++--- 2 files changed, 91 insertions(+), 68 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 5a42b5f82a238..25a63df5bb2c8 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -277,7 +277,7 @@ impl DmaFile { } } -type AsyncDmaFile = DmaFile; +pub type AsyncDmaFile = DmaFile; impl AsyncDmaFile { async fn open_fd(path: impl AsRef, dio: bool) -> io::Result { @@ -311,24 +311,34 @@ impl AsyncDmaFile { } /// Attempts to open a file in read-only mode. - async fn open(path: impl AsRef, dio: bool) -> io::Result { + pub async fn open( + path: impl AsRef, + dio: bool, + align: Option, + ) -> io::Result { let file = AsyncDmaFile::open_fd(path, dio).await?; - Self::open_dma(file).await + Self::new(file, align).await } /// Opens a file in write-only mode. async fn create(path: impl AsRef, dio: bool) -> io::Result { let file = AsyncDmaFile::create_fd(path, dio).await?; - Self::open_dma(file).await - } - - async fn open_dma(file: AsyncFile) -> io::Result { - let fd = file.as_raw_fd(); - let stat = asyncify(move || { - rustix::fs::fstatvfs(unsafe { BorrowedFd::borrow_raw(fd) }).map_err(|e| e.into()) - }) - .await?; - let alignment = Alignment::new(stat.f_bsize.max(512) as usize).unwrap(); + Self::new(file, None).await + } + + async fn new(file: AsyncFile, align: Option) -> io::Result { + let alignment = match align { + Some(align) => align, + None => { + let fd = file.as_raw_fd(); + let stat = asyncify(move || { + rustix::fs::fstatvfs(unsafe { BorrowedFd::borrow_raw(fd) }) + .map_err(|e| e.into()) + }) + .await?; + Alignment::new(stat.f_bsize.max(512) as usize).unwrap() + } + }; Ok(AsyncDmaFile { fd: file, @@ -341,6 +351,50 @@ impl AsyncDmaFile { async fn seek(&mut self, pos: SeekFrom) -> io::Result { self.fd.seek(pos).await } + + pub async fn read_range(&mut self, range: Range) -> io::Result<(DmaBuffer, Range)> { + let align_start = self.align_down(range.start as usize); + let align_end = self.align_up(range.end as usize); + + let buf = Vec::with_capacity_in(align_end - align_start, DmaAllocator::new(self.alignment)); + self.set_buffer(buf); + + if align_start != 0 { + let offset = self.seek(SeekFrom::Start(align_start as u64)).await?; + if offset as usize != align_start { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "range out of range", + )); + } + } + + let fd = self.fd.as_raw_fd(); + let mut buf = self.buf.take().unwrap(); + let alignment = self.alignment; + let mut n; + loop { + (buf, n) = asyncify(move || { + let remain = buf.capacity() - buf.len(); + let mut file = DmaFile { + fd: unsafe { BorrowedFd::borrow_raw(fd) }, + alignment, + buf: Some(buf), + length: 0, + }; + file.read_direct(remain).map(|n| (file.buf.unwrap(), n)) + }) + .await?; + if align_start + buf.len() >= range.end as usize { + break; + } + if n == 0 { + return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "")); + } + } + let rt_range = range.start as usize - align_start..range.end as usize - align_start; + Ok((buf, rt_range)) + } } pub type SyncDmaFile = DmaFile; @@ -673,7 +727,7 @@ pub async fn dma_read_file( mut writer: impl io::Write, ) -> io::Result { const BUFFER_SIZE: usize = 1024 * 1024; - let mut file = AsyncDmaFile::open(path.as_ref(), true).await?; + let mut file = AsyncDmaFile::open(path.as_ref(), true, None).await?; let buf = Vec::with_capacity_in( file.align_up(BUFFER_SIZE), DmaAllocator::new(file.alignment), @@ -708,42 +762,8 @@ pub async fn dma_read_file_range( path: impl AsRef, range: Range, ) -> io::Result<(DmaBuffer, Range)> { - let mut file = AsyncDmaFile::open(path.as_ref(), true).await?; - - let align_start = file.align_down(range.start as usize); - let align_end = file.align_up(range.end as usize); - - let buf = Vec::with_capacity_in(align_end - align_start, DmaAllocator::new(file.alignment)); - file.set_buffer(buf); - - if align_start != 0 { - let offset = file.seek(SeekFrom::Start(align_start as u64)).await?; - if offset as usize != align_start { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "range out of range", - )); - } - } - - let mut n; - loop { - (file, n) = asyncify(move || { - let buf = file.buffer(); - let remain = buf.capacity() - buf.len(); - file.read_direct(remain).map(|n| (file, n)) - }) - .await?; - if align_start + file.buffer().len() >= range.end as usize { - break; - } - if n == 0 { - return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "")); - } - } - - let rt_range = range.start as usize - align_start..range.end as usize - align_start; - Ok((file.buf.unwrap(), rt_range)) + let mut file = AsyncDmaFile::open(path.as_ref(), true, None).await?; + file.read_range(range).await } #[cfg(test)] @@ -800,7 +820,7 @@ mod tests { assert_eq!(length, want.len()); assert_eq!(got, want); - let file = AsyncDmaFile::open(filename, dio).await?; + let file = AsyncDmaFile::open(filename, dio, None).await?; let align = file.alignment; drop(file); @@ -870,7 +890,7 @@ mod tests { let bufs = vec![IoSlice::new(&want)]; dma_write_file_vectored(filename, &bufs).await.unwrap(); - let mut file = AsyncDmaFile::open(filename, true).await.unwrap(); + let mut file = AsyncDmaFile::open(filename, true, None).await.unwrap(); let buf = Vec::with_capacity_in(file_size, DmaAllocator::new(file.alignment)); file.set_buffer(buf); diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index 0cdfb5c4f2b40..e6477ed2ae397 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -18,6 +18,8 @@ use std::io::Write; use std::sync::Arc; use arrow_schema::Schema; +use databend_common_base::base::dma_buffer_to_bytes; +use databend_common_base::base::AsyncDmaFile; use databend_common_base::base::DmaWriteBuf; use databend_common_base::base::SyncDmaFile; use databend_common_base::runtime::Runtime; @@ -296,7 +298,7 @@ pub struct UnionFile { struct FileReader { meta: Arc, - local_path: Option, + local: Option<(TempPath, AsyncDmaFile)>, remote_reader: Reader, remote_offset: Option, } @@ -307,7 +309,7 @@ impl AsyncFileReader for FileReader { range: std::ops::Range, ) -> BoxFuture<'_, errors::Result> { async move { - let local_bytes = if let Some(local_path) = &self.local_path { + let local_bytes = if let Some((_, file)) = &mut self.local { let local_range = self .remote_offset .map(|offset| { @@ -320,16 +322,10 @@ impl AsyncFileReader for FileReader { offset..offset } }) - .unwrap_or(range); - - let (dma_buf, rt_range) = databend_common_base::base::dma_read_file_range( - local_path, - local_range.clone(), - ) - .await?; + .unwrap_or(range.clone()); - let bytes = - databend_common_base::base::dma_buffer_to_bytes(dma_buf).slice(rt_range); + let (dma_buf, rt_range) = file.read_range(local_range.clone()).await?; + let bytes = dma_buffer_to_bytes(dma_buf).slice(rt_range); if local_range == range { return Ok(bytes); } @@ -341,7 +337,7 @@ impl AsyncFileReader for FileReader { let remote_range = self .remote_offset .map(|offset| (range.start - offset)..(range.end - offset)) - .unwrap_or(range.clone()); + .unwrap_or(range); let remote_bytes = self .remote_reader @@ -350,10 +346,10 @@ impl AsyncFileReader for FileReader { .map_err(|err| errors::ParquetError::External(Box::new(err)))?; if local_bytes.is_some() { - Ok(opendal::Buffer::from_iter( - local_bytes.into_iter().chain(remote_bytes.into_iter()), + Ok( + opendal::Buffer::from_iter(local_bytes.into_iter().chain(remote_bytes)) + .to_bytes(), ) - .to_bytes()) } else { Ok(remote_bytes.to_bytes()) } @@ -409,9 +405,16 @@ impl SpillerInner { row_groups: Vec, ) -> Result> { let op = self.local_operator.as_ref().unwrap_or(&self.operator); + let input = FileReader { meta, - local_path: file.local_path, + local: if let Some(path) = file.local_path { + let alignment = Some(self.temp_dir.as_ref().unwrap().block_alignment()); + let file = AsyncDmaFile::open(&path, true, alignment).await?; + Some((path, file)) + } else { + None + }, remote_offset: file.remote_offset, remote_reader: op.reader(&file.remote_path).await?, }; From 3ef9edf4dc16cf7277c9dfc00c849913a1367a1c Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 29 Sep 2025 12:15:00 +0800 Subject: [PATCH 18/46] x --- src/query/service/src/spillers/union_file.rs | 77 +++++++++++++++++--- 1 file changed, 65 insertions(+), 12 deletions(-) diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index e6477ed2ae397..6a749d2ca9e19 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -135,6 +135,11 @@ impl FileWriter { Ok(self.flush_row_group(row_group)?) } + + fn schema_descr(&self) -> SchemaDescriptor { + let tp = self.writer.schema_descr().root_schema_ptr(); + SchemaDescriptor::new(tp) + } } impl FileWriter { @@ -453,9 +458,10 @@ mod tests { use databend_common_base::runtime::GlobalIORuntime; use databend_common_exception::Result; use databend_common_expression::infer_table_schema; + use databend_common_expression::types::StringType; use databend_common_expression::types::UInt64Type; use databend_common_expression::FromData; - use opendal::Operator; + use databend_common_storage::DataOperator; use parquet::file::properties::WriterProperties; use super::*; @@ -471,18 +477,21 @@ mod tests { let props = WriterProperties::default().into(); - let block = DataBlock::new_from_columns(vec![UInt64Type::from_data(vec![7, 8, 9])]); + let block = DataBlock::new_from_columns(vec![ + UInt64Type::from_data(vec![7, 8, 9]), + StringType::from_data(vec!["c", "d", "e"]), + ]); - let table_schema = infer_table_schema(&block.infer_schema())?; + let data_schema = block.infer_schema(); + let table_schema = infer_table_schema(&data_schema)?; let executor = GlobalIORuntime::instance(); let memory = 1024 * 1024 * 100; let pool = BufferPool::create(executor, memory, 3); + let op = DataOperator::instance().operator(); - let builder = opendal::services::Fs::default().root("/tmp"); - let op = Operator::new(builder)?.finish(); - - let writer = op.writer("path").await?; + let path = "path"; + let writer = op.writer(path).await?; let remote = pool.buffer_write(writer); // let dir = todo!(); @@ -492,14 +501,58 @@ mod tests { // let align = todo!(); // let buf = DmaWriteBuf::new(align, 4 * 1024 * 1024); - let file = UnionFileWriter::without_local("path".to_string(), remote); + let file = UnionFileWriter::without_local(path.to_string(), remote); let mut file_writer = FileWriter::new(props, &table_schema, file)?; - let x = file_writer.spill(vec![block])?; - println!("{x:#?}"); + let mut row_groups = vec![]; + let row_group = file_writer.spill(vec![block])?; + row_groups.push((*row_group).clone()); + + let schema_descr = file_writer.schema_descr().into(); + + let (file_metadata, file) = file_writer.finish()?; + + let metadata = parquet::file::metadata::FileMetaData::new( + file_metadata.version, + file_metadata.num_rows, + file_metadata.created_by.clone(), + file_metadata.key_value_metadata.clone(), + schema_descr, + None, + ); + + let meta = ParquetMetaData::new(metadata, row_groups).into(); + + let input = FileReader { + meta, + local: None, + remote_reader: op.reader(&file.remote_path).await?, + remote_offset: None, + }; + + let builder = ArrowReaderBuilder::new(input).await?; + let mut stream = builder.with_batch_size(usize::MAX).build()?; + + let mut blocks = Vec::new(); + + while let Some(reader) = stream.next_row_group().await? { + for record in reader { + let record = record?; + let num_rows = record.num_rows(); + let mut columns = Vec::with_capacity(record.num_columns()); + for (array, field) in record.columns().iter().zip(data_schema.fields()) { + let data_type = field.data_type(); + columns.push(BlockEntry::new( + Value::from_arrow_rs(array.clone(), data_type)?, + || (data_type.clone(), num_rows), + )) + } + let block = DataBlock::new(columns, num_rows); + blocks.push(block); + } + } - let x = file_writer.finish()?; - println!("{x:#?}"); + println!("{:?}", blocks); Ok(()) } From 339afaab765b392f0f9b62324ed2d306ad3084f5 Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 29 Sep 2025 14:16:04 +0800 Subject: [PATCH 19/46] x --- src/query/service/src/spillers/union_file.rs | 138 +++++++++---------- 1 file changed, 67 insertions(+), 71 deletions(-) diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index 6a749d2ca9e19..81e69122a9b28 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -41,6 +41,7 @@ use parquet::arrow::arrow_writer::compute_leaves; use parquet::arrow::arrow_writer::get_column_writers; use parquet::arrow::arrow_writer::ArrowColumnWriter; use parquet::arrow::async_reader::AsyncFileReader; +use parquet::arrow::async_reader::ParquetRecordBatchStream; use parquet::arrow::ArrowSchemaConverter; use parquet::errors; use parquet::file::metadata::RowGroupMetaDataPtr; @@ -95,6 +96,7 @@ impl RowGroupWriter { pub struct FileWriter { schema: Arc, + row_groups: Vec, writer: SerializedFileWriter, } @@ -107,7 +109,11 @@ impl FileWriter { .convert(&schema)?; let writer = SerializedFileWriter::new(w, parquet.root_schema_ptr(), props.clone())?; - Ok(Self { schema, writer }) + Ok(Self { + schema, + writer, + row_groups: vec![], + }) } pub(super) fn new_row_group(&self) -> RowGroupWriter { @@ -124,7 +130,9 @@ impl FileWriter { ) -> errors::Result { let mut row_group_writer = self.writer.next_row_group()?; row_group.close(&mut row_group_writer)?; - row_group_writer.close() + let meta = row_group_writer.close()?; + self.row_groups.push(meta.clone()); + Ok(meta) } pub fn spill(&mut self, blocks: Vec) -> Result { @@ -135,18 +143,27 @@ impl FileWriter { Ok(self.flush_row_group(row_group)?) } - - fn schema_descr(&self) -> SchemaDescriptor { - let tp = self.writer.schema_descr().root_schema_ptr(); - SchemaDescriptor::new(tp) - } } impl FileWriter { - pub(super) fn finish(mut self) -> errors::Result<(FileMetaData, UnionFile)> { - let file_meta = self.writer.finish()?; + pub(super) fn finish(mut self) -> errors::Result<(ParquetMetaData, UnionFile)> { + let file_metadata = self.writer.finish()?; + let tp = self.writer.schema_descr().root_schema_ptr(); + let schema_descr = Arc::new(SchemaDescriptor::new(tp)); + + let metadata = parquet::file::metadata::FileMetaData::new( + file_metadata.version, + file_metadata.num_rows, + file_metadata.created_by.clone(), + file_metadata.key_value_metadata.clone(), + schema_descr, + None, + ); let file = self.writer.inner_mut().finish()?; - Ok((file_meta, file)) + let row_groups = std::mem::take(&mut self.row_groups); + drop(self); + let row_groups = row_groups.into_iter().map(Arc::unwrap_or_clone).collect(); + Ok((ParquetMetaData::new(metadata, row_groups), file)) } } @@ -160,7 +177,7 @@ struct LocalDst { pub struct UnionFileWriter { local: Option, remote: String, - remote_writer: BufferWriter, + remote_writer: Option, remote_offset: u64, } @@ -181,7 +198,7 @@ impl UnionFileWriter { buf: Some(buf), }), remote, - remote_writer, + remote_writer: Some(remote_writer), remote_offset: 0, } } @@ -190,12 +207,13 @@ impl UnionFileWriter { UnionFileWriter { local: None, remote, - remote_writer, + remote_writer: Some(remote_writer), remote_offset: 0, } } fn finish(&mut self) -> io::Result { + self.remote_writer.take().unwrap().close()?; match self.local.take() { Some( mut local @ LocalDst { @@ -273,9 +291,9 @@ impl io::Write for UnionFileWriter { } for buf in dma_buf { - self.remote_writer.write(&buf)?; + self.remote_writer.as_mut().unwrap().write(&buf)?; } - self.remote_writer.write(buf) + self.remote_writer.as_mut().unwrap().write(buf) } fn flush(&mut self) -> io::Result<()> { @@ -290,7 +308,7 @@ impl io::Write for UnionFileWriter { return Ok(()); } - self.remote_writer.flush() + self.remote_writer.as_mut().unwrap().flush() } } @@ -425,32 +443,41 @@ impl SpillerInner { }; let builder = ArrowReaderBuilder::new(input).await?; - let mut stream = builder + let stream = builder .with_row_groups(row_groups) .with_batch_size(usize::MAX) .build()?; - let mut blocks = Vec::new(); - - while let Some(reader) = stream.next_row_group().await? { - for record in reader { - let record = record?; - let num_rows = record.num_rows(); - let mut columns = Vec::with_capacity(record.num_columns()); - for (array, field) in record.columns().iter().zip(schema.fields()) { - let data_type = field.data_type(); - columns.push(BlockEntry::new( - Value::from_arrow_rs(array.clone(), data_type)?, - || (data_type.clone(), num_rows), - )) - } - let block = DataBlock::new(columns, num_rows); - blocks.push(block); + load_blocks_from_stream(schema, stream).await + } +} + +async fn load_blocks_from_stream( + schema: &DataSchema, + mut stream: ParquetRecordBatchStream, +) -> Result> +where + T: AsyncFileReader + Unpin + Send + 'static, +{ + let mut blocks = Vec::new(); + while let Some(reader) = stream.next_row_group().await? { + for record in reader { + let record = record?; + let num_rows = record.num_rows(); + let mut columns = Vec::with_capacity(record.num_columns()); + for (array, field) in record.columns().iter().zip(schema.fields()) { + let data_type = field.data_type(); + columns.push(BlockEntry::new( + Value::from_arrow_rs(array.clone(), data_type)?, + || (data_type.clone(), num_rows), + )) } + let block = DataBlock::new(columns, num_rows); + blocks.push(block); } - - Ok(blocks) } + + Ok(blocks) } #[cfg(test)] @@ -471,7 +498,7 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_xxx() -> Result<()> { - let config = ConfigBuilder::create().build(); + let config = ConfigBuilder::create().off_log().build(); let fixture = TestFixture::setup_with_config(&config).await?; let _ctx = fixture.new_query_ctx().await?; @@ -508,50 +535,19 @@ mod tests { let row_group = file_writer.spill(vec![block])?; row_groups.push((*row_group).clone()); - let schema_descr = file_writer.schema_descr().into(); - - let (file_metadata, file) = file_writer.finish()?; - - let metadata = parquet::file::metadata::FileMetaData::new( - file_metadata.version, - file_metadata.num_rows, - file_metadata.created_by.clone(), - file_metadata.key_value_metadata.clone(), - schema_descr, - None, - ); - - let meta = ParquetMetaData::new(metadata, row_groups).into(); + let (metadata, file) = file_writer.finish()?; let input = FileReader { - meta, + meta: metadata.into(), local: None, remote_reader: op.reader(&file.remote_path).await?, remote_offset: None, }; let builder = ArrowReaderBuilder::new(input).await?; - let mut stream = builder.with_batch_size(usize::MAX).build()?; - - let mut blocks = Vec::new(); - - while let Some(reader) = stream.next_row_group().await? { - for record in reader { - let record = record?; - let num_rows = record.num_rows(); - let mut columns = Vec::with_capacity(record.num_columns()); - for (array, field) in record.columns().iter().zip(data_schema.fields()) { - let data_type = field.data_type(); - columns.push(BlockEntry::new( - Value::from_arrow_rs(array.clone(), data_type)?, - || (data_type.clone(), num_rows), - )) - } - let block = DataBlock::new(columns, num_rows); - blocks.push(block); - } - } + let stream = builder.with_batch_size(usize::MAX).build()?; + let blocks = load_blocks_from_stream(&data_schema, stream).await?; println!("{:?}", blocks); Ok(()) From 0499fd37033014f89f2bbea8be872a10931a91b0 Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 29 Sep 2025 14:36:48 +0800 Subject: [PATCH 20/46] x --- src/query/service/src/spillers/union_file.rs | 39 ++++++++++---------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index 81e69122a9b28..dea8f168048dd 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -22,12 +22,10 @@ use databend_common_base::base::dma_buffer_to_bytes; use databend_common_base::base::AsyncDmaFile; use databend_common_base::base::DmaWriteBuf; use databend_common_base::base::SyncDmaFile; -use databend_common_base::runtime::Runtime; use databend_common_exception::Result; use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; use databend_common_expression::DataSchema; -use databend_common_expression::TableSchema; use databend_common_expression::Value; use databend_storages_common_cache::ParquetMetaData; use databend_storages_common_cache::TempDir; @@ -49,7 +47,6 @@ use parquet::file::properties::WriterProperties; use parquet::file::properties::WriterPropertiesPtr; use parquet::file::writer::SerializedFileWriter; use parquet::file::writer::SerializedRowGroupWriter; -use parquet::format::FileMetaData; use parquet::schema::types::SchemaDescriptor; use super::async_buffer::BufferPool; @@ -101,8 +98,8 @@ pub struct FileWriter { } impl FileWriter { - fn new(props: Arc, table_schema: &TableSchema, w: W) -> errors::Result { - let schema = Arc::new(Schema::from(table_schema)); + fn new(props: Arc, data_schema: &DataSchema, w: W) -> errors::Result { + let schema = Arc::new(Schema::from(data_schema)); let parquet = ArrowSchemaConverter::new() .with_coerce_types(props.coerce_types()) @@ -391,12 +388,11 @@ impl AsyncFileReader for FileReader { impl SpillerInner { pub(super) async fn new_file_writer( &self, - schema: &TableSchema, - executor: Arc, - max_buffer: usize, + schema: &DataSchema, + pool: &Arc, + dio: bool, + chunk: usize, ) -> Result> { - let pool = BufferPool::create(executor, max_buffer, 3); - let op = self.local_operator.as_ref().unwrap_or(&self.operator); let remote_location = self.create_unique_location(); @@ -405,9 +401,9 @@ impl SpillerInner { let union = if let Some(disk) = &self.temp_dir { if let Some(path) = disk.new_file_with_size(0)? { - let file = SyncDmaFile::create(&path, true)?; + let file = SyncDmaFile::create(&path, dio)?; let align = disk.block_alignment(); - let buf = DmaWriteBuf::new(align, 4 * 1024 * 1024); + let buf = DmaWriteBuf::new(align, chunk); UnionFileWriter::new(disk.clone(), path, file, buf, remote_location, remote) } else { UnionFileWriter::without_local(remote_location, remote) @@ -422,24 +418,29 @@ impl SpillerInner { pub(super) async fn load_row_groups( &self, - file: UnionFile, + UnionFile { + local_path, + remote_path, + remote_offset, + }: UnionFile, meta: Arc, schema: &DataSchema, row_groups: Vec, + dio: bool, ) -> Result> { let op = self.local_operator.as_ref().unwrap_or(&self.operator); let input = FileReader { meta, - local: if let Some(path) = file.local_path { + local: if let Some(path) = local_path { let alignment = Some(self.temp_dir.as_ref().unwrap().block_alignment()); - let file = AsyncDmaFile::open(&path, true, alignment).await?; + let file = AsyncDmaFile::open(&path, dio, alignment).await?; Some((path, file)) } else { None }, - remote_offset: file.remote_offset, - remote_reader: op.reader(&file.remote_path).await?, + remote_offset, + remote_reader: op.reader(&remote_path).await?, }; let builder = ArrowReaderBuilder::new(input).await?; @@ -484,7 +485,6 @@ where mod tests { use databend_common_base::runtime::GlobalIORuntime; use databend_common_exception::Result; - use databend_common_expression::infer_table_schema; use databend_common_expression::types::StringType; use databend_common_expression::types::UInt64Type; use databend_common_expression::FromData; @@ -510,7 +510,6 @@ mod tests { ]); let data_schema = block.infer_schema(); - let table_schema = infer_table_schema(&data_schema)?; let executor = GlobalIORuntime::instance(); let memory = 1024 * 1024 * 100; @@ -529,7 +528,7 @@ mod tests { // let buf = DmaWriteBuf::new(align, 4 * 1024 * 1024); let file = UnionFileWriter::without_local(path.to_string(), remote); - let mut file_writer = FileWriter::new(props, &table_schema, file)?; + let mut file_writer = FileWriter::new(props, &data_schema, file)?; let mut row_groups = vec![]; let row_group = file_writer.spill(vec![block])?; From 9ad5e6549228a89eab46d39c0648e35585ded0d9 Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 29 Sep 2025 16:54:51 +0800 Subject: [PATCH 21/46] x --- .../transform_window_partition_collect.rs | 71 ++++++++++++++++--- .../partition/window_partition_buffer.rs | 5 ++ 2 files changed, 68 insertions(+), 8 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs index 3051a2f0f018c..348413d347dcc 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs @@ -40,6 +40,65 @@ use crate::spillers::SpillerConfig; use crate::spillers::SpillerDiskConfig; use crate::spillers::SpillerType; +// Local enum to wrap WindowPartitionBuffer as a variant without modifying the original. +#[derive(Debug)] +pub enum WindowBuffer { + WindowPartitionBuffer(Box), +} + +impl WindowBuffer { + pub fn new( + spiller: Spiller, + num_partitions: usize, + sort_block_size: usize, + memory_settings: MemorySettings, + ) -> Result { + let inner = + WindowPartitionBuffer::new(spiller, num_partitions, sort_block_size, memory_settings)?; + Ok(Self::WindowPartitionBuffer(Box::new(inner))) + } + + pub fn need_spill(&mut self) -> bool { + match self { + WindowBuffer::WindowPartitionBuffer(inner) => inner.need_spill(), + } + } + + pub fn out_of_memory_limit(&mut self) -> bool { + match self { + WindowBuffer::WindowPartitionBuffer(inner) => inner.out_of_memory_limit(), + } + } + + pub fn is_empty(&self) -> bool { + match self { + WindowBuffer::WindowPartitionBuffer(inner) => inner.is_empty(), + } + } + + pub fn add_data_block(&mut self, partition_id: usize, data_block: DataBlock) { + if let WindowBuffer::WindowPartitionBuffer(inner) = self { + inner.add_data_block(partition_id, data_block); + } + } + + pub async fn spill(&mut self) -> Result<()> { + if let WindowBuffer::WindowPartitionBuffer(inner) = self { + inner.spill().await + } else { + Ok(()) + } + } + + pub async fn restore(&mut self) -> Result> { + if let WindowBuffer::WindowPartitionBuffer(inner) = self { + inner.restore().await + } else { + Ok(vec![]) + } + } +} + #[derive(Debug, Clone, Copy)] pub enum Step { Sync(SyncStep), @@ -69,7 +128,7 @@ pub struct TransformWindowPartitionCollect { // The partition id is used to map the partition id to the new partition id. partition_id: Vec, // The buffer is used to control the memory usage of the window operator. - buffer: WindowPartitionBuffer, + buffer: WindowBuffer, strategy: S, @@ -116,12 +175,8 @@ impl TransformWindowPartitionCollect { // Create the window partition buffer. let sort_block_size = settings.get_window_partition_sort_block_size()? as usize; - let buffer = WindowPartitionBuffer::new( - spiller, - partitions.len(), - sort_block_size, - memory_settings, - )?; + let buffer = + WindowBuffer::new(spiller, partitions.len(), sort_block_size, memory_settings)?; Ok(Self { input, @@ -275,7 +330,7 @@ impl TransformWindowPartitionCollect { fn collect_data_block( data_block: DataBlock, partition_ids: &[usize], - buffer: &mut WindowPartitionBuffer, + buffer: &mut WindowBuffer, ) { if let Some(meta) = data_block .get_owned_meta() diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs index bf01acedc586c..81bdadbab0b61 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs @@ -238,3 +238,8 @@ impl WindowPartitionBuffer { Ok(result) } } + +trait Spill { + async fn spill(blocks: Vec) -> Result; + async fn restore(ordinal: Vec) -> Result>; +} From 02efee6a7a10f96fbdbe2ac7af2e5d2e2cc5ddde Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 29 Sep 2025 18:45:19 +0800 Subject: [PATCH 22/46] x --- .../transforms/window/partition/mod.rs | 2 + .../transform_window_partition_collect.rs | 20 +- .../partition/window_partition_buffer_v2.rs | 197 ++++++++++++++++++ 3 files changed, 205 insertions(+), 14 deletions(-) create mode 100644 src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs index 5aa4562c98865..4d4eda4b641af 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs @@ -16,6 +16,7 @@ mod data_processor_strategy; mod hilbert_partition_exchange; mod transform_window_partition_collect; mod window_partition_buffer; +mod window_partition_buffer_v2; mod window_partition_exchange; mod window_partition_meta; mod window_partition_partial_top_n_exchange; @@ -24,6 +25,7 @@ pub use data_processor_strategy::*; pub use hilbert_partition_exchange::*; pub use transform_window_partition_collect::*; pub use window_partition_buffer::*; +pub use window_partition_buffer_v2::*; pub use window_partition_exchange::*; pub use window_partition_meta::*; pub use window_partition_partial_top_n_exchange::*; diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs index 348413d347dcc..b2a48d0fb7e1b 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs @@ -41,7 +41,6 @@ use crate::spillers::SpillerDiskConfig; use crate::spillers::SpillerType; // Local enum to wrap WindowPartitionBuffer as a variant without modifying the original. -#[derive(Debug)] pub enum WindowBuffer { WindowPartitionBuffer(Box), } @@ -77,25 +76,18 @@ impl WindowBuffer { } pub fn add_data_block(&mut self, partition_id: usize, data_block: DataBlock) { - if let WindowBuffer::WindowPartitionBuffer(inner) = self { - inner.add_data_block(partition_id, data_block); - } + let WindowBuffer::WindowPartitionBuffer(inner) = self; + inner.add_data_block(partition_id, data_block); } pub async fn spill(&mut self) -> Result<()> { - if let WindowBuffer::WindowPartitionBuffer(inner) = self { - inner.spill().await - } else { - Ok(()) - } + let WindowBuffer::WindowPartitionBuffer(inner) = self; + inner.spill().await } pub async fn restore(&mut self) -> Result> { - if let WindowBuffer::WindowPartitionBuffer(inner) = self { - inner.restore().await - } else { - Ok(vec![]) - } + let WindowBuffer::WindowPartitionBuffer(inner) = self; + inner.restore().await } } diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs new file mode 100644 index 0000000000000..5343f87ac9f91 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs @@ -0,0 +1,197 @@ +use async_trait::async_trait; +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_pipeline_transforms::MemorySettings; + +use crate::spillers::PartitionBuffer; +use crate::spillers::PartitionBufferFetchOption; + +#[async_trait] +pub trait Spill: Send + Sync { + async fn spill(&mut self, blocks: Vec) -> Result; + async fn restore(&mut self, ordinals: Vec) -> Result>; +} + +/// Alternate window partition buffer that delegates spilling through a `Spill` +/// abstraction and tracks spilled partitions by ordinal. +pub struct WindowPartitionBufferV2 { + spill: S, + memory_settings: MemorySettings, + partition_buffer: PartitionBuffer, + num_partitions: usize, + sort_block_size: usize, + can_spill: bool, + next_to_restore_partition_id: isize, + spilled_partition_ordinals: Vec>, +} + +impl WindowPartitionBufferV2 { + pub fn new( + spill: S, + num_partitions: usize, + sort_block_size: usize, + memory_settings: MemorySettings, + ) -> Result { + let partition_buffer = PartitionBuffer::create(num_partitions); + Ok(Self { + spill, + memory_settings, + partition_buffer, + num_partitions, + sort_block_size, + can_spill: false, + next_to_restore_partition_id: -1, + spilled_partition_ordinals: vec![Vec::new(); num_partitions], + }) + } + + pub fn need_spill(&mut self) -> bool { + self.can_spill && self.memory_settings.check_spill() + } + + pub fn out_of_memory_limit(&mut self) -> bool { + self.memory_settings.check_spill() + } + + pub fn is_empty(&self) -> bool { + self.next_to_restore_partition_id + 1 >= self.num_partitions as isize + } + + pub fn add_data_block(&mut self, partition_id: usize, data_block: DataBlock) { + if data_block.is_empty() { + return; + } + self.partition_buffer + .add_data_block(partition_id, data_block); + self.can_spill = true; + } + + pub async fn spill(&mut self) -> Result<()> { + let spill_unit_size = self.memory_settings.spill_unit_size; + let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0); + let next_to_restore_partition_id = (self.next_to_restore_partition_id + 1).max(0) as usize; + + let mut preferred_partition: Option<(usize, usize)> = None; + for partition_id in (next_to_restore_partition_id..self.num_partitions).rev() { + if self.partition_buffer.is_partition_empty(partition_id) { + continue; + } + let partition_size = self.partition_buffer.partition_memory_size(partition_id); + if partition_size > spill_unit_size + && preferred_partition + .as_ref() + .map(|(_, size)| partition_size > *size) + .unwrap_or(true) + { + preferred_partition = Some((partition_id, partition_size)); + } + } + + if let Some((partition_id, _)) = preferred_partition { + if let Some(blocks) = self + .partition_buffer + .fetch_data_blocks(partition_id, &option)? + { + let ordinal = self.spill.spill(blocks).await?; + self.spilled_partition_ordinals[partition_id].push(ordinal); + return Ok(()); + } + } + + let mut partitions: Vec<(usize, usize)> = (next_to_restore_partition_id + ..self.num_partitions) + .filter_map(|partition_id| { + if self.partition_buffer.is_partition_empty(partition_id) { + None + } else { + Some(( + partition_id, + self.partition_buffer.partition_memory_size(partition_id), + )) + } + }) + .collect(); + + if partitions.is_empty() { + self.can_spill = false; + return Ok(()); + } + + partitions.sort_by(|a, b| b.1.cmp(&a.1)); + + let mut spilled_any = false; + let mut spilled_bytes = 0; + for (partition_id, partition_size) in partitions { + if let Some(blocks) = self + .partition_buffer + .fetch_data_blocks(partition_id, &option)? + { + let ordinal = self.spill.spill(blocks).await?; + self.spilled_partition_ordinals[partition_id].push(ordinal); + spilled_any = true; + spilled_bytes += partition_size; + } + + if spilled_bytes >= spill_unit_size { + break; + } + } + + if !spilled_any { + self.can_spill = false; + } + Ok(()) + } + + pub async fn restore(&mut self) -> Result> { + while self.next_to_restore_partition_id + 1 < self.num_partitions as isize { + self.next_to_restore_partition_id += 1; + let partition_id = self.next_to_restore_partition_id as usize; + + let ordinals = std::mem::take(&mut self.spilled_partition_ordinals[partition_id]); + let mut result = if ordinals.is_empty() { + Vec::new() + } else { + self.spill.restore(ordinals).await? + }; + + if !self.partition_buffer.is_partition_empty(partition_id) { + let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0); + if let Some(blocks) = self + .partition_buffer + .fetch_data_blocks(partition_id, &option)? + { + result.extend(self.concat_data_blocks(blocks)?); + } + } + + if !result.is_empty() { + return Ok(result); + } + } + + Ok(vec![]) + } + + fn concat_data_blocks(&self, data_blocks: Vec) -> Result> { + let mut num_rows = 0; + let mut result = Vec::new(); + let mut current_blocks = Vec::new(); + + for data_block in data_blocks.into_iter() { + num_rows += data_block.num_rows(); + current_blocks.push(data_block); + if num_rows >= self.sort_block_size { + result.push(DataBlock::concat(¤t_blocks)?); + num_rows = 0; + current_blocks.clear(); + } + } + + if !current_blocks.is_empty() { + result.push(DataBlock::concat(¤t_blocks)?); + } + + Ok(result) + } +} From 646bf5a15eecc5d5ff7e4e05c45e49cd6e9612c8 Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 29 Sep 2025 19:07:45 +0800 Subject: [PATCH 23/46] x --- src/query/service/src/spillers/union_file.rs | 72 +++++++++++++++++++- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index dea8f168048dd..76bfb173ad515 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -14,6 +14,7 @@ use std::future; use std::io; +use std::io::Cursor; use std::io::Write; use std::sync::Arc; @@ -35,6 +36,7 @@ use futures::future::FutureExt; use opendal::Reader; use parquet::arrow::arrow_reader::ArrowReaderBuilder; use parquet::arrow::arrow_reader::ArrowReaderOptions; +use parquet::arrow::arrow_reader::ParquetRecordBatchReader; use parquet::arrow::arrow_writer::compute_leaves; use parquet::arrow::arrow_writer::get_column_writers; use parquet::arrow::arrow_writer::ArrowColumnWriter; @@ -56,23 +58,33 @@ use super::SpillerInner; pub struct RowGroupWriter { schema: Arc, + props: WriterPropertiesPtr, writers: Vec, + num_rows: usize, } impl RowGroupWriter { fn new(props: &WriterPropertiesPtr, schema: Arc, parquet: &SchemaDescriptor) -> Self { let writers = get_column_writers(parquet, props, &schema).unwrap(); - Self { schema, writers } + Self { + schema, + props: props.clone(), + writers, + num_rows: 0, + } } pub(super) fn write(&mut self, block: DataBlock) -> errors::Result<()> { + let num_rows = block.num_rows(); + let columns = block.take_columns(); let mut writer_iter = self.writers.iter_mut(); - for (field, entry) in self.schema.fields().iter().zip(block.take_columns()) { + for (field, entry) in self.schema.fields().iter().zip(columns) { let array = (&entry.to_column()).into(); for col in compute_leaves(field, &array).unwrap() { writer_iter.next().unwrap().write(&col)?; } } + self.num_rows += num_rows; Ok(()) } @@ -89,6 +101,62 @@ impl RowGroupWriter { pub fn memory_size(&self) -> usize { self.writers.iter().map(|w| w.memory_size()).sum() } + + pub(super) fn into_block(self) -> Result { + let RowGroupWriter { + schema, + props, + writers, + num_rows, + } = self; + + let data_schema = DataSchema::try_from(schema.as_ref())?; + if num_rows == 0 { + return Ok(DataBlock::empty_with_schema(Arc::new(data_schema))); + } + + let parquet_schema = ArrowSchemaConverter::new() + .with_coerce_types(props.coerce_types()) + .convert(&schema)?; + + let mut file_writer = SerializedFileWriter::new( + Cursor::new(Vec::new()), + parquet_schema.root_schema_ptr(), + props.clone(), + )?; + + { + let mut row_group_writer = file_writer.next_row_group()?; + for writer in writers { + writer.close()?.append_to_row_group(&mut row_group_writer)?; + } + row_group_writer.close()?; + } + + let cursor = file_writer.into_inner()?; + let parquet_bytes = bytes::Bytes::from(cursor.into_inner()); + + let mut reader = ParquetRecordBatchReader::try_new(parquet_bytes, usize::MAX)?; + let mut blocks = Vec::new(); + while let Some(batch) = reader.next() { + let batch = batch?; + let (block, _) = DataBlock::from_record_batch(&data_schema, &batch)?; + blocks.push(block); + } + + if blocks.is_empty() { + return Ok(DataBlock::empty_with_schema(Arc::new(data_schema))); + } + + let block = if blocks.len() == 1 { + blocks.into_iter().next().unwrap() + } else { + DataBlock::concat(&blocks)? + }; + + debug_assert_eq!(block.num_rows(), num_rows); + Ok(block) + } } pub struct FileWriter { From cd58dcb5348059c77800a2c6187117180f4aebbd Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 29 Sep 2025 20:57:55 +0800 Subject: [PATCH 24/46] x --- src/query/expression/src/types/array.rs | 2 +- src/query/service/src/spillers/union_file.rs | 87 ++++++++++++++------ 2 files changed, 65 insertions(+), 24 deletions(-) diff --git a/src/query/expression/src/types/array.rs b/src/query/expression/src/types/array.rs index 6586103755b14..cdf459dee9897 100755 --- a/src/query/expression/src/types/array.rs +++ b/src/query/expression/src/types/array.rs @@ -324,7 +324,7 @@ impl ArrayColumn { impl ArrayColumn { pub fn upcast(self, data_type: &DataType) -> ArrayColumn { - let values_type = data_type.as_array().unwrap(); + let values_type = data_type.as_array().expect("must array type"); ArrayColumn { values: T::upcast_column_with_type(self.values, values_type), offsets: self.offsets, diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index 76bfb173ad515..084cf1a9aba57 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -60,7 +60,6 @@ pub struct RowGroupWriter { schema: Arc, props: WriterPropertiesPtr, writers: Vec, - num_rows: usize, } impl RowGroupWriter { @@ -70,12 +69,10 @@ impl RowGroupWriter { schema, props: props.clone(), writers, - num_rows: 0, } } pub(super) fn write(&mut self, block: DataBlock) -> errors::Result<()> { - let num_rows = block.num_rows(); let columns = block.take_columns(); let mut writer_iter = self.writers.iter_mut(); for (field, entry) in self.schema.fields().iter().zip(columns) { @@ -84,7 +81,6 @@ impl RowGroupWriter { writer_iter.next().unwrap().write(&col)?; } } - self.num_rows += num_rows; Ok(()) } @@ -102,47 +98,40 @@ impl RowGroupWriter { self.writers.iter().map(|w| w.memory_size()).sum() } - pub(super) fn into_block(self) -> Result { + pub fn into_block(self) -> Result { let RowGroupWriter { schema, props, writers, - num_rows, } = self; let data_schema = DataSchema::try_from(schema.as_ref())?; - if num_rows == 0 { - return Ok(DataBlock::empty_with_schema(Arc::new(data_schema))); - } - let parquet_schema = ArrowSchemaConverter::new() .with_coerce_types(props.coerce_types()) .convert(&schema)?; let mut file_writer = SerializedFileWriter::new( + // todo: find a nocopy way Cursor::new(Vec::new()), parquet_schema.root_schema_ptr(), props.clone(), )?; - { - let mut row_group_writer = file_writer.next_row_group()?; - for writer in writers { - writer.close()?.append_to_row_group(&mut row_group_writer)?; - } - row_group_writer.close()?; + let mut row_group_writer = file_writer.next_row_group()?; + for writer in writers { + writer.close()?.append_to_row_group(&mut row_group_writer)?; } + row_group_writer.close()?; let cursor = file_writer.into_inner()?; let parquet_bytes = bytes::Bytes::from(cursor.into_inner()); let mut reader = ParquetRecordBatchReader::try_new(parquet_bytes, usize::MAX)?; - let mut blocks = Vec::new(); - while let Some(batch) = reader.next() { - let batch = batch?; - let (block, _) = DataBlock::from_record_batch(&data_schema, &batch)?; - blocks.push(block); - } + + let blocks = reader + .into_iter() + .map(|batch| DataBlock::from_record_batch(&data_schema, &batch?)?.0) + .collect::>>()?; if blocks.is_empty() { return Ok(DataBlock::empty_with_schema(Arc::new(data_schema))); @@ -154,7 +143,6 @@ impl RowGroupWriter { DataBlock::concat(&blocks)? }; - debug_assert_eq!(block.num_rows(), num_rows); Ok(block) } } @@ -553,11 +541,17 @@ where mod tests { use databend_common_base::runtime::GlobalIORuntime; use databend_common_exception::Result; + use databend_common_expression::types::array::ArrayColumnBuilder; + use databend_common_expression::types::number::Int32Type; + use databend_common_expression::types::ArgType; + use databend_common_expression::types::DataType; use databend_common_expression::types::StringType; use databend_common_expression::types::UInt64Type; + use databend_common_expression::Column; use databend_common_expression::FromData; use databend_common_storage::DataOperator; use parquet::file::properties::WriterProperties; + use parquet::file::properties::WriterPropertiesPtr; use super::*; use crate::spillers::async_buffer::BufferPool; @@ -619,4 +613,51 @@ mod tests { Ok(()) } + + #[test] + fn test_row_group_writer_restores() -> Result<()> { + let mut array_builder = ArrayColumnBuilder::::with_capacity(3, 3, &[]); + { + let mut arrays = array_builder.as_mut(); + arrays.put_item(1); + arrays.put_item(2); + arrays.commit_row(); + + arrays.put_item(3); + arrays.commit_row(); + + arrays.push_default(); + } + let array_column = Column::Array(Box::new( + array_builder + .build() + .upcast(&DataType::Array(Int32Type::data_type().into())), + )); + + let block = DataBlock::new_from_columns(vec![ + StringType::from_data(vec!["alpha", "beta", "gamma"]), + array_column, + StringType::from_opt_data(vec![Some("nullable"), None, Some("value")]), + ]); + + let data_schema = block.infer_schema(); + + let props: WriterPropertiesPtr = WriterProperties::default().into(); + let file_writer = FileWriter::new(props.clone(), &data_schema, Vec::::new())?; + let mut row_group = file_writer.new_row_group(); + + row_group.write(block.clone())?; + row_group.write(block.clone())?; + let restored = row_group.into_block()?; + + for (a, b) in DataBlock::concat(&[block.clone(), block])? + .columns() + .iter() + .zip(restored.columns()) + { + assert_eq!(a, b); + } + + Ok(()) + } } From 8882d130ece65eca8c821c560e63f44e1611cde0 Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 29 Sep 2025 21:31:23 +0800 Subject: [PATCH 25/46] x --- .../transforms/hash_join/hash_join_spiller.rs | 7 +- .../partition/window_partition_buffer.rs | 8 +- .../partition/window_partition_buffer_v2.rs | 87 +++++-------------- .../service/src/spillers/partition_buffer.rs | 7 +- 4 files changed, 35 insertions(+), 74 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs index 37dc06bde0233..13cfa1ed55a53 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs @@ -270,7 +270,7 @@ impl HashJoinSpiller { .add_data_block(partition_id, data_block); if let Some(data_blocks) = self .partition_buffer - .fetch_data_blocks(partition_id, &fetch_option)? + .fetch_data_blocks(partition_id, &fetch_option) { self.spiller .spill_with_partition(partition_id, data_blocks) @@ -341,8 +341,9 @@ impl HashJoinSpiller { PartitionBufferFetchOption::ReadPartition }; - self.partition_buffer - .fetch_data_blocks(partition_id, &option) + Ok(self + .partition_buffer + .fetch_data_blocks(partition_id, &option)) } fn partition_data_block( diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs index 81bdadbab0b61..4cdcb8df93307 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs @@ -93,7 +93,7 @@ impl WindowPartitionBuffer { { if let Some(data_blocks) = self .partition_buffer - .fetch_data_blocks(partition_id, &option)? + .fetch_data_blocks(partition_id, &option) { return self .spiller @@ -112,7 +112,7 @@ impl WindowPartitionBuffer { self.partition_buffer.partition_memory_size(partition_id); if let Some(data_blocks) = self .partition_buffer - .fetch_data_blocks(partition_id, &option)? + .fetch_data_blocks(partition_id, &option) { partitions_to_spill.push((partition_id, data_blocks)); accumulated_bytes += partition_memory_size; @@ -190,7 +190,7 @@ impl WindowPartitionBuffer { let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0); if let Some(data_blocks) = self .partition_buffer - .fetch_data_blocks(partition_id, &option)? + .fetch_data_blocks(partition_id, &option) { result.extend(self.concat_data_blocks(data_blocks)?); } @@ -203,7 +203,7 @@ impl WindowPartitionBuffer { let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0); if let Some(data_blocks) = self .restored_partition_buffer - .fetch_data_blocks(partition_id, &option)? + .fetch_data_blocks(partition_id, &option) { result.extend(self.concat_data_blocks(data_blocks)?); } diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs index 5343f87ac9f91..bae00eb1534cc 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs @@ -68,76 +68,40 @@ impl WindowPartitionBufferV2 { pub async fn spill(&mut self) -> Result<()> { let spill_unit_size = self.memory_settings.spill_unit_size; - let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0); - let next_to_restore_partition_id = (self.next_to_restore_partition_id + 1).max(0) as usize; + let next_to_restore_partition_id = (self.next_to_restore_partition_id + 1) as usize; let mut preferred_partition: Option<(usize, usize)> = None; for partition_id in (next_to_restore_partition_id..self.num_partitions).rev() { if self.partition_buffer.is_partition_empty(partition_id) { continue; } - let partition_size = self.partition_buffer.partition_memory_size(partition_id); - if partition_size > spill_unit_size - && preferred_partition - .as_ref() - .map(|(_, size)| partition_size > *size) - .unwrap_or(true) - { - preferred_partition = Some((partition_id, partition_size)); - } - } - - if let Some((partition_id, _)) = preferred_partition { - if let Some(blocks) = self - .partition_buffer - .fetch_data_blocks(partition_id, &option)? - { + if let Some(blocks) = self.partition_buffer.fetch_data_blocks( + partition_id, + &PartitionBufferFetchOption::PickPartitionWithThreshold(spill_unit_size), + ) { let ordinal = self.spill.spill(blocks).await?; self.spilled_partition_ordinals[partition_id].push(ordinal); return Ok(()); } - } - - let mut partitions: Vec<(usize, usize)> = (next_to_restore_partition_id - ..self.num_partitions) - .filter_map(|partition_id| { - if self.partition_buffer.is_partition_empty(partition_id) { - None - } else { - Some(( - partition_id, - self.partition_buffer.partition_memory_size(partition_id), - )) - } - }) - .collect(); - - if partitions.is_empty() { - self.can_spill = false; - return Ok(()); - } - partitions.sort_by(|a, b| b.1.cmp(&a.1)); - - let mut spilled_any = false; - let mut spilled_bytes = 0; - for (partition_id, partition_size) in partitions { - if let Some(blocks) = self - .partition_buffer - .fetch_data_blocks(partition_id, &option)? + let partition_size = self.partition_buffer.partition_memory_size(partition_id); + if preferred_partition + .as_ref() + .map(|(_, size)| partition_size > *size) + .unwrap_or(true) { - let ordinal = self.spill.spill(blocks).await?; - self.spilled_partition_ordinals[partition_id].push(ordinal); - spilled_any = true; - spilled_bytes += partition_size; - } - - if spilled_bytes >= spill_unit_size { - break; + preferred_partition = Some((partition_id, partition_size)); } } - if !spilled_any { + if let Some((partition_id, _)) = preferred_partition { + let blocks = self + .partition_buffer + .fetch_data_blocks(partition_id, &PartitionBufferFetchOption::ReadPartition) + .unwrap(); + let ordinal = self.spill.spill(blocks).await?; + self.spilled_partition_ordinals[partition_id].push(ordinal); + } else { self.can_spill = false; } Ok(()) @@ -155,14 +119,11 @@ impl WindowPartitionBufferV2 { self.spill.restore(ordinals).await? }; - if !self.partition_buffer.is_partition_empty(partition_id) { - let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0); - if let Some(blocks) = self - .partition_buffer - .fetch_data_blocks(partition_id, &option)? - { - result.extend(self.concat_data_blocks(blocks)?); - } + if let Some(blocks) = self + .partition_buffer + .fetch_data_blocks(partition_id, &PartitionBufferFetchOption::ReadPartition) + { + result.extend(self.concat_data_blocks(blocks)?); } if !result.is_empty() { diff --git a/src/query/service/src/spillers/partition_buffer.rs b/src/query/service/src/spillers/partition_buffer.rs index 2ff44869d64c9..8fbb4869a402d 100644 --- a/src/query/service/src/spillers/partition_buffer.rs +++ b/src/query/service/src/spillers/partition_buffer.rs @@ -52,8 +52,8 @@ impl PartitionBuffer { &mut self, partition_id: usize, option: &PartitionBufferFetchOption, - ) -> Result>> { - let data_blocks = match option { + ) -> Option> { + match option { PartitionBufferFetchOption::ReadPartition => { if !self.partition_data[partition_id].is_empty() { Some(self.partition_data[partition_id].clone()) @@ -71,8 +71,7 @@ impl PartitionBuffer { None } } - }; - Ok(data_blocks) + } } pub fn memory_size(&self) -> usize { From 7289d46b567c6ca0d34feb19ab34134ff7733d2a Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 30 Sep 2025 08:57:17 +0800 Subject: [PATCH 26/46] x --- .../partition/window_partition_buffer.rs | 44 +++++++++---------- .../partition/window_partition_buffer_v2.rs | 42 ++++++------------ 2 files changed, 34 insertions(+), 52 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs index 4cdcb8df93307..a38a26dbc5cea 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs @@ -192,7 +192,7 @@ impl WindowPartitionBuffer { .partition_buffer .fetch_data_blocks(partition_id, &option) { - result.extend(self.concat_data_blocks(data_blocks)?); + result.extend(concat_data_blocks(data_blocks, self.sort_block_size)?); } } @@ -205,7 +205,7 @@ impl WindowPartitionBuffer { .restored_partition_buffer .fetch_data_blocks(partition_id, &option) { - result.extend(self.concat_data_blocks(data_blocks)?); + result.extend(concat_data_blocks(data_blocks, self.sort_block_size)?); } } @@ -215,31 +215,29 @@ impl WindowPartitionBuffer { } Ok(vec![]) } +} - fn concat_data_blocks(&self, data_blocks: Vec) -> Result> { - let mut num_rows = 0; - let mut result = Vec::new(); - let mut current_blocks = Vec::new(); - - for data_block in data_blocks.into_iter() { - num_rows += data_block.num_rows(); - current_blocks.push(data_block); - if num_rows >= self.sort_block_size { - result.push(DataBlock::concat(¤t_blocks)?); - num_rows = 0; - current_blocks.clear(); - } - } - - if !current_blocks.is_empty() { +pub(super) fn concat_data_blocks( + data_blocks: Vec, + target_size: usize, +) -> Result> { + let mut num_rows = 0; + let mut result = Vec::new(); + let mut current_blocks = Vec::new(); + + for data_block in data_blocks.into_iter() { + num_rows += data_block.num_rows(); + current_blocks.push(data_block); + if num_rows >= target_size { result.push(DataBlock::concat(¤t_blocks)?); + num_rows = 0; + current_blocks.clear(); } + } - Ok(result) + if !current_blocks.is_empty() { + result.push(DataBlock::concat(¤t_blocks)?); } -} -trait Spill { - async fn spill(blocks: Vec) -> Result; - async fn restore(ordinal: Vec) -> Result>; + Ok(result) } diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs index bae00eb1534cc..3b448af9f935d 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs @@ -1,22 +1,21 @@ -use async_trait::async_trait; use databend_common_exception::Result; use databend_common_expression::DataBlock; use databend_common_pipeline_transforms::MemorySettings; +use super::concat_data_blocks; use crate::spillers::PartitionBuffer; use crate::spillers::PartitionBufferFetchOption; -#[async_trait] +#[async_trait::async_trait] pub trait Spill: Send + Sync { async fn spill(&mut self, blocks: Vec) -> Result; async fn restore(&mut self, ordinals: Vec) -> Result>; } -/// Alternate window partition buffer that delegates spilling through a `Spill` -/// abstraction and tracks spilled partitions by ordinal. pub struct WindowPartitionBufferV2 { spill: S, memory_settings: MemorySettings, + min_spill_size: usize, partition_buffer: PartitionBuffer, num_partitions: usize, sort_block_size: usize, @@ -36,6 +35,7 @@ impl WindowPartitionBufferV2 { Ok(Self { spill, memory_settings, + min_spill_size: 1024 * 1024, partition_buffer, num_partitions, sort_block_size, @@ -63,7 +63,11 @@ impl WindowPartitionBufferV2 { } self.partition_buffer .add_data_block(partition_id, data_block); - self.can_spill = true; + if !self.can_spill + && self.partition_buffer.partition_memory_size(partition_id) >= self.min_spill_size + { + self.can_spill = true; + } } pub async fn spill(&mut self) -> Result<()> { @@ -94,7 +98,9 @@ impl WindowPartitionBufferV2 { } } - if let Some((partition_id, _)) = preferred_partition { + if let Some((partition_id, size)) = preferred_partition + && size >= self.min_spill_size + { let blocks = self .partition_buffer .fetch_data_blocks(partition_id, &PartitionBufferFetchOption::ReadPartition) @@ -123,7 +129,7 @@ impl WindowPartitionBufferV2 { .partition_buffer .fetch_data_blocks(partition_id, &PartitionBufferFetchOption::ReadPartition) { - result.extend(self.concat_data_blocks(blocks)?); + result.extend(concat_data_blocks(blocks, self.sort_block_size)?); } if !result.is_empty() { @@ -133,26 +139,4 @@ impl WindowPartitionBufferV2 { Ok(vec![]) } - - fn concat_data_blocks(&self, data_blocks: Vec) -> Result> { - let mut num_rows = 0; - let mut result = Vec::new(); - let mut current_blocks = Vec::new(); - - for data_block in data_blocks.into_iter() { - num_rows += data_block.num_rows(); - current_blocks.push(data_block); - if num_rows >= self.sort_block_size { - result.push(DataBlock::concat(¤t_blocks)?); - num_rows = 0; - current_blocks.clear(); - } - } - - if !current_blocks.is_empty() { - result.push(DataBlock::concat(¤t_blocks)?); - } - - Ok(result) - } } From 2a611d31e00d0e94e0354524824cdcbcafa4adee Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 30 Sep 2025 09:06:00 +0800 Subject: [PATCH 27/46] x --- src/query/service/src/spillers/adapter.rs | 2 +- src/query/service/src/spillers/union_file.rs | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index 163d2213b4d73..d149cad78ee54 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -364,7 +364,7 @@ impl SpillWriter { pub fn spill(&mut self, blocks: Vec) -> Result { let mut row_group = self.file.new_row_group(); for block in blocks { - row_group.write(block)?; + row_group.add(block)?; } Ok(self.file.flush_row_group(row_group)?) } diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index 084cf1a9aba57..46c6c6f4dc29e 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -56,13 +56,13 @@ use super::async_buffer::BufferWriter; use super::SpillAdapter; use super::SpillerInner; -pub struct RowGroupWriter { +pub struct RowGroupEncoder { schema: Arc, props: WriterPropertiesPtr, writers: Vec, } -impl RowGroupWriter { +impl RowGroupEncoder { fn new(props: &WriterPropertiesPtr, schema: Arc, parquet: &SchemaDescriptor) -> Self { let writers = get_column_writers(parquet, props, &schema).unwrap(); Self { @@ -72,7 +72,7 @@ impl RowGroupWriter { } } - pub(super) fn write(&mut self, block: DataBlock) -> errors::Result<()> { + pub fn add(&mut self, block: DataBlock) -> errors::Result<()> { let columns = block.take_columns(); let mut writer_iter = self.writers.iter_mut(); for (field, entry) in self.schema.fields().iter().zip(columns) { @@ -99,7 +99,7 @@ impl RowGroupWriter { } pub fn into_block(self) -> Result { - let RowGroupWriter { + let RowGroupEncoder { schema, props, writers, @@ -169,8 +169,8 @@ impl FileWriter { }) } - pub(super) fn new_row_group(&self) -> RowGroupWriter { - RowGroupWriter::new( + pub(super) fn new_row_group(&self) -> RowGroupEncoder { + RowGroupEncoder::new( self.writer.properties(), self.schema.clone(), self.writer.schema_descr(), @@ -179,7 +179,7 @@ impl FileWriter { pub(super) fn flush_row_group( &mut self, - row_group: RowGroupWriter, + row_group: RowGroupEncoder, ) -> errors::Result { let mut row_group_writer = self.writer.next_row_group()?; row_group.close(&mut row_group_writer)?; @@ -191,7 +191,7 @@ impl FileWriter { pub fn spill(&mut self, blocks: Vec) -> Result { let mut row_group = self.new_row_group(); for block in blocks { - row_group.write(block)?; + row_group.add(block)?; } Ok(self.flush_row_group(row_group)?) @@ -646,8 +646,8 @@ mod tests { let file_writer = FileWriter::new(props.clone(), &data_schema, Vec::::new())?; let mut row_group = file_writer.new_row_group(); - row_group.write(block.clone())?; - row_group.write(block.clone())?; + row_group.add(block.clone())?; + row_group.add(block.clone())?; let restored = row_group.into_block()?; for (a, b) in DataBlock::concat(&[block.clone(), block])? From 9b87c960db9e4d320f7632e301ad5ae2cf5f466c Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 30 Sep 2025 10:01:03 +0800 Subject: [PATCH 28/46] x --- .../partition/window_partition_buffer_v2.rs | 144 ++++++++++++++++-- .../service/src/spillers/partition_buffer.rs | 1 - src/query/service/src/spillers/union_file.rs | 6 +- 3 files changed, 136 insertions(+), 15 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs index 3b448af9f935d..4644730dc143c 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs @@ -1,3 +1,5 @@ +use std::future::Future; + use databend_common_exception::Result; use databend_common_expression::DataBlock; use databend_common_pipeline_transforms::MemorySettings; @@ -7,13 +9,52 @@ use crate::spillers::PartitionBuffer; use crate::spillers::PartitionBufferFetchOption; #[async_trait::async_trait] -pub trait Spill: Send + Sync { - async fn spill(&mut self, blocks: Vec) -> Result; +pub trait SpillReader: Send { async fn restore(&mut self, ordinals: Vec) -> Result>; } -pub struct WindowPartitionBufferV2 { - spill: S, +#[async_trait::async_trait] +pub trait SpillWriter: Send { + type Reader: SpillReader; + + async fn spill(&mut self, blocks: Vec) -> Result; + + async fn close(self) -> Result; +} + +#[async_trait::async_trait] +pub trait SpillBuilder: Send + Sync { + async fn create(&self, partition_id: usize) -> Result; +} + +#[async_trait::async_trait] +impl SpillBuilder for F +where + W: SpillWriter, + F: Fn(usize) -> Fut + Send + Sync, + Fut: Future> + Send, +{ + async fn create(&self, partition_id: usize) -> Result { + (self)(partition_id).await + } +} + +#[derive(Default)] +enum PartitionSpillState { + #[default] + Empty, + Writing(W), + Reading(R), +} + +pub struct WindowPartitionBufferV2 +where + W: SpillWriter, + W::Reader: SpillReader, + B: SpillBuilder, +{ + spill_builder: B, + partition_spills: Vec>, memory_settings: MemorySettings, min_spill_size: usize, partition_buffer: PartitionBuffer, @@ -24,16 +65,25 @@ pub struct WindowPartitionBufferV2 { spilled_partition_ordinals: Vec>, } -impl WindowPartitionBufferV2 { +impl WindowPartitionBufferV2 +where + W: SpillWriter, + W::Reader: SpillReader, + B: SpillBuilder, +{ pub fn new( - spill: S, + spill_builder: B, num_partitions: usize, sort_block_size: usize, memory_settings: MemorySettings, ) -> Result { let partition_buffer = PartitionBuffer::create(num_partitions); + let partition_spills = (0..num_partitions) + .map(|_| PartitionSpillState::default) + .collect(); Ok(Self { - spill, + spill_builder, + partition_spills, memory_settings, min_spill_size: 1024 * 1024, partition_buffer, @@ -83,7 +133,11 @@ impl WindowPartitionBufferV2 { partition_id, &PartitionBufferFetchOption::PickPartitionWithThreshold(spill_unit_size), ) { - let ordinal = self.spill.spill(blocks).await?; + self.ensure_partition_writer(partition_id).await?; + let writer = self + .partition_writer_mut(partition_id) + .expect("partition writer must exist"); + let ordinal = writer.spill(blocks).await?; self.spilled_partition_ordinals[partition_id].push(ordinal); return Ok(()); } @@ -105,7 +159,11 @@ impl WindowPartitionBufferV2 { .partition_buffer .fetch_data_blocks(partition_id, &PartitionBufferFetchOption::ReadPartition) .unwrap(); - let ordinal = self.spill.spill(blocks).await?; + self.ensure_partition_writer(partition_id).await?; + let writer = self + .partition_writer_mut(partition_id) + .expect("partition writer must exist"); + let ordinal = writer.spill(blocks).await?; self.spilled_partition_ordinals[partition_id].push(ordinal); } else { self.can_spill = false; @@ -122,7 +180,11 @@ impl WindowPartitionBufferV2 { let mut result = if ordinals.is_empty() { Vec::new() } else { - self.spill.restore(ordinals).await? + self.close_partition_writer(partition_id).await?; + let reader = self + .partition_reader_mut(partition_id) + .expect("partition reader must exist after closing writer"); + reader.restore(ordinals).await? }; if let Some(blocks) = self @@ -139,4 +201,66 @@ impl WindowPartitionBufferV2 { Ok(vec![]) } + + async fn ensure_partition_writer(&mut self, partition_id: usize) -> Result<()> { + if matches!( + self.partition_spills.get(partition_id), + Some(PartitionSpillState::Empty) + ) { + let writer = self.spill_builder.create(partition_id).await?; + self.partition_spills[partition_id] = PartitionSpillState::Writing(writer); + return Ok(()); + } + + if matches!( + self.partition_spills.get(partition_id), + Some(PartitionSpillState::Reading(_)) + ) { + debug_assert!( + false, + "partition {} spill already closed before new writes", + partition_id + ); + } + Ok(()) + } + + async fn close_partition_writer(&mut self, partition_id: usize) -> Result<()> { + let state = std::mem::replace( + &mut self.partition_spills[partition_id], + PartitionSpillState::Empty, + ); + match state { + PartitionSpillState::Empty => { + debug_assert!( + false, + "closing partition {} without spill writer", + partition_id + ); + self.partition_spills[partition_id] = PartitionSpillState::Empty; + } + PartitionSpillState::Writing(writer) => { + let reader = writer.close().await?; + self.partition_spills[partition_id] = PartitionSpillState::Reading(reader); + } + PartitionSpillState::Reading(reader) => { + self.partition_spills[partition_id] = PartitionSpillState::Reading(reader); + } + } + Ok(()) + } + + fn partition_writer_mut(&mut self, partition_id: usize) -> Option<&mut W> { + match self.partition_spills.get_mut(partition_id) { + Some(PartitionSpillState::Writing(writer)) => Some(writer), + _ => None, + } + } + + fn partition_reader_mut(&mut self, partition_id: usize) -> Option<&mut W::Reader> { + match self.partition_spills.get_mut(partition_id) { + Some(PartitionSpillState::Reading(reader)) => Some(reader), + _ => None, + } + } } diff --git a/src/query/service/src/spillers/partition_buffer.rs b/src/query/service/src/spillers/partition_buffer.rs index 8fbb4869a402d..d3c0bc7705339 100644 --- a/src/query/service/src/spillers/partition_buffer.rs +++ b/src/query/service/src/spillers/partition_buffer.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use databend_common_exception::Result; use databend_common_expression::DataBlock; pub enum PartitionBufferFetchOption { diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index 46c6c6f4dc29e..2ca50be2ee24b 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -126,11 +126,9 @@ impl RowGroupEncoder { let cursor = file_writer.into_inner()?; let parquet_bytes = bytes::Bytes::from(cursor.into_inner()); - let mut reader = ParquetRecordBatchReader::try_new(parquet_bytes, usize::MAX)?; - + let reader = ParquetRecordBatchReader::try_new(parquet_bytes, usize::MAX)?; let blocks = reader - .into_iter() - .map(|batch| DataBlock::from_record_batch(&data_schema, &batch?)?.0) + .map(|batch| Ok(DataBlock::from_record_batch(&data_schema, &batch?)?.0)) .collect::>>()?; if blocks.is_empty() { From eb8c2b0e20347ecea4717a6c42a42c99651731fc Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 30 Sep 2025 12:12:07 +0800 Subject: [PATCH 29/46] x --- .../partition/window_partition_buffer_v2.rs | 280 +++++++++--------- 1 file changed, 148 insertions(+), 132 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs index 4644730dc143c..e101a48c5fe4b 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs @@ -1,12 +1,22 @@ -use std::future::Future; +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. use databend_common_exception::Result; use databend_common_expression::DataBlock; use databend_common_pipeline_transforms::MemorySettings; use super::concat_data_blocks; -use crate::spillers::PartitionBuffer; -use crate::spillers::PartitionBufferFetchOption; #[async_trait::async_trait] pub trait SpillReader: Send { @@ -19,57 +29,140 @@ pub trait SpillWriter: Send { async fn spill(&mut self, blocks: Vec) -> Result; - async fn close(self) -> Result; + async fn close(self) -> Result + where Self: Sized; } #[async_trait::async_trait] -pub trait SpillBuilder: Send + Sync { - async fn create(&self, partition_id: usize) -> Result; +pub trait SpillBuilder: Send + Sync { + type Writer: SpillWriter; + + async fn create(&self) -> Result; } -#[async_trait::async_trait] -impl SpillBuilder for F +#[derive(Default)] +enum PartitionSpillState where W: SpillWriter, - F: Fn(usize) -> Fut + Send + Sync, - Fut: Future> + Send, + W::Reader: SpillReader, { - async fn create(&self, partition_id: usize) -> Result { - (self)(partition_id).await - } -} - -#[derive(Default)] -enum PartitionSpillState { #[default] Empty, Writing(W), - Reading(R), + Reading(W::Reader), +} + +struct PartitionSlot +where + W: SpillWriter, + W::Reader: SpillReader, +{ + state: PartitionSpillState, + spilled_ordinals: Vec, + buffered_blocks: Vec, + buffered_size: usize, +} + +impl Default for PartitionSlot +where + W: SpillWriter, + W::Reader: SpillReader, +{ + fn default() -> Self { + Self { + state: PartitionSpillState::Empty, + spilled_ordinals: Vec::new(), + buffered_blocks: Vec::new(), + buffered_size: 0, + } + } } -pub struct WindowPartitionBufferV2 +impl PartitionSlot where W: SpillWriter, W::Reader: SpillReader, - B: SpillBuilder, +{ + fn add_block(&mut self, block: DataBlock) { + self.buffered_size += block.memory_size(); + self.buffered_blocks.push(block); + } + + fn memory_size(&self) -> usize { + self.buffered_size + } + + fn is_empty(&self) -> bool { + self.buffered_blocks.is_empty() + } + + fn fetch_blocks(&mut self, threshold: Option) -> Option> { + match threshold { + None => { + if self.buffered_blocks.is_empty() { + None + } else { + Some(self.buffered_blocks.clone()) + } + } + Some(threshold) => { + if self.buffered_size >= threshold { + self.buffered_size = 0; + Some(std::mem::take(&mut self.buffered_blocks)) + } else { + None + } + } + } + } + + async fn writer_mut<'a, B>(&'a mut self, builder: &B) -> Result<&'a mut W> + where B: SpillBuilder { + match &mut self.state { + state @ PartitionSpillState::Empty => { + let writer = builder.create().await?; + let _ = std::mem::replace(state, PartitionSpillState::Writing(writer)); + let PartitionSpillState::Writing(writer) = state else { + unreachable!() + }; + Ok(writer) + } + PartitionSpillState::Writing(writer) => Ok(writer), + PartitionSpillState::Reading(_) => unreachable!("partition already closed"), + } + } + + async fn close_writer(&mut self) -> Result<&mut W::Reader> { + let PartitionSpillState::Writing(writer) = std::mem::take(&mut self.state) else { + unreachable!() + }; + self.state = PartitionSpillState::Reading(writer.close().await?); + let PartitionSpillState::Reading(reader) = &mut self.state else { + unreachable!() + }; + Ok(reader) + } +} + +pub struct WindowPartitionBufferV2 +where + B: SpillBuilder, + ::Reader: SpillReader, { spill_builder: B, - partition_spills: Vec>, + partitions: Vec>, memory_settings: MemorySettings, min_spill_size: usize, - partition_buffer: PartitionBuffer, num_partitions: usize, sort_block_size: usize, can_spill: bool, next_to_restore_partition_id: isize, - spilled_partition_ordinals: Vec>, } -impl WindowPartitionBufferV2 +impl WindowPartitionBufferV2 where - W: SpillWriter, - W::Reader: SpillReader, - B: SpillBuilder, + B: SpillBuilder, + ::Reader: SpillReader, { pub fn new( spill_builder: B, @@ -77,21 +170,19 @@ where sort_block_size: usize, memory_settings: MemorySettings, ) -> Result { - let partition_buffer = PartitionBuffer::create(num_partitions); - let partition_spills = (0..num_partitions) - .map(|_| PartitionSpillState::default) - .collect(); + let mut partitions = Vec::with_capacity(num_partitions); + for _ in 0..num_partitions { + partitions.push(PartitionSlot::::default()); + } Ok(Self { spill_builder, - partition_spills, + partitions, memory_settings, min_spill_size: 1024 * 1024, - partition_buffer, num_partitions, sort_block_size, can_spill: false, next_to_restore_partition_id: -1, - spilled_partition_ordinals: vec![Vec::new(); num_partitions], }) } @@ -111,11 +202,9 @@ where if data_block.is_empty() { return; } - self.partition_buffer - .add_data_block(partition_id, data_block); - if !self.can_spill - && self.partition_buffer.partition_memory_size(partition_id) >= self.min_spill_size - { + let partition = &mut self.partitions[partition_id]; + partition.add_block(data_block); + if !self.can_spill && partition.memory_size() >= self.min_spill_size { self.can_spill = true; } } @@ -126,23 +215,20 @@ where let mut preferred_partition: Option<(usize, usize)> = None; for partition_id in (next_to_restore_partition_id..self.num_partitions).rev() { - if self.partition_buffer.is_partition_empty(partition_id) { + let partition = &mut self.partitions[partition_id]; + if partition.is_empty() { continue; } - if let Some(blocks) = self.partition_buffer.fetch_data_blocks( - partition_id, - &PartitionBufferFetchOption::PickPartitionWithThreshold(spill_unit_size), - ) { - self.ensure_partition_writer(partition_id).await?; - let writer = self - .partition_writer_mut(partition_id) - .expect("partition writer must exist"); - let ordinal = writer.spill(blocks).await?; - self.spilled_partition_ordinals[partition_id].push(ordinal); + if let Some(blocks) = partition.fetch_blocks(Some(spill_unit_size)) { + let ordinal = { + let writer = partition.writer_mut(&self.spill_builder).await?; + writer.spill(blocks).await? + }; + partition.spilled_ordinals.push(ordinal); return Ok(()); } - let partition_size = self.partition_buffer.partition_memory_size(partition_id); + let partition_size = partition.memory_size(); if preferred_partition .as_ref() .map(|(_, size)| partition_size > *size) @@ -155,16 +241,13 @@ where if let Some((partition_id, size)) = preferred_partition && size >= self.min_spill_size { - let blocks = self - .partition_buffer - .fetch_data_blocks(partition_id, &PartitionBufferFetchOption::ReadPartition) - .unwrap(); - self.ensure_partition_writer(partition_id).await?; - let writer = self - .partition_writer_mut(partition_id) - .expect("partition writer must exist"); - let ordinal = writer.spill(blocks).await?; - self.spilled_partition_ordinals[partition_id].push(ordinal); + let partition = &mut self.partitions[partition_id]; + let blocks = partition.fetch_blocks(None).unwrap(); + let ordinal = { + let writer = partition.writer_mut(&self.spill_builder).await?; + writer.spill(blocks).await? + }; + partition.spilled_ordinals.push(ordinal); } else { self.can_spill = false; } @@ -175,22 +258,17 @@ where while self.next_to_restore_partition_id + 1 < self.num_partitions as isize { self.next_to_restore_partition_id += 1; let partition_id = self.next_to_restore_partition_id as usize; + let partition = &mut self.partitions[partition_id]; - let ordinals = std::mem::take(&mut self.spilled_partition_ordinals[partition_id]); + let ordinals = std::mem::take(&mut partition.spilled_ordinals); let mut result = if ordinals.is_empty() { Vec::new() } else { - self.close_partition_writer(partition_id).await?; - let reader = self - .partition_reader_mut(partition_id) - .expect("partition reader must exist after closing writer"); + let reader = partition.close_writer().await?; reader.restore(ordinals).await? }; - if let Some(blocks) = self - .partition_buffer - .fetch_data_blocks(partition_id, &PartitionBufferFetchOption::ReadPartition) - { + if let Some(blocks) = partition.fetch_blocks(None) { result.extend(concat_data_blocks(blocks, self.sort_block_size)?); } @@ -201,66 +279,4 @@ where Ok(vec![]) } - - async fn ensure_partition_writer(&mut self, partition_id: usize) -> Result<()> { - if matches!( - self.partition_spills.get(partition_id), - Some(PartitionSpillState::Empty) - ) { - let writer = self.spill_builder.create(partition_id).await?; - self.partition_spills[partition_id] = PartitionSpillState::Writing(writer); - return Ok(()); - } - - if matches!( - self.partition_spills.get(partition_id), - Some(PartitionSpillState::Reading(_)) - ) { - debug_assert!( - false, - "partition {} spill already closed before new writes", - partition_id - ); - } - Ok(()) - } - - async fn close_partition_writer(&mut self, partition_id: usize) -> Result<()> { - let state = std::mem::replace( - &mut self.partition_spills[partition_id], - PartitionSpillState::Empty, - ); - match state { - PartitionSpillState::Empty => { - debug_assert!( - false, - "closing partition {} without spill writer", - partition_id - ); - self.partition_spills[partition_id] = PartitionSpillState::Empty; - } - PartitionSpillState::Writing(writer) => { - let reader = writer.close().await?; - self.partition_spills[partition_id] = PartitionSpillState::Reading(reader); - } - PartitionSpillState::Reading(reader) => { - self.partition_spills[partition_id] = PartitionSpillState::Reading(reader); - } - } - Ok(()) - } - - fn partition_writer_mut(&mut self, partition_id: usize) -> Option<&mut W> { - match self.partition_spills.get_mut(partition_id) { - Some(PartitionSpillState::Writing(writer)) => Some(writer), - _ => None, - } - } - - fn partition_reader_mut(&mut self, partition_id: usize) -> Option<&mut W::Reader> { - match self.partition_spills.get_mut(partition_id) { - Some(PartitionSpillState::Reading(reader)) => Some(reader), - _ => None, - } - } } From 3619501f9f956d34323c8f00cac326ad3f4b4af1 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 30 Sep 2025 12:43:13 +0800 Subject: [PATCH 30/46] x --- .../window/partition/window_partition_buffer_v2.rs | 3 +-- src/query/service/src/spillers/adapter.rs | 8 +++++++- src/query/service/src/spillers/union_file.rs | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs index e101a48c5fe4b..3ec4d59ef496c 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs @@ -29,8 +29,7 @@ pub trait SpillWriter: Send { async fn spill(&mut self, blocks: Vec) -> Result; - async fn close(self) -> Result - where Self: Sized; + async fn close(self) -> Result; } #[async_trait::async_trait] diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index d149cad78ee54..1d2831c374398 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -40,6 +40,8 @@ use super::Location; use crate::sessions::QueryContext; use crate::spillers::block_reader::BlocksReader; use crate::spillers::block_writer::BlocksWriter; +use crate::spillers::union_file::FileReader; +use crate::spillers::union_file::UnionFileWriter; pub struct PartitionAdapter { ctx: Arc, @@ -357,7 +359,7 @@ pub struct Chunk { } pub struct SpillWriter { - file: FileWriter>, + file: FileWriter, } impl SpillWriter { @@ -368,6 +370,10 @@ impl SpillWriter { } Ok(self.file.flush_row_group(row_group)?) } + + async fn close(self) -> Result<()> { + todo!() + } } pub struct SpillReader {} diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index 2ca50be2ee24b..c14aef6734dc5 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -370,7 +370,7 @@ pub struct UnionFile { remote_offset: Option, } -struct FileReader { +pub(super) struct FileReader { meta: Arc, local: Option<(TempPath, AsyncDmaFile)>, remote_reader: Reader, From 6163c172d5906b6b02bc6db758f447406fd0ea96 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 30 Sep 2025 13:39:50 +0800 Subject: [PATCH 31/46] x --- src/query/service/src/spillers/adapter.rs | 208 +++++++++++++++++-- src/query/service/src/spillers/inner.rs | 2 +- src/query/service/src/spillers/union_file.rs | 6 + 3 files changed, 198 insertions(+), 18 deletions(-) diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index 1d2831c374398..f169f2cd2beea 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -15,6 +15,7 @@ use std::collections::hash_map::Entry; use std::collections::HashMap; use std::collections::HashSet; +use std::convert::TryFrom; use std::ops::DerefMut; use std::ops::Range; use std::sync::Arc; @@ -24,25 +25,33 @@ use std::time::Instant; use databend_common_base::base::dma_buffer_to_bytes; use databend_common_base::base::dma_read_file_range; use databend_common_base::base::ProgressValues; +use databend_common_base::runtime::GlobalIORuntime; use databend_common_catalog::table_context::TableContext; +use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::DataBlock; +use databend_common_expression::DataSchema; use databend_common_pipeline_transforms::traits::DataBlockSpill; +use databend_storages_common_cache::ParquetMetaData; use databend_storages_common_cache::TempPath; use opendal::Buffer; use opendal::Operator; -use parquet::file::metadata::RowGroupMetaDataPtr; +use super::async_buffer::BufferPool; use super::inner::*; use super::serialize::*; use super::union_file::FileWriter; +use super::union_file::UnionFile; +use super::union_file::UnionFileWriter; use super::Location; +use crate::pipelines::processors::transforms::SpillBuilder as WindowSpillBuilder; +use crate::pipelines::processors::transforms::SpillReader as WindowSpillReader; +use crate::pipelines::processors::transforms::SpillWriter as WindowSpillWriter; use crate::sessions::QueryContext; use crate::spillers::block_reader::BlocksReader; use crate::spillers::block_writer::BlocksWriter; -use crate::spillers::union_file::FileReader; -use crate::spillers::union_file::UnionFileWriter; +#[derive(Clone)] pub struct PartitionAdapter { ctx: Arc, // Stores the spilled files that controlled by current spiller @@ -358,29 +367,194 @@ pub struct Chunk { pub layout: Layout, } -pub struct SpillWriter { - file: FileWriter, +const WINDOW_SPILL_BUFFER_MEMORY_BYTES: usize = 64 * 1024 * 1024; +const WINDOW_SPILL_BUFFER_WORKERS: usize = 2; +const WINDOW_SPILL_CHUNK_SIZE: usize = 8 * 1024 * 1024; + +pub struct WindowPartitionSpillWriter { + spiller: Spiller, + buffer_pool: Arc, + dio: bool, + chunk_size: usize, + schema: Option>, + file_writer: Option>, + row_group_ordinals: Vec, +} + +pub struct WindowPartitionSpillReader { + spiller: Spiller, + schema: Arc, + parquet_metadata: Arc, + union_file: Option, + row_group_ordinals: Vec, + dio: bool, } -impl SpillWriter { - pub fn spill(&mut self, blocks: Vec) -> Result { - let mut row_group = self.file.new_row_group(); - for block in blocks { - row_group.add(block)?; +impl WindowPartitionSpillWriter { + async fn ensure_file_writer(&mut self, blocks: &[DataBlock]) -> Result<()> { + if self.file_writer.is_some() { + return Ok(()); } - Ok(self.file.flush_row_group(row_group)?) + + if blocks.is_empty() { + return Err(ErrorCode::Internal( + "window spill writer received empty block batch".to_string(), + )); + } + + if !self.spiller.use_parquet { + return Err(ErrorCode::Internal( + "window spill requires Parquet spill format".to_string(), + )); + } + + let schema = Arc::new(blocks[0].infer_schema()); + let file_writer = self + .spiller + .new_file_writer(&schema, &self.buffer_pool, self.dio, self.chunk_size) + .await?; + self.schema = Some(schema); + self.file_writer = Some(file_writer); + Ok(()) + } +} + +#[async_trait::async_trait] +impl WindowSpillWriter for WindowPartitionSpillWriter { + type Reader = WindowPartitionSpillReader; + + async fn spill(&mut self, blocks: Vec) -> Result { + self.ensure_file_writer(&blocks).await?; + + let file_writer = self.file_writer.as_mut().ok_or_else(|| { + ErrorCode::Internal("window spill writer not initialized".to_string()) + })?; + + let row_group_meta = file_writer.spill(blocks)?; + let row_group_index = row_group_meta + .ordinal() + .map(|value| value as usize) + .unwrap_or(self.row_group_ordinals.len()); + + let ordinal_index = self.row_group_ordinals.len(); + let ordinal = i16::try_from(ordinal_index).map_err(|_| { + ErrorCode::Internal("too many spilled batches for window partition".to_string()) + })?; + + self.row_group_ordinals.push(row_group_index); + Ok(ordinal) + } + + async fn close(self) -> Result<::Reader> { + let Some(file_writer) = self.file_writer else { + return Err(ErrorCode::Internal( + "attempted to close window spill writer without data".to_string(), + )); + }; + + let file_writer = file_writer; + + let schema = self + .schema + .ok_or_else(|| ErrorCode::Internal("missing schema for window spill".to_string()))?; + + let (metadata, union_file) = file_writer.finish()?; + let remote_path = union_file.remote_path().to_string(); + let parquet_metadata = Arc::new(metadata); + + let total_size = parquet_metadata + .row_groups() + .iter() + .map(|rg| rg.compressed_size().max(0) as usize) + .sum(); + + self.spiller.adapter.add_spill_file( + Location::Remote(remote_path), + Layout::Parquet, + total_size, + ); + + Ok(WindowPartitionSpillReader { + spiller: self.spiller, + schema, + parquet_metadata, + union_file: Some(union_file), + row_group_ordinals: self.row_group_ordinals, + dio: self.dio, + }) } +} + +#[async_trait::async_trait] +impl WindowSpillReader for WindowPartitionSpillReader { + async fn restore(&mut self, ordinals: Vec) -> Result> { + if ordinals.is_empty() { + return Ok(Vec::new()); + } - async fn close(self) -> Result<()> { - todo!() + let row_groups = ordinals + .into_iter() + .map(|ordinal| { + if ordinal < 0 { + return Err(ErrorCode::Internal(format!( + "invalid spill ordinal {} for window partition", + ordinal + ))); + } + let index = ordinal as usize; + self.row_group_ordinals.get(index).copied().ok_or_else(|| { + ErrorCode::Internal(format!( + "spill ordinal {} not found for window partition", + ordinal + )) + }) + }) + .collect::>>()?; + + let union_file = self.union_file.take().ok_or_else(|| { + ErrorCode::Internal("window spill reader already consumed".to_string()) + })?; + + self.spiller + .load_row_groups( + union_file, + self.parquet_metadata.clone(), + &self.schema, + row_groups, + self.dio, + ) + .await } } -pub struct SpillReader {} +#[async_trait::async_trait] +impl WindowSpillBuilder for Spiller { + type Writer = WindowPartitionSpillWriter; + + async fn create(&self) -> Result<::Writer> { + if !self.use_parquet { + return Err(ErrorCode::Internal( + "window spill requires Parquet spill format, please set `set global spilling_file_format='parquet'`" + .to_string(), + )); + } -impl SpillReader { - pub fn restore(&self, _ordinal: i16) { - todo!() + let runtime = GlobalIORuntime::instance(); + let buffer_pool = BufferPool::create( + runtime, + WINDOW_SPILL_BUFFER_MEMORY_BYTES, + WINDOW_SPILL_BUFFER_WORKERS, + ); + + Ok(WindowPartitionSpillWriter { + spiller: self.clone(), + buffer_pool, + dio: self.temp_dir.is_some(), + chunk_size: WINDOW_SPILL_CHUNK_SIZE, + schema: None, + file_writer: None, + row_group_ordinals: Vec::new(), + }) } } diff --git a/src/query/service/src/spillers/inner.rs b/src/query/service/src/spillers/inner.rs index fcb89e3fe7c87..de180afbbf666 100644 --- a/src/query/service/src/spillers/inner.rs +++ b/src/query/service/src/spillers/inner.rs @@ -107,7 +107,7 @@ pub struct SpillerInner { pub(super) temp_dir: Option>, // for dio disabled pub(super) local_operator: Option, - use_parquet: bool, + pub(super) use_parquet: bool, _spiller_type: SpillerType, } diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index c14aef6734dc5..4bad1aaa3738b 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -370,6 +370,12 @@ pub struct UnionFile { remote_offset: Option, } +impl UnionFile { + pub fn remote_path(&self) -> &str { + &self.remote_path + } +} + pub(super) struct FileReader { meta: Arc, local: Option<(TempPath, AsyncDmaFile)>, From 8310aab2b9ca26a6a686ca67cc9adf963a343e04 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 30 Sep 2025 14:12:29 +0800 Subject: [PATCH 32/46] x --- .../partition/window_partition_buffer_v2.rs | 23 ++-- src/query/service/src/spillers/adapter.rs | 107 ++++-------------- 2 files changed, 38 insertions(+), 92 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs index 3ec4d59ef496c..a53bea70fb750 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs @@ -12,22 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::Arc; + use databend_common_exception::Result; use databend_common_expression::DataBlock; +use databend_common_expression::DataSchema; use databend_common_pipeline_transforms::MemorySettings; use super::concat_data_blocks; #[async_trait::async_trait] pub trait SpillReader: Send { - async fn restore(&mut self, ordinals: Vec) -> Result>; + async fn restore(&mut self, ordinals: Vec) -> Result>; } #[async_trait::async_trait] pub trait SpillWriter: Send { type Reader: SpillReader; - async fn spill(&mut self, blocks: Vec) -> Result; + async fn spill(&mut self, blocks: Vec) -> Result; async fn close(self) -> Result; } @@ -36,7 +39,7 @@ pub trait SpillWriter: Send { pub trait SpillBuilder: Send + Sync { type Writer: SpillWriter; - async fn create(&self) -> Result; + async fn create(&self, schema: Arc) -> Result; } #[derive(Default)] @@ -57,7 +60,7 @@ where W::Reader: SpillReader, { state: PartitionSpillState, - spilled_ordinals: Vec, + spilled_ordinals: Vec, buffered_blocks: Vec, buffered_size: usize, } @@ -115,11 +118,11 @@ where } } - async fn writer_mut<'a, B>(&'a mut self, builder: &B) -> Result<&'a mut W> + async fn writer_mut<'a, B>(&'a mut self, builder: &B, block: &DataBlock) -> Result<&'a mut W> where B: SpillBuilder { match &mut self.state { state @ PartitionSpillState::Empty => { - let writer = builder.create().await?; + let writer = builder.create(block.infer_schema().into()).await?; let _ = std::mem::replace(state, PartitionSpillState::Writing(writer)); let PartitionSpillState::Writing(writer) = state else { unreachable!() @@ -220,7 +223,9 @@ where } if let Some(blocks) = partition.fetch_blocks(Some(spill_unit_size)) { let ordinal = { - let writer = partition.writer_mut(&self.spill_builder).await?; + let writer = partition + .writer_mut(&self.spill_builder, &blocks[0]) + .await?; writer.spill(blocks).await? }; partition.spilled_ordinals.push(ordinal); @@ -243,7 +248,9 @@ where let partition = &mut self.partitions[partition_id]; let blocks = partition.fetch_blocks(None).unwrap(); let ordinal = { - let writer = partition.writer_mut(&self.spill_builder).await?; + let writer = partition + .writer_mut(&self.spill_builder, &blocks[0]) + .await?; writer.spill(blocks).await? }; partition.spilled_ordinals.push(ordinal); diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index f169f2cd2beea..82bcfcfadeafa 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -15,7 +15,6 @@ use std::collections::hash_map::Entry; use std::collections::HashMap; use std::collections::HashSet; -use std::convert::TryFrom; use std::ops::DerefMut; use std::ops::Range; use std::sync::Arc; @@ -376,9 +375,8 @@ pub struct WindowPartitionSpillWriter { buffer_pool: Arc, dio: bool, chunk_size: usize, - schema: Option>, + schema: Arc, file_writer: Option>, - row_group_ordinals: Vec, } pub struct WindowPartitionSpillReader { @@ -386,78 +384,37 @@ pub struct WindowPartitionSpillReader { schema: Arc, parquet_metadata: Arc, union_file: Option, - row_group_ordinals: Vec, dio: bool, } -impl WindowPartitionSpillWriter { - async fn ensure_file_writer(&mut self, blocks: &[DataBlock]) -> Result<()> { - if self.file_writer.is_some() { - return Ok(()); - } - - if blocks.is_empty() { - return Err(ErrorCode::Internal( - "window spill writer received empty block batch".to_string(), - )); - } - - if !self.spiller.use_parquet { - return Err(ErrorCode::Internal( - "window spill requires Parquet spill format".to_string(), - )); - } - - let schema = Arc::new(blocks[0].infer_schema()); - let file_writer = self - .spiller - .new_file_writer(&schema, &self.buffer_pool, self.dio, self.chunk_size) - .await?; - self.schema = Some(schema); - self.file_writer = Some(file_writer); - Ok(()) - } -} - #[async_trait::async_trait] impl WindowSpillWriter for WindowPartitionSpillWriter { type Reader = WindowPartitionSpillReader; - async fn spill(&mut self, blocks: Vec) -> Result { - self.ensure_file_writer(&blocks).await?; - - let file_writer = self.file_writer.as_mut().ok_or_else(|| { - ErrorCode::Internal("window spill writer not initialized".to_string()) - })?; + async fn spill(&mut self, blocks: Vec) -> Result { + let file_writer = match &mut self.file_writer { + Some(file_writer) => file_writer, + file_writer @ None => { + let writer = self + .spiller + .new_file_writer(&self.schema, &self.buffer_pool, self.dio, self.chunk_size) + .await?; + file_writer.insert(writer) + } + }; let row_group_meta = file_writer.spill(blocks)?; - let row_group_index = row_group_meta - .ordinal() - .map(|value| value as usize) - .unwrap_or(self.row_group_ordinals.len()); - - let ordinal_index = self.row_group_ordinals.len(); - let ordinal = i16::try_from(ordinal_index).map_err(|_| { - ErrorCode::Internal("too many spilled batches for window partition".to_string()) - })?; - - self.row_group_ordinals.push(row_group_index); - Ok(ordinal) + let ordinal = row_group_meta.ordinal().unwrap(); + Ok(ordinal as _) } - async fn close(self) -> Result<::Reader> { + async fn close(self) -> Result { let Some(file_writer) = self.file_writer else { return Err(ErrorCode::Internal( "attempted to close window spill writer without data".to_string(), )); }; - let file_writer = file_writer; - - let schema = self - .schema - .ok_or_else(|| ErrorCode::Internal("missing schema for window spill".to_string()))?; - let (metadata, union_file) = file_writer.finish()?; let remote_path = union_file.remote_path().to_string(); let parquet_metadata = Arc::new(metadata); @@ -476,10 +433,9 @@ impl WindowSpillWriter for WindowPartitionSpillWriter { Ok(WindowPartitionSpillReader { spiller: self.spiller, - schema, + schema: self.schema, parquet_metadata, union_file: Some(union_file), - row_group_ordinals: self.row_group_ordinals, dio: self.dio, }) } @@ -487,30 +443,11 @@ impl WindowSpillWriter for WindowPartitionSpillWriter { #[async_trait::async_trait] impl WindowSpillReader for WindowPartitionSpillReader { - async fn restore(&mut self, ordinals: Vec) -> Result> { + async fn restore(&mut self, ordinals: Vec) -> Result> { if ordinals.is_empty() { return Ok(Vec::new()); } - let row_groups = ordinals - .into_iter() - .map(|ordinal| { - if ordinal < 0 { - return Err(ErrorCode::Internal(format!( - "invalid spill ordinal {} for window partition", - ordinal - ))); - } - let index = ordinal as usize; - self.row_group_ordinals.get(index).copied().ok_or_else(|| { - ErrorCode::Internal(format!( - "spill ordinal {} not found for window partition", - ordinal - )) - }) - }) - .collect::>>()?; - let union_file = self.union_file.take().ok_or_else(|| { ErrorCode::Internal("window spill reader already consumed".to_string()) })?; @@ -520,7 +457,7 @@ impl WindowSpillReader for WindowPartitionSpillReader { union_file, self.parquet_metadata.clone(), &self.schema, - row_groups, + ordinals, self.dio, ) .await @@ -531,7 +468,10 @@ impl WindowSpillReader for WindowPartitionSpillReader { impl WindowSpillBuilder for Spiller { type Writer = WindowPartitionSpillWriter; - async fn create(&self) -> Result<::Writer> { + async fn create( + &self, + schema: Arc, + ) -> Result<::Writer> { if !self.use_parquet { return Err(ErrorCode::Internal( "window spill requires Parquet spill format, please set `set global spilling_file_format='parquet'`" @@ -551,9 +491,8 @@ impl WindowSpillBuilder for Spiller { buffer_pool, dio: self.temp_dir.is_some(), chunk_size: WINDOW_SPILL_CHUNK_SIZE, - schema: None, + schema, file_writer: None, - row_group_ordinals: Vec::new(), }) } } From b38fb3f9737590272f3dfb54d032ed07edfdc183 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 30 Sep 2025 14:38:53 +0800 Subject: [PATCH 33/46] x --- .../partition/window_partition_buffer_v2.rs | 88 +++++++++++------ src/query/service/src/spillers/adapter.rs | 95 ++++++++----------- 2 files changed, 97 insertions(+), 86 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs index a53bea70fb750..c2303355723eb 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs @@ -20,15 +20,18 @@ use databend_common_expression::DataSchema; use databend_common_pipeline_transforms::MemorySettings; use super::concat_data_blocks; +use crate::spillers::SpillReader; +use crate::spillers::SpillWriter; +use crate::spillers::Spiller; #[async_trait::async_trait] -pub trait SpillReader: Send { +trait Reader: Send { async fn restore(&mut self, ordinals: Vec) -> Result>; } #[async_trait::async_trait] -pub trait SpillWriter: Send { - type Reader: SpillReader; +trait Writer: Send { + type Reader: Reader; async fn spill(&mut self, blocks: Vec) -> Result; @@ -36,17 +39,46 @@ pub trait SpillWriter: Send { } #[async_trait::async_trait] -pub trait SpillBuilder: Send + Sync { - type Writer: SpillWriter; +trait Builder: Send + Sync { + type Writer: Writer; async fn create(&self, schema: Arc) -> Result; } +#[async_trait::async_trait] +impl Builder for Spiller { + type Writer = SpillWriter; + + async fn create(&self, schema: Arc) -> Result { + self.new_spill_writer(schema) + } +} + +#[async_trait::async_trait] +impl Writer for SpillWriter { + type Reader = SpillReader; + + async fn spill(&mut self, blocks: Vec) -> Result { + self.spill(blocks).await + } + + async fn close(self) -> Result { + self.close() + } +} + +#[async_trait::async_trait] +impl Reader for SpillReader { + async fn restore(&mut self, ordinals: Vec) -> Result> { + self.restore(ordinals).await + } +} + #[derive(Default)] enum PartitionSpillState where - W: SpillWriter, - W::Reader: SpillReader, + W: Writer, + W::Reader: Reader, { #[default] Empty, @@ -56,8 +88,8 @@ where struct PartitionSlot where - W: SpillWriter, - W::Reader: SpillReader, + W: Writer, + W::Reader: Reader, { state: PartitionSpillState, spilled_ordinals: Vec, @@ -67,8 +99,8 @@ where impl Default for PartitionSlot where - W: SpillWriter, - W::Reader: SpillReader, + W: Writer, + W::Reader: Reader, { fn default() -> Self { Self { @@ -82,8 +114,8 @@ where impl PartitionSlot where - W: SpillWriter, - W::Reader: SpillReader, + W: Writer, + W::Reader: Reader, { fn add_block(&mut self, block: DataBlock) { self.buffered_size += block.memory_size(); @@ -119,7 +151,7 @@ where } async fn writer_mut<'a, B>(&'a mut self, builder: &B, block: &DataBlock) -> Result<&'a mut W> - where B: SpillBuilder { + where B: Builder { match &mut self.state { state @ PartitionSpillState::Empty => { let writer = builder.create(block.infer_schema().into()).await?; @@ -147,11 +179,9 @@ where } pub struct WindowPartitionBufferV2 -where - B: SpillBuilder, - ::Reader: SpillReader, +where B: Builder { - spill_builder: B, + spiller: B, partitions: Vec>, memory_settings: MemorySettings, min_spill_size: usize, @@ -161,13 +191,9 @@ where next_to_restore_partition_id: isize, } -impl WindowPartitionBufferV2 -where - B: SpillBuilder, - ::Reader: SpillReader, -{ +impl WindowPartitionBufferV2 { pub fn new( - spill_builder: B, + spiller: Spiller, num_partitions: usize, sort_block_size: usize, memory_settings: MemorySettings, @@ -177,7 +203,7 @@ where partitions.push(PartitionSlot::::default()); } Ok(Self { - spill_builder, + spiller, partitions, memory_settings, min_spill_size: 1024 * 1024, @@ -187,7 +213,11 @@ where next_to_restore_partition_id: -1, }) } +} +impl WindowPartitionBufferV2 +where B: Builder +{ pub fn need_spill(&mut self) -> bool { self.can_spill && self.memory_settings.check_spill() } @@ -223,9 +253,7 @@ where } if let Some(blocks) = partition.fetch_blocks(Some(spill_unit_size)) { let ordinal = { - let writer = partition - .writer_mut(&self.spill_builder, &blocks[0]) - .await?; + let writer = partition.writer_mut(&self.spiller, &blocks[0]).await?; writer.spill(blocks).await? }; partition.spilled_ordinals.push(ordinal); @@ -248,9 +276,7 @@ where let partition = &mut self.partitions[partition_id]; let blocks = partition.fetch_blocks(None).unwrap(); let ordinal = { - let writer = partition - .writer_mut(&self.spill_builder, &blocks[0]) - .await?; + let writer = partition.writer_mut(&self.spiller, &blocks[0]).await?; writer.spill(blocks).await? }; partition.spilled_ordinals.push(ordinal); diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index 82bcfcfadeafa..bf40213434ce5 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -43,9 +43,6 @@ use super::union_file::FileWriter; use super::union_file::UnionFile; use super::union_file::UnionFileWriter; use super::Location; -use crate::pipelines::processors::transforms::SpillBuilder as WindowSpillBuilder; -use crate::pipelines::processors::transforms::SpillReader as WindowSpillReader; -use crate::pipelines::processors::transforms::SpillWriter as WindowSpillWriter; use crate::sessions::QueryContext; use crate::spillers::block_reader::BlocksReader; use crate::spillers::block_writer::BlocksWriter; @@ -354,6 +351,31 @@ impl Spiller { .cloned() .collect() } + + pub fn new_spill_writer(&self, schema: Arc) -> Result { + if !self.use_parquet { + return Err(ErrorCode::Internal( + "window spill requires Parquet spill format, please set `set global spilling_file_format='parquet'`" + .to_string(), + )); + } + + let runtime = GlobalIORuntime::instance(); + let buffer_pool = BufferPool::create( + runtime, + WINDOW_SPILL_BUFFER_MEMORY_BYTES, + WINDOW_SPILL_BUFFER_WORKERS, + ); + + Ok(SpillWriter { + spiller: self.clone(), + buffer_pool, + dio: self.temp_dir.is_some(), + chunk_size: WINDOW_SPILL_CHUNK_SIZE, + schema, + file_writer: None, + }) + } } pub struct MergedPartition { @@ -370,7 +392,7 @@ const WINDOW_SPILL_BUFFER_MEMORY_BYTES: usize = 64 * 1024 * 1024; const WINDOW_SPILL_BUFFER_WORKERS: usize = 2; const WINDOW_SPILL_CHUNK_SIZE: usize = 8 * 1024 * 1024; -pub struct WindowPartitionSpillWriter { +pub struct SpillWriter { spiller: Spiller, buffer_pool: Arc, dio: bool, @@ -379,19 +401,8 @@ pub struct WindowPartitionSpillWriter { file_writer: Option>, } -pub struct WindowPartitionSpillReader { - spiller: Spiller, - schema: Arc, - parquet_metadata: Arc, - union_file: Option, - dio: bool, -} - -#[async_trait::async_trait] -impl WindowSpillWriter for WindowPartitionSpillWriter { - type Reader = WindowPartitionSpillReader; - - async fn spill(&mut self, blocks: Vec) -> Result { +impl SpillWriter { + pub async fn spill(&mut self, blocks: Vec) -> Result { let file_writer = match &mut self.file_writer { Some(file_writer) => file_writer, file_writer @ None => { @@ -408,7 +419,7 @@ impl WindowSpillWriter for WindowPartitionSpillWriter { Ok(ordinal as _) } - async fn close(self) -> Result { + pub fn close(self) -> Result { let Some(file_writer) = self.file_writer else { return Err(ErrorCode::Internal( "attempted to close window spill writer without data".to_string(), @@ -431,7 +442,7 @@ impl WindowSpillWriter for WindowPartitionSpillWriter { total_size, ); - Ok(WindowPartitionSpillReader { + Ok(SpillReader { spiller: self.spiller, schema: self.schema, parquet_metadata, @@ -441,9 +452,16 @@ impl WindowSpillWriter for WindowPartitionSpillWriter { } } -#[async_trait::async_trait] -impl WindowSpillReader for WindowPartitionSpillReader { - async fn restore(&mut self, ordinals: Vec) -> Result> { +pub struct SpillReader { + spiller: Spiller, + schema: Arc, + parquet_metadata: Arc, + union_file: Option, + dio: bool, +} + +impl SpillReader { + pub async fn restore(&mut self, ordinals: Vec) -> Result> { if ordinals.is_empty() { return Ok(Vec::new()); } @@ -464,39 +482,6 @@ impl WindowSpillReader for WindowPartitionSpillReader { } } -#[async_trait::async_trait] -impl WindowSpillBuilder for Spiller { - type Writer = WindowPartitionSpillWriter; - - async fn create( - &self, - schema: Arc, - ) -> Result<::Writer> { - if !self.use_parquet { - return Err(ErrorCode::Internal( - "window spill requires Parquet spill format, please set `set global spilling_file_format='parquet'`" - .to_string(), - )); - } - - let runtime = GlobalIORuntime::instance(); - let buffer_pool = BufferPool::create( - runtime, - WINDOW_SPILL_BUFFER_MEMORY_BYTES, - WINDOW_SPILL_BUFFER_WORKERS, - ); - - Ok(WindowPartitionSpillWriter { - spiller: self.clone(), - buffer_pool, - dio: self.temp_dir.is_some(), - chunk_size: WINDOW_SPILL_CHUNK_SIZE, - schema, - file_writer: None, - }) - } -} - impl SpillAdapter for Arc { fn add_spill_file(&self, location: Location, layout: Layout, size: usize) { self.as_ref().add_spill_file(location, layout, size); From 4377101176ca2ffe200e31d61209423429d7ae9c Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 30 Sep 2025 15:44:03 +0800 Subject: [PATCH 34/46] x --- .../transforms/window/partition/mod.rs | 1 - .../transform_window_partition_collect.rs | 53 ++++++++++--------- .../partition/window_partition_buffer.rs | 2 +- .../partition/window_partition_buffer_v2.rs | 30 +++++------ src/query/service/src/spillers/adapter.rs | 53 +++++++++++++------ 5 files changed, 82 insertions(+), 57 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs index 4d4eda4b641af..692511aa01a94 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs @@ -25,7 +25,6 @@ pub use data_processor_strategy::*; pub use hilbert_partition_exchange::*; pub use transform_window_partition_collect::*; pub use window_partition_buffer::*; -pub use window_partition_buffer_v2::*; pub use window_partition_exchange::*; pub use window_partition_meta::*; pub use window_partition_partial_top_n_exchange::*; diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs index b2a48d0fb7e1b..72a8e1f676bb2 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs @@ -31,6 +31,7 @@ use databend_common_pipeline_transforms::MemorySettings; use databend_common_settings::Settings; use databend_common_storage::DataOperator; +use super::window_partition_buffer_v2::WindowPartitionBufferV2; use super::WindowPartitionBuffer; use super::WindowPartitionMeta; use crate::pipelines::processors::transforms::DataProcessorStrategy; @@ -40,13 +41,13 @@ use crate::spillers::SpillerConfig; use crate::spillers::SpillerDiskConfig; use crate::spillers::SpillerType; -// Local enum to wrap WindowPartitionBuffer as a variant without modifying the original. -pub enum WindowBuffer { - WindowPartitionBuffer(Box), +enum WindowBuffer { + V1(WindowPartitionBuffer), + V2(WindowPartitionBufferV2), } impl WindowBuffer { - pub fn new( + fn new( spiller: Spiller, num_partitions: usize, sort_block_size: usize, @@ -54,58 +55,60 @@ impl WindowBuffer { ) -> Result { let inner = WindowPartitionBuffer::new(spiller, num_partitions, sort_block_size, memory_settings)?; - Ok(Self::WindowPartitionBuffer(Box::new(inner))) + Ok(Self::V1(inner)) } - pub fn need_spill(&mut self) -> bool { + fn need_spill(&mut self) -> bool { match self { - WindowBuffer::WindowPartitionBuffer(inner) => inner.need_spill(), + WindowBuffer::V1(inner) => inner.need_spill(), + WindowBuffer::V2(inner) => inner.need_spill(), } } - pub fn out_of_memory_limit(&mut self) -> bool { + fn is_empty(&self) -> bool { match self { - WindowBuffer::WindowPartitionBuffer(inner) => inner.out_of_memory_limit(), + WindowBuffer::V1(inner) => inner.is_empty(), + WindowBuffer::V2(inner) => inner.is_empty(), } } - pub fn is_empty(&self) -> bool { + fn add_data_block(&mut self, partition_id: usize, data_block: DataBlock) { match self { - WindowBuffer::WindowPartitionBuffer(inner) => inner.is_empty(), + WindowBuffer::V1(inner) => inner.add_data_block(partition_id, data_block), + WindowBuffer::V2(inner) => inner.add_data_block(partition_id, data_block), } } - pub fn add_data_block(&mut self, partition_id: usize, data_block: DataBlock) { - let WindowBuffer::WindowPartitionBuffer(inner) = self; - inner.add_data_block(partition_id, data_block); - } - - pub async fn spill(&mut self) -> Result<()> { - let WindowBuffer::WindowPartitionBuffer(inner) = self; - inner.spill().await + async fn spill(&mut self) -> Result<()> { + match self { + WindowBuffer::V1(inner) => inner.spill().await, + WindowBuffer::V2(inner) => inner.spill().await, + } } - pub async fn restore(&mut self) -> Result> { - let WindowBuffer::WindowPartitionBuffer(inner) = self; - inner.restore().await + async fn restore(&mut self) -> Result> { + match self { + WindowBuffer::V1(inner) => inner.restore().await, + WindowBuffer::V2(inner) => inner.restore().await, + } } } #[derive(Debug, Clone, Copy)] -pub enum Step { +enum Step { Sync(SyncStep), Async(AsyncStep), Finish, } #[derive(Debug, Clone, Copy)] -pub enum SyncStep { +enum SyncStep { Collect, Process, } #[derive(Debug, Clone, Copy)] -pub enum AsyncStep { +enum AsyncStep { Spill, Restore, } diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs index a38a26dbc5cea..f5ee18c7567ac 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs @@ -63,7 +63,7 @@ impl WindowPartitionBuffer { self.can_spill && self.memory_settings.check_spill() } - pub fn out_of_memory_limit(&mut self) -> bool { + fn out_of_memory_limit(&mut self) -> bool { self.memory_settings.check_spill() } diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs index c2303355723eb..3a86ab9769ea2 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs @@ -25,12 +25,12 @@ use crate::spillers::SpillWriter; use crate::spillers::Spiller; #[async_trait::async_trait] -trait Reader: Send { +pub trait Reader: Send { async fn restore(&mut self, ordinals: Vec) -> Result>; } #[async_trait::async_trait] -trait Writer: Send { +pub trait Writer: Send { type Reader: Reader; async fn spill(&mut self, blocks: Vec) -> Result; @@ -39,7 +39,7 @@ trait Writer: Send { } #[async_trait::async_trait] -trait Builder: Send + Sync { +pub trait Builder: Send + Sync { type Writer: Writer; async fn create(&self, schema: Arc) -> Result; @@ -59,7 +59,10 @@ impl Writer for SpillWriter { type Reader = SpillReader; async fn spill(&mut self, blocks: Vec) -> Result { - self.spill(blocks).await + if !self.is_opened() { + self.open().await?; + } + self.add_row_group(blocks) } async fn close(self) -> Result { @@ -178,7 +181,9 @@ where } } -pub struct WindowPartitionBufferV2 +pub(super) type WindowPartitionBufferV2 = PartitionBuffer; + +pub(super) struct PartitionBuffer where B: Builder { spiller: B, @@ -191,17 +196,16 @@ where B: Builder next_to_restore_partition_id: isize, } -impl WindowPartitionBufferV2 { +impl PartitionBuffer { pub fn new( spiller: Spiller, num_partitions: usize, sort_block_size: usize, memory_settings: MemorySettings, ) -> Result { - let mut partitions = Vec::with_capacity(num_partitions); - for _ in 0..num_partitions { - partitions.push(PartitionSlot::::default()); - } + let partitions = (0..num_partitions) + .map(|_| PartitionSlot::default()) + .collect(); Ok(Self { spiller, partitions, @@ -215,17 +219,13 @@ impl WindowPartitionBufferV2 { } } -impl WindowPartitionBufferV2 +impl PartitionBuffer where B: Builder { pub fn need_spill(&mut self) -> bool { self.can_spill && self.memory_settings.check_spill() } - pub fn out_of_memory_limit(&mut self) -> bool { - self.memory_settings.check_spill() - } - pub fn is_empty(&self) -> bool { self.next_to_restore_partition_id + 1 >= self.num_partitions as isize } diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index bf40213434ce5..24d773233405e 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -35,17 +35,16 @@ use databend_storages_common_cache::ParquetMetaData; use databend_storages_common_cache::TempPath; use opendal::Buffer; use opendal::Operator; +use parquet::file::metadata::RowGroupMetaDataPtr; use super::async_buffer::BufferPool; +use super::block_reader::BlocksReader; +use super::block_writer::BlocksWriter; use super::inner::*; use super::serialize::*; -use super::union_file::FileWriter; -use super::union_file::UnionFile; -use super::union_file::UnionFileWriter; +use super::union_file::*; use super::Location; use crate::sessions::QueryContext; -use crate::spillers::block_reader::BlocksReader; -use crate::spillers::block_writer::BlocksWriter; #[derive(Clone)] pub struct PartitionAdapter { @@ -402,16 +401,26 @@ pub struct SpillWriter { } impl SpillWriter { - pub async fn spill(&mut self, blocks: Vec) -> Result { - let file_writer = match &mut self.file_writer { - Some(file_writer) => file_writer, - file_writer @ None => { - let writer = self - .spiller - .new_file_writer(&self.schema, &self.buffer_pool, self.dio, self.chunk_size) - .await?; - file_writer.insert(writer) - } + pub async fn open(&mut self) -> Result<()> { + if self.file_writer.is_some() { + return Err(ErrorCode::Internal("SpillWriter already opened")); + } + + let writer = self + .spiller + .new_file_writer(&self.schema, &self.buffer_pool, self.dio, self.chunk_size) + .await?; + self.file_writer = Some(writer); + Ok(()) + } + + pub fn is_opened(&self) -> bool { + self.file_writer.is_some() + } + + pub fn add_row_group(&mut self, blocks: Vec) -> Result { + let Some(file_writer) = self.file_writer.as_mut() else { + return Err(ErrorCode::Internal("SpillWriter should open first")); }; let row_group_meta = file_writer.spill(blocks)?; @@ -419,6 +428,20 @@ impl SpillWriter { Ok(ordinal as _) } + pub fn new_row_group_encoder(&self) -> Option { + self.file_writer.as_ref().map(|w| w.new_row_group()) + } + + pub fn add_row_group_encoded( + &mut self, + row_group: RowGroupEncoder, + ) -> Result { + let Some(file_writer) = self.file_writer.as_mut() else { + return Err(ErrorCode::Internal("SpillWriter should open first")); + }; + Ok(file_writer.flush_row_group(row_group)?) + } + pub fn close(self) -> Result { let Some(file_writer) = self.file_writer else { return Err(ErrorCode::Internal( From 6081070ef4c8500ae2218ade14515de1149ef920 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 30 Sep 2025 15:58:40 +0800 Subject: [PATCH 35/46] x --- src/common/base/src/base/dma.rs | 18 +++++------ .../transform_window_partition_collect.rs | 30 +++++++++++++++---- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 25a63df5bb2c8..b7b76da73e997 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -573,7 +573,7 @@ impl DmaWriteBuf { let len = data.len() * self.chunk; let bufs = data.iter().map(|buf| IoSlice::new(buf)).collect::>(); - let writen = rustix::io::writev(&file.fd, &bufs)?; + let written = rustix::io::writev(&file.fd, &bufs)?; let last = self.data.pop(); self.data.clear(); @@ -584,12 +584,12 @@ impl DmaWriteBuf { _ => (), } - file.length += writen; + file.length += written; - if writen != len { + if written != len { Err(io::Error::other("short write")) } else { - Ok(writen) + Ok(written) } } @@ -624,19 +624,19 @@ impl DmaWriteBuf { .map(|buf| IoSlice::new(buf)) .collect::>(); - let writen = rustix::io::writev(&file.fd, &bufs)?; - if writen != len { + let written = rustix::io::writev(&file.fd, &bufs)?; + if written != len { return Err(io::Error::other("short write")); } if to_truncate == 0 { - file.length += writen; - return Ok(writen); + file.length += written; + return Ok(written); } file.length -= to_truncate; file.truncate(file.length)?; - Ok(writen - to_truncate) + Ok(written - to_truncate) } } diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs index 72a8e1f676bb2..8d5430420b55d 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs @@ -48,14 +48,29 @@ enum WindowBuffer { impl WindowBuffer { fn new( + is_v2: bool, spiller: Spiller, num_partitions: usize, sort_block_size: usize, memory_settings: MemorySettings, ) -> Result { - let inner = - WindowPartitionBuffer::new(spiller, num_partitions, sort_block_size, memory_settings)?; - Ok(Self::V1(inner)) + if is_v2 { + let inner = WindowPartitionBufferV2::new( + spiller, + num_partitions, + sort_block_size, + memory_settings, + )?; + Ok(Self::V2(inner)) + } else { + let inner = WindowPartitionBuffer::new( + spiller, + num_partitions, + sort_block_size, + memory_settings, + )?; + Ok(Self::V1(inner)) + } } fn need_spill(&mut self) -> bool { @@ -170,8 +185,13 @@ impl TransformWindowPartitionCollect { // Create the window partition buffer. let sort_block_size = settings.get_window_partition_sort_block_size()? as usize; - let buffer = - WindowBuffer::new(spiller, partitions.len(), sort_block_size, memory_settings)?; + let buffer = WindowBuffer::new( + true, + spiller, + partitions.len(), + sort_block_size, + memory_settings, + )?; Ok(Self { input, From 257504bfc37a40181abfd3bc282e7ac19fa9a912 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 9 Oct 2025 09:39:37 +0800 Subject: [PATCH 36/46] x --- .../transforms/src/processors/traits/spill.rs | 10 +++ src/query/service/src/sessions/query_ctx.rs | 34 ++++------ src/query/service/src/spillers/adapter.rs | 64 +++++++++++-------- src/query/service/src/spillers/inner.rs | 38 +++++------ src/query/service/src/spillers/union_file.rs | 32 ++++++---- 5 files changed, 97 insertions(+), 81 deletions(-) diff --git a/src/query/pipeline/transforms/src/processors/traits/spill.rs b/src/query/pipeline/transforms/src/processors/traits/spill.rs index 5ce4444528e54..407605cf3adf8 100644 --- a/src/query/pipeline/transforms/src/processors/traits/spill.rs +++ b/src/query/pipeline/transforms/src/processors/traits/spill.rs @@ -22,6 +22,16 @@ pub enum Location { Local(TempPath), } +impl Location { + pub fn is_local(&self) -> bool { + matches!(self, Location::Local(_)) + } + + pub fn is_remote(&self) -> bool { + matches!(self, Location::Remote(_)) + } +} + #[async_trait::async_trait] pub trait DataBlockSpill: Clone + Send + Sync + 'static { async fn spill(&self, data_block: DataBlock) -> Result { diff --git a/src/query/service/src/sessions/query_ctx.rs b/src/query/service/src/sessions/query_ctx.rs index c35e12f03021b..7cbfe7a062cf2 100644 --- a/src/query/service/src/sessions/query_ctx.rs +++ b/src/query/service/src/sessions/query_ctx.rs @@ -411,26 +411,20 @@ impl QueryContext { self.shared.clear_tables_cache() } - pub fn add_spill_file( - &self, - location: spillers::Location, - layout: spillers::Layout, - data_size: usize, - ) { - if matches!(location, spillers::Location::Remote(_)) { - let current_id = self.get_cluster().local_id(); - let mut w = self.shared.cluster_spill_progress.write(); - let p = SpillProgress::new(1, data_size); - w.entry(current_id) - .and_modify(|stats| { - stats.incr(&p); - }) - .or_insert(p); - } - { - let mut w = self.shared.spilled_files.write(); - w.insert(location, layout); - } + pub fn incr_spill_progress(&self, file_nums: usize, data_size: usize) { + let current_id = self.get_cluster().local_id(); + let mut w = self.shared.cluster_spill_progress.write(); + let p = SpillProgress::new(file_nums, data_size); + w.entry(current_id) + .and_modify(|stats| { + stats.incr(&p); + }) + .or_insert(p); + } + + pub fn add_spill_file(&self, location: spillers::Location, layout: spillers::Layout) { + let mut w = self.shared.spilled_files.write(); + w.insert(location, layout); } pub fn set_cluster_spill_progress(&self, source_target: &str, stats: SpillProgress) { diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index 24d773233405e..67447c93af267 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -61,7 +61,11 @@ impl SpillAdapter for PartitionAdapter { .write() .unwrap() .insert(location.clone(), layout.clone()); - self.ctx.as_ref().add_spill_file(location, layout, size); + + if location.is_remote() { + self.ctx.as_ref().incr_spill_progress(1, size); + } + self.ctx.as_ref().add_spill_file(location, layout); } fn get_spill_layout(&self, location: &Location) -> Option { @@ -229,7 +233,7 @@ impl Spiller { let instant = Instant::now(); let location = self.write_encodes(write_bytes, buf).await?; // Record statistics. - record_write_profile(&location, &instant, write_bytes); + record_write_profile(location.is_local(), &instant, write_bytes); self.adapter .add_spill_file(location.clone(), layout, write_bytes); @@ -352,13 +356,6 @@ impl Spiller { } pub fn new_spill_writer(&self, schema: Arc) -> Result { - if !self.use_parquet { - return Err(ErrorCode::Internal( - "window spill requires Parquet spill format, please set `set global spilling_file_format='parquet'`" - .to_string(), - )); - } - let runtime = GlobalIORuntime::instance(); let buffer_pool = BufferPool::create( runtime, @@ -423,7 +420,14 @@ impl SpillWriter { return Err(ErrorCode::Internal("SpillWriter should open first")); }; + let is_local = file_writer.has_opening_local(); + let start = std::time::Instant::now(); + let write_bytes = blocks.iter().map(DataBlock::memory_size).sum(); + let row_group_meta = file_writer.spill(blocks)?; + + record_write_profile(is_local, &start, write_bytes); + let ordinal = row_group_meta.ordinal().unwrap(); Ok(ordinal as _) } @@ -450,26 +454,28 @@ impl SpillWriter { }; let (metadata, union_file) = file_writer.finish()?; - let remote_path = union_file.remote_path().to_string(); - let parquet_metadata = Arc::new(metadata); - let total_size = parquet_metadata - .row_groups() - .iter() - .map(|rg| rg.compressed_size().max(0) as usize) - .sum(); + if let Some(path) = &union_file.local_path { + self.spiller.adapter.add_spill_file( + Location::Local(path.clone()), + Layout::Parquet, + path.size(), + ); + } self.spiller.adapter.add_spill_file( - Location::Remote(remote_path), + Location::Remote(union_file.remote_path.clone()), Layout::Parquet, - total_size, + union_file + .remote_size + .saturating_sub(union_file.remote_offset.unwrap_or_default()) as _, ); Ok(SpillReader { spiller: self.spiller, schema: self.schema, - parquet_metadata, - union_file: Some(union_file), + parquet_metadata: Arc::new(metadata), + union_file, dio: self.dio, }) } @@ -479,7 +485,7 @@ pub struct SpillReader { spiller: Spiller, schema: Arc, parquet_metadata: Arc, - union_file: Option, + union_file: UnionFile, dio: bool, } @@ -489,13 +495,9 @@ impl SpillReader { return Ok(Vec::new()); } - let union_file = self.union_file.take().ok_or_else(|| { - ErrorCode::Internal("window spill reader already consumed".to_string()) - })?; - self.spiller .load_row_groups( - union_file, + self.union_file.clone(), self.parquet_metadata.clone(), &self.schema, ordinals, @@ -507,7 +509,10 @@ impl SpillReader { impl SpillAdapter for Arc { fn add_spill_file(&self, location: Location, layout: Layout, size: usize) { - self.as_ref().add_spill_file(location, layout, size); + if matches!(location, Location::Remote(_)) { + self.incr_spill_progress(1, size); + } + self.as_ref().add_spill_file(location, layout); } fn get_spill_layout(&self, location: &Location) -> Option { @@ -523,7 +528,10 @@ pub struct SortAdapter { impl SpillAdapter for SortAdapter { fn add_spill_file(&self, location: Location, layout: Layout, size: usize) { match location { - Location::Remote(_) => self.ctx.as_ref().add_spill_file(location, layout, size), + Location::Remote(_) => { + self.ctx.as_ref().incr_spill_progress(1, size); + self.ctx.as_ref().add_spill_file(location, layout); + } Location::Local(temp_path) => { self.local_files.write().unwrap().insert(temp_path, layout); } diff --git a/src/query/service/src/spillers/inner.rs b/src/query/service/src/spillers/inner.rs index de180afbbf666..39650357e00fa 100644 --- a/src/query/service/src/spillers/inner.rs +++ b/src/query/service/src/spillers/inner.rs @@ -170,7 +170,7 @@ impl SpillerInner { let location = self.write_encodes(data_size, buf).await?; // Record statistics. - record_write_profile(&location, &instant, data_size); + record_write_profile(location.is_local(), &instant, data_size); let layout = columns_layout.pop().unwrap(); Ok((location, layout, data_size)) } @@ -273,27 +273,21 @@ impl SpillerInner { } } -pub(super) fn record_write_profile(location: &Location, start: &Instant, write_bytes: usize) { - match location { - Location::Remote(_) => { - Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1); - Profile::record_usize_profile( - ProfileStatisticsName::RemoteSpillWriteBytes, - write_bytes, - ); - Profile::record_usize_profile( - ProfileStatisticsName::RemoteSpillWriteTime, - start.elapsed().as_millis() as usize, - ); - } - Location::Local(_) => { - Profile::record_usize_profile(ProfileStatisticsName::LocalSpillWriteCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::LocalSpillWriteBytes, write_bytes); - Profile::record_usize_profile( - ProfileStatisticsName::LocalSpillWriteTime, - start.elapsed().as_millis() as usize, - ); - } +pub(super) fn record_write_profile(is_local: bool, start: &Instant, write_bytes: usize) { + if !is_local { + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1); + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteBytes, write_bytes); + Profile::record_usize_profile( + ProfileStatisticsName::RemoteSpillWriteTime, + start.elapsed().as_millis() as usize, + ); + } else { + Profile::record_usize_profile(ProfileStatisticsName::LocalSpillWriteCount, 1); + Profile::record_usize_profile(ProfileStatisticsName::LocalSpillWriteBytes, write_bytes); + Profile::record_usize_profile( + ProfileStatisticsName::LocalSpillWriteTime, + start.elapsed().as_millis() as usize, + ); } } diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index 4bad1aaa3738b..d41723fcbda6c 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -216,6 +216,10 @@ impl FileWriter { let row_groups = row_groups.into_iter().map(Arc::unwrap_or_clone).collect(); Ok((ParquetMetaData::new(metadata, row_groups), file)) } + + pub fn has_opening_local(&self) -> bool { + self.writer.inner().has_opening_local() + } } struct LocalDst { @@ -264,7 +268,7 @@ impl UnionFileWriter { } fn finish(&mut self) -> io::Result { - self.remote_writer.take().unwrap().close()?; + let remote_size = self.remote_writer.take().unwrap().close()?.content_length(); match self.local.take() { Some( mut local @ LocalDst { @@ -285,20 +289,30 @@ impl UnionFileWriter { local_path: Some(local.path), remote_path: std::mem::take(&mut self.remote), remote_offset: None, + remote_size, }) } Some(LocalDst { path, .. }) => Ok(UnionFile { local_path: Some(path), remote_path: std::mem::take(&mut self.remote), remote_offset: Some(self.remote_offset), + remote_size, }), None => Ok(UnionFile { local_path: None, remote_path: std::mem::take(&mut self.remote), remote_offset: Some(0), + remote_size, }), } } + + pub fn has_opening_local(&self) -> bool { + self.local + .as_ref() + .map(|local| local.file.is_some()) + .unwrap_or(false) + } } impl io::Write for UnionFileWriter { @@ -363,17 +377,12 @@ impl io::Write for UnionFileWriter { } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct UnionFile { - local_path: Option, - remote_path: String, - remote_offset: Option, -} - -impl UnionFile { - pub fn remote_path(&self) -> &str { - &self.remote_path - } + pub local_path: Option, + pub remote_path: String, + pub remote_offset: Option, + pub remote_size: u64, } pub(super) struct FileReader { @@ -482,6 +491,7 @@ impl SpillerInner { local_path, remote_path, remote_offset, + .. }: UnionFile, meta: Arc, schema: &DataSchema, From c40f4005f028b2dbf5f1a719fc4896b81c5724d5 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 9 Oct 2025 12:36:59 +0800 Subject: [PATCH 37/46] x --- src/query/service/src/spillers/adapter.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index 67447c93af267..bc7e18953bf70 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -422,11 +422,10 @@ impl SpillWriter { let is_local = file_writer.has_opening_local(); let start = std::time::Instant::now(); - let write_bytes = blocks.iter().map(DataBlock::memory_size).sum(); let row_group_meta = file_writer.spill(blocks)?; - record_write_profile(is_local, &start, write_bytes); + record_write_profile(is_local, &start, row_group_meta.compressed_size() as _); let ordinal = row_group_meta.ordinal().unwrap(); Ok(ordinal as _) From 3970d192505c17cdf179b38f1829e40e99104114 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 9 Oct 2025 12:36:50 +0800 Subject: [PATCH 38/46] dma.flush --- src/common/base/src/base/dma.rs | 241 ++++++++++++++---- .../service/src/spillers/async_buffer.rs | 3 - src/query/service/src/spillers/union_file.rs | 8 +- 3 files changed, 188 insertions(+), 64 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index b7b76da73e997..03e3b782efc0a 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -20,6 +20,7 @@ use std::fmt; use std::io; use std::io::IoSlice; use std::io::SeekFrom; +use std::io::Write; use std::ops::Range; use std::os::fd::AsFd; use std::os::fd::BorrowedFd; @@ -199,7 +200,7 @@ pub struct DmaFile { fd: F, alignment: Alignment, buf: Option, - length: usize, + written: usize, } impl DmaFile { @@ -232,19 +233,37 @@ impl DmaFile { } fn write_direct(&mut self) -> io::Result { - let buf = self.buffer(); - let buf_size = buf.len(); - match rustix::io::write(&self.fd, buf) { - Ok(n) => { - self.length += n; - if n != buf_size { - return Err(io::Error::other("short write")); + let buf = self.buf.as_ref().unwrap().as_slice(); + let mut written = 0; + + while written < buf.len() { + match rustix::io::write(&self.fd, &buf[written..]) { + Ok(0) => { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + "write returned zero bytes", + )); + } + Ok(n) => { + written += n; + } + Err(err) => { + if err.kind() == io::ErrorKind::Interrupted { + continue; + } + return Err(err.into()); } - self.mut_buffer().clear(); - Ok(n) } - Err(e) => Err(e.into()), } + self.inc_written(written); + self.mut_buffer().clear(); + Ok(written) + } + + fn inc_written(&mut self, n: usize) { + debug_assert!(n >= self.alignment.as_usize()); + debug_assert_eq!(n, self.alignment.align_down(n)); + self.written = self.align_down(self.written) + n; } fn read_direct(&mut self, n: usize) -> io::Result { @@ -265,7 +284,7 @@ impl DmaFile { } fn truncate(&self, length: usize) -> io::Result<()> { - rustix::fs::ftruncate(&self.fd, length as u64).map_err(|e| e.into()) + rustix::fs::ftruncate(&self.fd, length as u64).map_err(io::Error::from) } pub fn size(&self) -> io::Result { @@ -273,7 +292,7 @@ impl DmaFile { } pub fn length(&self) -> usize { - self.length + self.written } } @@ -333,7 +352,7 @@ impl AsyncDmaFile { let fd = file.as_raw_fd(); let stat = asyncify(move || { rustix::fs::fstatvfs(unsafe { BorrowedFd::borrow_raw(fd) }) - .map_err(|e| e.into()) + .map_err(io::Error::from) }) .await?; Alignment::new(stat.f_bsize.max(512) as usize).unwrap() @@ -344,7 +363,7 @@ impl AsyncDmaFile { fd: file, alignment, buf: None, - length: 0, + written: 0, }) } @@ -380,7 +399,7 @@ impl AsyncDmaFile { fd: unsafe { BorrowedFd::borrow_raw(fd) }, alignment, buf: Some(buf), - length: 0, + written: 0, }; file.read_direct(remain).map(|n| (file.buf.unwrap(), n)) }) @@ -406,17 +425,18 @@ impl SyncDmaFile { } else { OFlags::RDONLY }; - rustix::fs::open(path, flags, rustix::fs::Mode::empty()).map_err(|e| e.into()) + rustix::fs::open(path, flags, rustix::fs::Mode::empty()).map_err(io::Error::from) } fn create_fd(path: impl rustix::path::Arg, dio: bool) -> io::Result { let flags = if cfg!(target_os = "linux") && dio { - OFlags::EXCL | OFlags::CREATE | OFlags::TRUNC | OFlags::DIRECT + OFlags::EXCL | OFlags::CREATE | OFlags::TRUNC | OFlags::RDWR | OFlags::DIRECT } else { - OFlags::EXCL | OFlags::CREATE | OFlags::TRUNC + OFlags::EXCL | OFlags::CREATE | OFlags::TRUNC | OFlags::RDWR }; - rustix::fs::open(path, flags, rustix::fs::Mode::empty()).map_err(|e| e.into()) + rustix::fs::open(path, flags, rustix::fs::Mode::from_raw_mode(0o666)) + .map_err(io::Error::from) } fn open_dma(fd: OwnedFd) -> io::Result> { @@ -427,7 +447,7 @@ impl SyncDmaFile { fd, alignment, buf: None, - length: 0, + written: 0, }) } @@ -485,7 +505,7 @@ impl DmaWriteBuf { fd: AsyncDmaFile::create_fd(path, dio).await?, alignment: self.allocator.0, buf: None, - length: 0, + written: 0, }; let file_length = self.size(); @@ -572,8 +592,8 @@ impl DmaWriteBuf { let len = data.len() * self.chunk; - let bufs = data.iter().map(|buf| IoSlice::new(buf)).collect::>(); - let written = rustix::io::writev(&file.fd, &bufs)?; + let mut io_slices: Vec<_> = data.iter().map(|buf| IoSlice::new(buf)).collect(); + let written = writev_all(&file.fd, &mut io_slices)?; let last = self.data.pop(); self.data.clear(); @@ -584,7 +604,7 @@ impl DmaWriteBuf { _ => (), } - file.length += written; + file.inc_written(written); if written != len { Err(io::Error::other("short write")) @@ -593,51 +613,99 @@ impl DmaWriteBuf { } } - pub fn flush_and_close(&mut self, mut file: SyncDmaFile) -> io::Result { + fn flush_inner(&mut self, file: &mut SyncDmaFile, close: bool) -> io::Result<()> { debug_assert_eq!(self.allocator.0, file.alignment); - if self.is_last_full() { - return self.flush_full_buffer(&mut file); + if self.data.is_empty() { + return Ok(()); } - let (diff, to_truncate) = match self.data.last_mut() { - Some(last) if last.is_empty() => { - self.data.pop(); - (0, 0) - } - Some(last) => { - let n = last.len(); - let align_up = file.align_up(n); - if align_up == n { - (self.chunk - n, 0) - } else { - unsafe { last.set_len(align_up) }; - (self.chunk - align_up, align_up - n) + let last = self + .data + .pop_if(|last| file.align_up(last.len()) > last.len()); + + let last = if let Some(mut last) = last { + if self.data.is_empty() { + use std::cmp::Ordering::*; + match (file.written - file.align_down(file.written)).cmp(&last.len()) { + Equal => return Ok(()), + Greater => unreachable!(), + Less => {} } } - None => unreachable!(), + let len = last.len(); + let align_up = file.align_up(len); + let pad = align_up - len; + debug_assert!(pad != 0); + unsafe { last.set_len(align_up) }; + Some((last, len, pad)) + } else { + None }; - let len = self.data.len() * self.chunk - diff; - let bufs = self + + let mut slices: Vec<_> = self .data .iter() .map(|buf| IoSlice::new(buf)) - .collect::>(); + .chain(last.as_ref().map(|last| IoSlice::new(&last.0))) + .collect(); + let written = writev_all(&file.fd, &mut slices[..])?; + self.data.clear(); - let written = rustix::io::writev(&file.fd, &bufs)?; - if written != len { - return Err(io::Error::other("short write")); - } + file.inc_written(written); + + if let Some((last, len, pad)) = last { + file.written -= pad; + file.truncate(file.written)?; - if to_truncate == 0 { - file.length += written; - return Ok(written); + if !close { + rustix::fs::seek( + &file.fd, + rustix::fs::SeekFrom::Start(file.align_down(file.written) as _), + ) + .map_err(io::Error::from)?; + self.write_all(&last[file.align_down(len)..(file.align_up(len) - pad)])?; + } } - file.length -= to_truncate; - file.truncate(file.length)?; - Ok(written - to_truncate) + Ok(()) + } + + pub fn flush_and_close(&mut self, mut file: SyncDmaFile) -> io::Result { + self.flush_inner(&mut file, true)?; + Ok(file.length()) + } + + pub fn flush(&mut self, file: &mut SyncDmaFile) -> io::Result<()> { + self.flush_inner(file, false) + } +} + +fn writev_all(fd: impl AsFd, mut slices: &mut [IoSlice<'_>]) -> io::Result { + let mut written = 0; + + while !slices.is_empty() { + let n = match rustix::io::writev(fd.as_fd(), slices) { + Ok(0) => { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + "writev returned zero bytes", + )); + } + Ok(n) => n, + Err(err) => { + if err.kind() == io::ErrorKind::Interrupted { + continue; + } + return Err(err.into()); + } + }; + + written += n; + IoSlice::advance_slices(&mut slices, n); } + + Ok(written) } impl io::Write for DmaWriteBuf { @@ -768,6 +836,7 @@ pub async fn dma_read_file_range( #[cfg(test)] mod tests { + use std::io::Read; use std::io::Write; use super::*; @@ -928,4 +997,66 @@ mod tests { let buf = got.to_vec(); println!("{:?} {}", buf.as_ptr(), buf.capacity()); } + + #[test] + fn test_write() -> io::Result<()> { + let filename = "test_file"; + let _ = std::fs::remove_file(filename); + let mut file = SyncDmaFile::create(filename, true)?; + + let mut buf = DmaWriteBuf::new(file.alignment, file.alignment.as_usize() * 2); + + { + buf.write_all(b"1")?; + buf.flush(&mut file)?; + + assert_eq!(file.written, 1); + + let mut got = Vec::new(); + let mut read = std::fs::File::open(filename)?; + let n = read.read_to_end(&mut got)?; + assert_eq!(n, 1); + + assert_eq!(b"1".as_slice(), got.as_slice()); + } + + { + buf.write_all(b"2")?; + buf.write_all(b"3")?; + buf.flush(&mut file)?; + + assert_eq!(file.written, 3); + + let mut got = Vec::new(); + let mut read = std::fs::File::open(filename)?; + let n = read.read_to_end(&mut got)?; + assert_eq!(n, 3); + + assert_eq!(b"123".as_slice(), got.as_slice()); + } + + { + let data: Vec<_> = b"123" + .iter() + .copied() + .cycle() + .take(file.alignment.as_usize() * 3) + .collect(); + + buf.write_all(&data)?; + buf.flush(&mut file)?; + + assert_eq!(file.written, 3 + data.len()); + + let mut got = Vec::new(); + let mut read = std::fs::File::open(filename)?; + let n = read.read_to_end(&mut got)?; + assert_eq!(n, 3 + data.len()); + + let want: Vec<_> = [&b"123"[..], &data].concat(); + assert_eq!(want.as_slice(), got.as_slice()); + } + + Ok(()) + } } diff --git a/src/query/service/src/spillers/async_buffer.rs b/src/query/service/src/spillers/async_buffer.rs index 1b6facbce230c..e5036c1e4ca31 100644 --- a/src/query/service/src/spillers/async_buffer.rs +++ b/src/query/service/src/spillers/async_buffer.rs @@ -87,7 +87,6 @@ pub struct BufferPool { } impl BufferPool { - #[allow(dead_code)] pub fn create(executor: Arc, memory: usize, workers: usize) -> Arc { let (working_tx, working_rx) = async_channel::unbounded(); let (buffers_tx, buffers_rx) = async_channel::unbounded(); @@ -155,7 +154,6 @@ impl BufferPool { } } - #[allow(dead_code)] pub fn buffer_write(self: &Arc, writer: Writer) -> BufferWriter { BufferWriter::new(writer, self.clone()) } @@ -233,7 +231,6 @@ impl BufferWriter { Ok(()) } - #[allow(dead_code)] pub fn close(mut self) -> std::io::Result { self.flush()?; diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index d41723fcbda6c..e4017b3540a80 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -280,9 +280,7 @@ impl UnionFileWriter { let dma = local.buf.as_mut().unwrap(); let file = local.file.take().unwrap(); - let file_size = file.length() + dma.size(); - dma.flush_and_close(file)?; - + let file_size = dma.flush_and_close(file)?; local.path.set_size(file_size).unwrap(); Ok(UnionFile { @@ -368,9 +366,7 @@ impl io::Write for UnionFileWriter { .. }) = &mut self.local { - // warning: not completely flushed, data may be lost - dma.flush_full_buffer(file)?; - return Ok(()); + return dma.flush(file); } self.remote_writer.as_mut().unwrap().flush() From 6281530f8504f4e317b4d4ff0751eb54c842c4e3 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 9 Oct 2025 19:27:17 +0800 Subject: [PATCH 39/46] test --- src/query/service/src/spillers/union_file.rs | 134 ++++++++++++++----- 1 file changed, 99 insertions(+), 35 deletions(-) diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index e4017b3540a80..b7947e2e735ba 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -549,19 +549,23 @@ where #[cfg(test)] mod tests { + use databend_common_base::base::GlobalUniqName; use databend_common_base::runtime::GlobalIORuntime; + use databend_common_catalog::table_context::TableContext; + use databend_common_config::SpillConfig; use databend_common_exception::Result; use databend_common_expression::types::array::ArrayColumnBuilder; use databend_common_expression::types::number::Int32Type; use databend_common_expression::types::ArgType; use databend_common_expression::types::DataType; use databend_common_expression::types::StringType; - use databend_common_expression::types::UInt64Type; use databend_common_expression::Column; use databend_common_expression::FromData; use databend_common_storage::DataOperator; + use databend_storages_common_cache::TempDirManager; use parquet::file::properties::WriterProperties; use parquet::file::properties::WriterPropertiesPtr; + use tempfile::TempDir; use super::*; use crate::spillers::async_buffer::BufferPool; @@ -569,57 +573,117 @@ mod tests { use crate::test_kits::TestFixture; #[tokio::test(flavor = "multi_thread")] - async fn test_xxx() -> Result<()> { - let config = ConfigBuilder::create().off_log().build(); - let fixture = TestFixture::setup_with_config(&config).await?; - let _ctx = fixture.new_query_ctx().await?; - - let props = WriterProperties::default().into(); + async fn test_union_file_writer_without_local() -> Result<()> { + let spill_dir = TempDir::new().expect("create spill temp dir"); + let mut config = ConfigBuilder::create().off_log().build(); + config.spill = SpillConfig::new_for_test( + spill_dir.path().to_string_lossy().into_owned(), + 0.01, + 1 << 30, + ); - let block = DataBlock::new_from_columns(vec![ - UInt64Type::from_data(vec![7, 8, 9]), - StringType::from_data(vec!["c", "d", "e"]), - ]); + let fixture = TestFixture::setup_with_config(&config).await?; + let ctx = fixture.new_query_ctx().await?; - let data_schema = block.infer_schema(); let executor = GlobalIORuntime::instance(); let memory = 1024 * 1024 * 100; let pool = BufferPool::create(executor, memory, 3); let op = DataOperator::instance().operator(); - let path = "path"; - let writer = op.writer(path).await?; + let remote_path = format!( + "{}/{}", + ctx.query_id_spill_prefix(), + GlobalUniqName::unique() + ); + let writer = op.writer(&remote_path).await?; let remote = pool.buffer_write(writer); - // let dir = todo!(); - // let path = todo!(); + let mut writer = UnionFileWriter::without_local(remote_path.clone(), remote); + let mut expected = b"hello union writer".to_vec(); + writer.write_all(&expected)?; + let extra = b" write bytes"; + writer.write_all(extra)?; + expected.extend_from_slice(extra); + writer.flush()?; - // let file = SyncDmaFile::create(path, true)?; - // let align = todo!(); - // let buf = DmaWriteBuf::new(align, 4 * 1024 * 1024); + let file = writer.finish()?; + assert!(file.local_path.is_none()); + assert_eq!(file.remote_offset, Some(0)); + assert_eq!(file.remote_size, expected.len() as u64); - let file = UnionFileWriter::without_local(path.to_string(), remote); - let mut file_writer = FileWriter::new(props, &data_schema, file)?; + let reader = op.reader(&file.remote_path).await?; + let buffer = reader.read(0..file.remote_size).await?; + assert_eq!(buffer.to_vec(), expected); - let mut row_groups = vec![]; - let row_group = file_writer.spill(vec![block])?; - row_groups.push((*row_group).clone()); + Ok(()) + } - let (metadata, file) = file_writer.finish()?; + #[tokio::test(flavor = "multi_thread")] + async fn test_union_file_writer_with_local() -> Result<()> { + let spill_dir = TempDir::new().expect("create spill temp dir"); + let mut config = ConfigBuilder::create().off_log().build(); + config.spill = SpillConfig::new_for_test( + spill_dir.path().to_string_lossy().into_owned(), + 0.01, + 1 << 30, + ); - let input = FileReader { - meta: metadata.into(), - local: None, - remote_reader: op.reader(&file.remote_path).await?, - remote_offset: None, - }; + let fixture = TestFixture::setup_with_config(&config).await?; + let ctx = fixture.new_query_ctx().await?; - let builder = ArrowReaderBuilder::new(input).await?; - let stream = builder.with_batch_size(usize::MAX).build()?; + let executor = GlobalIORuntime::instance(); + let memory = 1024 * 1024 * 100; + + let pool = BufferPool::create(executor, memory, 3); + let op = DataOperator::instance().operator(); + + let remote_path = format!( + "{}/{}", + ctx.query_id_spill_prefix(), + GlobalUniqName::unique() + ); + let writer = op.writer(&remote_path).await?; + let remote = pool.buffer_write(writer); + + let query_id = ctx.get_id(); + let temp_dir = TempDirManager::instance() + .get_disk_spill_dir(memory, &query_id) + .expect("local spill directory should be available"); + let temp_path = temp_dir + .new_file_with_size(0)? + .expect("spill temp file should be allocated"); + + let dio = false; + let file = SyncDmaFile::create(&temp_path, dio)?; + let buf = DmaWriteBuf::new(temp_dir.block_alignment(), 4 * 1024 * 1024); + + let mut union_writer = UnionFileWriter::new( + temp_dir.clone(), + temp_path, + file, + buf, + remote_path.clone(), + remote, + ); + + assert!(union_writer.has_opening_local()); + + let mut expected = b"bytes on disk".to_vec(); + union_writer.write_all(&expected)?; + let extra = b" via union writer"; + union_writer.write_all(extra)?; + expected.extend_from_slice(extra); + union_writer.flush()?; + + let file = union_writer.finish()?; + + let local_path = file.local_path.clone().expect("local path should exist"); + assert!(file.remote_offset.is_none()); + assert_eq!(file.remote_size, 0); - let blocks = load_blocks_from_stream(&data_schema, stream).await?; - println!("{:?}", blocks); + let local_bytes = std::fs::read(local_path.as_ref())?; + assert_eq!(local_bytes, expected); Ok(()) } From 46fe327cb1b18cb1ba8117a299ac7c37065821ca Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 10 Oct 2025 10:20:50 +0800 Subject: [PATCH 40/46] x --- src/common/base/src/base/dma.rs | 2 +- src/query/service/src/spillers/union_file.rs | 41 +++++++++----------- 2 files changed, 19 insertions(+), 24 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 03e3b782efc0a..633e01bd1f869 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -577,7 +577,7 @@ impl DmaWriteBuf { .push(Vec::with_capacity_in(self.chunk, self.allocator)); } - pub fn flush_full_buffer(&mut self, file: &mut SyncDmaFile) -> io::Result { + pub fn flush_if_full(&mut self, file: &mut SyncDmaFile) -> io::Result { debug_assert_eq!(self.allocator.0, file.alignment); if self.size() < self.chunk { diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index b7947e2e735ba..3c33e979ff341 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -229,6 +229,17 @@ struct LocalDst { buf: Option, } +impl LocalDst { + fn close(&mut self) -> io::Result { + let file = self.file.take().unwrap(); + let mut dma = self.buf.take().unwrap(); + let file_size = dma.flush_and_close(file)?; + + self.path.set_size(file_size).unwrap(); + Ok(file_size) + } +} + pub struct UnionFileWriter { local: Option, remote: String, @@ -258,7 +269,7 @@ impl UnionFileWriter { } } - fn without_local(remote: String, remote_writer: BufferWriter) -> Self { + fn remote_only(remote: String, remote_writer: BufferWriter) -> Self { UnionFileWriter { local: None, remote, @@ -315,7 +326,7 @@ impl UnionFileWriter { impl io::Write for UnionFileWriter { fn write(&mut self, buf: &[u8]) -> io::Result { - let (dma_buf, offset) = if let Some( + if let Some( local @ LocalDst { file: Some(_), buf: Some(_), @@ -332,30 +343,14 @@ impl io::Write for UnionFileWriter { if local.dir.grow_size(&mut local.path, buf.len(), false)? { dma.write(buf)?; let file = local.file.as_mut().unwrap(); - dma.flush_full_buffer(file)?; + dma.flush_if_full(file)?; local.path.set_size(file.length()).unwrap(); return Ok(n); } - let mut file = local.file.take().unwrap(); - dma.flush_full_buffer(&mut file)?; - - let file_size = file.length(); - local.path.set_size(file_size).unwrap(); - drop(file); - - (local.buf.take().unwrap().into_data(), file_size) - } else { - (vec![], 0) + self.remote_offset = local.close()? as _; }; - if offset != 0 { - self.remote_offset = offset as _; - } - - for buf in dma_buf { - self.remote_writer.as_mut().unwrap().write(&buf)?; - } self.remote_writer.as_mut().unwrap().write(buf) } @@ -471,10 +466,10 @@ impl SpillerInner { let buf = DmaWriteBuf::new(align, chunk); UnionFileWriter::new(disk.clone(), path, file, buf, remote_location, remote) } else { - UnionFileWriter::without_local(remote_location, remote) + UnionFileWriter::remote_only(remote_location, remote) } } else { - UnionFileWriter::without_local(remote_location, remote) + UnionFileWriter::remote_only(remote_location, remote) }; let props = WriterProperties::default().into(); @@ -599,7 +594,7 @@ mod tests { let writer = op.writer(&remote_path).await?; let remote = pool.buffer_write(writer); - let mut writer = UnionFileWriter::without_local(remote_path.clone(), remote); + let mut writer = UnionFileWriter::remote_only(remote_path.clone(), remote); let mut expected = b"hello union writer".to_vec(); writer.write_all(&expected)?; let extra = b" write bytes"; From 73a0fca11f4c071e1486cab66b1c2525c12c1a11 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 10 Oct 2025 12:10:48 +0800 Subject: [PATCH 41/46] x --- .../transform_window_partition_collect.rs | 17 ++-- .../partition/window_partition_buffer_v2.rs | 12 +-- src/query/service/src/spillers/adapter.rs | 66 ++++++++++++---- src/query/service/src/spillers/inner.rs | 77 +++++++++---------- src/query/service/src/spillers/union_file.rs | 13 ++-- 5 files changed, 110 insertions(+), 75 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs index 8d5430420b55d..9e5308b8f2a83 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs @@ -40,6 +40,7 @@ use crate::spillers::Spiller; use crate::spillers::SpillerConfig; use crate::spillers::SpillerDiskConfig; use crate::spillers::SpillerType; +use crate::spillers::WindowSpiller; enum WindowBuffer { V1(WindowPartitionBuffer), @@ -49,14 +50,15 @@ enum WindowBuffer { impl WindowBuffer { fn new( is_v2: bool, - spiller: Spiller, + partition_spiller: Spiller, + writer_spiller: WindowSpiller, num_partitions: usize, sort_block_size: usize, memory_settings: MemorySettings, ) -> Result { if is_v2 { let inner = WindowPartitionBufferV2::new( - spiller, + writer_spiller, num_partitions, sort_block_size, memory_settings, @@ -64,7 +66,7 @@ impl WindowBuffer { Ok(Self::V2(inner)) } else { let inner = WindowPartitionBuffer::new( - spiller, + partition_spiller, num_partitions, sort_block_size, memory_settings, @@ -179,15 +181,18 @@ impl TransformWindowPartitionCollect { use_parquet: settings.get_spilling_file_format()?.is_parquet(), }; - // Create an inner `Spiller` to spill data. + // Create spillers for window operator. let operator = DataOperator::instance().spill_operator(); - let spiller = Spiller::create(ctx, operator, spill_config)?; + let partition_spiller = + Spiller::create(ctx.clone(), operator.clone(), spill_config.clone())?; + let window_spiller = WindowSpiller::create(ctx, operator, spill_config)?; // Create the window partition buffer. let sort_block_size = settings.get_window_partition_sort_block_size()? as usize; let buffer = WindowBuffer::new( true, - spiller, + partition_spiller, + window_spiller, partitions.len(), sort_block_size, memory_settings, diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs index 3a86ab9769ea2..7066e3907bdc4 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs @@ -22,7 +22,7 @@ use databend_common_pipeline_transforms::MemorySettings; use super::concat_data_blocks; use crate::spillers::SpillReader; use crate::spillers::SpillWriter; -use crate::spillers::Spiller; +use crate::spillers::WindowSpiller; #[async_trait::async_trait] pub trait Reader: Send { @@ -46,7 +46,7 @@ pub trait Builder: Send + Sync { } #[async_trait::async_trait] -impl Builder for Spiller { +impl Builder for WindowSpiller { type Writer = SpillWriter; async fn create(&self, schema: Arc) -> Result { @@ -181,7 +181,7 @@ where } } -pub(super) type WindowPartitionBufferV2 = PartitionBuffer; +pub(super) type WindowPartitionBufferV2 = PartitionBuffer; pub(super) struct PartitionBuffer where B: Builder @@ -196,9 +196,11 @@ where B: Builder next_to_restore_partition_id: isize, } -impl PartitionBuffer { +impl PartitionBuffer +where B: Builder +{ pub fn new( - spiller: Spiller, + spiller: B, num_partitions: usize, sort_block_size: usize, memory_settings: MemorySettings, diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index bc7e18953bf70..6ba40fe1b68f3 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -277,7 +277,7 @@ impl Spiller { }; // Record statistics. - record_read_profile(location, &instant, data.len()); + record_read_profile(location.is_local(), &instant, data.len()); // Deserialize partitioned data block. let mut partitioned_data = Vec::with_capacity(partitions.len()); @@ -311,7 +311,7 @@ impl Spiller { Location::Remote(loc) => self.operator.read_with(loc).range(data_range).await?, }; - record_read_profile(location, &instant, data.len()); + record_read_profile(location.is_local(), &instant, data.len()); deserialize_block(layout, data) } @@ -354,20 +354,55 @@ impl Spiller { .cloned() .collect() } +} - pub fn new_spill_writer(&self, schema: Arc) -> Result { +#[derive(Clone)] +pub struct WindowWriterAdapter { + ctx: Arc, + buffer_pool: Arc, + chunk_size: usize, +} + +impl WindowWriterAdapter { + fn add_spill_file(&self, location: Location, layout: Layout, size: usize) { + if location.is_remote() { + self.ctx.as_ref().incr_spill_progress(1, size); + self.ctx + .as_ref() + .add_spill_file(location.clone(), layout.clone()); + } + } +} + +pub type WindowSpiller = SpillerInner; + +impl WindowSpiller { + pub fn create( + ctx: Arc, + operator: Operator, + config: SpillerConfig, + ) -> Result { let runtime = GlobalIORuntime::instance(); let buffer_pool = BufferPool::create( runtime, WINDOW_SPILL_BUFFER_MEMORY_BYTES, WINDOW_SPILL_BUFFER_WORKERS, ); + Self::new( + WindowWriterAdapter { + ctx, + buffer_pool, + chunk_size: WINDOW_SPILL_CHUNK_SIZE, + }, + operator, + config, + ) + } + pub fn new_spill_writer(&self, schema: Arc) -> Result { Ok(SpillWriter { spiller: self.clone(), - buffer_pool, - dio: self.temp_dir.is_some(), - chunk_size: WINDOW_SPILL_CHUNK_SIZE, + chunk_size: self.adapter.chunk_size, schema, file_writer: None, }) @@ -389,9 +424,7 @@ const WINDOW_SPILL_BUFFER_WORKERS: usize = 2; const WINDOW_SPILL_CHUNK_SIZE: usize = 8 * 1024 * 1024; pub struct SpillWriter { - spiller: Spiller, - buffer_pool: Arc, - dio: bool, + spiller: WindowSpiller, chunk_size: usize, schema: Arc, file_writer: Option>, @@ -405,7 +438,11 @@ impl SpillWriter { let writer = self .spiller - .new_file_writer(&self.schema, &self.buffer_pool, self.dio, self.chunk_size) + .new_file_writer( + &self.schema, + &self.spiller.adapter.buffer_pool, + self.chunk_size, + ) .await?; self.file_writer = Some(writer); Ok(()) @@ -475,17 +512,15 @@ impl SpillWriter { schema: self.schema, parquet_metadata: Arc::new(metadata), union_file, - dio: self.dio, }) } } pub struct SpillReader { - spiller: Spiller, + spiller: WindowSpiller, schema: Arc, parquet_metadata: Arc, union_file: UnionFile, - dio: bool, } impl SpillReader { @@ -500,7 +535,6 @@ impl SpillReader { self.parquet_metadata.clone(), &self.schema, ordinals, - self.dio, ) .await } @@ -613,9 +647,7 @@ impl LiteSpiller { Location::Local(_) => None, }) .collect(); - let op = self.0.local_operator.as_ref().unwrap_or(&self.0.operator); - - op.delete_iter(files).await?; + self.0.operator.delete_iter(files).await?; Ok(()) } } diff --git a/src/query/service/src/spillers/inner.rs b/src/query/service/src/spillers/inner.rs index 39650357e00fa..4fd22b727be09 100644 --- a/src/query/service/src/spillers/inner.rs +++ b/src/query/service/src/spillers/inner.rs @@ -100,7 +100,7 @@ pub trait SpillAdapter: Send + Sync + 'static { /// 3. Serialization and deserialization input data /// 4. Interact with the underlying storage engine to write and read spilled data #[derive(Clone)] -pub struct SpillerInner { +pub struct SpillerInner { pub(super) adapter: A, pub(super) operator: Operator, location_prefix: String, @@ -111,7 +111,7 @@ pub struct SpillerInner { _spiller_type: SpillerType, } -impl SpillerInner { +impl SpillerInner { pub fn new(adapter: A, operator: Operator, config: SpillerConfig) -> Result { let SpillerConfig { location_prefix, @@ -139,17 +139,6 @@ impl SpillerInner { }) } - /// Spill some [`DataBlock`] to storage. These blocks will be concat into one. - #[fastrace::trace(name = "Spiller::spill")] - pub async fn spill(&self, data_block: Vec) -> Result { - let (location, layout, data_size) = self.spill_unmanage(data_block).await?; - - // Record columns layout for spilled data. - self.adapter - .add_spill_file(location.clone(), layout, data_size); - Ok(location) - } - async fn spill_unmanage( &self, data_block: Vec, @@ -179,13 +168,6 @@ impl SpillerInner { format!("{}/{}", self.location_prefix, GlobalUniqName::unique()) } - /// Read a certain file to a [`DataBlock`]. - #[fastrace::trace(name = "Spiller::read_spilled_file")] - pub async fn read_spilled_file(&self, location: &Location) -> Result { - let layout = self.adapter.get_spill_layout(location).unwrap(); - self.read_unmanage_spilled_file(location, &layout).await - } - async fn read_unmanage_spilled_file( &self, location: &Location, @@ -219,7 +201,7 @@ impl SpillerInner { Location::Remote(loc) => self.operator.read(loc).await?, }; - record_read_profile(location, &instant, data.len()); + record_read_profile(location.is_local(), &instant, data.len()); deserialize_block(columns_layout, data) } @@ -273,6 +255,26 @@ impl SpillerInner { } } +impl SpillerInner { + /// Spill some [`DataBlock`] to storage. These blocks will be concat into one. + #[fastrace::trace(name = "Spiller::spill")] + pub async fn spill(&self, data_block: Vec) -> Result { + let (location, layout, data_size) = self.spill_unmanage(data_block).await?; + + // Record columns layout for spilled data. + self.adapter + .add_spill_file(location.clone(), layout, data_size); + Ok(location) + } + + /// Read a certain file to a [`DataBlock`]. + #[fastrace::trace(name = "Spiller::read_spilled_file")] + pub async fn read_spilled_file(&self, location: &Location) -> Result { + let layout = self.adapter.get_spill_layout(location).unwrap(); + self.read_unmanage_spilled_file(location, &layout).await + } +} + pub(super) fn record_write_profile(is_local: bool, start: &Instant, write_bytes: usize) { if !is_local { Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1); @@ -291,23 +293,20 @@ pub(super) fn record_write_profile(is_local: bool, start: &Instant, write_bytes: } } -pub(super) fn record_read_profile(location: &Location, start: &Instant, read_bytes: usize) { - match location { - Location::Remote(_) => { - Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillReadCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillReadBytes, read_bytes); - Profile::record_usize_profile( - ProfileStatisticsName::RemoteSpillReadTime, - start.elapsed().as_millis() as usize, - ); - } - Location::Local(_) => { - Profile::record_usize_profile(ProfileStatisticsName::LocalSpillReadCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::LocalSpillReadBytes, read_bytes); - Profile::record_usize_profile( - ProfileStatisticsName::LocalSpillReadTime, - start.elapsed().as_millis() as usize, - ); - } +pub(super) fn record_read_profile(is_local: bool, start: &Instant, read_bytes: usize) { + if is_local { + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillReadCount, 1); + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillReadBytes, read_bytes); + Profile::record_usize_profile( + ProfileStatisticsName::RemoteSpillReadTime, + start.elapsed().as_millis() as usize, + ); + } else { + Profile::record_usize_profile(ProfileStatisticsName::LocalSpillReadCount, 1); + Profile::record_usize_profile(ProfileStatisticsName::LocalSpillReadBytes, read_bytes); + Profile::record_usize_profile( + ProfileStatisticsName::LocalSpillReadTime, + start.elapsed().as_millis() as usize, + ); } } diff --git a/src/query/service/src/spillers/union_file.rs b/src/query/service/src/spillers/union_file.rs index 3c33e979ff341..546a1a679ee7f 100644 --- a/src/query/service/src/spillers/union_file.rs +++ b/src/query/service/src/spillers/union_file.rs @@ -53,7 +53,6 @@ use parquet::schema::types::SchemaDescriptor; use super::async_buffer::BufferPool; use super::async_buffer::BufferWriter; -use super::SpillAdapter; use super::SpillerInner; pub struct RowGroupEncoder { @@ -445,15 +444,14 @@ impl AsyncFileReader for FileReader { } } -impl SpillerInner { +impl SpillerInner { pub(super) async fn new_file_writer( &self, schema: &DataSchema, pool: &Arc, - dio: bool, chunk: usize, ) -> Result> { - let op = self.local_operator.as_ref().unwrap_or(&self.operator); + let op = &self.operator; let remote_location = self.create_unique_location(); let remote_writer = op.writer(&remote_location).await?; @@ -461,7 +459,7 @@ impl SpillerInner { let union = if let Some(disk) = &self.temp_dir { if let Some(path) = disk.new_file_with_size(0)? { - let file = SyncDmaFile::create(&path, dio)?; + let file = SyncDmaFile::create(&path, true)?; let align = disk.block_alignment(); let buf = DmaWriteBuf::new(align, chunk); UnionFileWriter::new(disk.clone(), path, file, buf, remote_location, remote) @@ -487,15 +485,14 @@ impl SpillerInner { meta: Arc, schema: &DataSchema, row_groups: Vec, - dio: bool, ) -> Result> { - let op = self.local_operator.as_ref().unwrap_or(&self.operator); + let op = &self.operator; let input = FileReader { meta, local: if let Some(path) = local_path { let alignment = Some(self.temp_dir.as_ref().unwrap().block_alignment()); - let file = AsyncDmaFile::open(&path, dio, alignment).await?; + let file = AsyncDmaFile::open(&path, true, alignment).await?; Some((path, file)) } else { None From 6c6483dbee36bf406ce266ab1e2216d92540cd22 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 10 Oct 2025 13:20:20 +0800 Subject: [PATCH 42/46] x --- .../src/physical_plans/physical_recluster.rs | 1 + .../physical_window_partition.rs | 1 + .../transform_window_partition_collect.rs | 72 +++++++++++-------- .../partition/window_partition_buffer_v2.rs | 6 +- src/query/service/src/spillers/adapter.rs | 29 +++----- src/query/service/src/spillers/mod.rs | 1 + 6 files changed, 57 insertions(+), 53 deletions(-) diff --git a/src/query/service/src/physical_plans/physical_recluster.rs b/src/query/service/src/physical_plans/physical_recluster.rs index 1cd58dc8364f3..b95a3031c25d6 100644 --- a/src/query/service/src/physical_plans/physical_recluster.rs +++ b/src/query/service/src/physical_plans/physical_recluster.rs @@ -350,6 +350,7 @@ impl IPhysicalPlan for HilbertPartition { self.num_partitions, window_spill_settings.clone(), disk_spill.clone(), + false, CompactStrategy::new(self.rows_per_block, max_bytes_per_block), )?, ))) diff --git a/src/query/service/src/physical_plans/physical_window_partition.rs b/src/query/service/src/physical_plans/physical_window_partition.rs index 9239994c9df02..f51a9e39415f6 100644 --- a/src/query/service/src/physical_plans/physical_window_partition.rs +++ b/src/query/service/src/physical_plans/physical_window_partition.rs @@ -181,6 +181,7 @@ impl IPhysicalPlan for WindowPartition { num_partitions, window_spill_settings.clone(), disk_spill.clone(), + true, strategy, )?, ))) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs index 9e5308b8f2a83..d525f44f435f9 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs @@ -20,6 +20,7 @@ use std::any::Any; use std::collections::VecDeque; use std::sync::Arc; +use databend_common_base::runtime::GlobalIORuntime; use databend_common_exception::Result; use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; @@ -30,17 +31,19 @@ use databend_common_pipeline_core::processors::Processor; use databend_common_pipeline_transforms::MemorySettings; use databend_common_settings::Settings; use databend_common_storage::DataOperator; +use either::Either; use super::window_partition_buffer_v2::WindowPartitionBufferV2; use super::WindowPartitionBuffer; use super::WindowPartitionMeta; use crate::pipelines::processors::transforms::DataProcessorStrategy; use crate::sessions::QueryContext; +use crate::spillers::BackpressureSpiller; +use crate::spillers::BufferPool; use crate::spillers::Spiller; use crate::spillers::SpillerConfig; use crate::spillers::SpillerDiskConfig; use crate::spillers::SpillerType; -use crate::spillers::WindowSpiller; enum WindowBuffer { V1(WindowPartitionBuffer), @@ -49,29 +52,30 @@ enum WindowBuffer { impl WindowBuffer { fn new( - is_v2: bool, - partition_spiller: Spiller, - writer_spiller: WindowSpiller, + spiller: Either, num_partitions: usize, sort_block_size: usize, memory_settings: MemorySettings, ) -> Result { - if is_v2 { - let inner = WindowPartitionBufferV2::new( - writer_spiller, - num_partitions, - sort_block_size, - memory_settings, - )?; - Ok(Self::V2(inner)) - } else { - let inner = WindowPartitionBuffer::new( - partition_spiller, - num_partitions, - sort_block_size, - memory_settings, - )?; - Ok(Self::V1(inner)) + match spiller { + Either::Left(spiller) => { + let inner = WindowPartitionBuffer::new( + spiller, + num_partitions, + sort_block_size, + memory_settings, + )?; + Ok(Self::V1(inner)) + } + Either::Right(spiller) => { + let inner = WindowPartitionBufferV2::new( + spiller, + num_partitions, + sort_block_size, + memory_settings, + )?; + Ok(Self::V2(inner)) + } } } @@ -150,6 +154,7 @@ pub struct TransformWindowPartitionCollect { } impl TransformWindowPartitionCollect { + #[expect(clippy::too_many_arguments)] pub fn new( ctx: Arc, input: Arc, @@ -160,6 +165,7 @@ impl TransformWindowPartitionCollect { num_partitions: usize, memory_settings: MemorySettings, disk_spill: Option, + enable_backpressure_spiller: bool, strategy: S, ) -> Result { // Calculate the partition ids collected by the processor. @@ -183,20 +189,24 @@ impl TransformWindowPartitionCollect { // Create spillers for window operator. let operator = DataOperator::instance().spill_operator(); - let partition_spiller = - Spiller::create(ctx.clone(), operator.clone(), spill_config.clone())?; - let window_spiller = WindowSpiller::create(ctx, operator, spill_config)?; + let spiller = if !enable_backpressure_spiller { + Either::Left(Spiller::create(ctx, operator, spill_config)?) + } else { + let runtime = GlobalIORuntime::instance(); + let buffer_pool = BufferPool::create(runtime, 128 * 1024 * 1024, 3); + Either::Right(BackpressureSpiller::create( + ctx, + operator, + spill_config, + buffer_pool, + 8 * 1024 * 1024, + )?) + }; // Create the window partition buffer. let sort_block_size = settings.get_window_partition_sort_block_size()? as usize; - let buffer = WindowBuffer::new( - true, - partition_spiller, - window_spiller, - partitions.len(), - sort_block_size, - memory_settings, - )?; + let buffer = + WindowBuffer::new(spiller, partitions.len(), sort_block_size, memory_settings)?; Ok(Self { input, diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs index 7066e3907bdc4..c2bc456cdd240 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs @@ -20,9 +20,9 @@ use databend_common_expression::DataSchema; use databend_common_pipeline_transforms::MemorySettings; use super::concat_data_blocks; +use crate::spillers::BackpressureSpiller; use crate::spillers::SpillReader; use crate::spillers::SpillWriter; -use crate::spillers::WindowSpiller; #[async_trait::async_trait] pub trait Reader: Send { @@ -46,7 +46,7 @@ pub trait Builder: Send + Sync { } #[async_trait::async_trait] -impl Builder for WindowSpiller { +impl Builder for BackpressureSpiller { type Writer = SpillWriter; async fn create(&self, schema: Arc) -> Result { @@ -181,7 +181,7 @@ where } } -pub(super) type WindowPartitionBufferV2 = PartitionBuffer; +pub(super) type WindowPartitionBufferV2 = PartitionBuffer; pub(super) struct PartitionBuffer where B: Builder diff --git a/src/query/service/src/spillers/adapter.rs b/src/query/service/src/spillers/adapter.rs index 6ba40fe1b68f3..5bdf2c22011f7 100644 --- a/src/query/service/src/spillers/adapter.rs +++ b/src/query/service/src/spillers/adapter.rs @@ -24,7 +24,6 @@ use std::time::Instant; use databend_common_base::base::dma_buffer_to_bytes; use databend_common_base::base::dma_read_file_range; use databend_common_base::base::ProgressValues; -use databend_common_base::runtime::GlobalIORuntime; use databend_common_catalog::table_context::TableContext; use databend_common_exception::ErrorCode; use databend_common_exception::Result; @@ -357,13 +356,13 @@ impl Spiller { } #[derive(Clone)] -pub struct WindowWriterAdapter { +pub struct BackpressureAdapter { ctx: Arc, buffer_pool: Arc, chunk_size: usize, } -impl WindowWriterAdapter { +impl BackpressureAdapter { fn add_spill_file(&self, location: Location, layout: Layout, size: usize) { if location.is_remote() { self.ctx.as_ref().incr_spill_progress(1, size); @@ -374,25 +373,21 @@ impl WindowWriterAdapter { } } -pub type WindowSpiller = SpillerInner; +pub type BackpressureSpiller = SpillerInner; -impl WindowSpiller { +impl BackpressureSpiller { pub fn create( ctx: Arc, operator: Operator, config: SpillerConfig, + buffer_pool: Arc, + chunk_size: usize, ) -> Result { - let runtime = GlobalIORuntime::instance(); - let buffer_pool = BufferPool::create( - runtime, - WINDOW_SPILL_BUFFER_MEMORY_BYTES, - WINDOW_SPILL_BUFFER_WORKERS, - ); Self::new( - WindowWriterAdapter { + BackpressureAdapter { ctx, buffer_pool, - chunk_size: WINDOW_SPILL_CHUNK_SIZE, + chunk_size, }, operator, config, @@ -419,12 +414,8 @@ pub struct Chunk { pub layout: Layout, } -const WINDOW_SPILL_BUFFER_MEMORY_BYTES: usize = 64 * 1024 * 1024; -const WINDOW_SPILL_BUFFER_WORKERS: usize = 2; -const WINDOW_SPILL_CHUNK_SIZE: usize = 8 * 1024 * 1024; - pub struct SpillWriter { - spiller: WindowSpiller, + spiller: BackpressureSpiller, chunk_size: usize, schema: Arc, file_writer: Option>, @@ -517,7 +508,7 @@ impl SpillWriter { } pub struct SpillReader { - spiller: WindowSpiller, + spiller: BackpressureSpiller, schema: Arc, parquet_metadata: Arc, union_file: UnionFile, diff --git a/src/query/service/src/spillers/mod.rs b/src/query/service/src/spillers/mod.rs index bf148c7117bba..f538c3a25eb44 100644 --- a/src/query/service/src/spillers/mod.rs +++ b/src/query/service/src/spillers/mod.rs @@ -24,6 +24,7 @@ mod test_memory; mod union_file; pub use adapter::*; +pub use async_buffer::BufferPool; pub use block_writer::*; pub use databend_common_pipeline_transforms::traits::Location; pub use inner::*; From 6c7a1536e04f9e4b5368838e6eb74b22fd051ed2 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 10 Oct 2025 13:43:49 +0800 Subject: [PATCH 43/46] settings --- .../src/physical_plans/physical_window_partition.rs | 3 ++- .../window/partition/window_partition_buffer_v2.rs | 8 ++++---- src/query/settings/src/settings_default.rs | 7 +++++++ src/query/settings/src/settings_getter_setter.rs | 4 ++++ 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/query/service/src/physical_plans/physical_window_partition.rs b/src/query/service/src/physical_plans/physical_window_partition.rs index f51a9e39415f6..0e00ed3ec2af4 100644 --- a/src/query/service/src/physical_plans/physical_window_partition.rs +++ b/src/query/service/src/physical_plans/physical_window_partition.rs @@ -161,6 +161,7 @@ impl IPhysicalPlan for WindowPartition { _ => unimplemented!(), }; let window_spill_settings = MemorySettings::from_window_settings(&builder.ctx)?; + let enable_backpressure_spiller = settings.get_enable_backpressure_spiller()?; let processor_id = AtomicUsize::new(0); builder.main_pipeline.add_transform(|input, output| { @@ -181,7 +182,7 @@ impl IPhysicalPlan for WindowPartition { num_partitions, window_spill_settings.clone(), disk_spill.clone(), - true, + enable_backpressure_spiller, strategy, )?, ))) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs index c2bc456cdd240..d960e9140f687 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs @@ -107,10 +107,10 @@ where { fn default() -> Self { Self { - state: PartitionSpillState::Empty, - spilled_ordinals: Vec::new(), - buffered_blocks: Vec::new(), - buffered_size: 0, + state: Default::default(), + spilled_ordinals: Default::default(), + buffered_blocks: Default::default(), + buffered_size: Default::default(), } } } diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index 1fba150428237..749749ce80ac3 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -228,6 +228,13 @@ impl DefaultSettings { scope: SettingScope::Both, range: Some(SettingRange::Numeric(0..=500)), }), + ("enable_backpressure_spiller", DefaultSettingValue { + value: UserSettingValue::UInt64(1), + desc: "Use new backpressure spiller.", + mode: SettingMode::Both, + scope: SettingScope::Both, + range: Some(SettingRange::Numeric(0..=1)), + }), ("max_spill_io_requests", DefaultSettingValue { value: UserSettingValue::UInt64(default_max_spill_io_requests), desc: "Sets the maximum number of concurrent spill I/O requests.", diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index 3ba9d9b0fa2d0..087b24b7196f4 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -897,6 +897,10 @@ impl Settings { self.try_get_u64("dynamic_sample_time_budget_ms") } + pub fn get_enable_backpressure_spiller(&self) -> Result { + Ok(self.try_get_u64("enable_backpressure_spiller")? != 0) + } + pub fn get_max_spill_io_requests(&self) -> Result { self.try_get_u64("max_spill_io_requests") } From 81bc6fc1c167ac8ac5ff608408814d0af53f639f Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 10 Oct 2025 14:23:45 +0800 Subject: [PATCH 44/46] fsync --- src/common/base/src/base/dma.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 633e01bd1f869..bc9adac150bee 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -283,10 +283,14 @@ impl DmaFile { } } - fn truncate(&self, length: usize) -> io::Result<()> { + fn truncate(&mut self, length: usize) -> io::Result<()> { rustix::fs::ftruncate(&self.fd, length as u64).map_err(io::Error::from) } + pub fn fsync(&mut self) -> io::Result<()> { + rustix::fs::fsync(&self.fd).map_err(io::Error::from) + } + pub fn size(&self) -> io::Result { Ok(rustix::fs::fstat(&self.fd)?.st_size as _) } @@ -1009,6 +1013,7 @@ mod tests { { buf.write_all(b"1")?; buf.flush(&mut file)?; + file.fsync()?; assert_eq!(file.written, 1); @@ -1024,6 +1029,7 @@ mod tests { buf.write_all(b"2")?; buf.write_all(b"3")?; buf.flush(&mut file)?; + file.fsync()?; assert_eq!(file.written, 3); @@ -1045,6 +1051,7 @@ mod tests { buf.write_all(&data)?; buf.flush(&mut file)?; + file.fsync()?; assert_eq!(file.written, 3 + data.len()); From ac2ae7bddeef8c0af9f738902d92a61230ba9d91 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 10 Oct 2025 15:02:54 +0800 Subject: [PATCH 45/46] fix --- src/common/base/src/base/dma.rs | 66 +++++++++---------- .../partition/window_partition_buffer_v2.rs | 29 +++----- 2 files changed, 41 insertions(+), 54 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index bc9adac150bee..8d6d7ace71f69 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -200,7 +200,7 @@ pub struct DmaFile { fd: F, alignment: Alignment, buf: Option, - written: usize, + offset: usize, } impl DmaFile { @@ -235,9 +235,10 @@ impl DmaFile { fn write_direct(&mut self) -> io::Result { let buf = self.buf.as_ref().unwrap().as_slice(); let mut written = 0; + let offset = self.align_down(self.offset); while written < buf.len() { - match rustix::io::write(&self.fd, &buf[written..]) { + match rustix::io::pwrite(&self.fd, &buf[written..], (offset + written) as _) { Ok(0) => { return Err(io::Error::new( io::ErrorKind::WriteZero, @@ -246,6 +247,7 @@ impl DmaFile { } Ok(n) => { written += n; + self.offset = offset + written } Err(err) => { if err.kind() == io::ErrorKind::Interrupted { @@ -255,15 +257,14 @@ impl DmaFile { } } } - self.inc_written(written); self.mut_buffer().clear(); Ok(written) } - fn inc_written(&mut self, n: usize) { + fn inc_offset(&mut self, n: usize) { debug_assert!(n >= self.alignment.as_usize()); debug_assert_eq!(n, self.alignment.align_down(n)); - self.written = self.align_down(self.written) + n; + self.offset = self.align_down(self.offset) + n; } fn read_direct(&mut self, n: usize) -> io::Result { @@ -296,7 +297,7 @@ impl DmaFile { } pub fn length(&self) -> usize { - self.written + self.offset } } @@ -367,7 +368,7 @@ impl AsyncDmaFile { fd: file, alignment, buf: None, - written: 0, + offset: 0, }) } @@ -403,7 +404,7 @@ impl AsyncDmaFile { fd: unsafe { BorrowedFd::borrow_raw(fd) }, alignment, buf: Some(buf), - written: 0, + offset: 0, }; file.read_direct(remain).map(|n| (file.buf.unwrap(), n)) }) @@ -434,9 +435,9 @@ impl SyncDmaFile { fn create_fd(path: impl rustix::path::Arg, dio: bool) -> io::Result { let flags = if cfg!(target_os = "linux") && dio { - OFlags::EXCL | OFlags::CREATE | OFlags::TRUNC | OFlags::RDWR | OFlags::DIRECT + OFlags::EXCL | OFlags::CREATE | OFlags::TRUNC | OFlags::WRONLY | OFlags::DIRECT } else { - OFlags::EXCL | OFlags::CREATE | OFlags::TRUNC | OFlags::RDWR + OFlags::EXCL | OFlags::CREATE | OFlags::TRUNC | OFlags::WRONLY }; rustix::fs::open(path, flags, rustix::fs::Mode::from_raw_mode(0o666)) @@ -451,7 +452,7 @@ impl SyncDmaFile { fd, alignment, buf: None, - written: 0, + offset: 0, }) } @@ -509,7 +510,7 @@ impl DmaWriteBuf { fd: AsyncDmaFile::create_fd(path, dio).await?, alignment: self.allocator.0, buf: None, - written: 0, + offset: 0, }; let file_length = self.size(); @@ -597,7 +598,7 @@ impl DmaWriteBuf { let len = data.len() * self.chunk; let mut io_slices: Vec<_> = data.iter().map(|buf| IoSlice::new(buf)).collect(); - let written = writev_all(&file.fd, &mut io_slices)?; + let written = writev_all(&file.fd, &mut io_slices, file.align_down(file.offset))?; let last = self.data.pop(); self.data.clear(); @@ -608,7 +609,7 @@ impl DmaWriteBuf { _ => (), } - file.inc_written(written); + file.inc_offset(written); if written != len { Err(io::Error::other("short write")) @@ -631,7 +632,7 @@ impl DmaWriteBuf { let last = if let Some(mut last) = last { if self.data.is_empty() { use std::cmp::Ordering::*; - match (file.written - file.align_down(file.written)).cmp(&last.len()) { + match (file.offset - file.align_down(file.offset)).cmp(&last.len()) { Equal => return Ok(()), Greater => unreachable!(), Less => {} @@ -653,21 +654,16 @@ impl DmaWriteBuf { .map(|buf| IoSlice::new(buf)) .chain(last.as_ref().map(|last| IoSlice::new(&last.0))) .collect(); - let written = writev_all(&file.fd, &mut slices[..])?; + let written = writev_all(&file.fd, &mut slices[..], file.align_down(file.offset))?; self.data.clear(); - file.inc_written(written); + file.inc_offset(written); if let Some((last, len, pad)) = last { - file.written -= pad; - file.truncate(file.written)?; + file.offset -= pad; + file.truncate(file.offset)?; if !close { - rustix::fs::seek( - &file.fd, - rustix::fs::SeekFrom::Start(file.align_down(file.written) as _), - ) - .map_err(io::Error::from)?; self.write_all(&last[file.align_down(len)..(file.align_up(len) - pad)])?; } } @@ -685,18 +681,21 @@ impl DmaWriteBuf { } } -fn writev_all(fd: impl AsFd, mut slices: &mut [IoSlice<'_>]) -> io::Result { +fn writev_all(fd: impl AsFd, mut slices: &mut [IoSlice<'_>], offset: usize) -> io::Result { let mut written = 0; while !slices.is_empty() { - let n = match rustix::io::writev(fd.as_fd(), slices) { + match rustix::io::pwritev(fd.as_fd(), slices, (offset + written) as _) { Ok(0) => { return Err(io::Error::new( io::ErrorKind::WriteZero, "writev returned zero bytes", )); } - Ok(n) => n, + Ok(n) => { + written += n; + IoSlice::advance_slices(&mut slices, n); + } Err(err) => { if err.kind() == io::ErrorKind::Interrupted { continue; @@ -704,9 +703,6 @@ fn writev_all(fd: impl AsFd, mut slices: &mut [IoSlice<'_>]) -> io::Result io::Result<()> { - let filename = "test_file"; + let filename = "test_write_file"; let _ = std::fs::remove_file(filename); let mut file = SyncDmaFile::create(filename, true)?; @@ -1015,7 +1011,7 @@ mod tests { buf.flush(&mut file)?; file.fsync()?; - assert_eq!(file.written, 1); + assert_eq!(file.offset, 1); let mut got = Vec::new(); let mut read = std::fs::File::open(filename)?; @@ -1031,7 +1027,7 @@ mod tests { buf.flush(&mut file)?; file.fsync()?; - assert_eq!(file.written, 3); + assert_eq!(file.offset, 3); let mut got = Vec::new(); let mut read = std::fs::File::open(filename)?; @@ -1053,7 +1049,7 @@ mod tests { buf.flush(&mut file)?; file.fsync()?; - assert_eq!(file.written, 3 + data.len()); + assert_eq!(file.offset, 3 + data.len()); let mut got = Vec::new(); let mut read = std::fs::File::open(filename)?; @@ -1064,6 +1060,8 @@ mod tests { assert_eq!(want.as_slice(), got.as_slice()); } + let _ = std::fs::remove_file(filename); + Ok(()) } } diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs index d960e9140f687..51c2bcb89893b 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer_v2.rs @@ -134,22 +134,11 @@ where } fn fetch_blocks(&mut self, threshold: Option) -> Option> { - match threshold { - None => { - if self.buffered_blocks.is_empty() { - None - } else { - Some(self.buffered_blocks.clone()) - } - } - Some(threshold) => { - if self.buffered_size >= threshold { - self.buffered_size = 0; - Some(std::mem::take(&mut self.buffered_blocks)) - } else { - None - } - } + if self.buffered_size >= threshold.unwrap_or_default() { + self.buffered_size = 0; + Some(std::mem::take(&mut self.buffered_blocks)) + } else { + None } } @@ -189,7 +178,7 @@ where B: Builder spiller: B, partitions: Vec>, memory_settings: MemorySettings, - min_spill_size: usize, + min_row_group_size: usize, num_partitions: usize, sort_block_size: usize, can_spill: bool, @@ -212,7 +201,7 @@ where B: Builder spiller, partitions, memory_settings, - min_spill_size: 1024 * 1024, + min_row_group_size: 10 * 1024 * 1024, num_partitions, sort_block_size, can_spill: false, @@ -238,7 +227,7 @@ where B: Builder } let partition = &mut self.partitions[partition_id]; partition.add_block(data_block); - if !self.can_spill && partition.memory_size() >= self.min_spill_size { + if !self.can_spill && partition.memory_size() >= self.min_row_group_size { self.can_spill = true; } } @@ -273,7 +262,7 @@ where B: Builder } if let Some((partition_id, size)) = preferred_partition - && size >= self.min_spill_size + && size >= self.min_row_group_size { let partition = &mut self.partitions[partition_id]; let blocks = partition.fetch_blocks(None).unwrap(); From 74295f1e62e706196c58da224878a8a387e6ddfb Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 10 Oct 2025 19:20:30 +0800 Subject: [PATCH 46/46] typos --- .typos.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.typos.toml b/.typos.toml index 015175dee80dc..3ee56eaf62946 100644 --- a/.typos.toml +++ b/.typos.toml @@ -25,6 +25,10 @@ "creat" = "creat" "crate" = "crate" +[default.extend-identifiers] +## External +WRONLY = "WRONLY" + [files] extend-exclude = [ "**/Cargo.toml",