-
Notifications
You must be signed in to change notification settings - Fork 276
feat: Add experimental support for native Parquet writes #2812
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 34 commits
64c3dd7
c6fe639
2a8dc52
e4acf93
82a552d
ba93de7
96087ef
e6bb5cf
8c5a6be
42e0c79
fac5c56
30a503f
5174ba3
8748d85
f1b7ba8
3ad4d6e
f35196b
1ea7f31
61b6690
1c00aae
e1e357e
9fb2b53
457a9b9
757ba81
7b9e925
913f2ba
de07d45
eb4fee4
9a6e48c
8b1cb0b
58c056d
bd04274
55d5d48
cda8322
6a4726b
1d1d430
ece289a
b7daa61
44991b2
8d9b41a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,282 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| //! Parquet writer operator for writing RecordBatches to Parquet files | ||
| use std::{ | ||
| any::Any, | ||
| fmt, | ||
| fmt::{Debug, Formatter}, | ||
| fs::File, | ||
| sync::Arc, | ||
| }; | ||
|
|
||
| use arrow::datatypes::SchemaRef; | ||
| use async_trait::async_trait; | ||
| use datafusion::{ | ||
| error::{DataFusionError, Result}, | ||
| execution::context::TaskContext, | ||
| physical_expr::EquivalenceProperties, | ||
| physical_plan::{ | ||
| execution_plan::{Boundedness, EmissionType}, | ||
| metrics::{ExecutionPlanMetricsSet, MetricsSet}, | ||
| DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, | ||
| SendableRecordBatchStream, Statistics, | ||
| }, | ||
| }; | ||
| use futures::TryStreamExt; | ||
| use parquet::{ | ||
| arrow::ArrowWriter, | ||
| basic::{Compression, ZstdLevel}, | ||
| file::properties::WriterProperties, | ||
| }; | ||
|
|
||
| use crate::execution::shuffle::CompressionCodec; | ||
|
|
||
| /// Parquet writer operator that writes input batches to a Parquet file | ||
| #[derive(Debug)] | ||
| pub struct ParquetWriterExec { | ||
| /// Input execution plan | ||
| input: Arc<dyn ExecutionPlan>, | ||
| /// Output file path | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it file or folder?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is folder. Files named |
||
| output_path: String, | ||
| /// Compression codec | ||
| compression: CompressionCodec, | ||
| /// Partition ID (from Spark TaskContext) | ||
| partition_id: i32, | ||
| /// Column names to use in the output Parquet file | ||
| column_names: Vec<String>, | ||
| /// Metrics | ||
| metrics: ExecutionPlanMetricsSet, | ||
| /// Cache for plan properties | ||
| cache: PlanProperties, | ||
| } | ||
|
|
||
| impl ParquetWriterExec { | ||
| /// Create a new ParquetWriterExec | ||
| pub fn try_new( | ||
| input: Arc<dyn ExecutionPlan>, | ||
| output_path: String, | ||
| compression: CompressionCodec, | ||
| partition_id: i32, | ||
| column_names: Vec<String>, | ||
| ) -> Result<Self> { | ||
| // Preserve the input's partitioning so each partition writes its own file | ||
| let input_partitioning = input.output_partitioning().clone(); | ||
|
|
||
| let cache = PlanProperties::new( | ||
| EquivalenceProperties::new(Arc::clone(&input.schema())), | ||
| input_partitioning, | ||
| EmissionType::Final, | ||
| Boundedness::Bounded, | ||
| ); | ||
|
|
||
| Ok(ParquetWriterExec { | ||
| input, | ||
| output_path, | ||
| compression, | ||
| partition_id, | ||
| column_names, | ||
| metrics: ExecutionPlanMetricsSet::new(), | ||
| cache, | ||
| }) | ||
| } | ||
|
|
||
| fn compression_to_parquet(&self) -> Result<Compression> { | ||
| match self.compression { | ||
| CompressionCodec::None => Ok(Compression::UNCOMPRESSED), | ||
| CompressionCodec::Zstd(level) => Ok(Compression::ZSTD(ZstdLevel::try_new(level)?)), | ||
| CompressionCodec::Lz4Frame => Ok(Compression::LZ4), | ||
| CompressionCodec::Snappy => Ok(Compression::SNAPPY), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| impl DisplayAs for ParquetWriterExec { | ||
| fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> fmt::Result { | ||
| match t { | ||
| DisplayFormatType::Default | DisplayFormatType::Verbose => { | ||
| write!( | ||
| f, | ||
| "ParquetWriterExec: path={}, compression={:?}", | ||
| self.output_path, self.compression | ||
| ) | ||
| } | ||
| DisplayFormatType::TreeRender => unimplemented!(), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| #[async_trait] | ||
| impl ExecutionPlan for ParquetWriterExec { | ||
| fn as_any(&self) -> &dyn Any { | ||
| self | ||
| } | ||
|
|
||
| fn name(&self) -> &str { | ||
| "ParquetWriterExec" | ||
| } | ||
|
|
||
| fn metrics(&self) -> Option<MetricsSet> { | ||
| Some(self.metrics.clone_inner()) | ||
| } | ||
|
|
||
| fn statistics(&self) -> Result<Statistics> { | ||
| self.input.partition_statistics(None) | ||
| } | ||
|
|
||
| fn properties(&self) -> &PlanProperties { | ||
| &self.cache | ||
| } | ||
|
|
||
| fn schema(&self) -> SchemaRef { | ||
| self.input.schema() | ||
| } | ||
|
|
||
| fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> { | ||
| vec![&self.input] | ||
| } | ||
|
|
||
| fn with_new_children( | ||
| self: Arc<Self>, | ||
| children: Vec<Arc<dyn ExecutionPlan>>, | ||
| ) -> Result<Arc<dyn ExecutionPlan>> { | ||
| match children.len() { | ||
| 1 => Ok(Arc::new(ParquetWriterExec::try_new( | ||
| Arc::clone(&children[0]), | ||
| self.output_path.clone(), | ||
| self.compression.clone(), | ||
| self.partition_id, | ||
| self.column_names.clone(), | ||
| )?)), | ||
| _ => Err(DataFusionError::Internal( | ||
| "ParquetWriterExec requires exactly one child".to_string(), | ||
| )), | ||
| } | ||
| } | ||
|
|
||
| fn execute( | ||
| &self, | ||
| partition: usize, | ||
| context: Arc<TaskContext>, | ||
| ) -> Result<SendableRecordBatchStream> { | ||
| let input = self.input.execute(partition, context)?; | ||
| let input_schema = self.schema(); | ||
| let output_path = self.output_path.clone(); | ||
| let compression = self.compression_to_parquet()?; | ||
| let column_names = self.column_names.clone(); | ||
|
|
||
| // Create output schema with correct column names | ||
| let output_schema = if !column_names.is_empty() { | ||
| // Replace the generic column names (col_0, col_1, etc.) with the actual names | ||
| let fields: Vec<_> = input_schema | ||
| .fields() | ||
| .iter() | ||
| .enumerate() | ||
| .map(|(i, field)| { | ||
| if i < column_names.len() { | ||
| Arc::new(field.as_ref().clone().with_name(&column_names[i])) | ||
| } else { | ||
wForget marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| Arc::clone(field) | ||
| } | ||
| }) | ||
| .collect(); | ||
| Arc::new(arrow::datatypes::Schema::new(fields)) | ||
| } else { | ||
| // No column names provided, use input schema as-is | ||
| Arc::clone(&input_schema) | ||
| }; | ||
|
|
||
| // Strip file:// or file: prefix if present | ||
| let local_path = output_path | ||
| .strip_prefix("file://") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if hdfs:// ?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added a fallback for now so that it falls back to Spark if the path does not start with |
||
| .or_else(|| output_path.strip_prefix("file:")) | ||
| .unwrap_or(&output_path) | ||
| .to_string(); | ||
|
|
||
| // Create output directory | ||
| std::fs::create_dir_all(&local_path).map_err(|e| { | ||
| DataFusionError::Execution(format!( | ||
| "Failed to create output directory '{}': {}", | ||
| local_path, e | ||
| )) | ||
| })?; | ||
|
|
||
| // Generate part file name for this partition | ||
| let part_file = format!("{}/part-{:05}.parquet", local_path, self.partition_id); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this doesn't seem right, the extension will be different depending on the codec
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This file name is best generated by
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @wForget, are you proposing to keep hardcoded names for the PR and replicate Spark
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this pr seems to be missing some work related to file commit. My proposed write process might look like this: create a staging dir -> native write files to staging dir -> file commit (move and merge staging files) -> add or update partitions
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I filed #2827 for implementing the file commit protocol. This PR adds a starting point for development. Once it is merged then other contributors can help add the missing features. |
||
|
|
||
| // Create the Parquet file | ||
| let file = File::create(&part_file).map_err(|e| { | ||
| DataFusionError::Execution(format!( | ||
| "Failed to create output file '{}': {}", | ||
| part_file, e | ||
| )) | ||
| })?; | ||
|
|
||
| // Configure writer properties | ||
| let props = WriterProperties::builder() | ||
| .set_compression(compression) | ||
| .build(); | ||
|
|
||
| let mut writer = ArrowWriter::try_new(file, Arc::clone(&output_schema), Some(props)) | ||
| .map_err(|e| DataFusionError::Execution(format!("Failed to create writer: {}", e)))?; | ||
|
|
||
| // Clone schema for use in async closure | ||
| let schema_for_write = Arc::clone(&output_schema); | ||
|
|
||
| // Write batches | ||
| let write_task = async move { | ||
| let mut stream = input; | ||
|
|
||
| while let Some(batch_result) = stream.try_next().await.transpose() { | ||
| let batch = batch_result?; | ||
|
|
||
| // Rename columns in the batch to match output schema | ||
| let renamed_batch = if !column_names.is_empty() { | ||
| use arrow::record_batch::RecordBatch; | ||
| RecordBatch::try_new(Arc::clone(&schema_for_write), batch.columns().to_vec()) | ||
| .map_err(|e| { | ||
| DataFusionError::Execution(format!( | ||
| "Failed to rename batch columns: {}", | ||
| e | ||
| )) | ||
| })? | ||
| } else { | ||
| batch | ||
| }; | ||
|
|
||
| writer.write(&renamed_batch).map_err(|e| { | ||
| DataFusionError::Execution(format!("Failed to write batch: {}", e)) | ||
| })?; | ||
| } | ||
|
|
||
| writer.close().map_err(|e| { | ||
| DataFusionError::Execution(format!("Failed to close writer: {}", e)) | ||
| })?; | ||
|
|
||
| // Return empty stream to indicate completion | ||
| Ok::<_, DataFusionError>(futures::stream::empty()) | ||
| }; | ||
|
|
||
| // Execute the write task and convert to a stream | ||
| use datafusion::physical_plan::stream::RecordBatchStreamAdapter; | ||
| Ok(Box::pin(RecordBatchStreamAdapter::new( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if the partition failed? what would happen with the folder?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤷 this is all highly experimental so far |
||
| output_schema, | ||
| futures::stream::once(write_task).try_flatten(), | ||
| ))) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does it mean also Iceberg writes?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we change to
InsertIntoHadoopFsRelationCommandhere?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense. I updated this.