Skip to content

Commit 77a4cb7

Browse files
bert-beyondloopsBert Vermeiren
andauthored
Fix COPY TO does not produce an output file for the empty set (#18074)
## Which issue does this PR close? COPY TO does not produce a single output file for an empty set - Closes #18073 ## Rationale for this change Executing following sql does not effectively create a single output file on disk : COPY (SELECT 1 AS id WHERE FALSE) TO 'table_no_rows.parquet'; I would expect it creates a parquet file containing 0 rows including the schema metadata. The fact you can still query the schema of such a table is still valuable information. ## What changes are included in this PR? ## Are these changes tested? Additional COPY TO test added in the copy.slt sqllogictests ## Are there any user-facing changes? A file containing 0 rows will be created now --------- Co-authored-by: Bert Vermeiren <[email protected]>
1 parent b5b7f9b commit 77a4cb7

File tree

2 files changed

+36
-1
lines changed

2 files changed

+36
-1
lines changed

datafusion/datasource/src/write/demux.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,9 @@ use datafusion_common::cast::{
4040
};
4141
use datafusion_common::{exec_datafusion_err, internal_datafusion_err, not_impl_err};
4242
use datafusion_common_runtime::SpawnedTask;
43-
use datafusion_execution::TaskContext;
4443

4544
use chrono::NaiveDate;
45+
use datafusion_execution::TaskContext;
4646
use futures::StreamExt;
4747
use object_store::path::Path;
4848
use rand::distr::SampleString;
@@ -68,6 +68,11 @@ pub type DemuxedStreamReceiver = UnboundedReceiver<(Path, RecordBatchReceiver)>;
6868
/// be written with the extension from the path. Otherwise the default extension
6969
/// will be used and the output will be split into multiple files.
7070
///
71+
/// Output file guarantees:
72+
/// - Partitioned files: Files are created only for non-empty partitions.
73+
/// - Single-file output: 1 file is always written, even when the stream is empty.
74+
/// - Multi-file output: Depending on the number of record batches, 0 or more files are written.
75+
///
7176
/// Examples of `base_output_path`
7277
/// * `tmp/dataset/` -> is a folder since it ends in `/`
7378
/// * `tmp/dataset` -> is still a folder since it does not end in `/` but has no valid file extension
@@ -171,6 +176,21 @@ async fn row_count_demuxer(
171176
max_rows_per_file
172177
};
173178

179+
if single_file_output {
180+
// ensure we have one file open, even when the input stream is empty
181+
open_file_streams.push(create_new_file_stream(
182+
&base_output_path,
183+
&write_id,
184+
part_idx,
185+
&file_extension,
186+
single_file_output,
187+
max_buffered_batches,
188+
&mut tx,
189+
)?);
190+
row_counts.push(0);
191+
part_idx += 1;
192+
}
193+
174194
while let Some(rb) = input.next().await.transpose()? {
175195
// ensure we have at least minimum_parallel_files open
176196
if open_file_streams.len() < minimum_parallel_files {

datafusion/sqllogictest/test_files/copy.slt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,21 @@ select * from validate_parquet_single;
426426
1 Foo
427427
2 Bar
428428

429+
# copy 0 rows to a single parquet file output
430+
query I
431+
COPY (SELECT 1 AS id WHERE FALSE) TO 'test_files/scratch/copy/table_no_rows.parquet';
432+
----
433+
0
434+
435+
statement ok
436+
CREATE EXTERNAL TABLE validate_parquet_single_no_rows STORED AS PARQUET LOCATION 'test_files/scratch/copy/table_no_rows.parquet';
437+
438+
# validate the parquet file contains 0 rows.
439+
query I
440+
SELECT count(id) FROM validate_parquet_single_no_rows;
441+
----
442+
0
443+
429444
# copy from table to folder of compressed json files
430445
query I
431446
COPY source_table to 'test_files/scratch/copy/table_json_gz' STORED AS JSON OPTIONS ('format.compression' gzip);

0 commit comments

Comments
 (0)