19
19
20
20
use std:: collections:: HashMap ;
21
21
use std:: sync:: Arc ;
22
- use std:: sync:: atomic:: AtomicI64 ;
23
22
24
23
use arrow_schema:: SchemaRef as ArrowSchemaRef ;
25
24
use bytes:: Bytes ;
@@ -36,7 +35,6 @@ use parquet::thrift::{TCompactOutputProtocol, TSerializable};
36
35
use thrift:: protocol:: TOutputProtocol ;
37
36
38
37
use super :: location_generator:: { FileNameGenerator , LocationGenerator } ;
39
- use super :: track_writer:: TrackWriter ;
40
38
use super :: { FileWriter , FileWriterBuilder } ;
41
39
use crate :: arrow:: {
42
40
ArrowFileReader , DEFAULT_MAP_FIELD_NAME , NanValueCountVisitor , get_parquet_stat_max_as_datum,
@@ -87,7 +85,6 @@ impl<T: LocationGenerator, F: FileNameGenerator> FileWriterBuilder for ParquetWr
87
85
type R = ParquetWriter ;
88
86
89
87
async fn build ( self ) -> Result < Self :: R > {
90
- let written_size = Arc :: new ( AtomicI64 :: new ( 0 ) ) ;
91
88
let out_file = self . file_io . new_output (
92
89
self . location_generator
93
90
. generate_location ( & self . file_name_generator . generate_file_name ( ) ) ,
@@ -97,7 +94,6 @@ impl<T: LocationGenerator, F: FileNameGenerator> FileWriterBuilder for ParquetWr
97
94
schema : self . schema . clone ( ) ,
98
95
inner_writer : None ,
99
96
writer_properties : self . props ,
100
- written_size,
101
97
current_row_num : 0 ,
102
98
out_file,
103
99
nan_value_count_visitor : NanValueCountVisitor :: new ( ) ,
@@ -227,11 +223,8 @@ impl SchemaVisitor for IndexByParquetPathName {
227
223
pub struct ParquetWriter {
228
224
schema : SchemaRef ,
229
225
out_file : OutputFile ,
230
- inner_writer : Option < AsyncArrowWriter < AsyncFileWriter < TrackWriter > > > ,
226
+ inner_writer : Option < AsyncArrowWriter < AsyncFileWriter < Box < dyn FileWrite > > > > ,
231
227
writer_properties : WriterProperties ,
232
- // written_size is only accurate after closing the inner writer,
233
- // because the inner writer flushes data asynchronously.
234
- written_size : Arc < AtomicI64 > ,
235
228
current_row_num : usize ,
236
229
nan_value_count_visitor : NanValueCountVisitor ,
237
230
}
@@ -534,8 +527,7 @@ impl FileWriter for ParquetWriter {
534
527
writer
535
528
} else {
536
529
let arrow_schema: ArrowSchemaRef = Arc :: new ( self . schema . as_ref ( ) . try_into ( ) ?) ;
537
- let inner_writer =
538
- TrackWriter :: new ( self . out_file . writer ( ) . await ?, self . written_size . clone ( ) ) ;
530
+ let inner_writer = self . out_file . writer ( ) . await ?;
539
531
let async_writer = AsyncFileWriter :: new ( inner_writer) ;
540
532
let writer = AsyncArrowWriter :: try_new (
541
533
async_writer,
@@ -562,16 +554,16 @@ impl FileWriter for ParquetWriter {
562
554
}
563
555
564
556
async fn close ( mut self ) -> Result < Vec < DataFileBuilder > > {
565
- let writer = match self . inner_writer . take ( ) {
557
+ let mut writer = match self . inner_writer . take ( ) {
566
558
Some ( writer) => writer,
567
559
None => return Ok ( vec ! [ ] ) ,
568
560
} ;
569
561
570
- let metadata = writer. close ( ) . await . map_err ( |err| {
571
- Error :: new ( ErrorKind :: Unexpected , "Failed to close parquet writer." ) . with_source ( err)
562
+ let metadata = writer. finish ( ) . await . map_err ( |err| {
563
+ Error :: new ( ErrorKind :: Unexpected , "Failed to finish parquet writer." ) . with_source ( err)
572
564
} ) ?;
573
565
574
- let written_size = self . written_size . load ( std :: sync :: atomic :: Ordering :: Relaxed ) ;
566
+ let written_size = writer . bytes_written ( ) ;
575
567
576
568
if self . current_row_num == 0 {
577
569
self . out_file . delete ( ) . await . map_err ( |err| {
@@ -595,7 +587,7 @@ impl FileWriter for ParquetWriter {
595
587
Ok ( vec ! [ Self :: parquet_to_data_file_builder(
596
588
self . schema,
597
589
parquet_metadata,
598
- written_size as usize ,
590
+ written_size,
599
591
self . out_file. location( ) . to_string( ) ,
600
592
self . nan_value_count_visitor. nan_value_counts,
601
593
) ?] )
0 commit comments