File tree Expand file tree Collapse file tree 4 files changed +14
-10
lines changed
Expand file tree Collapse file tree 4 files changed +14
-10
lines changed Original file line number Diff line number Diff line change @@ -404,8 +404,8 @@ class FileWriterImpl : public FileWriter {
404404 }
405405 // max_row_group_bytes is applied only after the row group has accumulated data.
406406 if (row_group_writer_ != nullptr && row_group_writer_->num_rows () > 0 ) {
407- double avg_row_size = row_group_writer_-> current_buffered_bytes () * 1.0 /
408- row_group_writer_->num_rows ();
407+ double avg_row_size =
408+ row_group_writer_-> total_buffered_bytes () * 1.0 / row_group_writer_->num_rows ();
409409 chunk_size = std::min (
410410 chunk_size,
411411 static_cast <int64_t >(this ->properties ().max_row_group_bytes () / avg_row_size));
@@ -496,11 +496,11 @@ class FileWriterImpl : public FileWriter {
496496 int64_t batch_size =
497497 std::min (max_row_group_length - group_rows, batch.num_rows () - offset);
498498 if (group_rows > 0 ) {
499- int64_t buffered_bytes = row_group_writer_->current_buffered_bytes ();
500- double avg_row_bytes = buffered_bytes * 1.0 / group_rows;
499+ int64_t buffered_bytes = row_group_writer_->total_buffered_bytes ();
500+ double avg_row_size = buffered_bytes * 1.0 / group_rows;
501501 batch_size = std::min (
502502 batch_size,
503- static_cast <int64_t >((max_row_group_bytes - buffered_bytes) / avg_row_bytes ));
503+ static_cast <int64_t >((max_row_group_bytes - buffered_bytes) / avg_row_size ));
504504 }
505505 if (batch_size > 0 ) {
506506 RETURN_NOT_OK (WriteBatch (offset, batch_size));
Original file line number Diff line number Diff line change @@ -68,10 +68,10 @@ int64_t RowGroupWriter::total_compressed_bytes_written() const {
6868 return contents_->total_compressed_bytes_written ();
6969}
7070
71- int64_t RowGroupWriter::current_buffered_bytes () const {
71+ int64_t RowGroupWriter::total_buffered_bytes () const {
7272 return contents_->total_compressed_bytes () +
7373 contents_->total_compressed_bytes_written () +
74- contents_->estimated_buffered_value_bytes ();
74+ contents_->EstimatedBufferedValueBytes ();
7575}
7676
7777bool RowGroupWriter::buffered () const { return contents_->buffered (); }
@@ -201,7 +201,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents {
201201 return total_compressed_bytes_written;
202202 }
203203
204- int64_t estimated_buffered_value_bytes () const override {
204+ int64_t EstimatedBufferedValueBytes () const override {
205205 if (closed_) {
206206 return 0 ;
207207 }
Original file line number Diff line number Diff line change @@ -59,7 +59,7 @@ class PARQUET_EXPORT RowGroupWriter {
5959 // / \brief total compressed bytes written by the page writer
6060 virtual int64_t total_compressed_bytes_written () const = 0;
6161 // / \brief estimated size of the values that are not written to a page yet
62- virtual int64_t estimated_buffered_value_bytes () const = 0;
62+ virtual int64_t EstimatedBufferedValueBytes () const = 0;
6363
6464 virtual bool buffered () const = 0;
6565 };
@@ -103,7 +103,7 @@ class PARQUET_EXPORT RowGroupWriter {
103103 int64_t total_compressed_bytes_written () const ;
104104 // / \brief including compressed bytes in page writer and uncompressed data
105105 // / value buffer.
106- int64_t current_buffered_bytes () const ;
106+ int64_t total_buffered_bytes () const ;
107107
108108 // / Returns whether the current RowGroupWriter is in the buffered mode and is created
109109 // / by calling ParquetFileWriter::AppendBufferedRowGroup.
Original file line number Diff line number Diff line change 2626#include " arrow/io/caching.h"
2727#include " arrow/type_fwd.h"
2828#include " arrow/util/compression.h"
29+ #include " arrow/util/logging.h"
2930#include " arrow/util/type_fwd.h"
3031#include " parquet/encryption/encryption.h"
3132#include " parquet/exception.h"
@@ -417,13 +418,16 @@ class PARQUET_EXPORT WriterProperties {
417418 // / Specify the max number of rows to put in a single row group.
418419 // / Default 1Mi rows.
419420 Builder* max_row_group_length (int64_t max_row_group_length) {
421+ ARROW_CHECK_GT (max_row_group_length, 0 ) << " max_row_group_length must be positive" ;
420422 max_row_group_length_ = max_row_group_length;
421423 return this ;
422424 }
423425
424426 // / Specify the max number of bytes to put in a single row group.
427+ // / The size is estimated based on encoded and compressed data.
425428 // / Default 128MB.
426429 Builder* max_row_group_bytes (int64_t max_row_group_bytes) {
430+ ARROW_CHECK_GT (max_row_group_bytes, 0 ) << " max_row_group_bytes must be positive" ;
427431 max_row_group_bytes_ = max_row_group_bytes;
428432 return this ;
429433 }
You can’t perform that action at this time.
0 commit comments