Skip to content

Commit 13fe7b1

Browse files
committed
address comments
1 parent e19db37 commit 13fe7b1

File tree

4 files changed

+14
-10
lines changed

4 files changed

+14
-10
lines changed

cpp/src/parquet/arrow/writer.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -404,8 +404,8 @@ class FileWriterImpl : public FileWriter {
404404
}
405405
// max_row_group_bytes is applied only after the row group has accumulated data.
406406
if (row_group_writer_ != nullptr && row_group_writer_->num_rows() > 0) {
407-
double avg_row_size = row_group_writer_->current_buffered_bytes() * 1.0 /
408-
row_group_writer_->num_rows();
407+
double avg_row_size =
408+
row_group_writer_->total_buffered_bytes() * 1.0 / row_group_writer_->num_rows();
409409
chunk_size = std::min(
410410
chunk_size,
411411
static_cast<int64_t>(this->properties().max_row_group_bytes() / avg_row_size));
@@ -496,11 +496,11 @@ class FileWriterImpl : public FileWriter {
496496
int64_t batch_size =
497497
std::min(max_row_group_length - group_rows, batch.num_rows() - offset);
498498
if (group_rows > 0) {
499-
int64_t buffered_bytes = row_group_writer_->current_buffered_bytes();
500-
double avg_row_bytes = buffered_bytes * 1.0 / group_rows;
499+
int64_t buffered_bytes = row_group_writer_->total_buffered_bytes();
500+
double avg_row_size = buffered_bytes * 1.0 / group_rows;
501501
batch_size = std::min(
502502
batch_size,
503-
static_cast<int64_t>((max_row_group_bytes - buffered_bytes) / avg_row_bytes));
503+
static_cast<int64_t>((max_row_group_bytes - buffered_bytes) / avg_row_size));
504504
}
505505
if (batch_size > 0) {
506506
RETURN_NOT_OK(WriteBatch(offset, batch_size));

cpp/src/parquet/file_writer.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,10 @@ int64_t RowGroupWriter::total_compressed_bytes_written() const {
6868
return contents_->total_compressed_bytes_written();
6969
}
7070

71-
int64_t RowGroupWriter::current_buffered_bytes() const {
71+
int64_t RowGroupWriter::total_buffered_bytes() const {
7272
return contents_->total_compressed_bytes() +
7373
contents_->total_compressed_bytes_written() +
74-
contents_->estimated_buffered_value_bytes();
74+
contents_->EstimatedBufferedValueBytes();
7575
}
7676

7777
bool RowGroupWriter::buffered() const { return contents_->buffered(); }
@@ -201,7 +201,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents {
201201
return total_compressed_bytes_written;
202202
}
203203

204-
int64_t estimated_buffered_value_bytes() const override {
204+
int64_t EstimatedBufferedValueBytes() const override {
205205
if (closed_) {
206206
return 0;
207207
}

cpp/src/parquet/file_writer.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class PARQUET_EXPORT RowGroupWriter {
5959
/// \brief total compressed bytes written by the page writer
6060
virtual int64_t total_compressed_bytes_written() const = 0;
6161
/// \brief estimated size of the values that are not written to a page yet
62-
virtual int64_t estimated_buffered_value_bytes() const = 0;
62+
virtual int64_t EstimatedBufferedValueBytes() const = 0;
6363

6464
virtual bool buffered() const = 0;
6565
};
@@ -103,7 +103,7 @@ class PARQUET_EXPORT RowGroupWriter {
103103
int64_t total_compressed_bytes_written() const;
104104
/// \brief including compressed bytes in page writer and uncompressed data
105105
/// value buffer.
106-
int64_t current_buffered_bytes() const;
106+
int64_t total_buffered_bytes() const;
107107

108108
/// Returns whether the current RowGroupWriter is in the buffered mode and is created
109109
/// by calling ParquetFileWriter::AppendBufferedRowGroup.

cpp/src/parquet/properties.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "arrow/io/caching.h"
2727
#include "arrow/type_fwd.h"
2828
#include "arrow/util/compression.h"
29+
#include "arrow/util/logging.h"
2930
#include "arrow/util/type_fwd.h"
3031
#include "parquet/encryption/encryption.h"
3132
#include "parquet/exception.h"
@@ -417,13 +418,16 @@ class PARQUET_EXPORT WriterProperties {
417418
/// Specify the max number of rows to put in a single row group.
418419
/// Default 1Mi rows.
419420
Builder* max_row_group_length(int64_t max_row_group_length) {
421+
ARROW_CHECK_GT(max_row_group_length, 0) << "max_row_group_length must be positive";
420422
max_row_group_length_ = max_row_group_length;
421423
return this;
422424
}
423425

424426
/// Specify the max number of bytes to put in a single row group.
427+
/// The size is estimated based on encoded and compressed data.
425428
/// Default 128MB.
426429
Builder* max_row_group_bytes(int64_t max_row_group_bytes) {
430+
ARROW_CHECK_GT(max_row_group_bytes, 0) << "max_row_group_bytes must be positive";
427431
max_row_group_bytes_ = max_row_group_bytes;
428432
return this;
429433
}

0 commit comments

Comments
 (0)