Skip to content

Commit e6ad2c4

Browse files
committed
Fix handling of other column-specific properties
1 parent a9fd162 commit e6ad2c4

File tree

2 files changed

+176
-6
lines changed

2 files changed

+176
-6
lines changed

cpp/src/parquet/properties.h

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -323,17 +323,47 @@ class PARQUET_EXPORT WriterProperties {
323323
properties.content_defined_chunking_enabled()),
324324
content_defined_chunking_options_(
325325
properties.content_defined_chunking_options()) {
326-
for (const auto& kvp : properties.column_properties_) {
327-
if (kvp.second.statistics_enabled() !=
326+
for (const auto& [col_path, col_props] : properties.column_properties_) {
327+
if (col_props.statistics_enabled() !=
328328
default_column_properties_.statistics_enabled()) {
329-
if (kvp.second.statistics_enabled()) {
330-
this->enable_statistics(kvp.first);
329+
if (col_props.statistics_enabled()) {
330+
this->enable_statistics(col_path);
331331
} else {
332-
this->disable_statistics(kvp.first);
332+
this->disable_statistics(col_path);
333333
}
334334
}
335+
336+
if (col_props.dictionary_enabled() !=
337+
default_column_properties_.dictionary_enabled()) {
338+
if (col_props.dictionary_enabled()) {
339+
this->enable_dictionary(col_path);
340+
} else {
341+
this->disable_dictionary(col_path);
342+
}
343+
}
344+
345+
if (col_props.page_index_enabled() !=
346+
default_column_properties_.page_index_enabled()) {
347+
if (col_props.page_index_enabled()) {
348+
this->enable_write_page_index(col_path);
349+
} else {
350+
this->disable_write_page_index(col_path);
351+
}
352+
}
353+
354+
if (col_props.compression() != default_column_properties_.compression()) {
355+
this->compression(col_path, col_props.compression());
356+
}
357+
358+
if (col_props.compression_level() !=
359+
default_column_properties_.compression_level()) {
360+
this->compression_level(col_path, col_props.compression_level());
361+
}
362+
363+
if (col_props.encoding() != default_column_properties_.encoding()) {
364+
this->encoding(col_path, col_props.encoding());
365+
}
335366
}
336-
// TODO: Other column specific properties
337367
}
338368

339369
/// \brief EXPERIMENTAL: Use content-defined page chunking for all columns.

cpp/src/parquet/properties_test.cc

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,5 +149,145 @@ TEST(TestReaderProperties, GetStreamInsufficientData) {
149149
}
150150
}
151151

152+
struct WriterPropertiesTestCase {
153+
WriterPropertiesTestCase(std::shared_ptr<WriterProperties> props, std::string label)
154+
: properties(std::move(props)), label(std::move(label)) {}
155+
156+
std::shared_ptr<WriterProperties> properties;
157+
std::string label;
158+
};
159+
160+
void PrintTo(const WriterPropertiesTestCase& p, std::ostream* os) { *os << p.label; }
161+
162+
class WriterPropertiesTest : public testing::TestWithParam<WriterPropertiesTestCase> {};
163+
164+
TEST_P(WriterPropertiesTest, RoundTripThroughBuilder) {
165+
const std::shared_ptr<WriterProperties>& properties = GetParam().properties;
166+
const std::vector<std::shared_ptr<ColumnPath>> columns{
167+
ColumnPath::FromDotString("a"),
168+
ColumnPath::FromDotString("b"),
169+
};
170+
171+
const auto round_tripped = WriterProperties::Builder(*properties).build();
172+
173+
ASSERT_EQ(round_tripped->content_defined_chunking_enabled(),
174+
properties->content_defined_chunking_enabled());
175+
ASSERT_EQ(round_tripped->created_by(), properties->created_by());
176+
ASSERT_EQ(round_tripped->data_pagesize(), properties->data_pagesize());
177+
ASSERT_EQ(round_tripped->data_page_version(), properties->data_page_version());
178+
ASSERT_EQ(round_tripped->dictionary_index_encoding(),
179+
properties->dictionary_index_encoding());
180+
ASSERT_EQ(round_tripped->dictionary_pagesize_limit(),
181+
properties->dictionary_pagesize_limit());
182+
ASSERT_EQ(round_tripped->file_encryption_properties(),
183+
properties->file_encryption_properties());
184+
ASSERT_EQ(round_tripped->max_rows_per_page(), properties->max_rows_per_page());
185+
ASSERT_EQ(round_tripped->max_row_group_length(), properties->max_row_group_length());
186+
ASSERT_EQ(round_tripped->memory_pool(), properties->memory_pool());
187+
ASSERT_EQ(round_tripped->page_checksum_enabled(), properties->page_checksum_enabled());
188+
ASSERT_EQ(round_tripped->size_statistics_level(), properties->size_statistics_level());
189+
ASSERT_EQ(round_tripped->sorting_columns(), properties->sorting_columns());
190+
ASSERT_EQ(round_tripped->store_decimal_as_integer(),
191+
properties->store_decimal_as_integer());
192+
ASSERT_EQ(round_tripped->write_batch_size(), properties->write_batch_size());
193+
ASSERT_EQ(round_tripped->version(), properties->version());
194+
195+
const auto cdc_options = properties->content_defined_chunking_options();
196+
const auto round_tripped_cdc_options =
197+
round_tripped->content_defined_chunking_options();
198+
ASSERT_EQ(round_tripped_cdc_options.min_chunk_size, cdc_options.min_chunk_size);
199+
ASSERT_EQ(round_tripped_cdc_options.max_chunk_size, cdc_options.max_chunk_size);
200+
ASSERT_EQ(round_tripped_cdc_options.norm_level, cdc_options.norm_level);
201+
202+
for (const auto& column : columns) {
203+
const auto& column_properties = properties->column_properties(column);
204+
const auto& round_tripped_col = round_tripped->column_properties(column);
205+
206+
ASSERT_EQ(round_tripped_col.compression(), column_properties.compression());
207+
ASSERT_EQ(round_tripped_col.compression_level(),
208+
column_properties.compression_level());
209+
ASSERT_EQ(round_tripped_col.dictionary_enabled(),
210+
column_properties.dictionary_enabled());
211+
ASSERT_EQ(round_tripped_col.encoding(), column_properties.encoding());
212+
ASSERT_EQ(round_tripped_col.max_statistics_size(),
213+
column_properties.max_statistics_size());
214+
ASSERT_EQ(round_tripped_col.page_index_enabled(),
215+
column_properties.page_index_enabled());
216+
ASSERT_EQ(round_tripped_col.statistics_enabled(),
217+
column_properties.statistics_enabled());
218+
}
219+
}
220+
221+
std::vector<WriterPropertiesTestCase> writer_properties_test_cases() {
222+
std::vector<WriterPropertiesTestCase> test_cases;
223+
224+
test_cases.emplace_back(default_writer_properties(), "default_properties");
225+
226+
{
227+
WriterProperties::Builder builder;
228+
const auto column_a = ColumnPath::FromDotString("a");
229+
230+
builder.created_by("parquet-cpp-properties-test");
231+
builder.encoding(Encoding::BYTE_STREAM_SPLIT);
232+
builder.compression(Compression::ZSTD);
233+
builder.compression_level(2);
234+
builder.disable_dictionary();
235+
builder.disable_statistics();
236+
builder.enable_content_defined_chunking();
237+
builder.dictionary_pagesize_limit(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT - 1);
238+
builder.write_batch_size(DEFAULT_WRITE_BATCH_SIZE - 1);
239+
builder.max_row_group_length(DEFAULT_MAX_ROW_GROUP_LENGTH - 1);
240+
builder.data_pagesize(kDefaultDataPageSize - 1);
241+
builder.max_rows_per_page(kDefaultMaxRowsPerPage - 1);
242+
builder.data_page_version(ParquetDataPageVersion::V2);
243+
builder.version(ParquetVersion::type::PARQUET_2_4);
244+
builder.enable_store_decimal_as_integer();
245+
builder.disable_write_page_index();
246+
builder.set_size_statistics_level(SizeStatisticsLevel::ColumnChunk);
247+
builder.set_sorting_columns(std::vector<SortingColumn>{
248+
SortingColumn{.column_idx = 1, .descending = true, .nulls_first = false}});
249+
250+
test_cases.emplace_back(builder.build(), "override_defaults");
251+
}
252+
253+
const auto column_a = ColumnPath::FromDotString("a");
254+
{
255+
WriterProperties::Builder builder;
256+
builder.disable_dictionary();
257+
builder.enable_dictionary(column_a);
258+
test_cases.emplace_back(builder.build(), "dictionary_column_override");
259+
}
260+
{
261+
WriterProperties::Builder builder;
262+
builder.disable_statistics();
263+
builder.enable_statistics(column_a);
264+
test_cases.emplace_back(builder.build(), "statistics_column_override");
265+
}
266+
{
267+
WriterProperties::Builder builder;
268+
builder.compression(Compression::SNAPPY);
269+
builder.compression(column_a, Compression::UNCOMPRESSED);
270+
builder.compression_level(column_a, 2);
271+
test_cases.emplace_back(builder.build(), "compression_column_override");
272+
}
273+
{
274+
WriterProperties::Builder builder;
275+
builder.encoding(Encoding::UNDEFINED);
276+
builder.encoding(column_a, Encoding::BYTE_STREAM_SPLIT);
277+
test_cases.emplace_back(builder.build(), "encoding_column_override");
278+
}
279+
{
280+
WriterProperties::Builder builder;
281+
builder.disable_write_page_index();
282+
builder.enable_write_page_index(column_a);
283+
test_cases.emplace_back(builder.build(), "page_index_column_override");
284+
}
285+
286+
return test_cases;
287+
}
288+
289+
INSTANTIATE_TEST_SUITE_P(WriterPropertiesTest, WriterPropertiesTest,
290+
::testing::ValuesIn(writer_properties_test_cases()));
291+
152292
} // namespace test
153293
} // namespace parquet

0 commit comments

Comments
 (0)