Skip to content

Commit ded148c

Browse files
authored
apacheGH-41667: [C++][Parquet] Refuse writing non-nullable column that contains nulls (apache#44921)
### Rationale for this change A non-nullable column that contains nulls would result in an invalid Parquet file, so we'd rather raise an error when writing. This detection is only implemented for leaf columns. Implementing it for non-leaf columns would be more involved, and also doesn't actually seem necessary. ### Are these changes tested? Yes. ### Are there any user-facing changes? Raising a clear error when trying to write invalid data to Parquet, instead of letting the Parquet writer silently generate an invalid file. * GitHub Issue: apache#41667 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent fb8e812 commit ded148c

File tree

3 files changed

+32
-5
lines changed

3 files changed

+32
-5
lines changed

cpp/src/parquet/arrow/arrow_reader_writer_test.cc

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3349,6 +3349,27 @@ TEST(TestArrowWrite, CheckChunkSize) {
33493349
WriteTable(*table, ::arrow::default_memory_pool(), sink, chunk_size));
33503350
}
33513351

3352+
void CheckWritingNonNullableColumnWithNulls(std::shared_ptr<::arrow::Field> field,
3353+
std::string json_batch) {
3354+
ARROW_SCOPED_TRACE("field = ", field, ", json_batch = ", json_batch);
3355+
auto schema = ::arrow::schema({field});
3356+
auto table = ::arrow::TableFromJSON(schema, {json_batch});
3357+
auto sink = CreateOutputStream();
3358+
EXPECT_RAISES_WITH_MESSAGE_THAT(
3359+
Invalid, ::testing::HasSubstr("is declared non-nullable but contains nulls"),
3360+
WriteTable(*table, ::arrow::default_memory_pool(), sink));
3361+
}
3362+
3363+
TEST(TestArrowWrite, InvalidSchema) {
3364+
// GH-41667: nulls in non-nullable column
3365+
CheckWritingNonNullableColumnWithNulls(
3366+
::arrow::field("a", ::arrow::int32(), /*nullable=*/false),
3367+
R"([{"a": 456}, {"a": null}])");
3368+
CheckWritingNonNullableColumnWithNulls(
3369+
::arrow::field("a", ::arrow::utf8(), /*nullable=*/false),
3370+
R"([{"a": "foo"}, {"a": null}])");
3371+
}
3372+
33523373
void DoNestedValidate(const std::shared_ptr<::arrow::DataType>& inner_type,
33533374
const std::shared_ptr<::arrow::Field>& outer_field,
33543375
const std::shared_ptr<Buffer>& buffer,

cpp/src/parquet/column_writer.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,6 +1301,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
13011301
bool single_nullable_element =
13021302
(level_info_.def_level == level_info_.repeated_ancestor_def_level + 1) &&
13031303
leaf_field_nullable;
1304+
if (!leaf_field_nullable && leaf_array.null_count() != 0) {
1305+
return Status::Invalid("Column '", descr_->name(),
1306+
"' is declared non-nullable but contains nulls");
1307+
}
13041308
bool maybe_parent_nulls = level_info_.HasNullableValues() && !single_nullable_element;
13051309
if (maybe_parent_nulls) {
13061310
ARROW_ASSIGN_OR_RAISE(

java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,9 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) {
9595
// DenseUnion
9696
List<Field> childFields = new ArrayList<>();
9797
childFields.add(
98-
new Field(
99-
"int-child", new FieldType(false, new ArrowType.Int(32, true), null, null), null));
98+
new Field("int-child", FieldType.notNullable(new ArrowType.Int(32, true)), null));
10099
Field structField =
101-
new Field(
102-
"struct", new FieldType(true, ArrowType.Struct.INSTANCE, null, null), childFields);
100+
new Field("struct", FieldType.nullable(ArrowType.Struct.INSTANCE), childFields);
103101
Field[] fields =
104102
new Field[] {
105103
Field.nullablePrimitive("null", ArrowType.Null.INSTANCE),
@@ -239,7 +237,11 @@ private VectorSchemaRoot generateAllTypesVector(BufferAllocator allocator) {
239237
largeListWriter.integer().writeInt(1);
240238
largeListWriter.endList();
241239

242-
((StructVector) root.getVector("struct")).getChild("int-child", IntVector.class).set(1, 1);
240+
var intChildVector =
241+
((StructVector) root.getVector("struct")).getChild("int-child", IntVector.class);
242+
// set a non-null value at index 0 since the field is not nullable
243+
intChildVector.set(0, 0);
244+
intChildVector.set(1, 1);
243245
return root;
244246
}
245247

0 commit comments

Comments
 (0)