diff --git a/src/lib/parquet/src/Flow/Parquet/ParquetFile/RowGroupBuilder/DremelShredder.php b/src/lib/parquet/src/Flow/Parquet/ParquetFile/RowGroupBuilder/DremelShredder.php index fccd50057..609a84245 100644 --- a/src/lib/parquet/src/Flow/Parquet/ParquetFile/RowGroupBuilder/DremelShredder.php +++ b/src/lib/parquet/src/Flow/Parquet/ParquetFile/RowGroupBuilder/DremelShredder.php @@ -50,9 +50,11 @@ public function shred(Column $column, array $row) : FlatColumnData return $flatData; } - private function shredFlat(FlatColumn $column, mixed $value, int $definitionLevel, int $repetitionLevel, FlatColumnData $data) : void + private function shredFlat(FlatColumn $column, mixed $value, int $definitionLevel, int $repetitionLevel, FlatColumnData $data, bool $validate = true) : void { - $this->validator->validate($column, $value); + if ($validate) { + $this->validator->validate($column, $value); + } if (!$column->repetition()?->isRequired() && $value !== null) { $definitionLevel++; @@ -84,7 +86,7 @@ private function shredList(NestedColumn $column, ?array $listValue, int $definit if ($listElementColumn instanceof FlatColumn) { if ($listValue === null) { - $this->shredFlat($listElementColumn, null, $definitionLevel, $repetitionLevel - 1, $data); + $this->shredFlat($listElementColumn, null, $definitionLevel, $repetitionLevel - 1, $data, false); return; } @@ -94,7 +96,7 @@ private function shredList(NestedColumn $column, ?array $listValue, int $definit } if (!\count($listValue)) { - $this->shredFlat($listElementColumn, null, $definitionLevel, $repetitionLevel - 1, $data); + $this->shredFlat($listElementColumn, null, $definitionLevel, $repetitionLevel - 1, $data, false); return; } @@ -208,7 +210,7 @@ private function shredMap(NestedColumn $column, ?array $mapValue, int $definitio if (!\count($mapValue)) { $this->shredFlat($keyColumn->makeOptional(), null, $definitionLevel, $repetitionLevel - 1, $data); - $this->shredFlat($valueColumn, null, $definitionLevel, $repetitionLevel - 1, $data); + $this->shredFlat($valueColumn, null, $definitionLevel, $repetitionLevel - 1, $data, false); return; } diff --git a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/ParquetFile/RowGroupBuilder/Dremel/DremelListsTest.php b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/ParquetFile/RowGroupBuilder/Dremel/DremelListsTest.php index 3bbdac43c..7696f370c 100644 --- a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/ParquetFile/RowGroupBuilder/Dremel/DremelListsTest.php +++ b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/ParquetFile/RowGroupBuilder/Dremel/DremelListsTest.php @@ -1396,6 +1396,54 @@ public function test_optional_list_optional_struct_optional_struct_optional_int3 self::assertEquals($rows, (new DremelAssembler(DataConverter::initialize(Options::default())))->assemble($schema->get('l'), $flatData)); } + #[TestWith([ + [ + ['l' => []], + ], + [ + 'l.list.element' => [ + 'repetition_levels' => [0], + 'definition_levels' => [1], + 'values' => [], + ], + ], + ])] + #[TestWith([ + [ + ['l' => [1, 2]], + ], + [ + 'l.list.element' => [ + 'repetition_levels' => [0, 1], + 'definition_levels' => [2, 2], + 'values' => [1, 2], + ], + ], + ])] + public function test_optional_list_required_int32(array $rows, array $expectedColumnData) : void + { + $schema = Schema::with(NestedColumn::list('l', ListElement::int32(true))); + + $dremel = new DremelShredder(new ColumnDataValidator(), DataConverter::initialize(Options::default())); + + self::assertEquals('OPTIONAL,REPEATED,REQUIRED', $schema->get('l.list.element')->repetitions()); + self::assertEquals(2, $schema->get('l.list.element')->repetitions()->maxDefinitionLevel()); + self::assertEquals(1, $schema->get('l.list.element')->repetitions()->maxRepetitionLevel()); + + /** + * @var ?FlatColumnData $flatData + */ + $flatData = \array_reduce( + $rows, + static fn (?FlatColumnData $flatData, array $row) => $flatData === null + ? $dremel->shred($schema->get('l'), $row) + : $flatData->merge($dremel->shred($schema->get('l'), $row)) + ); + + self::assertEquals($expectedColumnData, $flatData->normalize()); + self::assertEquals($rows, (new DremelAssembler(DataConverter::initialize(Options::default())))->assemble($schema->get('l'), $flatData)); + } + #[TestWith([ [ ['l' => [1, 2, 3]], diff --git a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/ParquetFile/RowGroupBuilder/Dremel/DremelMapsTest.php b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/ParquetFile/RowGroupBuilder/Dremel/DremelMapsTest.php index 878f77d77..b35865c55 100644 --- a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/ParquetFile/RowGroupBuilder/Dremel/DremelMapsTest.php +++ b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/ParquetFile/RowGroupBuilder/Dremel/DremelMapsTest.php @@ -894,6 +894,83 @@ public function test_optional_map_string_optional_struct_optional_list_optional_ } } + #[TestWith( + [ + [ + ['m' => []], + ], + [ + 'm.key_value.key' => [ + 'repetition_levels' => [0], + 'definition_levels' => [1], + 'values' => [], + ], + 'm.key_value.value' => [ + 'repetition_levels' => [0], + 'definition_levels' => [1], + 'values' => [], + ], + ], + ] + )] + #[TestWith( + [ + [ + ['m' => ['a' => 1, 'b' => 2]], + ], + [ + 'm.key_value.key' => [ + 'repetition_levels' => [0, 1], + 'definition_levels' => [2, 2], + 'values' => ['a', 'b'], + ], + 'm.key_value.value' => [ + 'repetition_levels' => [0, 1], + 'definition_levels' => [2, 2], + 'values' => [1, 2], + ], + ], + ] + )] + public function test_optional_map_string_required_int32(array $rows, array $expectedFlatData, ?string $exceptionMessage = null) : void + { + $schema = Schema::with(NestedColumn::map('m', MapKey::string(), MapValue::int32(true))); + + $dremel = new DremelShredder(new ColumnDataValidator(), DataConverter::initialize(Options::default())); + + self::assertEquals('OPTIONAL,REPEATED,REQUIRED', $schema->get('m.key_value.key')->repetitions()); + self::assertEquals('OPTIONAL,REPEATED,REQUIRED', $schema->get('m.key_value.value')->repetitions()); + + self::assertEquals(2, $schema->get('m.key_value.key')->repetitions()->maxDefinitionLevel()); + self::assertEquals(1, $schema->get('m.key_value.key')->repetitions()->maxRepetitionLevel()); + + self::assertEquals(2, $schema->get('m.key_value.value')->repetitions()->maxDefinitionLevel()); + self::assertEquals(1, $schema->get('m.key_value.value')->repetitions()->maxRepetitionLevel()); + + if ($exceptionMessage) { + $this->expectExceptionMessage($exceptionMessage); + \array_reduce( + $rows, + static fn (?FlatColumnData $flatData, array $row) => $flatData === null + ? $dremel->shred($schema->get('m'), $row) + : $flatData->merge($dremel->shred($schema->get('m'), $row)) + ); + } else { + /** + * @var ?FlatColumnData $flatData + */ + $flatData = \array_reduce( + $rows, + static fn (?FlatColumnData $flatData, array $row) => $flatData === null + ? $dremel->shred($schema->get('m'), $row) + : $flatData->merge($dremel->shred($schema->get('m'), $row)) + ); + + self::assertEquals($expectedFlatData, $flatData->normalize()); + self::assertEquals($rows, (new DremelAssembler(DataConverter::initialize(Options::default())))->assemble($schema->get('m'), $flatData)); + } + } + #[TestWith([ [ ['m' => null],