4747#include " arrow/testing/gtest_util.h"
4848#include " arrow/testing/random.h"
4949#include " arrow/testing/util.h"
50+ #include " arrow/type_fwd.h"
5051#include " arrow/type_traits.h"
5152#include " arrow/util/checked_cast.h"
5253#include " arrow/util/config.h" // for ARROW_CSV definition
7172#include " parquet/column_writer.h"
7273#include " parquet/file_writer.h"
7374#include " parquet/page_index.h"
75+ #include " parquet/properties.h"
7476#include " parquet/test_util.h"
7577
7678using arrow::Array;
@@ -88,6 +90,7 @@ using arrow::DictionaryArray;
8890using arrow::ListArray;
8991using arrow::PrimitiveArray;
9092using arrow::ResizableBuffer;
93+ using arrow::Result;
9194using arrow::Scalar;
9295using arrow::Status;
9396using arrow::Table;
@@ -621,15 +624,28 @@ class ParquetIOTestBase : public ::testing::Test {
621624 return ParquetFileWriter::Open (sink_, schema);
622625 }
623626
627+ Result<std::unique_ptr<FileReader>> ReaderFromBuffer (
628+ const std::shared_ptr<Buffer>& buffer,
629+ const ArrowReaderProperties& properties = default_arrow_reader_properties()) {
630+ FileReaderBuilder builder;
631+ std::unique_ptr<FileReader> out;
632+ RETURN_NOT_OK (builder.Open (std::make_shared<BufferReader>(buffer)));
633+ RETURN_NOT_OK (builder.memory_pool (::arrow::default_memory_pool ())
634+ ->properties (properties)
635+ ->Build (&out));
636+ return out;
637+ }
638+
639+ Result<std::unique_ptr<FileReader>> ReaderFromSink (
640+ const ArrowReaderProperties& properties = default_arrow_reader_properties()) {
641+ ARROW_ASSIGN_OR_RAISE (auto buffer, sink_->Finish ());
642+ return ReaderFromBuffer (buffer, properties);
643+ }
644+
624645 void ReaderFromSink (
625646 std::unique_ptr<FileReader>* out,
626647 const ArrowReaderProperties& properties = default_arrow_reader_properties()) {
627- ASSERT_OK_AND_ASSIGN (auto buffer, sink_->Finish ());
628- FileReaderBuilder builder;
629- ASSERT_OK_NO_THROW (builder.Open (std::make_shared<BufferReader>(buffer)));
630- ASSERT_OK_NO_THROW (builder.memory_pool (::arrow::default_memory_pool ())
631- ->properties (properties)
632- ->Build (out));
648+ ASSERT_OK_NO_THROW (ReaderFromSink (properties).Value (out));
633649 }
634650
635651 void ReadSingleColumnFile (std::unique_ptr<FileReader> file_reader,
@@ -647,16 +663,20 @@ class ParquetIOTestBase : public ::testing::Test {
647663 ASSERT_OK ((*out)->ValidateFull ());
648664 }
649665
650- void ReadAndCheckSingleColumnFile (const Array& values) {
666+ void ReadAndCheckSingleColumnFile (std::unique_ptr<FileReader> file_reader,
667+ const Array& values) {
651668 std::shared_ptr<Array> out;
652-
653- std::unique_ptr<FileReader> reader;
654- ReaderFromSink (&reader);
655- ReadSingleColumnFile (std::move (reader), &out);
656-
669+ ReadSingleColumnFile (std::move (file_reader), &out);
657670 AssertArraysEqual (values, *out);
658671 }
659672
673+ void ReadAndCheckSingleColumnFile (
674+ const Array& values,
675+ const ArrowReaderProperties& properties = default_arrow_reader_properties()) {
676+ ASSERT_OK_AND_ASSIGN (auto file_reader, ReaderFromSink (properties));
677+ ReadAndCheckSingleColumnFile (std::move (file_reader), values);
678+ }
679+
660680 void ReadTableFromFile (std::unique_ptr<FileReader> reader, bool expect_metadata,
661681 std::shared_ptr<Table>* out) {
662682 ASSERT_OK_NO_THROW (reader->ReadTable (out));
@@ -776,8 +796,16 @@ class TestReadDecimals : public ParquetIOTestBase {
776796 /* rep_levels=*/ nullptr , byte_arrays.data ());
777797 column_writer->Close ();
778798 file_writer->Close ();
799+ ASSERT_OK_AND_ASSIGN (auto buffer, sink_->Finish ());
779800
780- ReadAndCheckSingleColumnFile (expected);
801+ // The binary_type setting shouldn't affect the results
802+ for (auto binary_type : {::arrow::Type::BINARY, ::arrow::Type::LARGE_BINARY,
803+ ::arrow::Type::BINARY_VIEW}) {
804+ ArrowReaderProperties properties;
805+ properties.set_binary_type (binary_type);
806+ ASSERT_OK_AND_ASSIGN (auto reader, ReaderFromBuffer (buffer, properties));
807+ ReadAndCheckSingleColumnFile (std::move (reader), expected);
808+ }
781809 }
782810};
783811
@@ -1390,50 +1418,56 @@ TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) {
13901418 AssertArraysEqual (*values, *chunked_array->chunk (0 ));
13911419}
13921420
1393- using TestLargeBinaryParquetIO = TestParquetIO<::arrow::LargeBinaryType>;
1394-
1395- TEST_F (TestLargeBinaryParquetIO, Basics) {
1396- const char * json = " [\" foo\" , \"\" , null, \"\xff\" ]" ;
1397-
1398- const auto large_type = ::arrow::large_binary ();
1399- const auto narrow_type = ::arrow::binary ();
1400- const auto large_array = ::arrow::ArrayFromJSON (large_type, json);
1401- const auto narrow_array = ::arrow::ArrayFromJSON (narrow_type, json);
1402-
1403- // When the original Arrow schema isn't stored, a LargeBinary array
1404- // is decoded as Binary (since there is no specific Parquet logical
1405- // type for it).
1406- this ->RoundTripSingleColumn (large_array, narrow_array,
1407- default_arrow_writer_properties ());
1421+ class TestBinaryLikeParquetIO : public ParquetIOTestBase {
1422+ public:
1423+ void CheckRoundTrip (std::string_view json, ::arrow::Type::type binary_type,
1424+ const std::shared_ptr<DataType>& specific_type,
1425+ const std::shared_ptr<DataType>& fallback_type) {
1426+ const auto specific_array = ::arrow::ArrayFromJSON (specific_type, json);
1427+ const auto fallback_array = ::arrow::ArrayFromJSON (fallback_type, json);
1428+
1429+ // When the original Arrow schema isn't stored, the array is decoded as
1430+ // the fallback type (since there is no specific Parquet logical
1431+ // type for it).
1432+ this ->RoundTripSingleColumn (specific_array, /* expected=*/ fallback_array,
1433+ default_arrow_writer_properties ());
1434+
1435+ // When the original Arrow schema isn't stored and a binary_type is set,
1436+ // the array is decoded as the specific type.
1437+ ArrowReaderProperties reader_properties;
1438+ reader_properties.set_binary_type (binary_type);
1439+ this ->RoundTripSingleColumn (specific_array, /* expected=*/ specific_array,
1440+ default_arrow_writer_properties (), reader_properties);
1441+ this ->RoundTripSingleColumn (fallback_array, /* expected=*/ specific_array,
1442+ default_arrow_writer_properties (), reader_properties);
1443+
1444+ // When the original Arrow schema is stored, the array is decoded as the
1445+ // specific type.
1446+ const auto writer_properties =
1447+ ArrowWriterProperties::Builder ().store_schema ()->build ();
1448+ this ->RoundTripSingleColumn (specific_array, /* expected=*/ specific_array,
1449+ writer_properties);
1450+ }
1451+ };
14081452
1409- // When the original Arrow schema is stored, the LargeBinary array
1410- // is read back as LargeBinary.
1411- const auto arrow_properties =
1412- ::parquet::ArrowWriterProperties::Builder ().store_schema()->build();
1413- this ->RoundTripSingleColumn (large_array, large_array, arrow_properties);
1453+ TEST_F (TestBinaryLikeParquetIO, LargeBinary) {
1454+ CheckRoundTrip (" [\" foo\" , \"\" , null, \"\xff\" ]" , ::arrow::Type::LARGE_BINARY,
1455+ ::arrow::large_binary (), ::arrow::binary());
14141456}
14151457
1416- using TestLargeStringParquetIO = TestParquetIO<::arrow::LargeStringType>;
1417-
1418- TEST_F (TestLargeStringParquetIO, Basics) {
1419- const char * json = R"( ["foo", "", null, "bar"])" ;
1420-
1421- const auto large_type = ::arrow::large_utf8 ();
1422- const auto narrow_type = ::arrow::utf8 ();
1423- const auto large_array = ::arrow::ArrayFromJSON (large_type, json);
1424- const auto narrow_array = ::arrow::ArrayFromJSON (narrow_type, json);
1458+ TEST_F (TestBinaryLikeParquetIO, BinaryView) {
1459+ CheckRoundTrip (" [\" foo\" , \"\" , null, \"\xff\" ]" , ::arrow::Type::BINARY_VIEW,
1460+ ::arrow::binary_view (), ::arrow::binary());
1461+ }
14251462
1426- // When the original Arrow schema isn't stored, a LargeBinary array
1427- // is decoded as Binary (since there is no specific Parquet logical
1428- // type for it).
1429- this ->RoundTripSingleColumn (large_array, narrow_array,
1430- default_arrow_writer_properties ());
1463+ TEST_F (TestBinaryLikeParquetIO, LargeString) {
1464+ CheckRoundTrip (R"( ["foo", "", null, "bar"])" , ::arrow::Type::LARGE_BINARY,
1465+ ::arrow::large_utf8 (), ::arrow::utf8());
1466+ }
14311467
1432- // When the original Arrow schema is stored, the LargeBinary array
1433- // is read back as LargeBinary.
1434- const auto arrow_properties =
1435- ::parquet::ArrowWriterProperties::Builder ().store_schema()->build();
1436- this ->RoundTripSingleColumn (large_array, large_array, arrow_properties);
1468+ TEST_F (TestBinaryLikeParquetIO, StringView) {
1469+ CheckRoundTrip (R"( ["foo", "", null, "bar"])" , ::arrow::Type::BINARY_VIEW,
1470+ ::arrow::utf8_view (), ::arrow::utf8());
14371471}
14381472
14391473using TestJsonParquetIO = TestParquetIO<::arrow::extension::JsonExtensionType>;
0 commit comments