|
17 | 17 |
|
18 | 18 | #include "arrow/dataset/file_parquet.h" |
19 | 19 |
|
| 20 | +#include <functional> |
20 | 21 | #include <memory> |
21 | 22 | #include <thread> |
22 | 23 | #include <utility> |
|
25 | 26 | #include "arrow/compute/api_scalar.h" |
26 | 27 | #include "arrow/dataset/dataset_internal.h" |
27 | 28 | #include "arrow/dataset/parquet_encryption_config.h" |
| 29 | +#include "arrow/dataset/scanner.h" |
28 | 30 | #include "arrow/dataset/test_util_internal.h" |
29 | 31 | #include "arrow/io/interfaces.h" |
30 | 32 | #include "arrow/io/memory.h" |
@@ -133,6 +135,29 @@ class ParquetFormatHelper { |
133 | 135 | } |
134 | 136 | }; |
135 | 137 |
|
| 138 | +class DelayedBufferReader : public ::arrow::io::BufferReader { |
| 139 | + public: |
| 140 | + explicit DelayedBufferReader(const std::shared_ptr<::arrow::Buffer>& buffer) |
| 141 | + : ::arrow::io::BufferReader(buffer) {} |
| 142 | + |
| 143 | + ::arrow::Future<std::shared_ptr<Buffer>> ReadAsync( |
| 144 | + const ::arrow::io::IOContext& io_context, int64_t position, |
| 145 | + int64_t nbytes) override { |
| 146 | + read_async_count.fetch_add(1); |
| 147 | + auto self = std::dynamic_pointer_cast<DelayedBufferReader>(shared_from_this()); |
| 148 | + return DeferNotOk(::arrow::io::internal::SubmitIO( |
| 149 | + io_context, [self, position, nbytes]() -> Result<std::shared_ptr<Buffer>> { |
| 150 | + std::this_thread::sleep_for(std::chrono::seconds(1)); |
| 151 | + return self->DoReadAt(position, nbytes); |
| 152 | + })); |
| 153 | + } |
| 154 | + |
| 155 | + std::atomic<int> read_async_count{0}; |
| 156 | +}; |
| 157 | + |
| 158 | +using CustomizeScanOptionsWithThreadPool = |
| 159 | + std::function<void(ScanOptions&, arrow::internal::ThreadPool*)>; |
| 160 | + |
136 | 161 | class TestParquetFileFormat : public FileFormatFixtureMixin<ParquetFormatHelper> { |
137 | 162 | public: |
138 | 163 | RecordBatchIterator Batches(Fragment* fragment) { |
@@ -183,6 +208,51 @@ class TestParquetFileFormat : public FileFormatFixtureMixin<ParquetFormatHelper> |
183 | 208 | EXPECT_EQ(SingleBatch(parquet_fragment.get())->num_rows(), expected + 1); |
184 | 209 | } |
185 | 210 | } |
| 211 | + |
| 212 | + void TestMultithreadedRegression(CustomizeScanOptionsWithThreadPool customizer) { |
| 213 | + auto reader = MakeGeneratedRecordBatch(schema({field("utf8", utf8())}), 10000, 100); |
| 214 | + ASSERT_OK_AND_ASSIGN(auto buffer, ParquetFormatHelper::Write(reader.get())); |
| 215 | + |
| 216 | + std::vector<Future<>> completes; |
| 217 | + std::vector<std::shared_ptr<arrow::internal::ThreadPool>> pools; |
| 218 | + |
| 219 | + for (int idx = 0; idx < 2; ++idx) { |
| 220 | + auto buffer_reader = std::make_shared<DelayedBufferReader>(buffer); |
| 221 | + auto source = std::make_shared<FileSource>(buffer_reader, buffer->size()); |
| 222 | + auto fragment = MakeFragment(*source); |
| 223 | + std::shared_ptr<Scanner> scanner; |
| 224 | + |
| 225 | + { |
| 226 | + auto options = std::make_shared<ScanOptions>(); |
| 227 | + ASSERT_OK_AND_ASSIGN(auto thread_pool, arrow::internal::ThreadPool::Make(1)); |
| 228 | + pools.emplace_back(thread_pool); |
| 229 | + customizer(*options, pools.back().get()); |
| 230 | + auto fragment_scan_options = std::make_shared<ParquetFragmentScanOptions>(); |
| 231 | + fragment_scan_options->arrow_reader_properties->set_pre_buffer(true); |
| 232 | + |
| 233 | + options->fragment_scan_options = fragment_scan_options; |
| 234 | + ScannerBuilder builder(ArithmeticDatasetFixture::schema(), fragment, options); |
| 235 | + |
| 236 | + ASSERT_OK(builder.UseThreads(true)); |
| 237 | + ASSERT_OK(builder.BatchSize(10000)); |
| 238 | + ASSERT_OK_AND_ASSIGN(scanner, builder.Finish()); |
| 239 | + } |
| 240 | + |
| 241 | + ASSERT_OK_AND_ASSIGN(auto batch, scanner->Head(10000)); |
| 242 | + [[maybe_unused]] auto fut = scanner->ScanBatchesUnorderedAsync(); |
| 243 | + // Random ReadAsync calls, generate some futures to make the state machine |
| 244 | + // more complex. |
| 245 | + for (int yy = 0; yy < 16; yy++) { |
| 246 | + completes.emplace_back( |
| 247 | + buffer_reader->ReadAsync(::arrow::io::IOContext(), 0, 1001)); |
| 248 | + } |
| 249 | + scanner = nullptr; |
| 250 | + } |
| 251 | + |
| 252 | + for (auto& f : completes) { |
| 253 | + f.Wait(); |
| 254 | + } |
| 255 | + } |
186 | 256 | }; |
187 | 257 |
|
188 | 258 | TEST_F(TestParquetFileFormat, InspectFailureWithRelevantError) { |
@@ -904,73 +974,25 @@ TEST(TestParquetStatistics, NoNullCount) { |
904 | 974 | } |
905 | 975 | } |
906 | 976 |
|
907 | | -class DelayedBufferReader : public ::arrow::io::BufferReader { |
908 | | - public: |
909 | | - explicit DelayedBufferReader(const std::shared_ptr<::arrow::Buffer>& buffer) |
910 | | - : ::arrow::io::BufferReader(buffer) {} |
911 | | - |
912 | | - ::arrow::Future<std::shared_ptr<Buffer>> ReadAsync( |
913 | | - const ::arrow::io::IOContext& io_context, int64_t position, |
914 | | - int64_t nbytes) override { |
915 | | - read_async_count.fetch_add(1); |
916 | | - auto self = std::dynamic_pointer_cast<DelayedBufferReader>(shared_from_this()); |
917 | | - return DeferNotOk(::arrow::io::internal::SubmitIO( |
918 | | - io_context, [self, position, nbytes]() -> Result<std::shared_ptr<Buffer>> { |
919 | | - std::this_thread::sleep_for(std::chrono::seconds(1)); |
920 | | - return self->DoReadAt(position, nbytes); |
921 | | - })); |
922 | | - } |
923 | | - |
924 | | - std::atomic<int> read_async_count{0}; |
925 | | -}; |
926 | | - |
927 | 977 | TEST_F(TestParquetFileFormat, MultithreadedScanRegression) { |
928 | 978 | // GH-38438: This test is similar to MultithreadedScan, but it try to use self |
929 | 979 | // designed Executor and DelayedBufferReader to mock async execution to make |
930 | 980 | // the state machine more complex. |
931 | | - auto reader = MakeGeneratedRecordBatch(schema({field("utf8", utf8())}), 10000, 100); |
932 | | - |
933 | | - ASSERT_OK_AND_ASSIGN(auto buffer, ParquetFormatHelper::Write(reader.get())); |
934 | | - |
935 | | - std::vector<Future<>> completes; |
936 | | - std::vector<std::shared_ptr<arrow::internal::ThreadPool>> pools; |
937 | | - |
938 | | - for (int idx = 0; idx < 2; ++idx) { |
939 | | - auto buffer_reader = std::make_shared<DelayedBufferReader>(buffer); |
940 | | - auto source = std::make_shared<FileSource>(buffer_reader, buffer->size()); |
941 | | - auto fragment = MakeFragment(*source); |
942 | | - std::shared_ptr<Scanner> scanner; |
943 | | - |
944 | | - { |
945 | | - auto options = std::make_shared<ScanOptions>(); |
946 | | - ASSERT_OK_AND_ASSIGN(auto thread_pool, arrow::internal::ThreadPool::Make(1)); |
947 | | - pools.emplace_back(thread_pool); |
948 | | - options->io_context = |
949 | | - ::arrow::io::IOContext(::arrow::default_memory_pool(), pools.back().get()); |
950 | | - auto fragment_scan_options = std::make_shared<ParquetFragmentScanOptions>(); |
951 | | - fragment_scan_options->arrow_reader_properties->set_pre_buffer(true); |
952 | | - |
953 | | - options->fragment_scan_options = fragment_scan_options; |
954 | | - ScannerBuilder builder(ArithmeticDatasetFixture::schema(), fragment, options); |
955 | | - |
956 | | - ASSERT_OK(builder.UseThreads(true)); |
957 | | - ASSERT_OK(builder.BatchSize(10000)); |
958 | | - ASSERT_OK_AND_ASSIGN(scanner, builder.Finish()); |
959 | | - } |
960 | | - |
961 | | - ASSERT_OK_AND_ASSIGN(auto batch, scanner->Head(10000)); |
962 | | - [[maybe_unused]] auto fut = scanner->ScanBatchesUnorderedAsync(); |
963 | | - // Random ReadAsync calls, generate some futures to make the state machine |
964 | | - // more complex. |
965 | | - for (int yy = 0; yy < 16; yy++) { |
966 | | - completes.emplace_back(buffer_reader->ReadAsync(::arrow::io::IOContext(), 0, 1001)); |
967 | | - } |
968 | | - scanner = nullptr; |
969 | | - } |
| 981 | + CustomizeScanOptionsWithThreadPool customize_io_context = |
| 982 | + [](ScanOptions& options, arrow::internal::ThreadPool* pool) { |
| 983 | + options.io_context = ::arrow::io::IOContext(::arrow::default_memory_pool(), pool); |
| 984 | + }; |
| 985 | + TestMultithreadedRegression(customize_io_context); |
| 986 | +} |
970 | 987 |
|
971 | | - for (auto& f : completes) { |
972 | | - f.Wait(); |
973 | | - } |
| 988 | +TEST_F(TestParquetFileFormat, MultithreadedComputeRegression) { |
| 989 | + // GH-43694: Test similar situation as MultithreadedScanRegression but with |
| 990 | + // the customized CPU executor instead |
| 991 | + CustomizeScanOptionsWithThreadPool customize_cpu_executor = |
| 992 | + [](ScanOptions& options, arrow::internal::ThreadPool* pool) { |
| 993 | + options.cpu_executor = pool; |
| 994 | + }; |
| 995 | + TestMultithreadedRegression(customize_cpu_executor); |
974 | 996 | } |
975 | 997 |
|
976 | 998 | } // namespace dataset |
|
0 commit comments