Skip to content

Commit 56eb2e7

Browse files
authored
Add ability to set the source_info of parquet_reader_options (rapidsai#20253)
This is useful for multi-task chunked reading since the user can pass a "template" set of options in and the tasking framework can reconstruct the options with individual source locations without laboriously having to transfer all of the options over (and remember to update when new options appear). Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Matthew Roeschke (https://github.com/mroeschke) - Paul Mattione (https://github.com/pmattione-nvidia) URL: rapidsai#20253
1 parent e534472 commit 56eb2e7

32 files changed

+236
-29
lines changed

cpp/include/cudf/io/avro.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,13 @@ class avro_reader_options {
9595
*/
9696
[[nodiscard]] size_type get_num_rows() const { return _num_rows; }
9797

98+
/**
99+
* @brief Sets source info.
100+
*
101+
* @param src The source info.
102+
*/
103+
void set_source(source_info src) { _source = std::move(src); }
104+
98105
/**
99106
* @brief Set names of the column to be read.
100107
*

cpp/include/cudf/io/csv.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,13 @@ class csv_reader_options {
486486
*/
487487
[[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
488488

489+
/**
490+
* @brief Sets source info.
491+
*
492+
* @param src The source info.
493+
*/
494+
void set_source(source_info src) { _source = std::move(src); }
495+
489496
/**
490497
* @brief Sets compression format of the source.
491498
*

cpp/include/cudf/io/json.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,13 @@ class json_reader_options {
379379
*/
380380
[[nodiscard]] std::vector<std::string> const& get_na_values() const { return _na_values; }
381381

382+
/**
383+
* @brief Sets source info.
384+
*
385+
* @param src The source info.
386+
*/
387+
void set_source(source_info src) { _source = std::move(src); }
388+
382389
/**
383390
* @brief Set data types for columns to be read.
384391
*

cpp/include/cudf/io/orc.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,13 @@ class orc_reader_options {
203203

204204
// Setters
205205

206+
/**
207+
* @brief Sets source info.
208+
*
209+
* @param src The source info.
210+
*/
211+
void set_source(source_info src) { _source = std::move(src); }
212+
206213
/**
207214
* @brief Sets names of the column to read.
208215
*

cpp/include/cudf/io/parquet.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,13 @@ class parquet_reader_options {
257257
*/
258258
[[nodiscard]] bool is_enabled_use_jit_filter() const { return _use_jit_filter; }
259259

260+
/**
261+
* @brief Set a new source location
262+
*
263+
* @param src New `source_info`.
264+
*/
265+
void set_source(source_info src) { _source = std::move(src); }
266+
260267
/**
261268
* @brief Sets the names of columns to be read from all input sources.
262269
*

cpp/tests/io/parquet_reader_test.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3681,3 +3681,25 @@ TEST_F(ParquetReaderTest, ByteBoundsAndFilters)
36813681
CUDF_TEST_EXPECT_TABLES_EQUAL(read->view(), expected->view());
36823682
}
36833683
}
3684+
3685+
TEST_F(ParquetReaderTest, LateBindSourceInfo)
3686+
{
3687+
srand(31337);
3688+
auto expected = create_random_fixed_table<int>(4, 4, false);
3689+
3690+
auto filepath = temp_env->get_temp_filepath("LateBindSourceInfo.parquet");
3691+
cudf::io::parquet_writer_options args =
3692+
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected);
3693+
cudf::io::write_parquet(args);
3694+
3695+
cudf::io::parquet_reader_options read_opts =
3696+
cudf::io::parquet_reader_options::builder(cudf::io::source_info{});
3697+
3698+
EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error);
3699+
3700+
read_opts.set_source(cudf::io::source_info{filepath});
3701+
3702+
auto result = cudf::io::read_parquet(read_opts);
3703+
3704+
CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), expected->view());
3705+
}

python/pylibcudf/pylibcudf/io/avro.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ cdef class AvroReaderOptions:
1616
cdef avro_reader_options c_obj
1717
cdef SourceInfo source
1818
cpdef void set_columns(self, list col_names)
19+
cpdef void set_source(self, SourceInfo src)
1920

2021

2122
cdef class AvroReaderOptionsBuilder:

python/pylibcudf/pylibcudf/io/avro.pyi

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ from pylibcudf.io.types import SourceInfo, TableWithMetadata
77
__all__ = ["AvroReaderOptions", "AvroReaderOptionsBuilder", "read_avro"]
88

99
class AvroReaderOptions:
10+
def set_columns(self, col_names: list[str]) -> None: ...
11+
def set_source(self, src: SourceInfo) -> None: ...
1012
@staticmethod
1113
def builder(source: SourceInfo) -> AvroReaderOptionsBuilder: ...
1214

python/pylibcudf/pylibcudf/io/avro.pyx

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,21 @@ cdef class AvroReaderOptions:
7070
vec.push_back(str(name).encode())
7171
self.c_obj.set_columns(vec)
7272

73+
cpdef void set_source(self, SourceInfo src):
74+
"""
75+
Set a new source info location.
76+
77+
Parameters
78+
----------
79+
src : SourceInfo
80+
New source information, replacing existing information.
81+
82+
Returns
83+
-------
84+
None
85+
"""
86+
self.c_obj.set_source(src.c_obj)
87+
7388

7489
cdef class AvroReaderOptionsBuilder:
7590
cpdef AvroReaderOptionsBuilder columns(self, list col_names):

python/pylibcudf/pylibcudf/io/csv.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ cdef class CsvReaderOptions:
4545
cpdef void set_true_values(self, list true_values)
4646
cpdef void set_false_values(self, list false_values)
4747
cpdef void set_na_values(self, list na_values)
48+
cpdef void set_source(self, SourceInfo src)
4849

4950

5051
cdef class CsvReaderOptionsBuilder:

0 commit comments

Comments
 (0)