Skip to content

Commit 7fcfd68

Browse files
sahil1105IsaacWarren
authored andcommitted
Skip casting of dict-encoded columns in CastingGenerator
1 parent e688141 commit 7fcfd68

File tree

1 file changed

+24
-10
lines changed

1 file changed

+24
-10
lines changed

recipe/patches/0004-Bodo-Changes.patch

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
2-
index 1f8b6cc488..322d50e598 100644
2+
index 1f8b6cc488..586a5122a5 100644
33
--- a/cpp/src/arrow/dataset/file_parquet.cc
44
+++ b/cpp/src/arrow/dataset/file_parquet.cc
55
@@ -26,16 +26,23 @@
@@ -26,32 +26,40 @@ index 1f8b6cc488..322d50e598 100644
2626
#include "arrow/util/tracing_internal.h"
2727
#include "parquet/arrow/reader.h"
2828
#include "parquet/arrow/schema.h"
29-
@@ -555,6 +562,59 @@ Future<std::shared_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReader
29+
@@ -555,6 +562,68 @@ Future<std::shared_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReader
3030
});
3131
}
3232

3333
+struct CastingGenerator {
3434
+ CastingGenerator(RecordBatchGenerator source, std::shared_ptr<Schema> final_schema,
35-
+ arrow::MemoryPool* pool = arrow::default_memory_pool())
35+
+ const std::unordered_set<std::string>& cols_to_skip,
36+
+ MemoryPool* pool = default_memory_pool())
3637
+ : source_(source),
3738
+ final_schema_(final_schema),
39+
+ cols_to_skip_(cols_to_skip),
3840
+ exec_ctx(std::make_shared<compute::ExecContext>(pool)) {}
3941
+
4042
+ Future<std::shared_ptr<RecordBatch>> operator()() {
4143
+ return this->source_().Then([this](const std::shared_ptr<RecordBatch>& next)
4244
+ -> Result<std::shared_ptr<RecordBatch>> {
43-
+ if (IsIterationEnd(next) || this->final_schema_.get() == nullptr) {
45+
+ if (IsIterationEnd(next) || this->final_schema_ == nullptr) {
4446
+ return next;
4547
+ }
46-
+ std::vector<std::shared_ptr<::arrow::Array>> out_cols;
47-
+ std::vector<std::shared_ptr<arrow::Field>> out_schema_fields;
48+
+ std::vector<std::shared_ptr<Array>> out_cols;
49+
+ std::vector<std::shared_ptr<Field>> out_schema_fields;
4850
+
4951
+ bool changed = false;
5052
+ for (const auto& field : this->final_schema_->fields()) {
5153
+ FieldRef field_ref = FieldRef(field->name());
5254
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> column,
5355
+ field_ref.GetOneOrNone(*next));
5456
+ if (column) {
57+
+ if (this->cols_to_skip_.count(field->name())) {
58+
+ out_cols.emplace_back(std::move(column));
59+
+ // Maintain the original input type.
60+
+ out_schema_fields.emplace_back(field->WithType(column->type()));
61+
+ continue;
62+
+ }
5563
+ if (!column->type()->Equals(field->type())) {
5664
+ // Referenced field was present but didn't have the expected type.
5765
+ ARROW_ASSIGN_OR_RAISE(
@@ -80,13 +88,14 @@ index 1f8b6cc488..322d50e598 100644
8088
+
8189
+ RecordBatchGenerator source_;
8290
+ std::shared_ptr<Schema> final_schema_;
91+
+ const std::unordered_set<std::string>& cols_to_skip_;
8392
+ std::shared_ptr<compute::ExecContext> exec_ctx;
8493
+};
8594
+
8695
struct SlicingGenerator {
8796
SlicingGenerator(RecordBatchGenerator source, int64_t batch_size)
8897
: state(std::make_shared<State>(source, batch_size)) {}
89-
@@ -617,6 +677,9 @@ Result<RecordBatchGenerator> ParquetFileFormat::ScanBatchesAsync(
98+
@@ -617,6 +686,9 @@ Result<RecordBatchGenerator> ParquetFileFormat::ScanBatchesAsync(
9099
[this, options, parquet_fragment, pre_filtered,
91100
row_groups](const std::shared_ptr<parquet::arrow::FileReader>& reader) mutable
92101
-> Result<RecordBatchGenerator> {
@@ -96,7 +105,7 @@ index 1f8b6cc488..322d50e598 100644
96105
// Ensure that parquet_fragment has FileMetaData
97106
RETURN_NOT_OK(parquet_fragment->EnsureCompleteMetadata(reader.get()));
98107
if (!pre_filtered) {
99-
@@ -633,12 +696,19 @@ Result<RecordBatchGenerator> ParquetFileFormat::ScanBatchesAsync(
108+
@@ -633,12 +705,24 @@ Result<RecordBatchGenerator> ParquetFileFormat::ScanBatchesAsync(
100109
kParquetTypeName, options.get(), default_fragment_scan_options));
101110
int batch_readahead = options->batch_readahead;
102111
int64_t rows_to_readahead = batch_readahead * options->batch_size;
@@ -113,8 +122,13 @@ index 1f8b6cc488..322d50e598 100644
113122
+ ARROW_ASSIGN_OR_RAISE(auto generator, reader->GetRecordBatchGenerator(
114123
+ reader, row_groups, column_projection,
115124
+ cpu_executor, rows_to_readahead));
116-
+ RecordBatchGenerator casted =
117-
+ CastingGenerator(std::move(generator), options->dataset_schema, options->pool);
125+
+ // We need to skip casting the dictionary columns since the dataset_schema doesn't
126+
+ // have the dictionary-encoding information. Parquet reader will return them with the
127+
+ // dictionary type, which is what we eventually want.
128+
+ const std::unordered_set<std::string>& dict_cols =
129+
+ parquet_fragment->parquet_format_.reader_options.dict_columns;
130+
+ RecordBatchGenerator casted = CastingGenerator(
131+
+ std::move(generator), options->dataset_schema, dict_cols, options->pool);
118132
RecordBatchGenerator sliced =
119133
- SlicingGenerator(std::move(generator), options->batch_size);
120134
+ SlicingGenerator(std::move(casted), options->batch_size);

0 commit comments

Comments
 (0)