Skip casting of dict-encoded columns in CastingGenerator

sahil1105 · IsaacWarren · commit 7fcfd687f504 · 2025-07-15T14:13:33.000-05:00
diff --git a/recipe/patches/0004-Bodo-Changes.patch b/recipe/patches/0004-Bodo-Changes.patch
@@ -1,5 +1,5 @@
 diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
-index 1f8b6cc488..322d50e598 100644
+index 1f8b6cc488..586a5122a5 100644
 --- a/cpp/src/arrow/dataset/file_parquet.cc
 +++ b/cpp/src/arrow/dataset/file_parquet.cc
 @@ -26,16 +26,23 @@
@@ -26,32 +26,40 @@ index 1f8b6cc488..322d50e598 100644
  #include "arrow/util/tracing_internal.h"
  #include "parquet/arrow/reader.h"
  #include "parquet/arrow/schema.h"
-@@ -555,6 +562,59 @@ Future<std::shared_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReader
+@@ -555,6 +562,68 @@ Future<std::shared_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReader
        });
  }
  
 +struct CastingGenerator {
 +  CastingGenerator(RecordBatchGenerator source, std::shared_ptr<Schema> final_schema,
-+                   arrow::MemoryPool* pool = arrow::default_memory_pool())
++                   const std::unordered_set<std::string>& cols_to_skip,
++                   MemoryPool* pool = default_memory_pool())
 +      : source_(source),
 +        final_schema_(final_schema),
++        cols_to_skip_(cols_to_skip),
 +        exec_ctx(std::make_shared<compute::ExecContext>(pool)) {}
 +
 +  Future<std::shared_ptr<RecordBatch>> operator()() {
 +    return this->source_().Then([this](const std::shared_ptr<RecordBatch>& next)
 +                                    -> Result<std::shared_ptr<RecordBatch>> {
-+      if (IsIterationEnd(next) || this->final_schema_.get() == nullptr) {
++      if (IsIterationEnd(next) || this->final_schema_ == nullptr) {
 +        return next;
 +      }
-+      std::vector<std::shared_ptr<::arrow::Array>> out_cols;
-+      std::vector<std::shared_ptr<arrow::Field>> out_schema_fields;
++      std::vector<std::shared_ptr<Array>> out_cols;
++      std::vector<std::shared_ptr<Field>> out_schema_fields;
 +
 +      bool changed = false;
 +      for (const auto& field : this->final_schema_->fields()) {
 +        FieldRef field_ref = FieldRef(field->name());
 +        ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> column,
 +                              field_ref.GetOneOrNone(*next));
 +        if (column) {
++          if (this->cols_to_skip_.count(field->name())) {
++            out_cols.emplace_back(std::move(column));
++            // Maintain the original input type.
++            out_schema_fields.emplace_back(field->WithType(column->type()));
++            continue;
++          }
 +          if (!column->type()->Equals(field->type())) {
 +            // Referenced field was present but didn't have the expected type.
 +            ARROW_ASSIGN_OR_RAISE(
@@ -80,13 +88,14 @@ index 1f8b6cc488..322d50e598 100644
 +
 +  RecordBatchGenerator source_;
 +  std::shared_ptr<Schema> final_schema_;
++  const std::unordered_set<std::string>& cols_to_skip_;
 +  std::shared_ptr<compute::ExecContext> exec_ctx;
 +};
 +
  struct SlicingGenerator {
    SlicingGenerator(RecordBatchGenerator source, int64_t batch_size)
        : state(std::make_shared<State>(source, batch_size)) {}
-@@ -617,6 +677,9 @@ Result<RecordBatchGenerator> ParquetFileFormat::ScanBatchesAsync(
+@@ -617,6 +686,9 @@ Result<RecordBatchGenerator> ParquetFileFormat::ScanBatchesAsync(
        [this, options, parquet_fragment, pre_filtered,
         row_groups](const std::shared_ptr<parquet::arrow::FileReader>& reader) mutable
        -> Result<RecordBatchGenerator> {
@@ -96,7 +105,7 @@ index 1f8b6cc488..322d50e598 100644
      // Ensure that parquet_fragment has FileMetaData
      RETURN_NOT_OK(parquet_fragment->EnsureCompleteMetadata(reader.get()));
      if (!pre_filtered) {
-@@ -633,12 +696,19 @@ Result<RecordBatchGenerator> ParquetFileFormat::ScanBatchesAsync(
+@@ -633,12 +705,24 @@ Result<RecordBatchGenerator> ParquetFileFormat::ScanBatchesAsync(
              kParquetTypeName, options.get(), default_fragment_scan_options));
      int batch_readahead = options->batch_readahead;
      int64_t rows_to_readahead = batch_readahead * options->batch_size;
@@ -113,8 +122,13 @@ index 1f8b6cc488..322d50e598 100644
 +    ARROW_ASSIGN_OR_RAISE(auto generator, reader->GetRecordBatchGenerator(
 +                                              reader, row_groups, column_projection,
 +                                              cpu_executor, rows_to_readahead));
-+    RecordBatchGenerator casted =
-+        CastingGenerator(std::move(generator), options->dataset_schema, options->pool);
++    // We need to skip casting the dictionary columns since the dataset_schema doesn't
++    // have the dictionary-encoding information. Parquet reader will return them with the
++    // dictionary type, which is what we eventually want.
++    const std::unordered_set<std::string>& dict_cols =
++        parquet_fragment->parquet_format_.reader_options.dict_columns;
++    RecordBatchGenerator casted = CastingGenerator(
++        std::move(generator), options->dataset_schema, dict_cols, options->pool);
      RecordBatchGenerator sliced =
 -        SlicingGenerator(std::move(generator), options->batch_size);
 +        SlicingGenerator(std::move(casted), options->batch_size);