n0r0shi
diff --git a/‎CMake/resolve_dependency_modules/cudf.cmake‎
Lines changed: 9 additions & 9 deletions b/‎CMake/resolve_dependency_modules/cudf.cmake‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎velox/experimental/cudf/connectors/hive/CudfHiveDataSource.cpp‎
Lines changed: 72 additions & 173 deletions b/‎velox/experimental/cudf/connectors/hive/CudfHiveDataSource.cpp‎
Lines changed: 72 additions & 173 deletions
@@ -30,35 +30,35 @@ set(
 )
 velox_resolve_dependency_url(rapids_cmake)
 
-# rmm commit 3d6669c from 2026-02-09
+# rmm commit f225c62 from 2026-02-10
 set(VELOX_rmm_VERSION 26.04)
-set(VELOX_rmm_COMMIT 3d6669cd21e15080a0af2dc18f991060be2a4c3c)
+set(VELOX_rmm_COMMIT f225c620fc177cbdd807dc67bfb53fa4b4272e9f)
 set(
   VELOX_rmm_BUILD_SHA256_CHECKSUM
-  1d2575d7a0fb492feaabc6917e7db33eb5c446a8e8eea301c54bd7e3d25fe66c
+  c6a5b4855802d7c17c0a0a978f2643792b9e17a6b773dd6bbb2f99b16496b38a
 )
 set(VELOX_rmm_SOURCE_URL "https://github.com/rapidsai/rmm/archive/${VELOX_rmm_COMMIT}.tar.gz")
 velox_resolve_dependency_url(rmm)
 
-# kvikio commit 593245b from 2026-02-05
+# kvikio commit 62b3d6f from 2026-02-10
 set(VELOX_kvikio_VERSION 26.04)
-set(VELOX_kvikio_COMMIT 593245b7799b6ea91eed77dd03ce4c9a4e158465)
+set(VELOX_kvikio_COMMIT 62b3d6fa4ba1fc518b0069a8214b1283e63df019)
 set(
   VELOX_kvikio_BUILD_SHA256_CHECKSUM
-  bcd03423b727fb0a23551a8ad3c6fcb58eaf0eb54ded7cdce914ea07a60ea1d7
+  b590a3b6f2d1fa0b36e07d26361d0a5f3cbcce25582f8ee07bcfcf5796ea37b7
 )
 set(
   VELOX_kvikio_SOURCE_URL
   "https://github.com/rapidsai/kvikio/archive/${VELOX_kvikio_COMMIT}.tar.gz"
 )
 velox_resolve_dependency_url(kvikio)
 
-# cudf commit fc213fc from 2026-02-10
+# cudf commit e6ba1fe from 2026-02-11
 set(VELOX_cudf_VERSION 26.04 CACHE STRING "cudf version")
-set(VELOX_cudf_COMMIT fc213fc1ad889e2edf291b5555764ce677cb5dfa)
+set(VELOX_cudf_COMMIT e6ba1feee8f056ce2e245f771403cfa5f598d813)
 set(
   VELOX_cudf_BUILD_SHA256_CHECKSUM
-  f2c8eec90cb8571188e5bdd32222da208c353bbe1066c56bb3052a988feda1a2
+  19137d306db0ddbf4eebb4333e0257de4815563039b1e805829ce2dcc525c3f5
 )
 set(VELOX_cudf_SOURCE_URL "https://github.com/rapidsai/cudf/archive/${VELOX_cudf_COMMIT}.tar.gz")
 velox_resolve_dependency_url(cudf)
 
@@ -47,6 +47,7 @@
 #include <cudf/transform.hpp>
 
 #include <cuda_runtime.h>
+#include <nvtx3/nvtx3.hpp>
 
 #include <filesystem>
 #include <memory>
@@ -87,6 +88,7 @@ CudfHiveDataSource::CudfHiveDataSource(
         outputName);
 
     auto* handle = static_cast<const hive::HiveColumnHandle*>(it->second.get());
+    readColumnSet_.emplace(handle->name());
     readColumnNames_.emplace_back(handle->name());
   }
 
@@ -100,11 +102,9 @@ CudfHiveDataSource::CudfHiveDataSource(
     subfieldFilters_.emplace(k.clone(), v->clone());
     // Add fields in the filter to the columns to read if not there
     for (const auto& [field, _] : subfieldFilters_) {
-      if (std::find(
-              readColumnNames_.begin(),
-              readColumnNames_.end(),
-              field.toString()) == readColumnNames_.end()) {
-        readColumnNames_.push_back(field.toString());
+      if (readColumnSet_.count(field.toString()) == 0) {
+        readColumnSet_.emplace(field.toString());
+        readColumnNames_.emplace_back(field.toString());
       }
     }
   }
@@ -115,11 +115,9 @@ CudfHiveDataSource::CudfHiveDataSource(
     remainingFilterExprSet_ = expressionEvaluator_->compile(remainingFilter);
     for (const auto& field : remainingFilterExprSet_->distinctFields()) {
       // Add fields in the filter to the columns to read if not there
-      if (std::find(
-              readColumnNames_.begin(),
-              readColumnNames_.end(),
-              field->name()) == readColumnNames_.end()) {
-        readColumnNames_.push_back(field->name());
+      if (readColumnSet_.count(field->name()) == 0) {
+        readColumnSet_.emplace(field->name());
+        readColumnNames_.emplace_back(field->name());
       }
     }
 
@@ -199,180 +197,80 @@ std::optional<RowVectorPtr> CudfHiveDataSource::next(
   auto startTimeUs = getCurrentTimeMicro();
 
   if (not useExperimentalSplitReader_) {
-    // Read a table chunk using the regular parquet reader
-    VELOX_CHECK_NOT_NULL(splitReader_, "Regular cudf split reader not present");
+    // Read table using the regular cudf parquet reader
+    VELOX_CHECK_NOT_NULL(splitReader_, "cudf parquet reader not present");
 
     if (not splitReader_->has_next()) {
       return nullptr;
     }
-    // Read a table chunk
+
     auto tableWithMetadata = splitReader_->read_chunk();
     cudfTable = std::move(tableWithMetadata.tbl);
     metadata = std::move(tableWithMetadata.metadata);
   } else {
-    // Read a table chunk using the experimental parquet reader
+    // Read table using the experimental parquet reader
     VELOX_CHECK_NOT_NULL(
-        exptSplitReader_, "Experimental cudf split reader not present");
+        exptSplitReader_, "cuDF hybrid scan reader not present");
+    VELOX_CHECK_NOT_NULL(hybridScanState_, "hybrid scan state not present");
 
-    // TODO(mh): Replace this with chunked hybrid scan APIs when available in
-    // the pinned cuDF version
-    std::call_once(*tableMaterialized_, [&]() {
+    std::call_once(*hybridScanState_->isHybridScanSetup_, [&]() {
       auto rowGroupIndices = exptSplitReader_->all_row_groups(readerOptions_);
 
-      // Temporary options used for filtering row groups. TODO(mh): Remove this
-      // once PR https://github.com/rapidsai/cudf/pull/20604 is merged
-      auto tmpOptions = readerOptions_;
-
-      if (readerOptions_.get_filter().has_value()) {
-        // Filter expression converter
-        auto exprConverter = referenceToNameConverter(
-            readerOptions_.get_filter(),
-            exptSplitReader_->parquet_metadata().schema,
-            readColumnNames_);
-        tmpOptions.set_filter(exprConverter.convertedExpression());
-
-        // Create a temporary split reader for filtering row groups. TODO(mh):
-        // Remove this once PR https://github.com/rapidsai/cudf/pull/20604 is
-        // merged or the pinned cuDF version is updated
-        auto footerBytes = fetchFooterBytes(dataSource_);
-        auto tmpExptSplitReader = std::make_unique<CudfHybridScanReader>(
-            cudf::host_span<uint8_t const>{
-                footerBytes->data(), footerBytes->size()},
-            tmpOptions);
-        rowGroupIndices = tmpExptSplitReader->filter_row_groups_with_stats(
-            rowGroupIndices, tmpOptions, stream_);
+      // Filter row groups using row group byte ranges
+      if (readerOptions_.get_skip_bytes() > 0 or
+          readerOptions_.get_num_bytes().has_value()) {
+        rowGroupIndices = exptSplitReader_->filter_row_groups_with_byte_range(
+            rowGroupIndices, readerOptions_);
       }
 
-      // Workaround: Set a dummy filter expression to avoid erroneous assertion
-      // in `payload_column_chunks_byte_ranges`. TODO(mh): Remove this once PR
-      // https://github.com/rapidsai/cudf/pull/20604 is merged
-      if (not tmpOptions.get_filter().has_value()) {
-        auto scalar = cudf::numeric_scalar<int32_t>(0, false, stream_);
-        auto literal = cudf::ast::literal(scalar);
-        auto filter =
-            cudf::ast::operation(cudf::ast::ast_operator::IDENTITY, literal);
-        tmpOptions.set_filter(filter);
+      // Filter row groups using column chunk statistics
+      if (readerOptions_.get_filter().has_value()) {
+        rowGroupIndices = exptSplitReader_->filter_row_groups_with_stats(
+            rowGroupIndices, readerOptions_, stream_);
       }
 
       // Get column chunk byte ranges to fetch
       const auto columnChunkByteRanges =
-          exptSplitReader_->payload_column_chunks_byte_ranges(
-              rowGroupIndices, tmpOptions);
-      // Fetch row group data device buffers
-      std::vector<rmm::device_buffer> columnChunkBuffers(
-          columnChunkByteRanges.size());
-      std::vector<std::future<size_t>> ioFutures{};
-      ioFutures.reserve(columnChunkByteRanges.size());
-      std::for_each(
-          thrust::counting_iterator<size_t>(0),
-          thrust::counting_iterator(columnChunkByteRanges.size()),
-          [&](auto idx) {
-            const auto& byteRange = columnChunkByteRanges[idx];
-            auto& buffer = columnChunkBuffers[idx];
-
-            // Pad the buffer size to be a multiple of 8 bytes
-            constexpr size_t bufferPaddingMultiple = 8;
-            buffer = rmm::device_buffer(
-                cudf::util::round_up_safe<size_t>(
-                    byteRange.size(), bufferPaddingMultiple),
-                stream_,
-                cudf::get_current_device_resource_ref());
-            // Directly read the column chunk data to the device buffer if
-            // supported
-            if (auto bufferedInput =
-                    dynamic_cast<BufferedInputDataSource*>(dataSource_.get())) {
-              bufferedInput->enqueueForDevice(
-                  static_cast<uint64_t>(byteRange.offset()),
-                  static_cast<uint64_t>(byteRange.size()),
-                  static_cast<uint8_t*>(buffer.data()));
-            } else if (
-                dataSource_->supports_device_read() and
-                dataSource_->is_device_read_preferred(byteRange.size())) {
-              ioFutures.emplace_back(dataSource_->device_read_async(
-                  byteRange.offset(),
-                  byteRange.size(),
-                  static_cast<uint8_t*>(buffer.data()),
-                  stream_));
-            } else {
-              // Read the column chunk data to the host buffer and copy it to
-              // the device buffer
-              auto hostBuffer =
-                  dataSource_->host_read(byteRange.offset(), byteRange.size());
-              CUDF_CUDA_TRY(cudaMemcpyAsync(
-                  buffer.data(),
-                  hostBuffer->data(),
-                  byteRange.size(),
-                  cudaMemcpyHostToDevice,
-                  stream_.value()));
-            }
-          });
-
-      if (auto bufferedInput =
-              dynamic_cast<BufferedInputDataSource*>(dataSource_.get())) {
-        bufferedInput->load(stream_);
-      }
+          exptSplitReader_->all_column_chunks_byte_ranges(
+              rowGroupIndices, readerOptions_);
+
+      // Fetch column chunk byte ranges
+      nvtxRangePush("fetchByteRanges");
+
+      // Tuple containing a vector of device buffers, a vector of device spans
+      // for each input byte range, and a future to wait for all reads to
+      // complete
+      auto ioData = fetchByteRangesAsync(
+          dataSource_,
+          columnChunkByteRanges,
+          stream_,
+          cudf::get_current_device_resource_ref());
+
+      // Wait for all pending reads to complete
+      std::get<2>(ioData).wait();
+      nvtxRangePop();
+
+      // Save state for hybrid scan reader for future calls to `next()`
+      hybridScanState_->columnChunkBuffers_ = std::move(std::get<0>(ioData));
+      hybridScanState_->columnChunkData_ = std::move(std::get<1>(ioData));
 
-      // Wait for all IO futures to complete
-      std::for_each(ioFutures.begin(), ioFutures.end(), [](auto& future) {
-        future.get();
-      });
-
-      // Convert device buffers to device spans
-      auto columnChunkData = [&]() {
-        std::vector<cudf::device_span<uint8_t const>> columnChunkData;
-        columnChunkData.reserve(columnChunkBuffers.size());
-        std::transform(
-            columnChunkBuffers.begin(),
-            columnChunkBuffers.end(),
-            std::back_inserter(columnChunkData),
-            [](auto& buffer) {
-              return cudf::device_span<uint8_t const>{
-                  static_cast<uint8_t*>(buffer.data()), buffer.size()};
-            });
-        return columnChunkData;
-      }();
-
-      // Create an all true row mask to read the table in one go without output
-      // filtering. TODO(mh): Remove this once PR
-      // https://github.com/rapidsai/cudf/pull/20604 is merged
-      const auto totalRows =
-          exptSplitReader_->total_rows_in_row_groups(rowGroupIndices);
-
-      auto const scalarTrue = cudf::numeric_scalar<bool>(true, true, stream_);
-      auto allTrueRowMask =
-          cudf::make_column_from_scalar(scalarTrue, totalRows, stream_);
-
-      // Read the table in one go
-      auto tableWithMetadata = exptSplitReader_->materialize_payload_columns(
+      exptSplitReader_->setup_chunking_for_all_columns(
+          cudfHiveConfig_->maxChunkReadLimit(),
+          cudfHiveConfig_->maxPassReadLimit(),
           rowGroupIndices,
-          columnChunkData,
-          allTrueRowMask->view(),
-          cudf::io::parquet::experimental::use_data_page_mask::NO,
+          hybridScanState_->columnChunkData_,
           readerOptions_,
           stream_,
           cudf::get_current_device_resource_ref());
-
-      // Store the read metadata
-      metadata = std::move(tableWithMetadata.metadata);
-
-      // Apply the subfield filter manually since we passed an all true row mask
-      if (readerOptions_.get_filter().has_value()) {
-        std::unique_ptr<cudf::table> table = std::move(tableWithMetadata.tbl);
-        auto filterMask = cudf::compute_column(
-            *table, readerOptions_.get_filter().value(), stream_);
-        cudfTable = cudf::apply_boolean_mask(
-            table->view(),
-            filterMask->view(),
-            stream_,
-            cudf::get_current_device_resource_ref());
-      } else {
-        cudfTable = std::move(tableWithMetadata.tbl);
-      }
     });
 
-    if (cudfTable == nullptr) {
+    if (not exptSplitReader_->has_next_table_chunk()) {
       return nullptr;
     }
+
+    auto tableWithMetadata = exptSplitReader_->materialize_all_columns_chunk();
+    cudfTable = std::move(tableWithMetadata.tbl);
+    metadata = std::move(tableWithMetadata.metadata);
   }
 
   TotalScanTimeCallbackData* callbackData =
@@ -506,18 +404,18 @@ void CudfHiveDataSource::addSplit(std::shared_ptr<ConnectorSplit> split) {
   if (splitReader_ or exptSplitReader_) {
     splitReader_.reset();
     exptSplitReader_.reset();
-    tableMaterialized_.reset();
+    hybridScanState_.reset();
   }
 
   // Create a cudf split reader
   if (useExperimentalSplitReader_) {
     exptSplitReader_ = createExperimentalSplitReader();
+    hybridScanState_ = std::make_unique<
+        facebook::velox::cudf_velox::connector::hive::HybridScanState>();
   } else {
     splitReader_ = createSplitReader();
   }
 
-  tableMaterialized_ = std::make_unique<std::once_flag>();
-
   // TODO: `completedBytes_` should be updated in `next()` as we read more and
   // more table bytes
   try {
@@ -591,7 +489,7 @@ void CudfHiveDataSource::setupCudfDataSourceAndOptions() {
   }();
 
   if (dataSource_ == nullptr) {
-    dataSource_ = std::move(makeDataSourcesFromSourceInfo(sourceInfo).front());
+    dataSource_ = std::move(cudf::io::make_datasources(sourceInfo).front());
   }
 
   // Reader options
@@ -640,29 +538,30 @@ CudfHybridScanReaderPtr CudfHiveDataSource::createExperimentalSplitReader() {
   stream_ = cudfGlobalStreamPool().get_stream();
 
   // Create a hybrid scan reader
-  auto const footerBytes = fetchFooterBytes(dataSource_);
-  auto exptSplitReader = std::make_unique<CudfHybridScanReader>(
-      cudf::host_span<uint8_t const>{footerBytes->data(), footerBytes->size()},
-      readerOptions_);
+  nvtxRangePush("hybridScanReader");
+  auto const footerBuffer = fetchFooterBytes(dataSource_);
+  auto splitReader =
+      std::make_unique<CudfHybridScanReader>(*footerBuffer, readerOptions_);
+  nvtxRangePop();
 
   // Setup page index if available
-  auto const pageIndexByteRange = exptSplitReader->page_index_byte_range();
+  auto const pageIndexByteRange = splitReader->page_index_byte_range();
   if (not pageIndexByteRange.is_empty()) {
-    auto const pageIndexBytes = dataSource_->host_read(
-        pageIndexByteRange.offset(), pageIndexByteRange.size());
-    exptSplitReader->setup_page_index(
-        cudf::host_span<uint8_t const>{
-            pageIndexBytes->data(), pageIndexBytes->size()});
+    nvtxRangePush("setupPageIndex");
+    auto const pageIndexBuffer =
+        fetchPageIndexBytes(dataSource_, pageIndexByteRange);
+    splitReader->setup_page_index(*pageIndexBuffer);
+    nvtxRangePop();
   }
 
-  return exptSplitReader;
+  return splitReader;
 }
 
 void CudfHiveDataSource::resetSplit() {
   split_.reset();
   splitReader_.reset();
   exptSplitReader_.reset();
-  tableMaterialized_.reset();
+  hybridScanState_.reset();
   dataSource_.reset();
 }