use raft 23.08 (#681)

benfred · web-flow · commit 42832574f1a2 · 2023-07-20T15:35:36.000-07:00
Update to use the latest raft version. RAFT 23.08 includes a sort option as part of the select_k method rapidsai/raft#1615 which means we don't have to sort the output ourselves.
diff --git a/implicit/gpu/CMakeLists.txt b/implicit/gpu/CMakeLists.txt
@@ -14,7 +14,7 @@ else()
     add_cython_target(_cuda CXX)
 
     # use rapids-cmake to install dependencies
-    file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.06/RAPIDS.cmake
+    file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.08/RAPIDS.cmake
         ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
     include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
     include(rapids-cmake)
@@ -57,10 +57,10 @@ else()
     # get raft
     # note: we're using RAFT in header only mode right now - mainly to reduce binary
     # size of the compiled wheels
-    rapids_cpm_find(raft 23.06
+    rapids_cpm_find(raft 23.08
         CPM_ARGS
           GIT_REPOSITORY  https://github.com/rapidsai/raft.git
-          GIT_TAG         branch-23.06
+          GIT_TAG         branch-23.08
           DOWNLOAD_ONLY   YES
     )
     include_directories(${raft_SOURCE_DIR}/cpp/include)
diff --git a/implicit/gpu/knn.cu b/implicit/gpu/knn.cu
@@ -236,25 +236,17 @@ void KnnQuery::topk_impl(const Matrix &items, const Matrix &query, int k,
     }
 
     auto current_k = std::min(k, static_cast<int>(temp_distances.cols));
-    rmm::device_uvector<float> best_distances(temp_distances.rows * current_k,
-                                              stream, mr.get());
-    rmm::device_uvector<int> best_indices(temp_distances.rows * current_k,
-                                          stream, mr.get());
 
     auto distance_view = raft::make_device_matrix_view<const float, int64_t>(
         temp_distances, temp_distances.rows, temp_distances.cols);
 
     raft::matrix::select_k<float, int>(
         handle, distance_view, std::nullopt,
         raft::make_device_matrix_view<float, int64_t>(
-            best_distances.data(), temp_distances.rows, current_k),
+            distances + start * k, temp_distances.rows, current_k),
         raft::make_device_matrix_view<int, int64_t>(
-            best_indices.data(), temp_distances.rows, current_k),
-        false);
-
-    // raft::select_k doesn't sort inputs - so we have to do it here
-    argsort(best_indices.data(), best_distances.data(), temp_distances.rows,
-            current_k, indices + start * k, distances + start * k);
+            indices + start * k, temp_distances.rows, current_k),
+        false, true);
     // TODO: callback per batch (show progress etc)
   }
 
@@ -271,44 +263,6 @@ void KnnQuery::topk_impl(const Matrix &items, const Matrix &query, int k,
   }
 }
 
-void KnnQuery::argsort(const int *input_indices, const float *input_distances,
-                       int rows, int cols, int *indices, float *distances) {
-  rmm::cuda_stream_view stream;
-  auto segment_offsets = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<int>(0),
-      [=] __device__(int i) { return i * cols; });
-
-  void *temp_mem = NULL;
-  size_t temp_size = 0;
-
-  // sort the values.
-  if (rows > 1) {
-    auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        NULL, temp_size, input_distances, distances, input_indices, indices,
-        rows * cols, rows, segment_offsets, segment_offsets + 1, 0,
-        sizeof(float) * 8, stream);
-    CHECK_CUDA(err);
-    temp_mem = mr->allocate(temp_size, stream);
-    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        temp_mem, temp_size, input_distances, distances, input_indices, indices,
-        rows * cols, rows, segment_offsets, segment_offsets + 1, 0,
-        sizeof(float) * 8, stream);
-    CHECK_CUDA(err);
-  } else {
-    size_t temp_size = 0;
-    auto err = cub::DeviceRadixSort::SortPairsDescending(
-        NULL, temp_size, input_distances, distances, input_indices, indices,
-        cols, 0, sizeof(float) * 8, stream);
-    CHECK_CUDA(err);
-    temp_mem = mr->allocate(temp_size, stream);
-    err = cub::DeviceRadixSort::SortPairsDescending(
-        temp_mem, temp_size, input_distances, distances, input_indices, indices,
-        cols, 0, sizeof(float) * 8, stream);
-    CHECK_CUDA(err);
-  }
-  mr->deallocate(temp_mem, temp_size, stream);
-}
-
 KnnQuery::~KnnQuery() {}
 
 } // namespace gpu
diff --git a/implicit/gpu/knn.h b/implicit/gpu/knn.h
@@ -25,9 +25,6 @@ class KnnQuery {
                  const COOMatrix *query_filter = NULL,
                  Vector<int> *item_filter = NULL);
 
-  void argsort(const int *input_indices, const float *input_distances, int rows,
-               int cols, int *indices, float *distances);
-
 protected:
   std::unique_ptr<rmm::mr::device_memory_resource> mr;
   raft::resources handle;