Drop codegen support of gather (but not takeAlongAxis) (#5907)

naoyam · web-flow · commit 1eacd9bab91e · 2026-02-04T10:46:30.000-08:00
Gather allows non-gathered indices to have smaller output dimensions, which complicates indexing and is not yet supported by TensorIndexer and is supported only by the legacy indexer. Note that takeAlongAxis, which is a limited case of gather, is supported. The motivation is to remove the legacy indexer. This is the only remaining fallback case. One way to support it is to decompose it into a takeAlongAxis and slice. For now, this PR disables codegen of gather and delegates to ExprEval. Note that the cross-entropy benchmark does use gather rather than takeAlongAxis. There's a pending change needed in Thunder. See #3924 (comment). While this is a perf regression, at this point I think it'd more important to remove the large technical debt. In a follow-up PR, I'll remove the legacy indexer. This PR just inserts an assertion that no fallback is necessary, which should be true by the scheduler changes.
diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
@@ -34,6 +34,8 @@
 namespace nvfuser {
 
 TensorIndexer::TensorIndexer(IdModel& id_model) : id_model_(id_model) {
+  NVF_ERROR(isSupported(id_model.fusion()));
+
   buildLoopIndexMap();
 
   if (isDebugDumpEnabled(DebugDumpOption::IndexingVerbose)) {
diff --git a/csrc/scheduler/expr_eval_sched.cpp b/csrc/scheduler/expr_eval_sched.cpp
@@ -64,6 +64,7 @@ bool ExprEvalScheduler::canScheduleCompileTime(Fusion* fusion) {
   // TODO: remove IndexPutAccumulateOp
   if (exprs.front()
           ->isOneOf<
+              GatherOp,
               ScatterOp,
               SdpaFwdOp,
               SdpaBwdOp,
diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp
@@ -64,6 +64,16 @@ bool checkCanSchedule(Fusion* fusion, SchedulerType scheduler_type) {
     return false;
   }
 
+  // Support of non-exact gather was dropped when the legacy indexer was
+  // deprecated
+  if (std::ranges::any_of(
+          ir_utils::getOpsOfType<GatherOp>(fusion),
+          [](GatherOp* gather) { return !gather->exactSizes(); })) {
+    scheduler_debug_utils::canScheduleRejectReason(
+        scheduler_type, "Non-exact gather ops");
+    return false;
+  }
+
   // Fusions with `MatmulOp, LinearOp, MmaOp` can only be accepted by Matmul
   // scheduler.
   if (scheduler_type != SchedulerType::Matmul &&
diff --git a/tests/cpp/test_gather.cpp b/tests/cpp/test_gather.cpp
@@ -582,7 +582,7 @@ TEST_F(GatherTest, TakeAlongAxisIntermediateTensorReduction1) {
 
   validateSegmentation(
       executor_cache.getMostRecentKernelRuntime(),
-      {SchedulerType::Reduction, SchedulerType::PointWise});
+      {SchedulerType::Reduction, SchedulerType::ExprEval});
 
   testValidate(&fusion, outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -1126,137 +1126,4 @@ TEST_F(GatherTest, TakeAlongAxisCrossEntropyLoss) {
   testValidate(fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
 }
 
-// Test grouped reduction on IterType::GatherScatter
-TEST_F(GatherTest, GatherIterGoupedReduction) {
-  const int max_dim_size = 128;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_i = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-
-  int rank = 3;
-  int dim = 2;
-
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  TensorView* tv1 = makeContigTensor(rank);
-  TensorView* tv_idx = makeContigTensor(rank, DataType::Int);
-  fusion.addInput(tv1);
-  fusion.addInput(tv_idx);
-  auto tv_gather = gather(tv1, dim, tv_idx);
-  auto tv_sum = sum(tv_gather, {0}, false);
-  fusion.addOutput(tv_sum);
-
-  // simply gather all elements
-  auto input_dims =
-      std::vector<int64_t>({max_dim_size, max_dim_size, max_dim_size});
-  auto index_dims = input_dims;
-  std::vector<int64_t> input2_dims(rank - 1, 0);
-  for (int idim = 0; idim < rank - 1; ++idim) {
-    input2_dims[idim] = index_dims[idim + 1];
-  }
-
-  at::Tensor t0 = at::randn(input_dims, options);
-  at::Tensor idx = at::randint(0, input_dims[dim], index_dims, options_i);
-
-  auto reduction_scheduler =
-      SchedulerEntry::makeSchedulerInstance(SchedulerType::Reduction);
-  SchedulerRuntimeInfo runtime_info(&fusion, {t0, idx});
-  auto heuristic_params =
-      reduction_scheduler->computeHeuristics(&fusion, runtime_info);
-  auto rparams = heuristic_params->as<ReductionParams>();
-
-  // Enforce vectorization so we can group them
-  const int vect_factor = 2;
-  rparams->vectorize_iter_dom = true;
-  rparams->unroll_factor_iter_dom = vect_factor;
-  // Enforce grid reduction, which requires a determined BIDy
-  // If the heuristic does not have a BIDy, bind it to 2
-  rparams->cross_grid_inner_reduction = true;
-  rparams->split_grid_dim_inner_reduction = true;
-  rparams->grid_dim_inner_reduction = ParallelType::BIDy;
-  if (!rparams->lparams.hasDim(ParallelType::BIDy)) {
-    rparams->lparams.bind(2L, ParallelType::BIDy);
-  }
-
-  reduction_scheduler->schedule(&fusion, rparams);
-
-  // lowering & check iteration grouped reductions
-  GpuLower gpulw(&fusion);
-  gpulw.run();
-  NVF_CHECK(
-      gpulw.kernel()->summary().has_iter_grouped_reductions,
-      "There must be iter domain grouped reductions.");
-  NVF_CHECK(
-      gpulw.kernel()->summary().num_grouped_iterations == vect_factor,
-      "Expected ",
-      vect_factor,
-      " grouped iterations, found ",
-      gpulw.kernel()->summary().num_grouped_iterations);
-
-  KernelExecutor ke;
-  auto lparams = rparams->lparams;
-  ke.compile(&fusion, {t0, idx}, lparams);
-  auto cg_outputs = ke.run({t0, idx}, {}, lparams);
-
-  auto t_gather = at::gather(t0, dim, idx);
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {t0, idx},
-      {t_gather.sum(0)},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST_F(GatherTest, SameTvUsedAsLookupAndIndex) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  // Create three input tensors
-  auto tv0 = makeContigTensor(2);
-  auto tv1 = makeContigTensor(2, DataType::Int);
-  auto tv2 = makeContigTensor(2, DataType::Int);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addInput(tv2);
-
-  auto tv3 = gather(tv0, 1, tv1);
-  auto tv4 = gather(tv1, 1, tv2);
-  auto tv5 = castOp(DataType::Float, tv4);
-  auto tv6 = add(tv3, tv5);
-  fusion.addOutput(tv6);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_i = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-
-  // Create test tensors
-  std::vector<int64_t> dims{4, 6};
-  at::Tensor t0 = at::randn(dims, options);
-  at::Tensor t1 = at::randint(0, dims[1], dims, options_i);
-  at::Tensor t2 = at::randint(0, dims[1], dims, options_i);
-
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
-  auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
-
-  auto runtime = executor_cache.getMostRecentKernelRuntime();
-  auto scheduled_fusion = runtime->executors()
-                              .back()
-                              ->as<KernelExecutor>()
-                              ->compiledKernel()
-                              ->kernel();
-  auto tv1_uses = scheduled_fusion->inputs().at(1)->uses();
-  EXPECT_EQ(tv1_uses.size(), 2);
-  EXPECT_THAT(
-      tv1_uses,
-      testing::UnorderedElementsAre(
-          testing::Truly([](Expr* e) { return e->isA<GatherOp>(); }),
-          testing::Truly([](Expr* e) { return e->isA<LoadStoreOp>(); })));
-
-  // Validate the result
-  testValidate(&fusion, cg_outputs, {t0, t1, t2}, __LINE__, __FILE__);
-}
 } // namespace nvfuser
diff --git a/tests/cpp/test_persistent_buffer.cpp b/tests/cpp/test_persistent_buffer.cpp
@@ -1941,7 +1941,9 @@ TEST_F(PersistentBufferTest, BufferGatherLookupTv) {
   auto tv2 = sum(tv1, {1});
   auto tv3 = broadcast(tv2, {false, true});
   auto tv4 = broadcast(index_tv, {false, true});
-  auto tv5 = gather(tv0, 1, tv4);
+  // Use takeAlongAxis rather than gather as codegen does not support
+  // the latter
+  auto tv5 = takeAlongAxis(tv0, tv4, 1);
   auto tv6 = maybeCastOp(DataType::BFloat16, tv5);
   auto tv7 = add(tv3, tv6);
   auto tv8 = add(tv1, tv7);
diff --git a/tests/cpp/test_reduction.cpp b/tests/cpp/test_reduction.cpp
@@ -2563,7 +2563,7 @@ TEST_F(ReductionTest, CrossEntropyGatherPattern) {
   fusion.addInput(labels);
 
   auto tv2 = broadcast(labels, {false, true});
-  auto tv3 = gather(log_probs, 1, tv2);
+  auto tv3 = takeAlongAxis(log_probs, tv2, 1);
   auto tv4 = squeeze(tv3, std::vector<bool>({false, true}));
 
   fusion.addOutput(tv4);