[XLA:GPU] Enable vectorization of the indices operand for scatter.

pifon2a · Google-ML-Automation · commit a32e66756cdf · 2025-01-14T04:50:39.000-08:00
PiperOrigin-RevId: 715329926
diff --git a/xla/backends/gpu/codegen/emitters/scatter.cc b/xla/backends/gpu/codegen/emitters/scatter.cc
@@ -81,6 +81,7 @@ using emitters::PartitionedComputations;
 using emitters::ProvideParameter;
 using llvm::APFloat;
 using llvm::APInt;
+using llvm::ArrayRef;
 using llvm::SmallVector;
 using mlir::AffineExpr;
 using mlir::AffineMap;
@@ -171,7 +172,7 @@ Value UpdateIsInbounds(ImplicitLocOpBuilder& b, Value is_inbounds,
       .front();
 }
 
-SmallVector<Value> Pack(llvm::ArrayRef<ValueRange> ranges) {
+SmallVector<Value> Pack(ArrayRef<ValueRange> ranges) {
   int64_t total_size = 0;
   for (auto& range : ranges) {
     total_size += range.size();
@@ -184,8 +185,7 @@ SmallVector<Value> Pack(llvm::ArrayRef<ValueRange> ranges) {
   return result;
 }
 
-SmallVector<ValueRange> Unpack(ValueRange range,
-                               llvm::ArrayRef<int64_t> sizes) {
+SmallVector<ValueRange> Unpack(ValueRange range, ArrayRef<int64_t> sizes) {
   int64_t total_size = 0;
   for (auto& size : sizes) {
     total_size += size;
@@ -213,29 +213,42 @@ SmallVector<Value, 4> PadWithZeros(ValueRange values, int64_t size,
 }
 
 // Creates a new indexing map that is the same as `map` but with the range
-// variable at `range_var_index` replaced with the new dimension variable at
-// `dimension_{dim_var_size)`. Potentially, it can be moved to indexing_map.h.
-IndexingMap ConvertRangeVariableToDimension(const IndexingMap& map,
-                                            int64_t range_var_index) {
+// variables at `range_var_indices` converted to the new dimensions variables at
+// and added to the end of dimension variables list. Potentially, it can be
+// moved to indexing_map.h.
+IndexingMap ConvertRangeVariableToDimension(
+    const IndexingMap& map, ArrayRef<int64_t> range_var_indices) {
+  CHECK(std::is_sorted(range_var_indices.begin(), range_var_indices.end()));
   auto* mlir_context = map.GetMLIRContext();
 
   AffineMap affine_map = map.GetAffineMap();
-  // Update the affine map.
+  // Update the affine map and the variables.
+  std::vector<IndexingMap::Variable> dims = map.GetDimVars();
+  std::vector<IndexingMap::Variable> range_vars;
+  std::vector<IndexingMap::Variable> rt_vars = map.GetRTVars();
   SmallVector<AffineExpr, 4> symbol_replacements;
   symbol_replacements.reserve(affine_map.getNumSymbols());
+  int64_t range_var_count = 0;
+  int64_t range_var_indices_count = range_var_indices.size();
   for (int i = 0; i < affine_map.getNumSymbols(); ++i) {
-    if (i == range_var_index) {
+    auto range_var = map.GetRangeVar(i);
+    if (range_var_count < range_var_indices_count &&
+        i == range_var_indices[range_var_count]) {
       symbol_replacements.push_back(
           getAffineDimExpr(affine_map.getNumDims(), mlir_context));
+      dims.push_back(range_var);
+      range_var_count++;
     } else {
       symbol_replacements.push_back(
-          getAffineSymbolExpr(i > range_var_index ? i - 1 : i, mlir_context));
+          getAffineSymbolExpr(i - range_var_count, mlir_context));
+      range_vars.push_back(range_var);
     }
   }
 
   AffineMap converted_affine_map = affine_map.replaceDimsAndSymbols(
-      {}, symbol_replacements, affine_map.getNumDims() + 1,
-      affine_map.getNumSymbols() - 1);
+      {}, symbol_replacements,
+      affine_map.getNumDims() + range_var_indices_count,
+      affine_map.getNumSymbols() - range_var_indices_count);
 
   // Update the constraints.
   std::vector<std::pair<AffineExpr, Interval>> constraints;
@@ -244,13 +257,6 @@ IndexingMap ConvertRangeVariableToDimension(const IndexingMap& map,
     constraints.push_back({constraint.first.replaceSymbols(symbol_replacements),
                            constraint.second});
   }
-  // Update the variables.
-  std::vector<IndexingMap::Variable> dims = map.GetDimVars();
-  std::vector<IndexingMap::Variable> range_vars = map.GetRangeVars();
-  std::vector<IndexingMap::Variable> rt_vars = map.GetRTVars();
-
-  dims.push_back(range_vars[range_var_index]);
-  range_vars.erase(range_vars.begin() + range_var_index);
   return IndexingMap{converted_affine_map, std::move(dims),
                      std::move(range_vars), std::move(rt_vars), constraints};
 }
@@ -299,7 +305,8 @@ class EmitterHelper {
 
   Value WriteAccumulatorToOutput(ImplicitLocOpBuilder& b,
                                  Value write_to_output_required,
-                                 ValueRange thread_and_block_ids, Value iv,
+                                 ValueRange thread_and_block_ids,
+                                 Value index_id,
                                  const IndexingMap& slice_indexing,
                                  ValueRange offsets, Value accumulator,
                                  Value output_tensor) const;
@@ -370,10 +377,10 @@ SmallVector<Value> EmitterHelper::WriteAccumulatedElementToOutput(
 
 Value EmitterHelper::WriteAccumulatorToOutput(
     ImplicitLocOpBuilder& b, Value write_to_output_required,
-    ValueRange thread_and_block_ids, Value iv,
+    ValueRange thread_and_block_ids, Value index_id,
     const IndexingMap& slice_indexing, ValueRange offsets, Value accumulator,
     Value output_tensor) const {
-  SmallVector<Value> dims = Pack({thread_and_block_ids, iv});
+  SmallVector<Value> dims = Pack({thread_and_block_ids, index_id});
   return EmitUpdateIf(
              b, write_to_output_required, output_tensor,
              [&](ImplicitLocOpBuilder& if_builder) -> SmallVector<Value> {
@@ -569,9 +576,9 @@ absl::Status ScatterWithDistributedUpdates::EmitEntryFunctionImpl(
     ValueRange thread_and_block_ids, Value output_tensor) const {
   if (VLOG_IS_ON(5)) {
     llvm::errs() << "Settings for ScatterWithDistributedUpdates: \n"
-                 << "vector_size_: " << vector_size_ << "\n"
-                 << "num_warps_: " << num_warps_ << "\n"
-                 << "num_blocks_: " << num_blocks_;
+                 << "vector_size: " << vector_size_ << "\n"
+                 << "num_warps: " << num_warps_ << "\n"
+                 << "num_blocks: " << num_blocks_ << "\n";
   }
   EmitNaiveImplementation(b, description_, helper, updates_map, indices_map,
                           thread_and_block_ids, output_tensor);
@@ -581,10 +588,11 @@ absl::Status ScatterWithDistributedUpdates::EmitEntryFunctionImpl(
 ScatterWithDistributedIndices::ScatterWithDistributedIndices(
     const HloFusionAnalysis& analysis, const ScatterDescription& description,
     int64_t vector_size, int64_t num_warps_per_slice,
-    int64_t num_indices_per_warp)
+    int64_t num_indices_per_warp, int64_t indices_vector_size)
     : ScatterFusion(analysis, description, vector_size),
       num_warps_per_slice_(num_warps_per_slice),
-      num_indices_per_warp_(num_indices_per_warp) {
+      num_indices_per_warp_(num_indices_per_warp),
+      indices_vector_size_(indices_vector_size) {
   num_warps_ = kNumWarpsPerBlock;
   num_blocks_ = CeilOfRatio(description.num_slices * num_warps_per_slice_,
                             num_indices_per_warp_ * num_warps_);
@@ -605,32 +613,40 @@ void ScatterWithDistributedIndices::ComputeIndexing(
       (block_x * num_warps_ + warp_id) % num_warps_per_slice_;
   auto lane_id = thread_x % warp_size_;
   auto index_id_loop = getAffineSymbolExpr(0, ctx);
+  auto index_vector_id = getAffineSymbolExpr(1, ctx);
 
-  auto index_id_expr = slice_id * num_indices_per_warp_ + index_id_loop;
-  std::pair<AffineExpr, Interval> index_id_constraint =
-      std::make_pair(index_id_expr, Interval{0, description_.num_slices - 1});
+  auto vectorized_index_id_expr = slice_id * num_indices_per_warp_ +
+                                  index_id_loop * indices_vector_size_ +
+                                  index_vector_id;
 
   auto grid_vars =
       DimVarsFromGPUGrid({num_warps_ * warp_size_, 1, 1, num_blocks_, 1, 1});
   if (indices_map) {
-    auto index_dim_loop = getAffineSymbolExpr(1, ctx);
+    auto index_dim_loop = getAffineSymbolExpr(2, ctx);
     *indices_map = IndexingMap{
-        AffineMap::get(6, 2, {index_id_expr, index_dim_loop}, ctx),
+        AffineMap::get(6, 3, {vectorized_index_id_expr, index_dim_loop}, ctx),
         grid_vars,
-        {IndexingMap::Variable{{0, num_indices_per_warp_ - 1}, "index_id_loop"},
+        {IndexingMap::Variable{
+             {0, num_indices_per_warp_ / indices_vector_size_ - 1},
+             "index_id_loop"},
+         IndexingMap::Variable{{0, indices_vector_size_ - 1},
+                               "index_vector_id"},
          IndexingMap::Variable{{0, description_.index_vector_length - 1},
                                "index_dim"}},
         /*rt_vars=*/{},
-        {index_id_constraint}};
+        {std::make_pair(vectorized_index_id_expr,
+                        Interval{0, description_.num_slices - 1})}};
 
     indices_map->Simplify();
   }
 
   if (updates_map) {
+    auto index_id = getAffineSymbolExpr(0, ctx);
     auto update_dim_loop = getAffineSymbolExpr(1, ctx);
     auto vector_id = getAffineSymbolExpr(2, ctx);
     auto num_elements_per_slice = Product(description_.slice_shape);
 
+    auto index_id_expr = slice_id * num_indices_per_warp_ + index_id;
     auto linear_slice_index =
         warp_id_in_slice * warp_size_ * vector_size_ +
         update_dim_loop * vector_size_ * warp_size_ * num_warps_per_slice_ +
@@ -652,7 +668,8 @@ void ScatterWithDistributedIndices::ComputeIndexing(
          IndexingMap::Variable{{0, vector_size_ - 1}, "vector_id"}},
         /*rt_vars=*/{},
         std::vector<std::pair<AffineExpr, Interval>>{
-            index_id_constraint,
+            std::make_pair(index_id_expr,
+                           Interval{0, description_.num_slices - 1}),
             std::make_pair(linear_slice_index,
                            Interval{0, num_elements_per_slice - 1})}};
 
@@ -695,11 +712,12 @@ absl::Status ScatterWithDistributedIndices::EmitEntryFunctionImpl(
     ValueRange thread_and_block_ids, Value output_tensor) const {
   if (VLOG_IS_ON(5)) {
     llvm::errs() << "Settings for ScatterWithDistributedIndices: \n"
-                 << "vector_size_: " << vector_size_ << "\n"
-                 << "num_warps_: " << num_warps_ << "\n"
-                 << "num_blocks_: " << num_blocks_
-                 << "num_warps_per_slice_: " << num_warps_per_slice_ << "\n"
-                 << "num_indices_per_warp_: " << num_indices_per_warp_;
+                 << "vector_size: " << vector_size_ << "\n"
+                 << "num_warps: " << num_warps_ << "\n"
+                 << "num_blocks: " << num_blocks_ << "\n"
+                 << "num_warps_per_slice: " << num_warps_per_slice_ << "\n"
+                 << "num_indices_per_warp: " << num_indices_per_warp_ << "\n"
+                 << "indices_vector_size: " << indices_vector_size_ << "\n";
   }
   if (num_indices_per_warp_ == 1) {
     EmitNaiveImplementation(b, description_, helper, updates_map, indices_map,
@@ -709,12 +727,17 @@ absl::Status ScatterWithDistributedIndices::EmitEntryFunctionImpl(
   MLIRContext* mlir_context = b.getContext();
 
   auto thread_id_to_update_id_map = IndexingMap(
-      AffineMap::get(6, 1, {updates_map.GetAffineMap().getResult(0)},
+      AffineMap::get(6, 2, {indices_map.GetAffineMap().getResult(0)},
                      mlir_context),
-      updates_map.GetDimVars(),
-      /*range_vars = */ {updates_map.GetRangeVars().front()},
-      /*rt vars = */ {});
-  IndexingMap slice_indexing = ConvertRangeVariableToDimension(updates_map, 0);
+      indices_map.GetDimVars(),
+      /*range_vars = */
+      {indices_map.GetRangeVars().begin(),
+       indices_map.GetRangeVars().begin() + 2},
+      /*rt vars = */ {}, indices_map.GetConstraints());
+
+  // Convert index_id_loop and index_vector_id to dimension variables.
+  IndexingMap slice_indexing =
+      ConvertRangeVariableToDimension(updates_map, {0});
 
   // Prepare loop initial values. Inits are packed as
   // [index_changed, is_inbounds, index_0,  ..., accumulator].
@@ -740,7 +763,14 @@ absl::Status ScatterWithDistributedIndices::EmitEntryFunctionImpl(
     Value iter_is_inbounds = iter_args_unpack[2].front();
     Value iter_acc = iter_args_unpack[3].front();
     Value iter_output = iter_args_unpack[4].front();
-    Value iter_slice_id = ivs.front();
+    CHECK_EQ(ivs.size(), 2);
+    Value index_loop_id = ivs.front();
+    Value index_vector_id = ivs.back();
+    Value iter_slice_id = nested_b.create<arith::AddIOp>(
+        nested_b.create<arith::MulIOp>(
+            index_loop_id,
+            nested_b.create<arith::ConstantIndexOp>(indices_vector_size_)),
+        index_vector_id);
 
     SmallVector<Value> offsets =
         PadWithZeros(trimmed_offsets, output_rank, nested_b);
@@ -926,32 +956,44 @@ std::unique_ptr<ScatterFusion> CreateScatterFusion(
 
   int64_t max_active_warps =
       kNumWarpsPerBlock * analysis.device_info().core_count();
-  // For sorted scatter, we try to estimate the number of updates per warp by
-  // computing the ratio of the number of the given updates to the number of the
-  // possible valid indices. If we do not have multiple updates per warp, there
-  // is no reason to use this algorithm.
   // TODO(b/385081952): Investigate why bf16 and f64 leads to incorrect results.
-  if (description.scatter->indices_are_sorted() &&
-      description.elem_type != BF16 && num_slices > 2 * max_active_warps) {
-    int64_t num_indices_per_warp = CeilOfRatio(
-        num_slices, GetNumPossibleValidIndices(
-                        description.slice_shape, description.output_shape,
-                        description.index_vector_length));
-    int64_t num_warps_per_slice = 1;
-    if (num_indices_per_warp > 2 &&
-        num_active_threads_per_warp > warp_size / 2) {
-      return std::make_unique<ScatterWithDistributedIndices>(
-          analysis, description, vector_size, num_warps_per_slice,
-          num_indices_per_warp);
-    }
-  }
   // If we have enough data, we assign each warp to process a single
   // slice.
   if (num_slices > max_active_warps &&
       num_active_threads_per_warp > warp_size / 2) {
+    int64_t num_indices_per_warp = 1;
+    int64_t indices_vector_size = 1;
+    int64_t num_warps_per_slice = 1;
+    // For sorted scatter, we try to estimate the number of updates per warp by
+    // computing the ratio of the number of the given updates to the number of
+    // the possible valid indices. If we do not have multiple updates per warp,
+    // there is no reason to use this algorithm.
+    if (description.scatter->indices_are_sorted()) {
+      num_indices_per_warp = CeilOfRatio(
+          num_slices,
+          std::max(max_active_warps,
+                   GetNumPossibleValidIndices(
+                       description.slice_shape, description.output_shape,
+                       description.index_vector_length)));
+
+      // If the index_vector_length is 1, we can vectorize the indices read.
+      int64_t index_elem_type_bits = primitive_util::BitWidth(
+          description.scatter->scatter_indices()->shape().element_type());
+      int64_t max_vectorized_indices =
+          kMaxVectorizedBits / index_elem_type_bits;
+      if (description.index_vector_length == 1 &&
+          num_indices_per_warp > max_vectorized_indices) {
+        // Pad num_indices_per_warp to the next multiple of
+        // max_vectorized_indices.
+        num_indices_per_warp =
+            CeilOfRatio(num_indices_per_warp, max_vectorized_indices) *
+            max_vectorized_indices;
+        indices_vector_size = max_vectorized_indices;
+      }
+    }
     return std::make_unique<ScatterWithDistributedIndices>(
-        analysis, description, vector_size,
-        /*num_warps_per_slice=*/1, /*num_indices_per_warp=*/1);
+        analysis, description, vector_size, num_warps_per_slice,
+        num_indices_per_warp, indices_vector_size);
   }
   // Otherwise, we distribute the linearized updates tensor.
   vector_size =
diff --git a/xla/backends/gpu/codegen/emitters/scatter.h b/xla/backends/gpu/codegen/emitters/scatter.h
@@ -147,26 +147,29 @@ class ScatterWithDistributedUpdates : public ScatterFusion {
  %acc = vector<num_iter x vector_size>
 
  // #indices_map
- %updated_accumulator, %updated_out = for %i = 0 to %num_indices_per_warp_ {
-  %new_indices = PadWithZeros(ExtractOffsets(%indices_operand, %i))
+ %updated_accumulator, %updated_out
+    = for %i = 0 to %num_indices_per_warp_  step %indices_vector_size_ {
+        for %j = 0 to %indices_vector_size_  step 1 {
+  %index = %i * %indices_vector_size_ + %j + %index_start(bl_x, th_x)
+  %new_indices = PadWithZeros(ExtractOffsets(%indices_operand, %index))
   %indices_changed = EmitInequalityCheck(%new_indices, %indices)
-  if (%indices_changed && %i != 0) {
+  if (%indices_changed && %index != 0) {
     %output_tensor = WriteAccumulatorToOutput(%current_acc, %current_out);
   }
   if (%indices_changed) {
     %inbounds = EmitBoundsCheck(%new_indices, %slice_shape, %output_shape)
   }
   if (%inbounds) {
     if (%indices_changed) {
-     // updates_map(%i)
+     // updates_map(%index)
      for %j = 0 to %num_slice_iterations_per_warp step 1 {
        for %k = 0 to %vector_size step 1 {
          %update_elem = GetUpdateElement
          %acc = %update_elem
        }
      }
     } else {
-     // updates_map(%i)
+     // updates_map(%index)
      for %j = 0 to %num_slice_iterations_per_warp step 1 {
        for %k = 0 to %vector_size step 1 {
          %update_elem = GetUpdateElement
@@ -184,7 +187,8 @@ class ScatterWithDistributedIndices : public ScatterFusion {
                                          const ScatterDescription& description,
                                          int64_t vector_size,
                                          int64_t num_warps_per_slice,
-                                         int64_t num_indices_per_warp);
+                                         int64_t num_indices_per_warp,
+                                         int64_t indices_vector_size);
 
  protected:
   void ComputeIndexing(mlir::MLIRContext* ctx, IndexingMap* updates_map,
@@ -206,6 +210,8 @@ class ScatterWithDistributedIndices : public ScatterFusion {
   // The number of indices that every warp iterates over. This is a useful
   // setting, if we know that the indices tensor is sorted.
   int64_t num_indices_per_warp_;
+  // Vector size for the indices operand.
+  int64_t indices_vector_size_;
 };
 
 std::unique_ptr<ScatterFusion> CreateScatterFusion(
diff --git a/xla/service/gpu/fusions/tests/scatter/sorted_indices.hlo b/xla/service/gpu/fusions/tests/scatter/sorted_indices.hlo