[XLA:GPU] Use Cub RaddixSort for bf16 sorts in Numpy order (NaNs go last).

thomasjoerg · Google-ML-Automation · commit 42a164f87bd5 · 2024-12-03T01:07:36.000-08:00
The support is limited to bf16. Generalizing this to other dtypes is straightforward and will follow in a separate change.

PiperOrigin-RevId: 702237308
diff --git a/xla/service/gpu/tests/gpu_cub_sort_test.cc b/xla/service/gpu/tests/gpu_cub_sort_test.cc
@@ -92,6 +92,39 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompare(std::move(hlo_module), ErrorSpec{0, 0}));
 }
 
+TEST_F(CubSortKeysTest, CompareToReferenceNumpyOrderGt) {
+  constexpr char kHlo[] = R"(
+numpy_order_comparator {
+  lhs = bf16[] parameter(0)
+  lhs_is_nan = pred[] compare(lhs, lhs), direction=NE
+  c_nan = bf16[] constant(nan)
+  c_zero = bf16[] constant(0)
+  lhs_is_zero = pred[] compare(lhs, c_zero), direction=EQ
+  lhs_no_neg_zero = bf16[] select(lhs_is_zero, c_zero, lhs)
+  lhs_no_neg_zero_or_nan = bf16[] select(lhs_is_nan, c_nan, lhs_no_neg_zero)
+  rhs = bf16[] parameter(1)
+  rhs_is_nan = pred[] compare(rhs, rhs), direction=NE
+  rhs_is_zero = pred[] compare(rhs, c_zero), direction=EQ
+  rhs_no_neg_zero = bf16[] select(rhs_is_zero, c_zero, rhs)
+  rhs_no_neg_zero_or_nan = bf16[] select(rhs_is_nan, c_nan, rhs_no_neg_zero)
+  ROOT compare.20017 = pred[] compare(lhs_no_neg_zero_or_nan, rhs_no_neg_zero_or_nan), direction=GT, type=TOTALORDER
+}
+
+ENTRY main {
+  p = bf16[8] parameter(0)
+  nans_and_zeros = bf16[8] constant({nan, -nan, nan, -nan, 0.0, -0.0, 0.0, -0.0})
+  values = bf16[16] concatenate(p, nans_and_zeros), dimensions={0}
+  ROOT sort = bf16[16] sort(values), dimensions={0}, is_stable=true, to_apply=numpy_order_comparator
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_hlo_module,
+                          GetOptimizedModule(kHlo));
+  EXPECT_TRUE(HloWasRewrittenToUseCubSort(*optimized_hlo_module));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_TRUE(RunAndCompare(std::move(hlo_module), ErrorSpec{0, 0}));
+}
+
 // This test verifies an issue where sort was launched on the wrong stream,
 // causing subtle timing bugs: b/347239322.
 TEST_P(CubSortKeysTest, SortWithSlice) {
diff --git a/xla/service/gpu/transforms/BUILD b/xla/service/gpu/transforms/BUILD
@@ -2846,12 +2846,13 @@ cc_library(
     visibility = ["//xla/service/gpu:__subpackages__"] + if_google(["//learning/brain/engprod/xwatch:__subpackages__"]),
     deps = [
         "//xla:comparison_util",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "//xla/hlo/transforms:stable_sort_expander",
+        "//xla/service:pattern_matcher",
         "//xla/service/gpu:cublas_cudnn",
         "//xla/service/gpu/runtime:cub_sort_thunk",
         "@com_google_absl//absl/container:flat_hash_set",
diff --git a/xla/service/gpu/transforms/sort_rewriter.cc b/xla/service/gpu/transforms/sort_rewriter.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/sort_rewriter.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -31,8 +32,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal_util.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/runtime/cub_sort_thunk.h"
+#include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
@@ -45,13 +49,75 @@ namespace xla {
 namespace gpu {
 namespace {
 
+namespace m = match;
+
+// Floating point numbers can be sorted in two ways:
+// * Default order (aka total order):
+//   -NaN < -Inf < -Finite < -0 < +0 < +Finite < +Inf < +NaN.
+// * Numpy sorts NaNs last, even when negative:
+//   -Inf < -Finite < +/-0 < +Finite < +Inf < +/-NaN.
+//   Note that negative and positive zeros are considered equal and appear in
+//   the result in the same order as they appear in the input. The same applies
+//   to negative and positive NaNs.
+enum class SortOrderType {
+  kDefaultOrder,
+  kNumpyOrder,
+};
+
 // Analyze sort comparer function.
 struct SortComputationAnalysis {
   int key_operand;  // 0 or 1
   bool descending;
+  SortOrderType sort_order;
+  PrimitiveType key_type;
+  std::optional<PrimitiveType> value_type;
 };
 
-std::pair<int64_t, int64_t> ParametersFromCmpOperands(
+bool MatchConstNan(const HloInstruction* op) {
+  const auto const_nan = DynCast<HloConstantInstruction>(op);
+  if (const_nan == nullptr) {
+    return false;
+  }
+  return const_nan->literal().GetAsString({}) == "nan";
+}
+
+// Matches the HLO pattern used to ensure Numpy sort order. This is how JAX
+// lowers `lax.sort` to HLO comparators.
+int ParamNumberOfCanonicalizedZerosAndNans(const HloInstruction* select) {
+  const HloInstruction* param = nullptr;
+  const HloInstruction* maybe_const_nan;
+  if (!Match(select,
+             m::Select(
+                 m::Compare(m::Parameter(&param), m::Parameter(&param))
+                     .WithComparisonDirection(ComparisonDirection::kNe),
+                 m::Constant(&maybe_const_nan),
+                 m::Select(
+                     m::Compare(m::Parameter(&param),
+                                m::ConstantEffectiveScalar(0))
+                         .WithComparisonDirection(ComparisonDirection::kEq),
+                     m::ConstantEffectiveScalar(0), m::Parameter(&param))))) {
+    return -1;
+  }
+  if (!MatchConstNan(maybe_const_nan)) {
+    return -1;
+  }
+  return param->parameter_number();
+}
+
+// Returns numbers of the parameters used in a comparator for Numpy sort order.
+std::pair<int64_t, int64_t> ParamNumberOfNumpySortComparator(
+    const HloCompareInstruction* cmp_op) {
+  const HloInstruction *select0, *select1;
+  if (!Match(cmp_op, m::Compare(m::Op(&select0), m::Op(&select1)))) {
+    return std::pair<int64_t, int64_t>(-1, -1);
+  }
+  return std::pair<int64_t, int64_t>(
+      ParamNumberOfCanonicalizedZerosAndNans(select0),
+      ParamNumberOfCanonicalizedZerosAndNans(select1));
+}
+
+// Returns numbers of the parameters used in a simple comparator.
+std::pair<int64_t, int64_t> ParamNumberOfSimpleSortComparator(
     const HloCompareInstruction* cmp_op) {
   if (cmp_op == nullptr) {
     return std::pair<int64_t, int64_t>(-1, -1);
@@ -79,10 +145,25 @@ std::optional<SortComputationAnalysis> AnalyzeCompareOp(
     return std::nullopt;
   }
 
-  // Compare should operate on the function parameters for a single tensor.
-  auto [index0, index1] = ParametersFromCmpOperands(compare);
-  if (index0 == -1 || index1 == -1) {
-    return std::nullopt;
+  // Determine the sort order and the parameters used in the comparator.
+  SortOrderType sort_order;
+  int64_t index0, index1;
+  auto [simple_sort_index0, simple_sort_index1] =
+      ParamNumberOfSimpleSortComparator(compare);
+  if (simple_sort_index0 != -1 && simple_sort_index1 != -1) {
+    sort_order = SortOrderType::kDefaultOrder;
+    index0 = simple_sort_index0;
+    index1 = simple_sort_index1;
+  } else {
+    auto [numpy_sort_index0, numpy_sort_index1] =
+        ParamNumberOfNumpySortComparator(compare);
+    if (numpy_sort_index0 != -1 && numpy_sort_index1 != -1) {
+      sort_order = SortOrderType::kNumpyOrder;
+      index0 = numpy_sort_index0;
+      index1 = numpy_sort_index1;
+    } else {
+      return std::nullopt;
+    }
   }
 
   // When sorting a pair of tensors, the parameters should be adjacent.
@@ -95,27 +176,54 @@ std::optional<SortComputationAnalysis> AnalyzeCompareOp(
   bool descending = compare->direction() == ComparisonDirection::kGt ||
                     compare->direction() == ComparisonDirection::kGe;
   bool reverse = first_index != index0;
-  return SortComputationAnalysis{first_index / 2, descending != reverse};
+  return SortComputationAnalysis{first_index / 2, descending != reverse,
+                                 sort_order};
 }
 
 std::optional<SortComputationAnalysis> AnalyzeSortOp(
     const HloSortInstruction& sort_op) {
   auto computation = sort_op.called_computations().front();
 
-  // Check if the computation is a simple compare op on the operands.
-  return AnalyzeCompareOp(computation->root_instruction());
+  auto sort_analysis = AnalyzeCompareOp(computation->root_instruction());
+  if (!sort_analysis.has_value()) {
+    return std::nullopt;
+  }
+
+  PrimitiveType sort_key_type =
+      sort_op.operand(sort_analysis->key_operand)->shape().element_type();
+  // Sort values are only present if sorting a pair of tensors.
+  std::optional<PrimitiveType> sort_value_type;
+  if (sort_op.operand_count() == 2) {
+    // The value operand of the sort op is either 0 or 1, the opposite of the
+    // key operand.
+    int value_index = 1 - sort_analysis->key_operand;
+    sort_value_type = sort_op.operand(value_index)->shape().element_type();
+  }
+  // For sorting in Numpy order, synthetic keys are materialized. The synthetic
+  // keys and the original values are sorted as pairs.
+  if (sort_analysis->sort_order == SortOrderType::kNumpyOrder) {
+    // TODO(tjoerg): Add support for dtypes besides bf16.
+    if (sort_key_type != BF16) {
+      return std::nullopt;
+    }
+    // Sorting a pair of input tensors is not supported. The keys to sort on
+    // will be generated synthetically.
+    if (sort_op.operand_count() != 1) {
+      return std::nullopt;
+    }
+    sort_key_type = U16;
+    sort_value_type = BF16;
+  }
+  return SortComputationAnalysis{
+      sort_analysis->key_operand, sort_analysis->descending,
+      sort_analysis->sort_order, sort_key_type, sort_value_type};
 }
 
 // Create runner for CUB sort operation.
 absl::StatusOr<std::unique_ptr<CubSortRunnerInterface>> CreateRunner(
-    const HloSortInstruction* sort_op,
     const SortComputationAnalysis& sort_analysis) {
-  int value_index = 1 - sort_analysis.key_operand;
-  return CubSortRunnerInterface::Create(
-      sort_op->operand(sort_analysis.key_operand)->shape().element_type(),
-      sort_op->operand_count() == 2
-          ? std::optional(sort_op->operand(value_index)->shape().element_type())
-          : std::nullopt);
+  return CubSortRunnerInterface::Create(sort_analysis.key_type,
+                                        sort_analysis.value_type);
 }
 
 // Restore the result shape after sorting a pair of tensors.
@@ -131,6 +239,65 @@ HloInstruction* UnpackResultPair(HloSortInstruction* sort_op,
   return sort_op->AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
 }
 
+// Add HLO ops to materialize sort keys for Numpy sort order from the sort op's
+// operand.
+HloInstruction* AddNumpySortKey(HloInstruction* operand) {
+  Shape value_shape = operand->shape();
+  Shape key_shape = ShapeUtil::ChangeElementType(value_shape, U16);
+  Shape pred_shape = ShapeUtil::ChangeElementType(value_shape, PRED);
+  // Canonicalize zeros, i.e. replace -0 with +0.
+  HloInstruction* const_zero = operand->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(BF16)));
+  HloInstruction* broadcasted_zero = operand->AddInstruction(
+      HloInstruction::CreateBroadcast(value_shape, const_zero, {}));
+  HloInstruction* is_zero =
+      operand->AddInstruction(HloInstruction::CreateCompare(
+          pred_shape, operand, broadcasted_zero, ComparisonDirection::kEq));
+  HloInstruction* canonicalized_zeros =
+      operand->AddInstruction(HloInstruction::CreateTernary(
+          value_shape, HloOpcode::kSelect, is_zero, broadcasted_zero, operand));
+  // Canonicalize NaNs, i.e. replace -NaN with NaN.
+  HloInstruction* const_nan = operand->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::NanValue(BF16).value()));
+  HloInstruction* broadcasted_nan = operand->AddInstruction(
+      HloInstruction::CreateBroadcast(value_shape, const_nan, {}));
+  // Only NaNs are not equal to themselves.
+  HloInstruction* is_nan =
+      operand->AddInstruction(HloInstruction::CreateCompare(
+          pred_shape, operand, operand, ComparisonDirection::kNe));
+  HloInstruction* canonicalized_nans = operand->AddInstruction(
+      HloInstruction::CreateTernary(value_shape, HloOpcode::kSelect, is_nan,
+                                    broadcasted_nan, canonicalized_zeros));
+  // To convert the input values into a radix-sortable bitwise representation,
+  // the following transformations take place prior to sorting:
+  // * For positive floating point values, the sign bit is inverted.
+  // * For negative floating point values, the full key is inverted.
+  HloInstruction* is_negative =
+      operand->AddInstruction(HloInstruction::CreateCompare(
+          pred_shape, canonicalized_nans, broadcasted_zero,
+          ComparisonDirection::kLt));
+  HloInstruction* bitcast_convert = operand->AddInstruction(
+      HloInstruction::CreateBitcastConvert(key_shape, canonicalized_nans));
+  HloInstruction* constant_8000 = operand->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint16_t>(32768)));
+  HloInstruction* broadcasted_8000 = operand->AddInstruction(
+      HloInstruction::CreateBroadcast(key_shape, constant_8000, {}));
+  HloInstruction* inverted_sign =
+      operand->AddInstruction(HloInstruction::CreateBinary(
+          key_shape, HloOpcode::kXor, broadcasted_8000, bitcast_convert));
+  HloInstruction* constant_ffff = operand->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint16_t>(65535)));
+  HloInstruction* broadcasted_ffff = operand->AddInstruction(
+      HloInstruction::CreateBroadcast(key_shape, constant_ffff, {}));
+  HloInstruction* inverted_bits =
+      operand->AddInstruction(HloInstruction::CreateBinary(
+          key_shape, HloOpcode::kXor, broadcasted_ffff, bitcast_convert));
+  HloInstruction* sort_keys = operand->AddInstruction(
+      HloInstruction::CreateTernary(key_shape, HloOpcode::kSelect, is_negative,
+                                    inverted_bits, inverted_sign));
+  return sort_keys;
+}
+
 }  // namespace
 
 // Rewrites a single sort instruction with a custom call.
@@ -144,7 +311,7 @@ absl::StatusOr<bool> SortRewriter::RunOnInstruction(
   int64_t batch_size = Product(operand_shape.dimensions()) /
                        operand_shape.dimensions(sort_op->sort_dimension());
 
-  TF_ASSIGN_OR_RETURN(auto runner, CreateRunner(sort_op, sort_analysis));
+  TF_ASSIGN_OR_RETURN(auto runner, CreateRunner(sort_analysis));
   TF_ASSIGN_OR_RETURN(
       int64_t scratch_size,
       runner->GetScratchSize(Product(operand_shape.dimensions()), batch_size));
@@ -156,12 +323,22 @@ absl::StatusOr<bool> SortRewriter::RunOnInstruction(
   }
 
   // Values are only present if sorting a pair of tensors.
-  HloInstruction* keys = sort_op->mutable_operand(sort_analysis.key_operand);
+  HloInstruction* keys;
   HloInstruction* values = nullptr;
+  bool sorting_pairs = sort_op->operand_count() == 2;
+
+  keys = sort_op->mutable_operand(sort_analysis.key_operand);
   int value_index = 1 - sort_analysis.key_operand;
-  if (sort_op->operand_count() == 2) {
+  if (sorting_pairs) {
     values = sort_op->mutable_operand(value_index);
   }
+  // For sorting in Numpy order, materialize synthetic keys and treat the
+  // original input as values.
+  if (sort_analysis.sort_order == SortOrderType::kNumpyOrder) {
+    sorting_pairs = true;
+    keys = AddNumpySortKey(sort_op->mutable_operand(sort_analysis.key_operand));
+    values = sort_op->mutable_operand(sort_analysis.key_operand);
+  }
 
   // Build the resulting shape for the custom call.
   std::vector<Shape> shapes{keys->shape()};
@@ -184,10 +361,14 @@ absl::StatusOr<bool> SortRewriter::RunOnInstruction(
 
   // Build the replacement instruction.
   HloInstruction* replacement;
-  if (sort_op->operand_count() == 1) {
+  if (!sorting_pairs) {
     replacement =
         sort_op->parent()->AddInstruction(HloInstruction::CreateGetTupleElement(
             sort_op->shape(), custom_call, 0));
+  } else if (sort_analysis.sort_order == SortOrderType::kNumpyOrder) {
+    // Discard the synthetic keys generated for sorting in Numpy order.
+    replacement = sort_op->AddInstruction(
+        HloInstruction::CreateGetTupleElement(values->shape(), custom_call, 1));
   } else {
     replacement = UnpackResultPair(sort_op, custom_call,
                                    /*swap=*/sort_analysis.key_operand == 1);
@@ -254,7 +435,7 @@ bool IsCubCompatibleSort(const HloSortInstruction* sort_op) {
     VLOG(2) << "Only simple compare computations are supported";
     return false;
   }
-  if (!CreateRunner(sort_op, *sort_analysis).ok()) {
+  if (!CreateRunner(*sort_analysis).ok()) {
     VLOG(2) << "Unsupported operand types (no compiled CUB kernels)";
     return false;
   }
diff --git a/xla/service/gpu/transforms/sort_rewriter_test.cc b/xla/service/gpu/transforms/sort_rewriter_test.cc