[XLA:GPU] Require packed dot operands to be packed along contracting dimension.

mooskagh · Google-ML-Automation · commit be9504c79a90 · 2025-01-14T06:32:53.000-08:00
For now, only do that if `--xla_gpu_experimental_pack_dot_operands_along_k_dimension` is set.

PiperOrigin-RevId: 715355925
diff --git a/xla/debug_options_flags.cc b/xla/debug_options_flags.cc
@@ -323,6 +323,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_pjrt_allow_auto_layout_in_hlo(false);
   opts.set_xla_gpu_enable_scatter_determinism_expander(true);
   opts.set_xla_gpu_unsupported_enable_ragged_all_to_all_decomposer(false);
+  opts.set_xla_gpu_experimental_pack_dot_operands_along_k_dimension(false);
   return opts;
 }
 
@@ -2230,6 +2231,13 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "Enable windowed einsum rewrite for all-to-all+gemm pattern, "
       "This optimization slices the all-to-all into smaller all-to-alls."
       "It is an experimental feature."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_pack_dot_operands_along_k_dimension",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_gpu_experimental_pack_dot_operands_along_k_dimension),
+      debug_options->xla_gpu_experimental_pack_dot_operands_along_k_dimension(),
+      "For sub-byte dot operands, layout them along contracting dimensions."));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/xla/service/gpu/fusions/triton/triton_fusion_emitter_device_legacy_test.cc b/xla/service/gpu/fusions/triton/triton_fusion_emitter_device_legacy_test.cc
@@ -4136,18 +4136,22 @@ HloModule m
 ENTRY e {
   parameter_0 = bf16[32,4,36]{2,1,0} parameter(0)
   parameter_1 = bf16[40,4,36]{2,1,0} parameter(1)
-  ROOT dot.16450 = bf16[4,32,40]{2,1,0} dot(parameter_0, parameter_1), lhs_batch_dims={1}, lhs_contracting_dims={2}, rhs_batch_dims={1}, rhs_contracting_dims={2}
+  ROOT dot.16450 = bf16[4,32,40]{2,1,0} dot(parameter_0, parameter_1),
+      lhs_batch_dims={1}, lhs_contracting_dims={2},
+      rhs_batch_dims={1}, rhs_contracting_dims={2}
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
 
+  // The contracting dims were already minor, so the layout is unchanged
+  // (non-major batch dims are fine).
   EXPECT_THAT(module->entry_computation()
                   ->root_instruction()
                   ->fused_instructions_computation()
                   ->root_instruction(),
-              GmockMatch(m::Dot(m::Op().WithShape(BF16, {32, 4, 36}, {2, 0, 1}),
-                                m::Op().WithShape(BF16, {40, 4, 36}, {2, 0, 1}))
+              GmockMatch(m::Dot(m::Op().WithShape(BF16, {32, 4, 36}, {2, 1, 0}),
+                                m::Op().WithShape(BF16, {40, 4, 36}, {2, 1, 0}))
                              .WithShape(BF16, {4, 32, 40}, {2, 1, 0})));
 }
 
@@ -4161,18 +4165,22 @@ HloModule m
 ENTRY e {
   parameter_1 = bf16[16,16,48]{2,1,0} parameter(1)
   parameter_2 = bf16[16,48,32]{2,1,0} parameter(0)
-  ROOT dot.16125 = bf16[16,16,32]{2,1,0} dot(parameter_1, parameter_2), lhs_batch_dims={1}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+  ROOT dot.16125 = bf16[16,16,32]{2,1,0} dot(parameter_1, parameter_2),
+      lhs_batch_dims={1}, lhs_contracting_dims={2},
+      rhs_batch_dims={0}, rhs_contracting_dims={1}
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
 
+  // lhs has minor contracting dims, so the layout is changed.
+  // rhs changes layout to have minor contracting dims.
   EXPECT_THAT(
       module->entry_computation()
           ->root_instruction()
           ->fused_instructions_computation()
           ->root_instruction(),
-      GmockMatch(m::Dot(m::Op().WithShape(BF16, {16, 16, 48}, {2, 0, 1}),
+      GmockMatch(m::Dot(m::Op().WithShape(BF16, {16, 16, 48}, {2, 1, 0}),
                         m::Op().WithShape(BF16, {16, 48, 32}, {1, 2, 0}))
                      .WithShape(BF16, {16, 16, 32}, {2, 1, 0})));
 }
diff --git a/xla/service/gpu/transforms/BUILD b/xla/service/gpu/transforms/BUILD
@@ -2167,6 +2167,7 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:dnn",
         "//xla/tsl/util:env_var",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -2188,6 +2189,7 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:computation_layout",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
diff --git a/xla/service/gpu/transforms/layout_assignment.cc b/xla/service/gpu/transforms/layout_assignment.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -329,13 +330,21 @@ bool DotCanSupportShapeWithLayout(const HloInstruction* dot,
       .ok();
 }
 
+bool IsPackedInstruction(const HloInstruction* instruction) {
+  return primitive_util::IsSubByteNonPredType(
+             instruction->shape().element_type()) ||
+         (instruction->opcode() == HloOpcode::kConvert &&
+          primitive_util::IsSubByteNonPredType(
+              instruction->operand(0)->shape().element_type()));
+}
+
 }  // namespace
 
 absl::Status GpuLayoutAssignment::AddDotBackendConstraints(
     LayoutConstraints* constraints, HloDotInstruction* instruction) {
   struct Side {
     size_t operand_no;
-    const Shape* shape;
+    const HloInstruction* operand;
     absl::Span<const int64_t> batch_dims;
     absl::Span<const int64_t> contracting_dims;
     PrimitiveType type;
@@ -344,12 +353,13 @@ absl::Status GpuLayoutAssignment::AddDotBackendConstraints(
   auto make_side =
       [&](size_t operand_no, absl::Span<const int64_t> batch_dims,
           absl::Span<const int64_t> contracting_dims) -> absl::StatusOr<Side> {
-    Side side = {operand_no, &instruction->operand(operand_no)->shape(),
-                 batch_dims, contracting_dims};
-    side.type = side.shape->element_type();
-    TF_ASSIGN_OR_RETURN(side.non_contracting_dims,
-                        GetNonContractingDims(*side.shape, side.batch_dims,
-                                              side.contracting_dims));
+    Side side = {operand_no, instruction->operand(operand_no), batch_dims,
+                 contracting_dims};
+    side.type = side.operand->shape().element_type();
+    TF_ASSIGN_OR_RETURN(
+        side.non_contracting_dims,
+        GetNonContractingDims(side.operand->shape(), side.batch_dims,
+                              side.contracting_dims));
     return side;
   };
   const DotDimensionNumbers& dot_dims = instruction->dot_dimension_numbers();
@@ -372,6 +382,11 @@ absl::Status GpuLayoutAssignment::AddDotBackendConstraints(
           ->config()
           .debug_options()
           .xla_gpu_ensure_minor_dot_contraction_dims();
+  const bool pack_along_contracting_dims =
+      instruction->GetModule()
+          ->config()
+          .debug_options()
+          .xla_gpu_experimental_pack_dot_operands_along_k_dimension();
 
   const bool is_bf16_to_bf16 =
       (output_type == PrimitiveType::BF16 && lhs.type == PrimitiveType::BF16 &&
@@ -388,11 +403,11 @@ absl::Status GpuLayoutAssignment::AddDotBackendConstraints(
       is_s8_to_s32 || is_fp8_to_fp8;
 
   for (const Side& side : {lhs, rhs}) {
-    if (both_operands_require_minor_contraction_dims) {
-      TF_RETURN_IF_ERROR(SetOperandMajorToMinorLayout(
-          instruction, side.operand_no,
-          /*dim_groups=*/
-          {side.batch_dims, side.non_contracting_dims, side.contracting_dims}));
+    if ((IsPackedInstruction(side.operand) && pack_along_contracting_dims) ||
+        both_operands_require_minor_contraction_dims) {
+      TF_RETURN_IF_ERROR(SetDotOperandLayoutToMinorContracting(
+          instruction, side.operand_no, side.batch_dims, side.contracting_dims,
+          side.non_contracting_dims));
     } else if (!side.batch_dims.empty() || side.contracting_dims.size() > 1 ||
                side.non_contracting_dims.size() > 1) {
       TF_RETURN_IF_ERROR(SetDotOperandLayout(
@@ -571,6 +586,42 @@ absl::Status GpuLayoutAssignment::SetDotOperandLayout(
       /*dim_groups=*/{batch_dims, row_dims, col_dims});
 }
 
+absl::Status GpuLayoutAssignment::SetDotOperandLayoutToMinorContracting(
+    const HloInstruction* instruction, int64_t operand,
+    absl::Span<const int64_t> batch_dims,
+    absl::Span<const int64_t> contracting_dims,
+    absl::Span<const int64_t> noncontracting_dims) {
+  Shape shape = instruction->operand(operand)->shape();
+
+  if (shape.has_layout() &&
+      shape.layout().minor_to_major_size() >= contracting_dims.size()) {
+    // Check that the contracting dimensions are physically minor, i.e. check
+    // that minor physical dimensions all point to contracting logical
+    // dimensions.
+    bool contracting_dims_are_minor = true;
+    const auto& minor_to_major = shape.layout().minor_to_major();
+    for (int64_t i = 0; i < contracting_dims.size(); ++i) {
+      if (!absl::c_linear_search(contracting_dims, minor_to_major[i])) {
+        contracting_dims_are_minor = false;
+        break;
+      }
+    }
+
+    // If contracting dims are already minor, and the layout is valid, keep it.
+    if (contracting_dims_are_minor &&
+        MatrixLayout::For(shape, batch_dims, noncontracting_dims,
+                          contracting_dims)
+            .ok()) {
+      // Re-set the operand layout, so it becomes mandatory.
+      return SetOperandLayout(shape, instruction, operand);
+    }
+  }
+  return SetOperandMajorToMinorLayout(
+      instruction, operand,
+      /*dim_groups=*/
+      {batch_dims, noncontracting_dims, contracting_dims});
+}
+
 absl::Status GpuLayoutAssignment::SetOperandMajorToMinorLayout(
     const HloInstruction* instruction, int64_t operand,
     std::initializer_list<absl::Span<const int64_t>> dim_groups) {
diff --git a/xla/service/gpu/transforms/layout_assignment.h b/xla/service/gpu/transforms/layout_assignment.h
@@ -65,6 +65,12 @@ class GpuLayoutAssignment : public LayoutAssignment {
                                    absl::Span<const int64_t> row_dims,
                                    absl::Span<const int64_t> col_dims);
 
+  absl::Status SetDotOperandLayoutToMinorContracting(
+      const HloInstruction* instruction, int64_t operand,
+      absl::Span<const int64_t> batch_dims,
+      absl::Span<const int64_t> contracting_dims,
+      absl::Span<const int64_t> noncontracting_dims);
+
   absl::Status SetDotLayout(const HloInstruction* instruction,
                             LayoutConstraints* constraints);
 
diff --git a/xla/service/gpu/transforms/layout_assignment_test.cc b/xla/service/gpu/transforms/layout_assignment_test.cc
@@ -27,12 +27,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
@@ -770,6 +770,73 @@ TEST_F(LayoutAssignmentTest, AutoLayoutE4M3ContractingMinorFirst) {
               .WithShape(F32, {128, 10240}, {1, 0})));
 }
 
+TEST_F(LayoutAssignmentTest, AutoLayoutS4DotContractingMinorLhs) {
+  const char* hlo = R"(
+  HloModule AutoLayoutS4DotContractingMinorLhs
+
+  ENTRY main {
+    p0 = s4[5120,128] parameter(0)
+    p0.c = bf16[5120,128] convert(p0)
+    p1 = bf16[5120,10240] parameter(1)
+    ROOT dot = bf16[128,10240] dot(p0.c, p1), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> m,
+      ParseAndReturnUnverifiedModule(
+          hlo, {}, HloParserOptions().set_fill_missing_layouts(false)));
+  DebugOptions debug_options = m->config().debug_options();
+  debug_options.set_xla_gpu_experimental_pack_dot_operands_along_k_dimension(
+      true);
+  m->mutable_config().set_debug_options(debug_options);
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape(),
+      /*ignore_layouts=*/false);
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout, GetGpuComputeCapability(), GetDnnVersion(),
+      GetDeviceDescription());
+  EXPECT_THAT(layout_assignment.Run(m.get()), IsOkAndHolds(true));
+  EXPECT_THAT(m->entry_computation()->parameter_instruction(0),
+              GmockMatch(m::Parameter(0).WithShape(S4, {5120, 128}, {0, 1})));
+  EXPECT_THAT(
+      m->entry_computation()->parameter_instruction(1),
+      GmockMatch(m::Parameter(1).WithShape(BF16, {5120, 10240}, {1, 0})));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Dot().WithShape(BF16, {128, 10240}, {1, 0})));
+}
+
+TEST_F(LayoutAssignmentTest, AutoLayoutS4DotContractingMinorRhs) {
+  const char* hlo = R"(
+  HloModule AutoLayoutS4DotContractingMinorRhs
+
+  ENTRY main {
+    p0 = bf16[5120,128] parameter(0)
+    p1 = s4[5120,10240] parameter(1)
+    p1.c = bf16[5120,10240] convert(p1)
+    ROOT dot = bf16[128,10240] dot(p0, p1.c), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> m,
+      ParseAndReturnUnverifiedModule(
+          hlo, {}, HloParserOptions().set_fill_missing_layouts(false)));
+  DebugOptions debug_options = m->config().debug_options();
+  debug_options.set_xla_gpu_experimental_pack_dot_operands_along_k_dimension(
+      true);
+  m->mutable_config().set_debug_options(debug_options);
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape(),
+      /*ignore_layouts=*/false);
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout, GetGpuComputeCapability(), GetDnnVersion(),
+      GetDeviceDescription());
+  EXPECT_THAT(layout_assignment.Run(m.get()), IsOkAndHolds(true));
+  EXPECT_THAT(m->entry_computation()->parameter_instruction(0),
+              GmockMatch(m::Parameter(0).WithShape(BF16, {5120, 128}, {1, 0})));
+  EXPECT_THAT(m->entry_computation()->parameter_instruction(1),
+              GmockMatch(m::Parameter(1).WithShape(S4, {5120, 10240}, {0, 1})));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Dot().WithShape(BF16, {128, 10240}, {1, 0})));
+}
+
 TEST_F(LayoutAssignmentTest, VariadicReduceSameOperandLayout) {
   const char* module_str = R"(
 HloModule variadic_reduce
diff --git a/xla/xla.proto b/xla/xla.proto
@@ -1113,7 +1113,10 @@ message DebugOptions {
   // xla_gpu_multi_streamed_windowed_einsum is set to true.
   bool xla_gpu_experimental_enable_alltoall_windowed_einsum = 360;
 
-  // Next id: 362
+  // For sub-byte dot operands, layout them along contracting dimensions.
+  bool xla_gpu_experimental_pack_dot_operands_along_k_dimension = 362;
+
+  // Next id: 363
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.