Fix convolution fp16 performance drop on gfx12xx (#403)

AleksaArsic · web-flow · commit c01b39d9ccd2 · 2025-10-24T10:08:17.000+01:00
* Remove hardcoded convolution NCHW layout assignment for fp16 precision. * PR openxla#32773: [ROCm] Fix convolution fp16 performance drop on gfx11xx, gfx12xx Imported from GitHub PR openxla#32773 📝 Summary of Changes Remove hardcoded NHWC convolution layout for fp16 precision. 🎯 Justification Performance drops for fp16 precision on gfx11xx and gfx12xx GPUs were observed internally, as well as by the [community](jax-ml/jax#30548). 🚀 Kind of Contribution 🐛 Bug Fix 📊 Benchmark Community member provided the script with whom the [profiling can be done](jax-ml/jax#30548 (comment)). Significant performance improvement for fp16 on gfx12xx: ``` Running on: rocm:0 Testing float32... Avg time: 0.092307 s, Throughput: 1.68 TFLOP/s Testing float16... Avg time: 0.011742 s, Throughput: 13.17 TFLOP/s Testing bfloat16... Avg time: 0.011989 s, Throughput: 12.90 TFLOP/s ``` Results of the profiling before the fix: ``` Running on: rocm:0 Testing float32... Avg time: 0.092312 s, Throughput: 1.67 TFLOP/s Testing float16... Avg time: 0.775142 s, Throughput: 0.20 TFLOP/s Testing bfloat16... Avg time: 0.011990 s, Throughput: 12.90 TFLOP/s ``` @xla-rotation can you please review this PR? Copybara import of the project: -- c9fdba7 by Aleksa Arsic <Aleksa.Arsic@amd.com>: Remove hardcoded convolution NCHW layout assignment for fp16 precision. -- 69660d1 by Aleksa Arsic <Aleksa.Arsic@amd.com>: Add unit tests for ROCm layout assignment. Merging this change closes openxla#32773 COPYBARA_INTEGRATE_REVIEW=openxla#32773 from ROCm:ci_fix-hardcoded-NHWC-conv-layout-for-fp16 69660d1 PiperOrigin-RevId: 822022522
diff --git a/xla/service/gpu/transforms/layout_assignment.cc b/xla/service/gpu/transforms/layout_assignment.cc
@@ -161,14 +161,9 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
     }
   }
 
-  const auto* rocm_compute_capability =
-      std::get_if<se::RocmComputeCapability>(&gpu_version);
-  if (rocm_compute_capability && input_ty == F16) return kAllNHWC;
-
-  // If we're not Volta or not fp16/bfloat16, or not conv2D, the decision is
-  // easy: Use NCHW.
   const bool isFloat16 = (input_ty == F16) || (input_ty == BF16);
   if (std::holds_alternative<se::CudaComputeCapability>(gpu_version)) {
+    // CUDA:
     // If we're not Volta or not fp16/bfloat16, or not conv2D, the decision is
     // easy: Use NCHW.
     const auto* cuda_compute_capability =
@@ -181,6 +176,9 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
       return kAllNCHW;
     }
   } else if (std::holds_alternative<se::RocmComputeCapability>(gpu_version)) {
+    // ROCm:
+    // If we do not have NHWC layout support or not fp16/bfloat16, or not
+    // conv2D, or ROCm NHWC is disabled the decision is to use NCHW.
     bool is_enabled = false;
     TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_USE_ROCM_NHWC",
                                         /*default_val=*/false, &is_enabled));
@@ -195,7 +193,7 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
 
   VLOG(2) << "Using heuristic to figure out layouts for " << instr->ToString();
 
-  // For other Volta f16 convolutions, use NHWC.
+  // For other f16 convolutions, use NHWC.
   return kAllNHWC;
 }
 
diff --git a/xla/service/gpu/transforms/layout_assignment_test.cc b/xla/service/gpu/transforms/layout_assignment_test.cc
@@ -61,6 +61,10 @@ class LayoutAssignmentTest : public HloTestBase {
     return backend().default_stream_executor()->GetDeviceDescription();
   }
 
+  se::RocmComputeCapability GetRocmComputeCapability() {
+    return GetDeviceDescription().rocm_compute_capability();
+  }
+
   se::CudaComputeCapability GetCudaComputeCapability() {
     return GetDeviceDescription().cuda_compute_capability();
   }
@@ -580,6 +584,160 @@ ENTRY entry {
   EXPECT_EQ(output_layout, LayoutUtil::GetDefaultLayoutForR3());
 }
 
+TEST_F(LayoutAssignmentTest, FP16ROCmConvolutionHasNCHWLayoutRDNA) {
+  const char* hlo = R"(
+ENTRY entry {
+  p0 = f16[2,64,64,16]{3,2,1,0} parameter(0)
+  p1 = f16[6,16,3,32]{3,2,1,0} parameter(1)
+  ROOT conv = (f64[2,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call(p0, p1),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b10f_o10i->b10f,
+    custom_call_target="__cudnn$convForward"
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo));
+  ComputationLayout computation_layout(
+      hlo_module->entry_computation()->ComputeProgramShape());
+
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout, se::RocmComputeCapability::EarliestRDNASupport(),
+      GetDnnVersion(), GetDeviceDescription());
+
+  EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
+              absl_testing::IsOkAndHolds(true));
+
+  // We start from b10f_o10i->b10f, meaning that the inputs start out as
+  // NWHC_OWHI->NWHC. Layout assignment should yield layouts of the form
+  // {1,2,3,0} for both inputs and for the output, therefore, in order to get to
+  // the desired NCHW_OIHW->NCHW layout.
+  EXPECT_THAT(
+      RunFileCheck(hlo_module->ToString(HloPrintOptions::ShortParsable()), R"(
+      // CHECK-DAG: [[P0:[^ ]+]] = {{.*}} parameter(0)
+      // CHECK-DAG: [[P1:[^ ]+]] = {{.*}} parameter(1)
+      // CHECK-DAG: [[COPY_P0:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P0]])
+      // CHECK-DAG: [[COPY_P1:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P1]])
+      // CHECK:     [[CONV:[^ ]+]] = {{.*}}{1,2,3,0}, {{.*}} custom-call([[COPY_P0]], [[COPY_P1]])
+      )"),
+      absl_testing::IsOkAndHolds(true));
+}
+
+TEST_F(LayoutAssignmentTest, FP32ROCmConvolutionHasNCHWLayoutRDNA) {
+  const char* hlo = R"(
+ENTRY entry {
+  p0 = f32[2,64,64,16]{3,2,1,0} parameter(0)
+  p1 = f32[6,16,3,32]{3,2,1,0} parameter(1)
+  ROOT conv = (f64[2,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call(p0, p1),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b10f_o10i->b10f,
+    custom_call_target="__cudnn$convForward"
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo));
+  ComputationLayout computation_layout(
+      hlo_module->entry_computation()->ComputeProgramShape());
+
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout, se::RocmComputeCapability::EarliestRDNASupport(),
+      GetDnnVersion(), GetDeviceDescription());
+
+  EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
+              absl_testing::IsOkAndHolds(true));
+
+  // We start from b10f_o10i->b10f, meaning that the inputs start out as
+  // NWHC_OWHI->NWHC. Layout assignment should yield layouts of the form
+  // {1,2,3,0} for both inputs and for the output, therefore, in order to get to
+  // the desired NCHW_OIHW->NCHW layout.
+  EXPECT_THAT(
+      RunFileCheck(hlo_module->ToString(HloPrintOptions::ShortParsable()), R"(
+      // CHECK-DAG: [[P0:[^ ]+]] = {{.*}} parameter(0)
+      // CHECK-DAG: [[P1:[^ ]+]] = {{.*}} parameter(1)
+      // CHECK-DAG: [[COPY_P0:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P0]])
+      // CHECK-DAG: [[COPY_P1:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P1]])
+      // CHECK:     [[CONV:[^ ]+]] = {{.*}}{1,2,3,0}, {{.*}} custom-call([[COPY_P0]], [[COPY_P1]])
+      )"),
+      absl_testing::IsOkAndHolds(true));
+}
+
+TEST_F(LayoutAssignmentTest, FP16ROCmConvolutionHasNHWCLayoutCDNA) {
+  // Enable ROCm NHWC for this test
+  setenv("TF_USE_ROCM_NHWC", "true", 1);
+
+  const char* hlo = R"(
+ENTRY entry {
+  p0 = f16[2,64,64,16]{3,2,1,0} parameter(0)
+  p1 = f16[6,16,3,32]{3,2,1,0} parameter(1)
+  ROOT conv = (f64[2,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call(p0, p1),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b10f_o10i->b10f,
+    custom_call_target="__cudnn$convForward"
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo));
+  ComputationLayout computation_layout(
+      hlo_module->entry_computation()->ComputeProgramShape());
+
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout, se::RocmComputeCapability::EarliestCDNASupport(),
+      GetDnnVersion(), GetDeviceDescription());
+
+  EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
+              absl_testing::IsOkAndHolds(true));
+
+  // We start from b10f_o10i->b10f, meaning that the inputs start out as
+  // NWHC_OWHI->NWHC. Layout assignment should yield layouts of the form
+  // {3,1,2,0} (transpose the middle dimensions) for both inputs and for the
+  // output, therefore, in order to get to the desired NHWC_OHWI->NHWC layout.
+  EXPECT_THAT(
+      RunFileCheck(hlo_module->ToString(HloPrintOptions::ShortParsable()), R"(
+      // CHECK-DAG: [[P0:[^ ]+]] = {{.*}} parameter(0)
+      // CHECK-DAG: [[P1:[^ ]+]] = {{.*}} parameter(1)
+      // CHECK-DAG: [[COPY_P0:[^ ]+]] = {{.*}}{3,1,2,0} copy([[P0]])
+      // CHECK-DAG: [[COPY_P1:[^ ]+]] = {{.*}}{3,1,2,0} copy([[P1]])
+      // CHECK:     [[CONV:[^ ]+]] = {{.*}}{3,1,2,0}, {{.*}} custom-call([[COPY_P0]], [[COPY_P1]])
+      )"),
+      absl_testing::IsOkAndHolds(true));
+
+  // Clean up after the test
+  unsetenv("TF_USE_ROCM_NHWC");
+}
+
+TEST_F(LayoutAssignmentTest, FP32ROCmConvolutionHasNCHWLayoutCDNA) {
+  const char* hlo = R"(
+ENTRY entry {
+  p0 = f32[2,64,64,16]{3,2,1,0} parameter(0)
+  p1 = f32[6,16,3,32]{3,2,1,0} parameter(1)
+  ROOT conv = (f64[2,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call(p0, p1),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b10f_o10i->b10f,
+    custom_call_target="__cudnn$convForward"
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo));
+  ComputationLayout computation_layout(
+      hlo_module->entry_computation()->ComputeProgramShape());
+
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout, se::RocmComputeCapability::EarliestCDNASupport(),
+      GetDnnVersion(), GetDeviceDescription());
+
+  EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
+              absl_testing::IsOkAndHolds(true));
+
+  // We start from b10f_o10i->b10f, meaning that the inputs start out as
+  // NWHC_OWHI->NWHC. Layout assignment should yield layouts of the form
+  // {1,2,3,0} for both inputs and for the output, therefore, in order to get to
+  // the desired NCHW_OIHW->NCHW layout.
+  EXPECT_THAT(
+      RunFileCheck(hlo_module->ToString(HloPrintOptions::ShortParsable()), R"(
+      // CHECK-DAG: [[P0:[^ ]+]] = {{.*}} parameter(0)
+      // CHECK-DAG: [[P1:[^ ]+]] = {{.*}} parameter(1)
+      // CHECK-DAG: [[COPY_P0:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P0]])
+      // CHECK-DAG: [[COPY_P1:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P1]])
+      // CHECK:     [[CONV:[^ ]+]] = {{.*}}{1,2,3,0}, {{.*}} custom-call([[COPY_P0]], [[COPY_P1]])
+      )"),
+      absl_testing::IsOkAndHolds(true));
+}
+
 TEST_F(LayoutAssignmentTest, CuDNNConvolutionHasNHWCLayoutPostHopper) {
   const char* hlo = R"(
 ENTRY entry {
diff --git a/xla/stream_executor/device_description.h b/xla/stream_executor/device_description.h
@@ -56,6 +56,14 @@ class RocmComputeCapability {
 
   RocmComputeCapability() = default;
 
+  static RocmComputeCapability EarliestCDNASupport() {
+    return RocmComputeCapability{"gfx908"};
+  }
+
+  static RocmComputeCapability EarliestRDNASupport() {
+    return RocmComputeCapability{"gfx1030"};
+  }
+
   std::string gcn_arch_name() const { return gcn_arch_name_; }
 
   std::string ToString() const { return gcn_arch_name(); }
@@ -92,7 +100,7 @@ class RocmComputeCapability {
       "gfx1030",  // RX68xx / RX69xx
       "gfx1100",  // RX7900
       "gfx1101",  // RX7700 / RX7800
-      "gfx1103", "gfx1150", "gfx1151", "gfx1200", "gfx1201",
+      "gfx1103", "gfx1150", "gfx1151", "gfx1200", "gfx1201"
   };
 
   bool is_supported_gfx_version() const {