Skip to content

Commit c01b39d

Browse files
authored
Fix convolution fp16 performance drop on gfx12xx (#403)
* Remove hardcoded convolution NCHW layout assignment for fp16 precision. * PR openxla#32773: [ROCm] Fix convolution fp16 performance drop on gfx11xx, gfx12xx Imported from GitHub PR openxla#32773 📝 Summary of Changes Remove hardcoded NHWC convolution layout for fp16 precision. 🎯 Justification Performance drops for fp16 precision on gfx11xx and gfx12xx GPUs were observed internally, as well as by the [community](jax-ml/jax#30548). 🚀 Kind of Contribution 🐛 Bug Fix 📊 Benchmark Community member provided the script with whom the [profiling can be done](jax-ml/jax#30548 (comment)). Significant performance improvement for fp16 on gfx12xx: ``` Running on: rocm:0 Testing float32... Avg time: 0.092307 s, Throughput: 1.68 TFLOP/s Testing float16... Avg time: 0.011742 s, Throughput: 13.17 TFLOP/s Testing bfloat16... Avg time: 0.011989 s, Throughput: 12.90 TFLOP/s ``` Results of the profiling before the fix: ``` Running on: rocm:0 Testing float32... Avg time: 0.092312 s, Throughput: 1.67 TFLOP/s Testing float16... Avg time: 0.775142 s, Throughput: 0.20 TFLOP/s Testing bfloat16... Avg time: 0.011990 s, Throughput: 12.90 TFLOP/s ``` @xla-rotation can you please review this PR? Copybara import of the project: -- c9fdba7 by Aleksa Arsic <[email protected]>: Remove hardcoded convolution NCHW layout assignment for fp16 precision. -- 69660d1 by Aleksa Arsic <[email protected]>: Add unit tests for ROCm layout assignment. Merging this change closes openxla#32773 COPYBARA_INTEGRATE_REVIEW=openxla#32773 from ROCm:ci_fix-hardcoded-NHWC-conv-layout-for-fp16 69660d1 PiperOrigin-RevId: 822022522
1 parent bae387e commit c01b39d

File tree

3 files changed

+172
-8
lines changed

3 files changed

+172
-8
lines changed

xla/service/gpu/transforms/layout_assignment.cc

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -161,14 +161,9 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
161161
}
162162
}
163163

164-
const auto* rocm_compute_capability =
165-
std::get_if<se::RocmComputeCapability>(&gpu_version);
166-
if (rocm_compute_capability && input_ty == F16) return kAllNHWC;
167-
168-
// If we're not Volta or not fp16/bfloat16, or not conv2D, the decision is
169-
// easy: Use NCHW.
170164
const bool isFloat16 = (input_ty == F16) || (input_ty == BF16);
171165
if (std::holds_alternative<se::CudaComputeCapability>(gpu_version)) {
166+
// CUDA:
172167
// If we're not Volta or not fp16/bfloat16, or not conv2D, the decision is
173168
// easy: Use NCHW.
174169
const auto* cuda_compute_capability =
@@ -181,6 +176,9 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
181176
return kAllNCHW;
182177
}
183178
} else if (std::holds_alternative<se::RocmComputeCapability>(gpu_version)) {
179+
// ROCm:
180+
// If we do not have NHWC layout support or not fp16/bfloat16, or not
181+
// conv2D, or ROCm NHWC is disabled the decision is to use NCHW.
184182
bool is_enabled = false;
185183
TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_USE_ROCM_NHWC",
186184
/*default_val=*/false, &is_enabled));
@@ -195,7 +193,7 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
195193

196194
VLOG(2) << "Using heuristic to figure out layouts for " << instr->ToString();
197195

198-
// For other Volta f16 convolutions, use NHWC.
196+
// For other f16 convolutions, use NHWC.
199197
return kAllNHWC;
200198
}
201199

xla/service/gpu/transforms/layout_assignment_test.cc

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ class LayoutAssignmentTest : public HloTestBase {
6161
return backend().default_stream_executor()->GetDeviceDescription();
6262
}
6363

64+
se::RocmComputeCapability GetRocmComputeCapability() {
65+
return GetDeviceDescription().rocm_compute_capability();
66+
}
67+
6468
se::CudaComputeCapability GetCudaComputeCapability() {
6569
return GetDeviceDescription().cuda_compute_capability();
6670
}
@@ -580,6 +584,160 @@ ENTRY entry {
580584
EXPECT_EQ(output_layout, LayoutUtil::GetDefaultLayoutForR3());
581585
}
582586

587+
TEST_F(LayoutAssignmentTest, FP16ROCmConvolutionHasNCHWLayoutRDNA) {
588+
const char* hlo = R"(
589+
ENTRY entry {
590+
p0 = f16[2,64,64,16]{3,2,1,0} parameter(0)
591+
p1 = f16[6,16,3,32]{3,2,1,0} parameter(1)
592+
ROOT conv = (f64[2,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call(p0, p1),
593+
window={size=3x3 pad=1_1x1_1}, dim_labels=b10f_o10i->b10f,
594+
custom_call_target="__cudnn$convForward"
595+
})";
596+
597+
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
598+
ParseAndReturnVerifiedModule(hlo));
599+
ComputationLayout computation_layout(
600+
hlo_module->entry_computation()->ComputeProgramShape());
601+
602+
GpuLayoutAssignment layout_assignment(
603+
&computation_layout, se::RocmComputeCapability::EarliestRDNASupport(),
604+
GetDnnVersion(), GetDeviceDescription());
605+
606+
EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
607+
absl_testing::IsOkAndHolds(true));
608+
609+
// We start from b10f_o10i->b10f, meaning that the inputs start out as
610+
// NWHC_OWHI->NWHC. Layout assignment should yield layouts of the form
611+
// {1,2,3,0} for both inputs and for the output, therefore, in order to get to
612+
// the desired NCHW_OIHW->NCHW layout.
613+
EXPECT_THAT(
614+
RunFileCheck(hlo_module->ToString(HloPrintOptions::ShortParsable()), R"(
615+
// CHECK-DAG: [[P0:[^ ]+]] = {{.*}} parameter(0)
616+
// CHECK-DAG: [[P1:[^ ]+]] = {{.*}} parameter(1)
617+
// CHECK-DAG: [[COPY_P0:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P0]])
618+
// CHECK-DAG: [[COPY_P1:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P1]])
619+
// CHECK: [[CONV:[^ ]+]] = {{.*}}{1,2,3,0}, {{.*}} custom-call([[COPY_P0]], [[COPY_P1]])
620+
)"),
621+
absl_testing::IsOkAndHolds(true));
622+
}
623+
624+
TEST_F(LayoutAssignmentTest, FP32ROCmConvolutionHasNCHWLayoutRDNA) {
625+
const char* hlo = R"(
626+
ENTRY entry {
627+
p0 = f32[2,64,64,16]{3,2,1,0} parameter(0)
628+
p1 = f32[6,16,3,32]{3,2,1,0} parameter(1)
629+
ROOT conv = (f64[2,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call(p0, p1),
630+
window={size=3x3 pad=1_1x1_1}, dim_labels=b10f_o10i->b10f,
631+
custom_call_target="__cudnn$convForward"
632+
})";
633+
634+
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
635+
ParseAndReturnVerifiedModule(hlo));
636+
ComputationLayout computation_layout(
637+
hlo_module->entry_computation()->ComputeProgramShape());
638+
639+
GpuLayoutAssignment layout_assignment(
640+
&computation_layout, se::RocmComputeCapability::EarliestRDNASupport(),
641+
GetDnnVersion(), GetDeviceDescription());
642+
643+
EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
644+
absl_testing::IsOkAndHolds(true));
645+
646+
// We start from b10f_o10i->b10f, meaning that the inputs start out as
647+
// NWHC_OWHI->NWHC. Layout assignment should yield layouts of the form
648+
// {1,2,3,0} for both inputs and for the output, therefore, in order to get to
649+
// the desired NCHW_OIHW->NCHW layout.
650+
EXPECT_THAT(
651+
RunFileCheck(hlo_module->ToString(HloPrintOptions::ShortParsable()), R"(
652+
// CHECK-DAG: [[P0:[^ ]+]] = {{.*}} parameter(0)
653+
// CHECK-DAG: [[P1:[^ ]+]] = {{.*}} parameter(1)
654+
// CHECK-DAG: [[COPY_P0:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P0]])
655+
// CHECK-DAG: [[COPY_P1:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P1]])
656+
// CHECK: [[CONV:[^ ]+]] = {{.*}}{1,2,3,0}, {{.*}} custom-call([[COPY_P0]], [[COPY_P1]])
657+
)"),
658+
absl_testing::IsOkAndHolds(true));
659+
}
660+
661+
TEST_F(LayoutAssignmentTest, FP16ROCmConvolutionHasNHWCLayoutCDNA) {
662+
// Enable ROCm NHWC for this test
663+
setenv("TF_USE_ROCM_NHWC", "true", 1);
664+
665+
const char* hlo = R"(
666+
ENTRY entry {
667+
p0 = f16[2,64,64,16]{3,2,1,0} parameter(0)
668+
p1 = f16[6,16,3,32]{3,2,1,0} parameter(1)
669+
ROOT conv = (f64[2,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call(p0, p1),
670+
window={size=3x3 pad=1_1x1_1}, dim_labels=b10f_o10i->b10f,
671+
custom_call_target="__cudnn$convForward"
672+
})";
673+
674+
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
675+
ParseAndReturnVerifiedModule(hlo));
676+
ComputationLayout computation_layout(
677+
hlo_module->entry_computation()->ComputeProgramShape());
678+
679+
GpuLayoutAssignment layout_assignment(
680+
&computation_layout, se::RocmComputeCapability::EarliestCDNASupport(),
681+
GetDnnVersion(), GetDeviceDescription());
682+
683+
EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
684+
absl_testing::IsOkAndHolds(true));
685+
686+
// We start from b10f_o10i->b10f, meaning that the inputs start out as
687+
// NWHC_OWHI->NWHC. Layout assignment should yield layouts of the form
688+
// {3,1,2,0} (transpose the middle dimensions) for both inputs and for the
689+
// output, therefore, in order to get to the desired NHWC_OHWI->NHWC layout.
690+
EXPECT_THAT(
691+
RunFileCheck(hlo_module->ToString(HloPrintOptions::ShortParsable()), R"(
692+
// CHECK-DAG: [[P0:[^ ]+]] = {{.*}} parameter(0)
693+
// CHECK-DAG: [[P1:[^ ]+]] = {{.*}} parameter(1)
694+
// CHECK-DAG: [[COPY_P0:[^ ]+]] = {{.*}}{3,1,2,0} copy([[P0]])
695+
// CHECK-DAG: [[COPY_P1:[^ ]+]] = {{.*}}{3,1,2,0} copy([[P1]])
696+
// CHECK: [[CONV:[^ ]+]] = {{.*}}{3,1,2,0}, {{.*}} custom-call([[COPY_P0]], [[COPY_P1]])
697+
)"),
698+
absl_testing::IsOkAndHolds(true));
699+
700+
// Clean up after the test
701+
unsetenv("TF_USE_ROCM_NHWC");
702+
}
703+
704+
TEST_F(LayoutAssignmentTest, FP32ROCmConvolutionHasNCHWLayoutCDNA) {
705+
const char* hlo = R"(
706+
ENTRY entry {
707+
p0 = f32[2,64,64,16]{3,2,1,0} parameter(0)
708+
p1 = f32[6,16,3,32]{3,2,1,0} parameter(1)
709+
ROOT conv = (f64[2,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call(p0, p1),
710+
window={size=3x3 pad=1_1x1_1}, dim_labels=b10f_o10i->b10f,
711+
custom_call_target="__cudnn$convForward"
712+
})";
713+
714+
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
715+
ParseAndReturnVerifiedModule(hlo));
716+
ComputationLayout computation_layout(
717+
hlo_module->entry_computation()->ComputeProgramShape());
718+
719+
GpuLayoutAssignment layout_assignment(
720+
&computation_layout, se::RocmComputeCapability::EarliestCDNASupport(),
721+
GetDnnVersion(), GetDeviceDescription());
722+
723+
EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
724+
absl_testing::IsOkAndHolds(true));
725+
726+
// We start from b10f_o10i->b10f, meaning that the inputs start out as
727+
// NWHC_OWHI->NWHC. Layout assignment should yield layouts of the form
728+
// {1,2,3,0} for both inputs and for the output, therefore, in order to get to
729+
// the desired NCHW_OIHW->NCHW layout.
730+
EXPECT_THAT(
731+
RunFileCheck(hlo_module->ToString(HloPrintOptions::ShortParsable()), R"(
732+
// CHECK-DAG: [[P0:[^ ]+]] = {{.*}} parameter(0)
733+
// CHECK-DAG: [[P1:[^ ]+]] = {{.*}} parameter(1)
734+
// CHECK-DAG: [[COPY_P0:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P0]])
735+
// CHECK-DAG: [[COPY_P1:[^ ]+]] = {{.*}}{1,2,3,0} copy([[P1]])
736+
// CHECK: [[CONV:[^ ]+]] = {{.*}}{1,2,3,0}, {{.*}} custom-call([[COPY_P0]], [[COPY_P1]])
737+
)"),
738+
absl_testing::IsOkAndHolds(true));
739+
}
740+
583741
TEST_F(LayoutAssignmentTest, CuDNNConvolutionHasNHWCLayoutPostHopper) {
584742
const char* hlo = R"(
585743
ENTRY entry {

xla/stream_executor/device_description.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,14 @@ class RocmComputeCapability {
5656

5757
RocmComputeCapability() = default;
5858

59+
static RocmComputeCapability EarliestCDNASupport() {
60+
return RocmComputeCapability{"gfx908"};
61+
}
62+
63+
static RocmComputeCapability EarliestRDNASupport() {
64+
return RocmComputeCapability{"gfx1030"};
65+
}
66+
5967
std::string gcn_arch_name() const { return gcn_arch_name_; }
6068

6169
std::string ToString() const { return gcn_arch_name(); }
@@ -92,7 +100,7 @@ class RocmComputeCapability {
92100
"gfx1030", // RX68xx / RX69xx
93101
"gfx1100", // RX7900
94102
"gfx1101", // RX7700 / RX7800
95-
"gfx1103", "gfx1150", "gfx1151", "gfx1200", "gfx1201",
103+
"gfx1103", "gfx1150", "gfx1151", "gfx1200", "gfx1201"
96104
};
97105

98106
bool is_supported_gfx_version() const {

0 commit comments

Comments
 (0)