Skip to content

Commit bb77caf

Browse files
authored
[GPU][DT] Update data layout strategy for pingpong ukernels (#21957)
As a follow-up to #21914 and #21919, specialize data layout selection for the newly added pingpong ukernels. It also adds end-to-end data-tiling matmul tests with ukernels enabled. --------- Signed-off-by: Yu-Zhewen <[email protected]>
1 parent af6366a commit bb77caf

File tree

3 files changed

+292
-8
lines changed

3 files changed

+292
-8
lines changed

compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMAttrs.cpp

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -496,15 +496,37 @@ Attribute TensorUKernelProviderAttr::getDataLayoutForUKernel(
496496
return {};
497497
}
498498
SmallVector<Type> types = encodingAttr.getElementTypesArray();
499+
SmallVector<int64_t> iterationSizes = encodingAttr.getIterationSizesArray();
500+
if (types.size() != 3 || iterationSizes.size() != 3) {
501+
return {};
502+
}
503+
// Match the layouts based on UKernels implementation:
504+
// https://github.com/iree-org/iree/tree/main/compiler/plugins/target/ROCM/builtins/mlir_ukernel
499505
Type f16 = Float16Type::get(encoding.getContext());
500506
Type f32 = Float32Type::get(encoding.getContext());
501-
if (types.size() != 3 || types[0] != f16 || types[1] != f16 ||
502-
types[2] != f32) {
503-
return {};
507+
Type f8E4M3FNUZ = Float8E4M3FNUZType::get(encoding.getContext());
508+
if (types[0] == f16 && types[1] == f16 && types[2] == f32) {
509+
// UKernel: pingpong_dt_large_f16.
510+
return IREE::GPU::DataTiledMMAAttr::get(
511+
encoding.getContext(), IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x16_F16,
512+
8, 2, 4, 4, 1);
513+
}
514+
if (types[0] == f8E4M3FNUZ && types[1] == f8E4M3FNUZ && types[2] == f32) {
515+
/// TODO(#21865): Remove the upper bound (8192) once the scratch memory
516+
/// issue is resolved.
517+
if (iterationSizes[1] >= 2048 && iterationSizes[1] <= 8192) {
518+
// UKernel: pingpong_dt_large_f8E4M3FNUZ.
519+
return IREE::GPU::DataTiledMMAAttr::get(
520+
encoding.getContext(),
521+
IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ, 8, 2, 4, 4, 1);
522+
} else {
523+
// UKernel: pingpong_dt_medium_f8E4M3FNUZ.
524+
return IREE::GPU::DataTiledMMAAttr::get(
525+
encoding.getContext(),
526+
IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ, 8, 1, 2, 8, 2);
527+
}
504528
}
505-
return IREE::GPU::DataTiledMMAAttr::get(
506-
encoding.getContext(), IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x16_F16, 8,
507-
2, 4, 4, 1);
529+
return {};
508530
}
509531

510532
//===----------------------------------------------------------------------===//

compiler/plugins/target/ROCM/test/materialize_encoding_ukernel_gfx942.mlir

Lines changed: 164 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,9 @@
4545
#hal.pipeline.binding<storage_buffer>
4646
]>
4747

48-
func.func @matmul_lowering_ukernel_provider() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
48+
func.func @matmul_f16_f16_f32_large_lowering_ukernel_provider() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
4949
%c0 = arith.constant 0 : index
50+
// M, N, K are dynamic.
5051
%M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index
5152
%N = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index
5253
%K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(2) : index
@@ -75,7 +76,168 @@ func.func @matmul_lowering_ukernel_provider() attributes {hal.executable.target
7576
-> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N}
7677
return
7778
}
78-
// CHECK-LABEL: matmul_lowering_ukernel_provider
79+
// CHECK-LABEL: matmul_f16_f16_f32_large_lowering_ukernel_provider
7980
// CHECK: iree_codegen.inner_tiled
8081
// CHECK-SAME: iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]
8182
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_F16, intrinsics_m = 8, subgroups_m = 2, intrinsics_n = 4, subgroups_n = 4>
83+
84+
// -----
85+
86+
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {
87+
abi = "hip",
88+
iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>,
89+
iree_codegen.target_info = #iree_gpu.target<
90+
arch = "gfx942",
91+
features = "",
92+
wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8,
93+
storage = b64|b32|b16|b8,
94+
subgroup = shuffle|arithmetic,
95+
dot = dp4xi8toi32,
96+
mma = [<MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>,
97+
<MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>,
98+
<MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>,
99+
<MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>,
100+
<MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>,
101+
<MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>,
102+
<MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>
103+
],
104+
subgroup_size_choices = [64],
105+
max_workgroup_sizes = [1024, 1024, 1024],
106+
max_thread_count_per_workgroup = 1024,
107+
max_workgroup_memory_bytes = 65536,
108+
max_workgroup_counts = [2147483647, 2147483647, 2147483647],
109+
max_load_instruction_bits = 128,
110+
simds_per_wgp = 4,
111+
vgpr_space_bits = 16384>
112+
>,
113+
iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider,
114+
ukernels = "none"
115+
}>
116+
117+
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
118+
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
119+
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
120+
#encoding_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f8E4M3FNUZ, f8E4M3FNUZ, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, ?, ?]>
121+
#encoding_rhs = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [f8E4M3FNUZ, f8E4M3FNUZ, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, ?, ?]>
122+
#encoding_result = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [f8E4M3FNUZ, f8E4M3FNUZ, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, ?, ?]>
123+
#pipeline_layout_3 = #hal.pipeline.layout<constants = 3, bindings = [
124+
#hal.pipeline.binding<storage_buffer>,
125+
#hal.pipeline.binding<storage_buffer>,
126+
#hal.pipeline.binding<storage_buffer>
127+
]>
128+
129+
func.func @matmul_f8_f8_f32_medium_lowering_ukernel_provider() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
130+
%c0 = arith.constant 0 : index
131+
// M, N, K are dynamic.
132+
%M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index
133+
%N = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index
134+
%K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(2) : index
135+
%0 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(0) alignment(64) offset(%c0)
136+
: !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf8E4M3FNUZ, #encoding_lhs>>{%M, %K}
137+
%1 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(1) alignment(64) offset(%c0)
138+
: !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf8E4M3FNUZ, #encoding_rhs>>{%K, %N}
139+
%2 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(2) alignment(64) offset(%c0)
140+
: !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N}
141+
%3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
142+
: !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf8E4M3FNUZ, #encoding_lhs>>{%M, %K}
143+
-> tensor<?x?xf8E4M3FNUZ, #encoding_lhs>
144+
%4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1]
145+
: !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf8E4M3FNUZ, #encoding_rhs>>{%K, %N}
146+
-> tensor<?x?xf8E4M3FNUZ, #encoding_rhs>
147+
%5 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
148+
: !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N}
149+
-> tensor<?x?xf32, #encoding_result>
150+
%6 = linalg.matmul
151+
ins(%3, %4 : tensor<?x?xf8E4M3FNUZ, #encoding_lhs>,
152+
tensor<?x?xf8E4M3FNUZ, #encoding_rhs>)
153+
outs(%5 : tensor<?x?xf32, #encoding_result>)
154+
-> tensor<?x?xf32, #encoding_result>
155+
iree_tensor_ext.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
156+
: tensor<?x?xf32, #encoding_result>
157+
-> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N}
158+
return
159+
}
160+
// CHECK-LABEL: matmul_f8_f8_f32_medium_lowering_ukernel_provider
161+
// CHECK: iree_codegen.inner_tiled
162+
// CHECK-SAME: iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]
163+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x32_F8E4M3FNUZ, intrinsics_m = 8, intrinsics_n = 2, subgroups_n = 8, intrinsics_k = 2>
164+
165+
// -----
166+
167+
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {
168+
abi = "hip",
169+
iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>,
170+
iree_codegen.target_info = #iree_gpu.target<
171+
arch = "gfx942",
172+
features = "",
173+
wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8,
174+
storage = b64|b32|b16|b8,
175+
subgroup = shuffle|arithmetic,
176+
dot = dp4xi8toi32,
177+
mma = [<MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>,
178+
<MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>,
179+
<MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>,
180+
<MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>,
181+
<MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>,
182+
<MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>,
183+
<MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>
184+
],
185+
subgroup_size_choices = [64],
186+
max_workgroup_sizes = [1024, 1024, 1024],
187+
max_thread_count_per_workgroup = 1024,
188+
max_workgroup_memory_bytes = 65536,
189+
max_workgroup_counts = [2147483647, 2147483647, 2147483647],
190+
max_load_instruction_bits = 128,
191+
simds_per_wgp = 4,
192+
vgpr_space_bits = 16384>
193+
>,
194+
iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider,
195+
ukernels = "none"
196+
}>
197+
198+
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
199+
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
200+
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
201+
#encoding_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f8E4M3FNUZ, f8E4M3FNUZ, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, 2048, ?]>
202+
#encoding_rhs = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [f8E4M3FNUZ, f8E4M3FNUZ, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, 2048, ?]>
203+
#encoding_result = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [f8E4M3FNUZ, f8E4M3FNUZ, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, 2048, ?]>
204+
#pipeline_layout_3 = #hal.pipeline.layout<constants = 2, bindings = [
205+
#hal.pipeline.binding<storage_buffer>,
206+
#hal.pipeline.binding<storage_buffer>,
207+
#hal.pipeline.binding<storage_buffer>
208+
]>
209+
210+
func.func @matmul_f8_f8_f32_large_lowering_ukernel_provider() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
211+
%c0 = arith.constant 0 : index
212+
// M, K are dynamic, and N is static as 2048.
213+
%M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index
214+
%K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index
215+
%0 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(0) alignment(64) offset(%c0)
216+
: !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf8E4M3FNUZ, #encoding_lhs>>{%M, %K}
217+
%1 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(1) alignment(64) offset(%c0)
218+
: !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x2048xf8E4M3FNUZ, #encoding_rhs>>{%K}
219+
%2 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(2) alignment(64) offset(%c0)
220+
: !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x2048xf32, #encoding_result>>{%M}
221+
%3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
222+
: !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf8E4M3FNUZ, #encoding_lhs>>{%M, %K}
223+
-> tensor<?x?xf8E4M3FNUZ, #encoding_lhs>
224+
%4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, 2048], strides = [1, 1]
225+
: !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x2048xf8E4M3FNUZ, #encoding_rhs>>{%K}
226+
-> tensor<?x2048xf8E4M3FNUZ, #encoding_rhs>
227+
%5 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, 2048], strides = [1, 1]
228+
: !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x2048xf32, #encoding_result>>{%M}
229+
-> tensor<?x2048xf32, #encoding_result>
230+
%6 = linalg.matmul
231+
ins(%3, %4 : tensor<?x?xf8E4M3FNUZ, #encoding_lhs>,
232+
tensor<?x2048xf8E4M3FNUZ, #encoding_rhs>)
233+
outs(%5 : tensor<?x2048xf32, #encoding_result>)
234+
-> tensor<?x2048xf32, #encoding_result>
235+
iree_tensor_ext.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, 2048], strides = [1, 1]
236+
: tensor<?x2048xf32, #encoding_result>
237+
-> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x2048xf32, #encoding_result>>{%M}
238+
return
239+
}
240+
// CHECK-LABEL: matmul_f8_f8_f32_large_lowering_ukernel_provider
241+
// CHECK: iree_codegen.inner_tiled
242+
// CHECK-SAME: iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]
243+
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x32_F8E4M3FNUZ, intrinsics_m = 8, subgroups_m = 2, intrinsics_n = 4, subgroups_n = 4>

tests/e2e/matmul/CMakeLists.txt

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1510,6 +1510,72 @@ iree_generated_e2e_runner_test(
15101510
"requires-gpu-cdna3"
15111511
)
15121512

1513+
iree_generated_e2e_runner_test(
1514+
NAME
1515+
e2e_matmul_cdna3_dt_f8E4M3FNUZ_tensor_ukernel_medium
1516+
TEST_TYPE
1517+
matmul
1518+
GENERATOR
1519+
"generate_e2e_matmul_tests.py"
1520+
GENERATOR_ARGS
1521+
"--lhs_rhs_type=f8E4M3FNUZ"
1522+
"--acc_type=f32"
1523+
"--shapes=custom_mnk"
1524+
"--mnk=1024,1024,1024"
1525+
TEST_RUNNER
1526+
iree_tools_testing_e2e_iree-e2e-matmul-test
1527+
TARGET_BACKENDS
1528+
"rocm"
1529+
DRIVERS
1530+
"hip"
1531+
COMPILER_FLAGS
1532+
${IREE_HIP_TEST_COMPILER_FLAGS}
1533+
"--iree-opt-data-tiling=false"
1534+
"--iree-dispatch-creation-data-tiling"
1535+
"--iree-hip-encoding-layout-resolver=data-tiling"
1536+
"--iree-llvmgpu-test-combine-layout-transformation=true"
1537+
"--iree-hip-enable-tensor-ukernels"
1538+
LABELS
1539+
"noasan"
1540+
"nomsan"
1541+
"notsan"
1542+
"noubsan"
1543+
"requires-gpu-cdna3"
1544+
)
1545+
1546+
iree_generated_e2e_runner_test(
1547+
NAME
1548+
e2e_matmul_cdna3_dt_f8E4M3FNUZ_tensor_ukernel_large
1549+
TEST_TYPE
1550+
matmul
1551+
GENERATOR
1552+
"generate_e2e_matmul_tests.py"
1553+
GENERATOR_ARGS
1554+
"--lhs_rhs_type=f8E4M3FNUZ"
1555+
"--acc_type=f32"
1556+
"--shapes=custom_mnk"
1557+
"--mnk=2048,2048,2048"
1558+
TEST_RUNNER
1559+
iree_tools_testing_e2e_iree-e2e-matmul-test
1560+
TARGET_BACKENDS
1561+
"rocm"
1562+
DRIVERS
1563+
"hip"
1564+
COMPILER_FLAGS
1565+
${IREE_HIP_TEST_COMPILER_FLAGS}
1566+
"--iree-opt-data-tiling=false"
1567+
"--iree-dispatch-creation-data-tiling"
1568+
"--iree-hip-encoding-layout-resolver=data-tiling"
1569+
"--iree-llvmgpu-test-combine-layout-transformation=true"
1570+
"--iree-hip-enable-tensor-ukernels"
1571+
LABELS
1572+
"noasan"
1573+
"nomsan"
1574+
"notsan"
1575+
"noubsan"
1576+
"requires-gpu-cdna3"
1577+
)
1578+
15131579
iree_generated_e2e_runner_test(
15141580
NAME
15151581
e2e_matmul_cdna3_dt_f64
@@ -1631,6 +1697,40 @@ iree_generated_e2e_runner_test(
16311697
"requires-gpu-cdna3"
16321698
)
16331699

1700+
1701+
iree_generated_e2e_runner_test(
1702+
NAME
1703+
e2e_matmul_dt_tensor_ukernel_f16f16f32_large
1704+
TEST_TYPE
1705+
matmul
1706+
GENERATOR
1707+
"generate_e2e_matmul_tests.py"
1708+
GENERATOR_ARGS
1709+
"--lhs_rhs_type=f16"
1710+
"--acc_type=f32"
1711+
"--shapes=custom_mnk"
1712+
"--mnk=1024,1024,1024"
1713+
TEST_RUNNER
1714+
iree_tools_testing_e2e_iree-e2e-matmul-test
1715+
TARGET_BACKENDS
1716+
"rocm"
1717+
DRIVERS
1718+
"hip"
1719+
COMPILER_FLAGS
1720+
${IREE_HIP_TEST_COMPILER_FLAGS}
1721+
"--iree-opt-data-tiling=false"
1722+
"--iree-dispatch-creation-data-tiling"
1723+
"--iree-hip-encoding-layout-resolver=data-tiling"
1724+
"--iree-llvmgpu-test-combine-layout-transformation=true"
1725+
"--iree-hip-enable-tensor-ukernels"
1726+
LABELS
1727+
"noasan"
1728+
"nomsan"
1729+
"notsan"
1730+
"noubsan"
1731+
"requires-gpu-cdna3"
1732+
)
1733+
16341734
iree_generated_e2e_runner_test(
16351735
NAME
16361736
e2e_matmul_tensor_ukernel_bf16bf16f32_large

0 commit comments

Comments
 (0)