Skip to content

Commit bd48f46

Browse files
Fix usage of kernel-repeats in tuningRunner (#2126)
1 parent 219a818 commit bd48f46

File tree

7 files changed

+49
-31
lines changed

7 files changed

+49
-31
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: rocmlir-gen --operation conv_bwd_data -t f16 --arch %arch --fil_layout gkc01 --in_layout ngc01 --out_layout ngk01 --batchsize 1 --in_channels 192 --in_h 64 --in_w 64 --out_channels 384 --fil_h 4 --fil_w 4 --dilation_h 1 --dilation_w 1 --conv_stride_h 2 --conv_stride_w 2 --padding_h 1 --padding_w 1 --groupsize 1 --kernel-repeats 10 --perf_config 'v3:128,64,8,128,64,16,1,1,2,1,1' | rocmlir-driver --kernel-pipeline=full | FileCheck %s
1+
// RUN: rocmlir-gen --operation conv_bwd_data -t f16 --arch %arch --fil_layout gkc01 --in_layout ngc01 --out_layout ngk01 --batchsize 1 --in_channels 192 --in_h 64 --in_w 64 --out_channels 384 --fil_h 4 --fil_w 4 --dilation_h 1 --dilation_w 1 --conv_stride_h 2 --conv_stride_w 2 --padding_h 1 --padding_w 1 --groupsize 1 --perf_config 'v3:128,64,8,128,64,16,1,1,2,1,1' | rocmlir-driver --kernel-pipeline=full | FileCheck %s
22

33
// CHECK: gpu.binary {{.*}} rock.blocks_per_cu = {{.*}} : i32
44

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// This is E2E compilation test to ensure gfx908 specific inlineAsm hack
22
// does not blow up in the backend compiler.
33

4-
// RUN: rocmlir-gen -operation gemm -t f32 -out_datatype f32 --arch gfx908:sramecc+:xnack- --num_cu 120 -g 1 -m 24576 -k 768 -n 3072 -transA=False -transB=False --kernel-repeats 5 --perf_config= | rocmlir-driver -c | FileCheck %s
4+
// RUN: rocmlir-gen -operation gemm -t f32 -out_datatype f32 --arch gfx908:sramecc+:xnack- --num_cu 120 -g 1 -m 24576 -k 768 -n 3072 -transA=False -transB=False --perf_config= | rocmlir-driver -c | FileCheck %s
55
// CHECK: gpu.binary

mlir/test/rocmlir-gen/options.mlir

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
// RUN: not rocmlir-gen -operation gemm -t f32 -out_datatype f32 --arch %arch -g 1 -m 1024 -k 1024 -n 1024 -transA=False -transB=False --kernel-repeats=100 2>&1 | FileCheck %s --check-prefix=ERR_KERNEL_REPEATS
2+
// ERR_KERNEL_REPEATS: --kernel-repeats is only supported with host harness (-ph) or CPU validation (-pv).

mlir/test/rocmlir-gen/problem-key.mlir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@
1919
// RUN: rocmlir-gen --arch gfx942 --operation attention -seq_len_q 256 -seq_len_k 512 -head_dim_qk 64 -head_dim_v 32 -t f16 -return_lse -split_kv 8 -g 1 | rocmlir-gen --emit-tuning-key - | FileCheck %s --check-prefixes=CHECK_10
2020
// CHECK_10: -t f16 -transQ false -transK false -transV false -transO false -causal false -return_lse true -split_kv 8 -num_heads_q 1 -num_heads_kv 1 -g 1 -seq_len_q 256 -seq_len_k 512 -head_dim_qk 64 -head_dim_v 32
2121

22-
// RUN: rocmlir-gen --arch gfx942 --operation conv -t f16 --fil_layout gkc01 --in_layout ngc01 --out_layout ngk01 --batchsize 64 --in_channels 256 --in_h 20 --in_w 20 --out_channels 256 --fil_h 7 --fil_w 7 --dilation_h 1 --dilation_w 1 --conv_stride_h 1 --conv_stride_w 1 --padding_h 3 --padding_w 3 --groupsize 256 --kernel-repeats 1 --perf_config=v3:32,256,2,32,32,4,1,1,2,1,1 | rocmlir-gen --emit-tuning-key - | FileCheck %s --check-prefixes=CHECK_DEPTHWISE_CONV
22+
// RUN: rocmlir-gen --arch gfx942 --operation conv -t f16 --fil_layout gkc01 --in_layout ngc01 --out_layout ngk01 --batchsize 64 --in_channels 256 --in_h 20 --in_w 20 --out_channels 256 --fil_h 7 --fil_w 7 --dilation_h 1 --dilation_w 1 --conv_stride_h 1 --conv_stride_w 1 --padding_h 3 --padding_w 3 --groupsize 256 --perf_config=v3:32,256,2,32,32,4,1,1,2,1,1 | rocmlir-gen --emit-tuning-key - | FileCheck %s --check-prefixes=CHECK_DEPTHWISE_CONV
2323
// CHECK_DEPTHWISE_CONV: convfp16 -F 1 -f GNC01 -I NGC01 -O NGC01 -n 64 -c 256 -H 20 -W 20 -k 256 -y 7 -x 7 -p 3 -q 3 -u 1 -v 1 -l 1 -j 1 -g 256
2424

25-
// RUN: rocmlir-gen --arch gfx942 --operation conv -t f16 --fil_layout gkc01 --in_layout ngc01 --out_layout ngk01 --batchsize 64 --in_channels 256 --in_h 20 --in_w 20 --out_channels 256 --fil_h 7 --fil_w 7 --dilation_h 1 --dilation_w 1 --conv_stride_h 1 --conv_stride_w 1 --padding_h 3 --padding_w 3 --groupsize 128 --kernel-repeats 1 --perf_config=v3:32,256,2,32,32,4,1,1,2,1,1 | rocmlir-gen --emit-tuning-key - | FileCheck %s --check-prefixes=CHECK_GROUP_CONV
25+
// RUN: rocmlir-gen --arch gfx942 --operation conv -t f16 --fil_layout gkc01 --in_layout ngc01 --out_layout ngk01 --batchsize 64 --in_channels 256 --in_h 20 --in_w 20 --out_channels 256 --fil_h 7 --fil_w 7 --dilation_h 1 --dilation_w 1 --conv_stride_h 1 --conv_stride_w 1 --padding_h 3 --padding_w 3 --groupsize 128 --perf_config=v3:32,256,2,32,32,4,1,1,2,1,1 | rocmlir-gen --emit-tuning-key - | FileCheck %s --check-prefixes=CHECK_GROUP_CONV
2626
// CHECK_GROUP_CONV: convfp16 -F 1 -f GNC01 -I NGC01 -O NGC01 -n 64 -c 256 -H 20 -W 20 -k 256 -y 7 -x 7 -p 3 -q 3 -u 1 -v 1 -l 1 -j 1 -g 128
2727

28-
// RUN: rocmlir-gen --arch gfx942 --operation conv -t f16 --fil_layout gkc01 --in_layout ngc01 --out_layout ngk01 --batchsize 64 --in_channels 256 --in_h 20 --in_w 20 --out_channels 512 --fil_h 7 --fil_w 7 --dilation_h 1 --dilation_w 1 --conv_stride_h 1 --conv_stride_w 1 --padding_h 3 --padding_w 3 --groupsize 128 --kernel-repeats 1 --perf_config=v3:32,256,2,32,32,4,1,1,2,1,1 | rocmlir-gen --emit-tuning-key - | FileCheck %s --check-prefixes=CHECK_GROUP_CONV2
28+
// RUN: rocmlir-gen --arch gfx942 --operation conv -t f16 --fil_layout gkc01 --in_layout ngc01 --out_layout ngk01 --batchsize 64 --in_channels 256 --in_h 20 --in_w 20 --out_channels 512 --fil_h 7 --fil_w 7 --dilation_h 1 --dilation_w 1 --conv_stride_h 1 --conv_stride_w 1 --padding_h 3 --padding_w 3 --groupsize 128 --perf_config=v3:32,256,2,32,32,4,1,1,2,1,1 | rocmlir-gen --emit-tuning-key - | FileCheck %s --check-prefixes=CHECK_GROUP_CONV2
2929
// CHECK_GROUP_CONV2: convfp16 -F 1 -f GNC01 -I NGC01 -O NGC01 -n 64 -c 256 -H 20 -W 20 -k 512 -y 7 -x 7 -p 3 -q 3 -u 1 -v 1 -l 1 -j 1 -g 128
3030

3131
// Checking numCU

mlir/tools/rocmlir-gen/rocmlir-gen.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5507,6 +5507,14 @@ int main(int argc, char **argv) {
55075507
module = ModuleOp::create(UnknownLoc::get(&context));
55085508
}
55095509

5510+
if (kernelRepeats.getNumOccurrences() > 0 && !genCPUValidation &&
5511+
!genHostHarness) {
5512+
llvm::errs()
5513+
<< "--kernel-repeats is only supported with host harness (-ph) or "
5514+
"CPU validation (-pv).\n";
5515+
return EXIT_FAILURE;
5516+
}
5517+
55105518
if (genCloneHarness.getValue()) {
55115519
populateCloneHarnessLogic(*module);
55125520
} else if (!hasUserKernel) {

mlir/utils/performance/perfRunner.py

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -499,8 +499,9 @@ def generate_mlir_driver_commandline(self, rocmlir_gen_flags, kernel_repeats=MLI
499499
str(self.conv_stride_w), '--padding_h',
500500
str(self.padding_h), '--padding_w',
501501
str(self.padding_w), '--groupsize',
502-
str(self.group), '--kernel-repeats',
503-
str(kernel_repeats), f"--perf_config={self.perfconfig}"
502+
str(self.group),
503+
*(['--kernel-repeats', str(kernel_repeats)] if kernel_repeats is not None else []),
504+
f"--perf_config={self.perfconfig}"
504505
])
505506
result += ' '
506507
if rocmlir_gen_flags != '':
@@ -696,7 +697,7 @@ def get_gemm_configurations(filename,
696697

697698
# Skip unsupported datatypes
698699
if datatype == 'f4E2M1FN':
699-
## TODO: use information from AMDArchDB when it becomes available to determine supported chips
700+
# TODO: use information from AMDArchDB when it becomes available to determine supported chips
700701
supported_chips = {'gfx950'}
701702
if get_chip() not in supported_chips:
702703
continue
@@ -926,32 +927,28 @@ def set_perfconfig(self, perf_config):
926927
self.perfconfig = perf_config
927928

928929
def generate_mlir_driver_commandline(self, rocmlir_gen_flags, kernel_repeats=MLIR_N_REPEATS):
929-
cmd_parts = [
930+
result = ' '.join([
930931
'-operation', 'gemm', '-t', self.datatype, '-out_datatype', self.out_dtype, '--arch',
931932
self.arch, '--num_cu',
932933
str(self.num_cu), '-g',
933934
str(self.g), '-m',
934935
str(self.m), '-k',
935936
str(self.k), '-n',
936-
str(self.n), f"-transA={self.trans_a}", f"-transB={self.trans_b}"
937-
]
937+
str(self.n), f"-transA={self.trans_a}", f"-transB={self.trans_b}",
938+
*(['--kernel-repeats', str(kernel_repeats)] if kernel_repeats is not None else []),
939+
f"--perf_config={self.perfconfig}"
940+
])
938941

939942
if self.scaled_gemm:
940-
cmd_parts.append('-scaledGemm')
943+
result += ' -scaledGemm'
941944
if self.scale_a_dtype:
942-
cmd_parts.extend(['-scale_a_dtype', self.scale_a_dtype])
945+
result += f' -scale_a_dtype {self.scale_a_dtype}'
943946
if self.scale_b_dtype:
944-
cmd_parts.extend(['-scale_b_dtype', self.scale_b_dtype])
947+
result += f' -scale_b_dtype {self.scale_b_dtype}'
945948
if self.trans_scale_a:
946-
cmd_parts.append(f"-transScaleA={self.trans_scale_a}")
949+
result += f' -transScaleA {str(self.trans_scale_a)}'
947950
if self.trans_scale_b:
948-
cmd_parts.append(f"-transScaleB={self.trans_scale_b}")
949-
950-
cmd_parts.extend(
951-
['--kernel-repeats',
952-
str(kernel_repeats), f"--perf_config={self.perfconfig}"])
953-
954-
result = ' '.join(cmd_parts)
951+
result += f' -transScaleB {str(self.trans_scale_b)}'
955952

956953
result += ' '
957954
if rocmlir_gen_flags != '':
@@ -1194,7 +1191,8 @@ def generate_mlir_driver_commandline(self, rocmlir_gen_flags, kernel_repeats=MLI
11941191
f'--dilation_w={self.dilation_w}', f'--conv_stride_h={self.conv_stride_h}',
11951192
f'--conv_stride_w={self.conv_stride_w}', f'--padding_h={self.padding_h}',
11961193
f'--padding_w={self.padding_w}', f'--groupsize={self.group}', f'--gemmO={self.o}',
1197-
f'--kernel-repeats={kernel_repeats}', f"--perf_config={self.perfconfig}"
1194+
*(['--kernel-repeats', str(kernel_repeats)] if kernel_repeats is not None else []),
1195+
f"--perf_config={self.perfconfig}"
11981196
])
11991197
result += ' '
12001198
if rocmlir_gen_flags != '':
@@ -1363,8 +1361,9 @@ def generate_mlir_driver_commandline(self, rocmlir_gen_flags, kernel_repeats=MLI
13631361
str(self.k), '-n',
13641362
str(self.n), '-gemmO',
13651363
str(self.o), f"-transA={self.trans_a}", f"-transB={self.trans_b}",
1366-
f"-transC={self.trans_c}", f"-transO={self.trans_o}", '--kernel-repeats',
1367-
str(kernel_repeats), f"--perf_config={self.perfconfig}"
1364+
f"-transC={self.trans_c}", f"-transO={self.trans_o}",
1365+
*(['--kernel-repeats', str(kernel_repeats)] if kernel_repeats is not None else []),
1366+
f"--perf_config={self.perfconfig}"
13681367
])
13691368
result += ' '
13701369
if rocmlir_gen_flags != '':
@@ -1691,17 +1690,25 @@ def run_config_with_mlir(config: PerfConfiguration,
16911690
# remove the result file generated by rocprof in previous benchmarking
16921691
if os.path.exists(get_profiler_output_path(arch, BENCHMARKING_STATS_FILE_NAME)):
16931692
os.remove(get_profiler_output_path(arch, BENCHMARKING_STATS_FILE_NAME))
1694-
commandline_options = config.generate_mlir_driver_commandline(rocmlir_gen_flags)
1693+
use_tuning_driver = (not use_rocprof) and bool(config.perfconfig)
1694+
use_host_harness = not use_tuning_driver
1695+
1696+
rocmlir_gen_flags = rocmlir_gen_flags + ' -ph' if use_host_harness else ''
1697+
# We want to use kernel_repeats only if we are passing ' -ph' to rocmlir-gen, otherwise we use None.
1698+
# This is because the kernel-repeats flag is only supported with host harness or CPU validation.
1699+
kernel_repeats = MLIR_N_REPEATS if use_host_harness else None
1700+
1701+
commandline_options = config.generate_mlir_driver_commandline(rocmlir_gen_flags, kernel_repeats)
1702+
rocmlir_gen_cmd = paths.mlir_paths.rocmlir_gen_path + ' ' + commandline_options
16951703
if debug:
16961704
print("Running MLIR Benchmark: ", repr(config))
16971705

16981706
nanoseconds = np.nan
16991707

17001708
# Use HIP timing via tuning-driver if rocprof is disabled and perfconfig is present
1701-
if not use_rocprof and config.perfconfig:
1709+
if use_tuning_driver:
17021710
if debug:
17031711
print("Using HIP timing for benchmarking")
1704-
rocmlir_gen_cmd = paths.mlir_paths.rocmlir_gen_path + ' ' + commandline_options
17051712
tuning_driver_command = [
17061713
paths.mlir_paths.rocmlir_tuning_driver_path, f'--benchmark-config={config.perfconfig}',
17071714
f'--num-iterations={MLIR_N_REPEATS}', f'--warmup-iterations={WARMUP_ITERATIONS}',
@@ -1719,7 +1726,6 @@ def run_config_with_mlir(config: PerfConfiguration,
17191726
else:
17201727
if debug:
17211728
print("Using rocprof for benchmarking")
1722-
rocmlir_gen_cmd = paths.mlir_paths.rocmlir_gen_path + ' -ph ' + commandline_options
17231729
rocmlir_driver_cmd = [paths.mlir_paths.rocmlir_driver_path, '-c']
17241730
mlir_cpu_runner_args = [
17251731
f'--shared-libs={paths.mlir_paths.libmlir_rocm_runtime_path},{paths.mlir_paths.libconv_validation_wrappers_path},{paths.mlir_paths.libmlir_runtime_utils_path},{paths.mlir_paths.libmlir_c_runner_utils_path}',

mlir/utils/performance/tuningRunner.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,8 +183,10 @@ def tune_mlir_kernels(configs, conf_class, paths: Paths, options: Options):
183183
test_vector = config.to_command_line()
184184
print("Tuning:", test_vector, file=sys.stderr)
185185
command_line_options = config.generate_mlir_driver_commandline(
186-
options.rocmlir_gen_flags, kernel_repeats=MLIR_N_REPEATS)
187-
# Note, we don't need the -ph, this goes to the tuning driver
186+
options.rocmlir_gen_flags, kernel_repeats=None)
187+
# Note, we don't need the -ph, this goes to the tuning driver.
188+
# Because we don't set -ph, kernel_repeats is set to None.
189+
# This is because the kernel-repeats flag is only supported with host harness or CPU validation.
188190
kernel_gen_command = paths.mlir_paths.rocmlir_gen_path + ' ' + command_line_options
189191
kernel_gen = subprocess.Popen(kernel_gen_command.split(),
190192
stdout=subprocess.PIPE,

0 commit comments

Comments
 (0)