Skip to content

Commit 4423eaf

Browse files
authored
Fix wrong check in rocmlir-gen and other bugs in perfRunner (#1936)
Fix wrong check in rocmlir-gen and other bugs in perfRunner regarding convolution layouts
1 parent c89ff86 commit 4423eaf

File tree

5 files changed

+85
-104
lines changed

5 files changed

+85
-104
lines changed
Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
1-
// Check the guards of tensor layouts in rock-driver
21

3-
// RUN: not rocmlir-gen --arch %arch -p -fil_layout ykcx 2>&1 | FileCheck %s --check-prefix=ERR1
4-
// RUN: not rocmlir-gen --arch %arch -p -fil_layout kycx 2>&1 | FileCheck %s --check-prefix=ERR2
5-
// RUN: not rocmlir-gen --arch %arch -p -in_layout nhcw 2>&1 | FileCheck %s --check-prefix=ERR3
6-
// RUN: not rocmlir-gen --arch %arch -p -in_layout chnw 2>&1 | FileCheck %s --check-prefix=ERR4
2+
// RUN: rocmlir-gen --arch %arch -p -fil_layout ykcx 2>&1 | FileCheck %s
3+
// RUN: rocmlir-gen --arch %arch -p -fil_layout kycx 2>&1 | FileCheck %s
4+
// RUN: rocmlir-gen --arch %arch -p -in_layout nhcw 2>&1 | FileCheck %s
5+
// RUN: rocmlir-gen --arch %arch -p -in_layout chnw 2>&1 | FileCheck %s
76

8-
ERR1: Unsupported filter layout
9-
ERR2: Unsupported filter layout
10-
ERR3: Unsupported input layout
11-
ERR4: Unsupported input layout
7+
CHECK: rock.conv
Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// RUN: rocmlir-gen --arch gfx900 --operation gemm -p -ph --kernel-repeats=5 | FileCheck %s --check-prefix=GEMM
2-
// RUN: rocmlir-gen --arch gfx942 -pv --operation conv_bwd_weight -t f32 --fil_layout k01c --in_layout n01c --out_layout n01k --batchsize 64 --in_channels 1024 --in_h 14 --in_w 14 --out_channels 256 --fil_h 1 --fil_w 1 --dilation_h 1 --dilation_w 1 --conv_stride_h 1 --conv_stride_w 1 --padding_h 0 --padding_w 0 --groupsize 1 --kernel-repeats 5 | FileCheck %s --check-prefix=CONV_WRW
2+
// RUN: rocmlir-gen --arch gfx942 -pv --operation conv_bwd_weight -t f32 --fil_layout k01c --in_layout n01c --out_layout n01k --batchsize 64 --in_channels 1024 --in_h 14 --in_w 14 --out_channels 256 --fil_h 1 --fil_w 1 --dilation_h 1 --dilation_w 1 --conv_stride_h 1 --conv_stride_w 1 --padding_h 0 --padding_w 0 --groupsize 1 --kernel-repeats 5 | FileCheck %s --check-prefix=CONV_WRW
33
// RUN: rocmlir-gen --arch gfx942 -pv_with_gpu --operation conv_bwd_weight -t f32 --fil_layout k01c --in_layout n01c --out_layout n01k --batchsize 64 --in_channels 1024 --in_h 14 --in_w 14 --out_channels 256 --fil_h 1 --fil_w 1 --dilation_h 1 --dilation_w 1 --conv_stride_h 1 --conv_stride_w 1 --padding_h 0 --padding_w 0 --groupsize 1 --kernel-repeats 5 | FileCheck %s --check-prefix=CONV_WRW_GPU
44

55
// GEMM-LABEL: @rock_gemm_gpu
@@ -10,28 +10,28 @@
1010
// GEMM-NEXT: func.call @rock_gemm
1111
// GEMM-NEXT: }
1212

13-
// CONV_WRW-LABEL: func.func @rock_conv_bwd_weight_gk01c_n01gc_n01gk_0
13+
// CONV_WRW-LABEL: func.func @rock_conv_bwd_weight_gk01c_ng01c_ng01k_0
1414
// CONV_WRW: rock.init_kernel
15-
// CONV_WRW-LABEL: func.func @rock_conv_bwd_weight_gk01c_n01gc_n01gk_1
15+
// CONV_WRW-LABEL: func.func @rock_conv_bwd_weight_gk01c_ng01c_ng01k_1
1616
// CONV_WRW: rock.conv_bwd_weight
17-
// CONV_WRW-LABEL: func.func @rock_conv_bwd_weight_gk01c_n01gc_n01gk_gpu
17+
// CONV_WRW-LABEL: func.func @rock_conv_bwd_weight_gk01c_ng01c_ng01k_gpu
1818
// CONV_WRW-DAG: %[[one:.*]] = arith.constant 1 : index
1919
// CONV_WRW-DAG: %[[five:.*]] = arith.constant 5 : index
2020
// CONV_WRW-DAG: %[[zero:.*]] = arith.constant 0 : index
2121
// CONV_WRW: scf.for %{{.*}} = %[[zero]] to %[[five]] step %[[one]] {
22-
// CONV_WRW-NEXT: func.call @rock_conv_bwd_weight_gk01c_n01gc_n01gk_0
23-
// CONV_WRW-NEXT: func.call @rock_conv_bwd_weight_gk01c_n01gc_n01gk_1
22+
// CONV_WRW-NEXT: func.call @rock_conv_bwd_weight_gk01c_ng01c_ng01k_0
23+
// CONV_WRW-NEXT: func.call @rock_conv_bwd_weight_gk01c_ng01c_ng01k_1
2424
// CONV_WRW-NEXT: }
2525

26-
// CONV_WRW_GPU-LABEL: func.func @rock_conv_bwd_weight_gk01c_n01gc_n01gk_0
26+
// CONV_WRW_GPU-LABEL: func.func @rock_conv_bwd_weight_gk01c_ng01c_ng01k_0
2727
// CONV_WRW_GPU: rock.init_kernel
28-
// CONV_WRW_GPU-LABEL: func.func @rock_conv_bwd_weight_gk01c_n01gc_n01gk_1
28+
// CONV_WRW_GPU-LABEL: func.func @rock_conv_bwd_weight_gk01c_ng01c_ng01k_1
2929
// CONV_WRW_GPU: rock.conv_bwd_weight
30-
// CONV_WRW_GPU-LABEL: func.func @rock_conv_bwd_weight_gk01c_n01gc_n01gk_gpu
30+
// CONV_WRW_GPU-LABEL: func.func @rock_conv_bwd_weight_gk01c_ng01c_ng01k_gpu
3131
// CONV_WRW_GPU-DAG: %[[zero:.*]] = arith.constant 0 : index
3232
// CONV_WRW_GPU-DAG: %[[one:.*]] = arith.constant 1 : index
3333
// CONV_WRW_GPU-DAG: %[[five:.*]] = arith.constant 5 : index
3434
// CONV_WRW_GPU: scf.for %{{.*}} = %[[zero]] to %[[five]] step %[[one]] {
35-
// CONV_WRW_GPU-NEXT: func.call @rock_conv_bwd_weight_gk01c_n01gc_n01gk_0
36-
// CONV_WRW_GPU-NEXT: func.call @rock_conv_bwd_weight_gk01c_n01gc_n01gk_1
35+
// CONV_WRW_GPU-NEXT: func.call @rock_conv_bwd_weight_gk01c_ng01c_ng01k_0
36+
// CONV_WRW_GPU-NEXT: func.call @rock_conv_bwd_weight_gk01c_ng01c_ng01k_1
3737
// CONV_WRW_GPU-NEXT: }

mlir/tools/rocmlir-gen/rocmlir-gen.cpp

Lines changed: 22 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -943,35 +943,34 @@ namespace test {
943943
void registerTestDialect(DialectRegistry &);
944944
} // namespace test
945945

946-
static void correctConvParameters() {
947-
std::string filterLayoutValue = filterLayout.getValue();
948-
949-
// yxcgk not implement yet
950-
if (filterLayoutValue.find('g') == std::string::npos &&
951-
(filterLayoutValue.substr(0, 2) == "kc" ||
952-
(filterLayoutValue[0] == 'k' && filterLayoutValue.back() == 'c') ||
953-
filterLayoutValue.substr(filterLayoutValue.size() - 2) == "ck"))
954-
filterLayout = "g" + filterLayoutValue;
946+
static bool isConv(rock::KernelType kernelType) {
947+
return kernelType == rock::KernelType::Conv ||
948+
kernelType == rock::KernelType::ConvBwdData ||
949+
kernelType == rock::KernelType::ConvBwdWeight ||
950+
kernelType == rock::KernelType::ConvElementwiseGemm;
951+
}
955952

956-
auto addGToLayout = [&](std::string ch,
957-
std::string &layoutValue) -> std::string {
953+
static void correctConvParameters() {
954+
auto addGToLayout = [](std::string &layoutValue) -> std::string {
958955
std::string layout;
959956
if (layoutValue.find('g') == std::string::npos) {
960-
if (layoutValue.substr(0, 2) == "n" + ch)
961-
layout = "ng" + ch + layoutValue.substr(2);
962-
else if (layoutValue[0] == 'n' && layoutValue.back() == ch[0])
963-
layout = layoutValue.substr(0, layoutValue.size() - 1) + "g" + ch;
964-
else
965-
layout = "g" + layoutValue;
966-
} else
957+
// Always add 'g' after 'n' when it's missing
958+
size_t nPos = layoutValue.find('n');
959+
assert(nPos != std::string::npos);
960+
layout =
961+
layoutValue.substr(0, nPos + 1) + "g" + layoutValue.substr(nPos + 1);
962+
} else {
967963
layout = layoutValue;
964+
}
968965
return layout;
969966
};
970967

971-
inputLayout = addGToLayout("c", inputLayout.getValue());
972-
outputLayout = addGToLayout("k", outputLayout.getValue());
968+
if (filterLayout.getValue().find('g') == std::string::npos)
969+
filterLayout = "g" + filterLayout.getValue();
970+
inputLayout = addGToLayout(inputLayout.getValue());
971+
outputLayout = addGToLayout(outputLayout.getValue());
973972

974-
// +++pf: update old key names.
973+
// update old key names.
975974
std::replace(filterLayout.getValue().begin(), filterLayout.getValue().end(),
976975
'y', '0');
977976
std::replace(filterLayout.getValue().begin(), filterLayout.getValue().end(),
@@ -1080,28 +1079,6 @@ static void correctConvParameters() {
10801079
paddingDepthRight = in_right_pad_d + (di_minimum - di_specified);
10811080
}
10821081

1083-
static void verifyConvLayout() {
1084-
std::string filterLayoutValue = filterLayout.getValue();
1085-
std::string inputLayoutValue = inputLayout.getValue();
1086-
1087-
if (filterLayoutValue.find("yx") == std::string::npos &&
1088-
filterLayoutValue.find("xy") == std::string::npos &&
1089-
filterLayoutValue.find("01") == std::string::npos &&
1090-
filterLayoutValue.find("10") == std::string::npos) {
1091-
llvm::errs() << "Unsupported filter layout: disjointed yx!\n";
1092-
exit(1);
1093-
}
1094-
1095-
if (inputLayoutValue.find("hw") == std::string::npos &&
1096-
inputLayoutValue.find("wh") == std::string::npos &&
1097-
inputLayoutValue.find("01") == std::string::npos &&
1098-
inputLayoutValue.find("10") == std::string::npos) {
1099-
1100-
llvm::errs() << "Unsupported input layout: disjointed hw!\n";
1101-
exit(1);
1102-
}
1103-
}
1104-
11051082
static void populateDefaults() {
11061083
const bool isGemm = operation == rock::KernelType::Gemm;
11071084
const bool isAttention = operation == rock::KernelType::Attention;
@@ -5033,10 +5010,9 @@ int main(int argc, char **argv) {
50335010
outputDataType = canonicaliseF8Type(outputDataType);
50345011
}
50355012

5036-
if (operation != rock::KernelType::Gemm) {
5037-
verifyConvLayout();
5013+
if (isConv(operation))
50385014
correctConvParameters();
5039-
}
5015+
50405016
populateDefaults();
50415017

50425018
bool hasUserKernel = !testFuncName.empty();

mlir/utils/jenkins/Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1239,7 +1239,7 @@ pipeline {
12391239
sh 'date --utc +%Y-%m-%d > perf-run-date'
12401240
sh 'ls -l /dev/kfd'
12411241
sh 'ls -l /dev/dri'
1242-
// Run MLIR vs MIOpend perf benchmarks.
1242+
// Run MLIR vs MIOpen perf benchmarks.
12431243
sh """python3 ./bin/perfRunner.py --op=conv --batch_all \
12441244
--configs_file=${convToUse} \
12451245
--tuning_db=${WORKSPACE}/build/mlir_tuning_${CHIP}.tsv \

mlir/utils/performance/perfRunner.py

Lines changed: 46 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,7 @@
4242
OUTPUT_DATA_TYPES_MAP = {'f32': 'f32', 'f16': 'f16', 'bf16': 'bf16', 'i8': 'i32', 'fp8':'f32',
4343
'fp8_fp8': 'f32', 'fp8_bf8': 'f32', 'bf8_fp8': 'f32',
4444
'bf8_bf8': 'f32'}
45-
MLIR_N_REPEATS = 5
46-
47-
MLIR_FILTER_LAYOUTS = {"NCHW": "kcyx", "NCHWG": "kcyxg", "NHWC": "kyxc", "NHWCG": "kyxcg",
48-
"NC01": "kc01", "NC01G": "kc01g", "N01C": "k01c", "N01CG": "k01cg",
49-
"GNC01":"gkc01", "GN01C":"gk01c"}
50-
MLIR_OUTPUT_LAYOUTS = {"NCHW": "nkhw", "NCHWG": "nkhwg", "NHWC": "nhwk", "NHWCG": "nhwkg",
51-
"NC01": "nk01", "NC01G": "nk01g", "N01C": "n01k", "N01CG": "n01kg",
52-
"NGC01":"ngk01", "N01GC": "n01gk"}
53-
INVERSE_FILTER_LAYOUTS = {v: k for k, v in MLIR_FILTER_LAYOUTS.items()}
54-
INVERSE_OUTPUT_LAYOUTS = {v: k for k, v in MLIR_OUTPUT_LAYOUTS.items()}
45+
MLIR_N_REPEATS = 100
5546

5647
FILTER_LAYOUT_MAP = {'N':'k', 'C':'c', 'H':'0', 'W':'1', 'G':'g'}
5748
INPUT_LAYOUT_MAP = {'N':'n', 'C':'c', 'H':'0', 'W':'1', 'G':'g'}
@@ -64,6 +55,14 @@
6455
INFO_ARCH_NAME = re.compile(r"Name:\s*(.*)")
6556
INFO_ARCH_CU = re.compile(r"Compute Unit:\s*(.*)")
6657

58+
def inverse_output_layouts(output_layout):
59+
map = {"n": "N", "k": "C", "h": "H", "w": "W", "g": "G", "0": "0", "1": "1"}
60+
return "".join(map[char] for char in output_layout)
61+
62+
def inverse_filter_layouts(filter_layout):
63+
map = {"k": "N", "c": "C", "y": "H", "x": "W", "g": "G", "0": "0", "1": "1"}
64+
return "".join(map[char] for char in filter_layout)
65+
6766
@dataclass
6867
class MLIRPaths:
6968
rocmlir_gen_path : str
@@ -285,15 +284,15 @@ def runPipeline(proc_specs):
285284
for p in procs:
286285
p.wait()
287286
if p.returncode != 0:
288-
raise OSError(str(p.stderr))
287+
raise OSError(str(p.stderr.read()))
289288
outs, errs = p.communicate()
290-
return outs, errs
289+
return outs, True
291290
except Exception as err:
292291
print(f"Error: {err}")
293292
print(f"Failing command: {' '.join(p.args)}")
294293
print(f"Failing pipeline: {' | '.join([' '.join(proc) for proc in proc_specs])}")
295294
outs, errs = p.communicate()
296-
return outs, errs
295+
return outs, False
297296

298297
class PerfConfiguration:
299298
TABLE_COLUMNS = []
@@ -537,8 +536,8 @@ def fromCommandLine(cls, argv, arch, numCU):
537536
def toCommandLine(self):
538537
return (f"conv{ {'f32':'', 'f16':'fp16', 'bf16':'bfp16', 'i8':'int8','fp8_fp8':'fp8_fp8', 'fp8': 'fp8'}[self.dataType]} "
539538
+ f"-F { {'fwd':1, 'bwd':2, 'wrw':4}[self.direction]} "
540-
+ f"-f {INVERSE_FILTER_LAYOUTS[self.filterLayout]} -I {self.inputLayout.upper()} "
541-
+ f"-O {INVERSE_OUTPUT_LAYOUTS[self.outputLayout]} "
539+
+ f"-f {inverse_filter_layouts(self.filterLayout)} -I {self.inputLayout.upper()} "
540+
+ f"-O {inverse_output_layouts(self.outputLayout)} "
542541
+ f"-n {self.n} -c {self.c} -H {self.hi} -W {self.wi} -k {self.k} "
543542
+ f"-y {self.y} -x {self.x} -p {self.paddingH} -q {self.paddingW} "
544543
+ f"-u {self.convStrideH} -v {self.convStrideW} -l {self.dilationH} "
@@ -593,17 +592,17 @@ def benchmarkExternal(cls, commandLine, paths: Paths, arch, numCU):
593592
MIOpenDriverCommand = [MIOPENDRIVER, *commandLine, '-V', '0', '-t', '1']
594593
print("Running MIOpen Benchmark: ", ' '.join(commandLine))
595594
# invoke MIOpenDriver.
596-
outs,errs = runPipeline([MIOpenDriverCommand])
597-
if len(errs) == 0:
595+
outs, noerr = runPipeline([MIOpenDriverCommand])
596+
nanoSeconds = np.nan
597+
if noerr:
598598
# convert bytes to str
599599
outs = outs.decode('utf-8')
600600
# Extract Elapsed time in ms from the output of MIOpenDriver
601601
# Use regular expression to match the contents between
602602
# "Elasped: " (note the space at the end) and "ms"
603603
elapsedTimeInMs = ELAPSED_TIME_RE.search(outs).group(1)
604604
nanoSeconds = float(elapsedTimeInMs)*1.0e6
605-
else:
606-
nanoSeconds = np.nan
605+
607606
return config.tableEntry(nanoSeconds)
608607

609608
def getGemmConfigurations(fileName, dataTypes=DATA_TYPES_GEMM, outDataTypeMap=OUTPUT_DATA_TYPES_MAP):
@@ -1109,7 +1108,7 @@ def fromCommandLine(cls, argv, arch, numCU):
11091108

11101109
def toCommandLine(self):
11111110
return (f"-t {self.dataType} "
1112-
+ f"-f {INVERSE_FILTER_LAYOUTS[self.filterLayout]} -I {self.inputLayout.upper()} "
1111+
+ f"-f {inverse_filter_layouts(self.filterLayout)} -I {self.inputLayout.upper()} "
11131112
+ f"-transC {str(self.transC).lower()} -transO {str(self.transO).lower()} "
11141113
+ f"-n {self.n} -c {self.c} -H {self.hi} -W {self.wi} -k {self.k} "
11151114
+ f"-y {self.y} -x {self.x} -p {self.paddingH} -q {self.paddingW} "
@@ -1458,9 +1457,12 @@ def benchmarkExternal(cls, commandLine, paths: Paths, arch, numCU):
14581457
print(f"Running rocBLAS benchmark {config!r}")
14591458
profilerCommand = [paths.mlir_paths.rocblas_benchmark_driver_path] + \
14601459
benchmarkArgs.split()
1461-
outs,errs = runPipeline([profilerCommand])
1462-
milliSeconds = getMilliseconds(outs)
1463-
nanoSeconds = milliSeconds*1e6
1460+
outs, noerr = runPipeline([profilerCommand])
1461+
nanoSeconds = np.nan
1462+
if noerr:
1463+
milliSeconds = getMilliseconds(outs)
1464+
nanoSeconds = milliSeconds*1e6
1465+
14641466
return config.tableEntry(nanoSeconds)
14651467

14661468
class CKGemmConfig(GemmConfiguration):
@@ -1479,9 +1481,12 @@ def benchmarkExternal(cls, commandLine, paths: Paths, arch, numCU):
14791481

14801482
profilerCommand = [paths.mlir_paths.ck_gemm_benchmark_driver_path] + \
14811483
benchmarkArgs.split()
1482-
outs,errs = runPipeline([profilerCommand])
1483-
milliSeconds = getMilliseconds(outs)
1484-
nanoSeconds = milliSeconds*1e6
1484+
outs, noerr = runPipeline([profilerCommand])
1485+
nanoSeconds = np.nan
1486+
if noerr:
1487+
milliSeconds = getMilliseconds(outs)
1488+
nanoSeconds = milliSeconds*1e6
1489+
14851490
return config.tableEntry(nanoSeconds)
14861491

14871492
def runConfigWithMLIR(config: PerfConfiguration, paths: Paths, arch, rocmlir_gen_flags, debug=True):
@@ -1496,7 +1501,12 @@ def runConfigWithMLIR(config: PerfConfiguration, paths: Paths, arch, rocmlir_gen
14961501
mlir_cpu_runner_args = [f'--shared-libs={paths.mlir_paths.libmlir_rocm_runtime_path},{paths.mlir_paths.libconv_validation_wrappers_path},{paths.mlir_paths.libmlir_runtime_utils_path},{paths.mlir_paths.libmlir_c_runner_utils_path}', '--entry-point-result=void']
14971502
profilerCommand = [ROCPROF] + getMetricArgsForRocprof(arch) + ['--kernel-trace', '--stats', '-o', BENCHMARKING_RESULT_FILE_NAME, '--' ,paths.mlir_paths.cpu_runner_path] + mlir_cpu_runner_args
14981503

1499-
runPipeline([rocmlirGenCommand.split(), rocmlirDriverCommand, profilerCommand])
1504+
outs, noerr = runPipeline([rocmlirGenCommand.split(), rocmlirDriverCommand, profilerCommand])
1505+
nanoSeconds = np.nan
1506+
if noerr:
1507+
nanoSeconds = getNanoSeconds(getProfilerOutputPath(arch, BENCHMARKING_STATS_FILE_NAME))
1508+
1509+
return nanoSeconds
15001510

15011511
# Benchmarking function.
15021512
def benchmarkMLIR(commandLine, confClass, paths: Paths, arch, numCU, tuningDb: MaybeTuningDb, rocmlir_gen_flags):
@@ -1508,9 +1518,7 @@ def benchmarkMLIR(commandLine, confClass, paths: Paths, arch, numCU, tuningDb: M
15081518
else: # Tuning DB present but doesn't contain config, return N/A
15091519
return config.tableEntry(np.nan)
15101520

1511-
runConfigWithMLIR(config, paths, arch, rocmlir_gen_flags)
1512-
# get nanoseconds from rocprof output.
1513-
nanoSeconds = getNanoSeconds(getProfilerOutputPath(arch, BENCHMARKING_STATS_FILE_NAME))
1521+
nanoSeconds = runConfigWithMLIR(config, paths, arch, rocmlir_gen_flags)
15141522
return config.tableEntry(nanoSeconds)
15151523

15161524
#Generate MLIR vs. MIOpen or rocBLAS performance results
@@ -1682,7 +1690,12 @@ def runFusionKernel(filename, rocmlirGenArgs, paths: Paths):
16821690
mlir_cpu_runner_args = [f'--shared-libs={paths.mlir_paths.libmlir_rocm_runtime_path},{paths.mlir_paths.libconv_validation_wrappers_path},{paths.mlir_paths.libmlir_runtime_utils_path},{paths.mlir_paths.libmlir_c_runner_utils_path}', '--entry-point-result=void']
16831691
profilerCommand = [ROCPROF] + getMetricArgsForRocprof(chip) + ['--kernel-trace', '--stats', '-o', BENCHMARKING_RESULT_FILE_NAME] + ['--', paths.mlir_paths.cpu_runner_path] + mlir_cpu_runner_args
16841692
commands.append(profilerCommand)
1685-
runPipeline(commands)
1693+
outs, noerr = runPipeline(commands)
1694+
nanoSeconds = np.nan
1695+
if noerr:
1696+
nanoSeconds = getNanoSeconds(getProfilerOutputPath(arch, BENCHMARKING_STATS_FILE_NAME))
1697+
1698+
return nanoSeconds
16861699

16871700
# Generate fusion vs. gemm/conv performance results
16881701
def benchmarkFusionKernels(test_dir, paths: Paths, arch, numCU, tuningDb: MaybeTuningDb):
@@ -1747,18 +1760,14 @@ def benchmarkFusionKernels(test_dir, paths: Paths, arch, numCU, tuningDb: MaybeT
17471760

17481761
# Run fusion test
17491762
rocmlirGenArgs = ['-ph', '-fut='+futName+'_wrapper', '--perf_config='+bestPerf, '-']
1750-
runFusionKernel(filename, rocmlirGenArgs, paths)
1751-
# Get nanoseconds of fusion test
1752-
nanoSeconds = getNanoSeconds(getProfilerOutputPath(arch, BENCHMARKING_STATS_FILE_NAME))
1763+
nanoSeconds = runFusionKernel(filename, rocmlirGenArgs, paths)
17531764
oneEntry = config.tableEntry(nanoSeconds)
17541765
# Keep the best performance
17551766
if testVector in perfResults and oneEntry['TFlops'] <= perfResults[testVector]['TFlops']:
17561767
continue
17571768

17581769
# Run gemm or conv op with the same configuration
1759-
runConfigWithMLIR(config, paths, arch, '')
1760-
# Get nanoseconds of gemm/conv
1761-
nanoSeconds = getNanoSeconds(getProfilerOutputPath(arch, BENCHMARKING_STATS_FILE_NAME))
1770+
nanoSeconds = runConfigWithMLIR(config, paths, arch, '')
17621771
oneEntry['MLIR TFlops'] = config.computeTFlops(nanoSeconds)
17631772
oneEntry['Fusion/MLIR'] = oneEntry['TFlops']/oneEntry['MLIR TFlops']
17641773
oneEntry['FileName'] = filename

0 commit comments

Comments
 (0)