Fix wrong check in rocmlir-gen and other bugs in perfRunner (#1936)

dhernandez0 · web-flow · commit 4423eafa21a3 · 2025-08-07T11:13:17.000+02:00
Fix wrong check in rocmlir-gen and other bugs in perfRunner regarding convolution layouts
diff --git a/mlir/test/rocmlir-driver/layout_driver.mlir b/mlir/test/rocmlir-driver/layout_driver.mlir
@@ -1,11 +1,7 @@
-// Check the guards of tensor layouts in rock-driver 
 
-// RUN: not rocmlir-gen --arch %arch -p -fil_layout ykcx 2>&1 | FileCheck %s --check-prefix=ERR1
-// RUN: not rocmlir-gen --arch %arch -p -fil_layout kycx 2>&1 | FileCheck %s --check-prefix=ERR2
-// RUN: not rocmlir-gen --arch %arch -p -in_layout nhcw  2>&1 | FileCheck %s --check-prefix=ERR3
-// RUN: not rocmlir-gen --arch %arch -p -in_layout chnw  2>&1 | FileCheck %s --check-prefix=ERR4
+// RUN: rocmlir-gen --arch %arch -p -fil_layout ykcx 2>&1 | FileCheck %s
+// RUN: rocmlir-gen --arch %arch -p -fil_layout kycx 2>&1 | FileCheck %s
+// RUN: rocmlir-gen --arch %arch -p -in_layout nhcw  2>&1 | FileCheck %s
+// RUN: rocmlir-gen --arch %arch -p -in_layout chnw  2>&1 | FileCheck %s
 
-ERR1: Unsupported filter layout
-ERR2: Unsupported filter layout
-ERR3: Unsupported input layout
-ERR4: Unsupported input layout
+CHECK: rock.conv
diff --git a/mlir/test/rocmlir-gen/kernel-repeats.mlir b/mlir/test/rocmlir-gen/kernel-repeats.mlir
@@ -1,5 +1,5 @@
 // RUN: rocmlir-gen --arch gfx900 --operation gemm -p -ph --kernel-repeats=5 | FileCheck %s --check-prefix=GEMM
-// RUN: rocmlir-gen --arch gfx942  -pv --operation conv_bwd_weight -t f32 --fil_layout k01c --in_layout n01c --out_layout n01k --batchsize 64 --in_channels 1024 --in_h 14 --in_w 14 --out_channels 256 --fil_h 1 --fil_w 1 --dilation_h 1 --dilation_w 1 --conv_stride_h 1 --conv_stride_w 1 --padding_h 0 --padding_w 0 --groupsize 1 --kernel-repeats 5 | FileCheck %s --check-prefix=CONV_WRW
+// RUN: rocmlir-gen --arch gfx942 -pv --operation conv_bwd_weight -t f32 --fil_layout k01c --in_layout n01c --out_layout n01k --batchsize 64 --in_channels 1024 --in_h 14 --in_w 14 --out_channels 256 --fil_h 1 --fil_w 1 --dilation_h 1 --dilation_w 1 --conv_stride_h 1 --conv_stride_w 1 --padding_h 0 --padding_w 0 --groupsize 1 --kernel-repeats 5 | FileCheck %s --check-prefix=CONV_WRW
 // RUN: rocmlir-gen --arch gfx942 -pv_with_gpu --operation conv_bwd_weight -t f32 --fil_layout k01c --in_layout n01c --out_layout n01k --batchsize 64 --in_channels 1024 --in_h 14 --in_w 14 --out_channels 256 --fil_h 1 --fil_w 1 --dilation_h 1 --dilation_w 1 --conv_stride_h 1 --conv_stride_w 1 --padding_h 0 --padding_w 0 --groupsize 1 --kernel-repeats 5 | FileCheck %s --check-prefix=CONV_WRW_GPU
 
 // GEMM-LABEL: @rock_gemm_gpu
@@ -10,28 +10,28 @@
 // GEMM-NEXT: func.call @rock_gemm
 // GEMM-NEXT: }
 
-// CONV_WRW-LABEL: func.func @rock_conv_bwd_weight_gk01c_n01gc_n01gk_0
+// CONV_WRW-LABEL: func.func @rock_conv_bwd_weight_gk01c_ng01c_ng01k_0
 // CONV_WRW: rock.init_kernel
-// CONV_WRW-LABEL: func.func @rock_conv_bwd_weight_gk01c_n01gc_n01gk_1
+// CONV_WRW-LABEL: func.func @rock_conv_bwd_weight_gk01c_ng01c_ng01k_1
 // CONV_WRW: rock.conv_bwd_weight
-// CONV_WRW-LABEL: func.func @rock_conv_bwd_weight_gk01c_n01gc_n01gk_gpu
+// CONV_WRW-LABEL: func.func @rock_conv_bwd_weight_gk01c_ng01c_ng01k_gpu
 // CONV_WRW-DAG: %[[one:.*]] = arith.constant 1 : index
 // CONV_WRW-DAG: %[[five:.*]] = arith.constant 5 : index
 // CONV_WRW-DAG: %[[zero:.*]] = arith.constant 0 : index
 // CONV_WRW: scf.for %{{.*}} = %[[zero]] to %[[five]] step %[[one]] {
-// CONV_WRW-NEXT: func.call @rock_conv_bwd_weight_gk01c_n01gc_n01gk_0
-// CONV_WRW-NEXT: func.call @rock_conv_bwd_weight_gk01c_n01gc_n01gk_1
+// CONV_WRW-NEXT: func.call @rock_conv_bwd_weight_gk01c_ng01c_ng01k_0
+// CONV_WRW-NEXT: func.call @rock_conv_bwd_weight_gk01c_ng01c_ng01k_1
 // CONV_WRW-NEXT: }
 
-// CONV_WRW_GPU-LABEL: func.func @rock_conv_bwd_weight_gk01c_n01gc_n01gk_0
+// CONV_WRW_GPU-LABEL: func.func @rock_conv_bwd_weight_gk01c_ng01c_ng01k_0
 // CONV_WRW_GPU: rock.init_kernel
-// CONV_WRW_GPU-LABEL: func.func @rock_conv_bwd_weight_gk01c_n01gc_n01gk_1
+// CONV_WRW_GPU-LABEL: func.func @rock_conv_bwd_weight_gk01c_ng01c_ng01k_1
 // CONV_WRW_GPU: rock.conv_bwd_weight
-// CONV_WRW_GPU-LABEL: func.func @rock_conv_bwd_weight_gk01c_n01gc_n01gk_gpu
+// CONV_WRW_GPU-LABEL: func.func @rock_conv_bwd_weight_gk01c_ng01c_ng01k_gpu
 // CONV_WRW_GPU-DAG: %[[zero:.*]] = arith.constant 0 : index
 // CONV_WRW_GPU-DAG: %[[one:.*]] = arith.constant 1 : index
 // CONV_WRW_GPU-DAG: %[[five:.*]] = arith.constant 5 : index
 // CONV_WRW_GPU: scf.for %{{.*}} = %[[zero]] to %[[five]] step %[[one]] {
-// CONV_WRW_GPU-NEXT: func.call @rock_conv_bwd_weight_gk01c_n01gc_n01gk_0
-// CONV_WRW_GPU-NEXT: func.call @rock_conv_bwd_weight_gk01c_n01gc_n01gk_1
+// CONV_WRW_GPU-NEXT: func.call @rock_conv_bwd_weight_gk01c_ng01c_ng01k_0
+// CONV_WRW_GPU-NEXT: func.call @rock_conv_bwd_weight_gk01c_ng01c_ng01k_1
 // CONV_WRW_GPU-NEXT: }
diff --git a/mlir/tools/rocmlir-gen/rocmlir-gen.cpp b/mlir/tools/rocmlir-gen/rocmlir-gen.cpp
@@ -943,35 +943,34 @@ namespace test {
 void registerTestDialect(DialectRegistry &);
 } // namespace test
 
-static void correctConvParameters() {
-  std::string filterLayoutValue = filterLayout.getValue();
-
-  // yxcgk not implement yet
-  if (filterLayoutValue.find('g') == std::string::npos &&
-      (filterLayoutValue.substr(0, 2) == "kc" ||
-       (filterLayoutValue[0] == 'k' && filterLayoutValue.back() == 'c') ||
-       filterLayoutValue.substr(filterLayoutValue.size() - 2) == "ck"))
-    filterLayout = "g" + filterLayoutValue;
+static bool isConv(rock::KernelType kernelType) {
+  return kernelType == rock::KernelType::Conv ||
+         kernelType == rock::KernelType::ConvBwdData ||
+         kernelType == rock::KernelType::ConvBwdWeight ||
+         kernelType == rock::KernelType::ConvElementwiseGemm;
+}
 
-  auto addGToLayout = [&](std::string ch,
-                          std::string &layoutValue) -> std::string {
+static void correctConvParameters() {
+  auto addGToLayout = [](std::string &layoutValue) -> std::string {
     std::string layout;
     if (layoutValue.find('g') == std::string::npos) {
-      if (layoutValue.substr(0, 2) == "n" + ch)
-        layout = "ng" + ch + layoutValue.substr(2);
-      else if (layoutValue[0] == 'n' && layoutValue.back() == ch[0])
-        layout = layoutValue.substr(0, layoutValue.size() - 1) + "g" + ch;
-      else
-        layout = "g" + layoutValue;
-    } else
+      // Always add 'g' after 'n' when it's missing
+      size_t nPos = layoutValue.find('n');
+      assert(nPos != std::string::npos);
+      layout =
+          layoutValue.substr(0, nPos + 1) + "g" + layoutValue.substr(nPos + 1);
+    } else {
       layout = layoutValue;
+    }
     return layout;
   };
 
-  inputLayout = addGToLayout("c", inputLayout.getValue());
-  outputLayout = addGToLayout("k", outputLayout.getValue());
+  if (filterLayout.getValue().find('g') == std::string::npos)
+    filterLayout = "g" + filterLayout.getValue();
+  inputLayout = addGToLayout(inputLayout.getValue());
+  outputLayout = addGToLayout(outputLayout.getValue());
 
-  // +++pf:  update old key names.
+  // update old key names.
   std::replace(filterLayout.getValue().begin(), filterLayout.getValue().end(),
                'y', '0');
   std::replace(filterLayout.getValue().begin(), filterLayout.getValue().end(),
@@ -1080,28 +1079,6 @@ static void correctConvParameters() {
     paddingDepthRight = in_right_pad_d + (di_minimum - di_specified);
 }
 
-static void verifyConvLayout() {
-  std::string filterLayoutValue = filterLayout.getValue();
-  std::string inputLayoutValue = inputLayout.getValue();
-
-  if (filterLayoutValue.find("yx") == std::string::npos &&
-      filterLayoutValue.find("xy") == std::string::npos &&
-      filterLayoutValue.find("01") == std::string::npos &&
-      filterLayoutValue.find("10") == std::string::npos) {
-    llvm::errs() << "Unsupported filter layout: disjointed yx!\n";
-    exit(1);
-  }
-
-  if (inputLayoutValue.find("hw") == std::string::npos &&
-      inputLayoutValue.find("wh") == std::string::npos &&
-      inputLayoutValue.find("01") == std::string::npos &&
-      inputLayoutValue.find("10") == std::string::npos) {
-
-    llvm::errs() << "Unsupported input layout: disjointed hw!\n";
-    exit(1);
-  }
-}
-
 static void populateDefaults() {
   const bool isGemm = operation == rock::KernelType::Gemm;
   const bool isAttention = operation == rock::KernelType::Attention;
@@ -5033,10 +5010,9 @@ int main(int argc, char **argv) {
     outputDataType = canonicaliseF8Type(outputDataType);
   }
 
-  if (operation != rock::KernelType::Gemm) {
-    verifyConvLayout();
+  if (isConv(operation))
     correctConvParameters();
-  }
+
   populateDefaults();
 
   bool hasUserKernel = !testFuncName.empty();
diff --git a/mlir/utils/jenkins/Jenkinsfile b/mlir/utils/jenkins/Jenkinsfile
@@ -1239,7 +1239,7 @@ pipeline {
                                             sh 'date --utc +%Y-%m-%d > perf-run-date'
                                             sh 'ls -l /dev/kfd'
                                             sh 'ls -l /dev/dri'
-                                            // Run MLIR vs MIOpend perf benchmarks.
+                                            // Run MLIR vs MIOpen perf benchmarks.
                                             sh """python3 ./bin/perfRunner.py --op=conv --batch_all \
                                                 --configs_file=${convToUse} \
                                                 --tuning_db=${WORKSPACE}/build/mlir_tuning_${CHIP}.tsv \
diff --git a/mlir/utils/performance/perfRunner.py b/mlir/utils/performance/perfRunner.py
@@ -42,16 +42,7 @@
 OUTPUT_DATA_TYPES_MAP = {'f32': 'f32', 'f16': 'f16', 'bf16': 'bf16', 'i8': 'i32', 'fp8':'f32',
                          'fp8_fp8': 'f32', 'fp8_bf8': 'f32', 'bf8_fp8': 'f32',
                          'bf8_bf8': 'f32'}
-MLIR_N_REPEATS = 5
-
-MLIR_FILTER_LAYOUTS = {"NCHW": "kcyx", "NCHWG": "kcyxg", "NHWC": "kyxc", "NHWCG": "kyxcg",
-                        "NC01": "kc01", "NC01G": "kc01g", "N01C": "k01c", "N01CG": "k01cg",
-                        "GNC01":"gkc01", "GN01C":"gk01c"}
-MLIR_OUTPUT_LAYOUTS = {"NCHW": "nkhw", "NCHWG": "nkhwg", "NHWC": "nhwk", "NHWCG": "nhwkg",
-                        "NC01": "nk01", "NC01G": "nk01g", "N01C": "n01k", "N01CG": "n01kg",
-                        "NGC01":"ngk01", "N01GC": "n01gk"}
-INVERSE_FILTER_LAYOUTS = {v: k for k, v in MLIR_FILTER_LAYOUTS.items()}
-INVERSE_OUTPUT_LAYOUTS = {v: k for k, v in MLIR_OUTPUT_LAYOUTS.items()}
+MLIR_N_REPEATS = 100
 
 FILTER_LAYOUT_MAP = {'N':'k', 'C':'c', 'H':'0', 'W':'1', 'G':'g'}
 INPUT_LAYOUT_MAP = {'N':'n', 'C':'c', 'H':'0', 'W':'1', 'G':'g'}
@@ -64,6 +55,14 @@
 INFO_ARCH_NAME = re.compile(r"Name:\s*(.*)")
 INFO_ARCH_CU = re.compile(r"Compute Unit:\s*(.*)")
 
+def inverse_output_layouts(output_layout):
+    map = {"n": "N", "k": "C", "h": "H", "w": "W", "g": "G", "0": "0", "1": "1"}
+    return "".join(map[char] for char in output_layout)
+    
+def inverse_filter_layouts(filter_layout):
+    map = {"k": "N", "c": "C", "y": "H", "x": "W", "g": "G", "0": "0", "1": "1"}
+    return "".join(map[char] for char in filter_layout)
+
 @dataclass
 class MLIRPaths:
     rocmlir_gen_path : str
@@ -285,15 +284,15 @@ def runPipeline(proc_specs):
         for p in procs:
             p.wait()
             if p.returncode != 0:
-                raise OSError(str(p.stderr))
+                raise OSError(str(p.stderr.read()))
         outs, errs = p.communicate()
-        return outs, errs
+        return outs, True
     except Exception as err:
         print(f"Error:  {err}")
         print(f"Failing command:  {' '.join(p.args)}")
         print(f"Failing pipeline:  {' | '.join([' '.join(proc) for proc in proc_specs])}")
         outs, errs = p.communicate()
-    return outs, errs
+    return outs, False
 
 class PerfConfiguration:
     TABLE_COLUMNS = []
@@ -537,8 +536,8 @@ def fromCommandLine(cls, argv, arch, numCU):
     def toCommandLine(self):
         return (f"conv{ {'f32':'', 'f16':'fp16', 'bf16':'bfp16', 'i8':'int8','fp8_fp8':'fp8_fp8', 'fp8': 'fp8'}[self.dataType]} "
                 + f"-F { {'fwd':1, 'bwd':2, 'wrw':4}[self.direction]} "
-                + f"-f {INVERSE_FILTER_LAYOUTS[self.filterLayout]} -I {self.inputLayout.upper()} "
-                + f"-O {INVERSE_OUTPUT_LAYOUTS[self.outputLayout]} "
+                + f"-f {inverse_filter_layouts(self.filterLayout)} -I {self.inputLayout.upper()} "
+                + f"-O {inverse_output_layouts(self.outputLayout)} "
                 + f"-n {self.n} -c {self.c} -H {self.hi} -W {self.wi} -k {self.k} "
                 + f"-y {self.y} -x {self.x} -p {self.paddingH} -q {self.paddingW} "
                 + f"-u {self.convStrideH} -v {self.convStrideW} -l {self.dilationH} "
@@ -593,17 +592,17 @@ def benchmarkExternal(cls, commandLine, paths: Paths, arch, numCU):
         MIOpenDriverCommand = [MIOPENDRIVER, *commandLine, '-V', '0', '-t', '1']
         print("Running MIOpen Benchmark: ", ' '.join(commandLine))
         # invoke MIOpenDriver.
-        outs,errs = runPipeline([MIOpenDriverCommand])
-        if len(errs) == 0:
+        outs, noerr = runPipeline([MIOpenDriverCommand])
+        nanoSeconds = np.nan
+        if noerr:
             # convert bytes to str
             outs = outs.decode('utf-8')
             # Extract Elapsed time in ms from the output of MIOpenDriver
             # Use regular expression to match the contents between
             # "Elasped: " (note the space at the end) and "ms"
             elapsedTimeInMs = ELAPSED_TIME_RE.search(outs).group(1)
             nanoSeconds = float(elapsedTimeInMs)*1.0e6
-        else:
-            nanoSeconds = np.nan
+            
         return config.tableEntry(nanoSeconds)
 
 def getGemmConfigurations(fileName, dataTypes=DATA_TYPES_GEMM, outDataTypeMap=OUTPUT_DATA_TYPES_MAP):
@@ -1109,7 +1108,7 @@ def fromCommandLine(cls, argv, arch, numCU):
     
     def toCommandLine(self):
         return (f"-t {self.dataType} "
-                + f"-f {INVERSE_FILTER_LAYOUTS[self.filterLayout]} -I {self.inputLayout.upper()} "
+                + f"-f {inverse_filter_layouts(self.filterLayout)} -I {self.inputLayout.upper()} "
                 + f"-transC {str(self.transC).lower()} -transO {str(self.transO).lower()} "
                 + f"-n {self.n} -c {self.c} -H {self.hi} -W {self.wi} -k {self.k} "
                 + f"-y {self.y} -x {self.x} -p {self.paddingH} -q {self.paddingW} "
@@ -1458,9 +1457,12 @@ def benchmarkExternal(cls, commandLine, paths: Paths, arch, numCU):
         print(f"Running rocBLAS benchmark {config!r}")
         profilerCommand = [paths.mlir_paths.rocblas_benchmark_driver_path] + \
             benchmarkArgs.split()
-        outs,errs = runPipeline([profilerCommand])
-        milliSeconds = getMilliseconds(outs)
-        nanoSeconds = milliSeconds*1e6
+        outs, noerr = runPipeline([profilerCommand])
+        nanoSeconds = np.nan
+        if noerr:
+            milliSeconds = getMilliseconds(outs)
+            nanoSeconds = milliSeconds*1e6
+            
         return config.tableEntry(nanoSeconds)
 
 class CKGemmConfig(GemmConfiguration):
@@ -1479,9 +1481,12 @@ def benchmarkExternal(cls, commandLine, paths: Paths, arch, numCU):
 
         profilerCommand = [paths.mlir_paths.ck_gemm_benchmark_driver_path] + \
             benchmarkArgs.split()
-        outs,errs = runPipeline([profilerCommand])
-        milliSeconds = getMilliseconds(outs)
-        nanoSeconds = milliSeconds*1e6
+        outs, noerr = runPipeline([profilerCommand])
+        nanoSeconds = np.nan
+        if noerr:
+            milliSeconds = getMilliseconds(outs)
+            nanoSeconds = milliSeconds*1e6
+
         return config.tableEntry(nanoSeconds)
 
 def runConfigWithMLIR(config: PerfConfiguration, paths: Paths, arch, rocmlir_gen_flags, debug=True):
@@ -1496,7 +1501,12 @@ def runConfigWithMLIR(config: PerfConfiguration, paths: Paths, arch, rocmlir_gen
     mlir_cpu_runner_args = [f'--shared-libs={paths.mlir_paths.libmlir_rocm_runtime_path},{paths.mlir_paths.libconv_validation_wrappers_path},{paths.mlir_paths.libmlir_runtime_utils_path},{paths.mlir_paths.libmlir_c_runner_utils_path}', '--entry-point-result=void']
     profilerCommand = [ROCPROF] + getMetricArgsForRocprof(arch) + ['--kernel-trace', '--stats', '-o', BENCHMARKING_RESULT_FILE_NAME, '--' ,paths.mlir_paths.cpu_runner_path] + mlir_cpu_runner_args
 
-    runPipeline([rocmlirGenCommand.split(), rocmlirDriverCommand, profilerCommand])
+    outs, noerr = runPipeline([rocmlirGenCommand.split(), rocmlirDriverCommand, profilerCommand])
+    nanoSeconds = np.nan
+    if noerr:
+        nanoSeconds = getNanoSeconds(getProfilerOutputPath(arch, BENCHMARKING_STATS_FILE_NAME))
+
+    return nanoSeconds
 
 # Benchmarking function.
 def benchmarkMLIR(commandLine, confClass, paths: Paths, arch, numCU, tuningDb: MaybeTuningDb, rocmlir_gen_flags):
@@ -1508,9 +1518,7 @@ def benchmarkMLIR(commandLine, confClass, paths: Paths, arch, numCU, tuningDb: M
         else: # Tuning DB present but doesn't contain config, return N/A
             return config.tableEntry(np.nan)
 
-    runConfigWithMLIR(config, paths, arch, rocmlir_gen_flags)
-    # get nanoseconds from rocprof output.
-    nanoSeconds = getNanoSeconds(getProfilerOutputPath(arch, BENCHMARKING_STATS_FILE_NAME))
+    nanoSeconds = runConfigWithMLIR(config, paths, arch, rocmlir_gen_flags)
     return config.tableEntry(nanoSeconds)
 
 #Generate MLIR vs. MIOpen or rocBLAS performance results
@@ -1682,7 +1690,12 @@ def runFusionKernel(filename, rocmlirGenArgs, paths: Paths):
     mlir_cpu_runner_args = [f'--shared-libs={paths.mlir_paths.libmlir_rocm_runtime_path},{paths.mlir_paths.libconv_validation_wrappers_path},{paths.mlir_paths.libmlir_runtime_utils_path},{paths.mlir_paths.libmlir_c_runner_utils_path}', '--entry-point-result=void']
     profilerCommand = [ROCPROF] + getMetricArgsForRocprof(chip) + ['--kernel-trace', '--stats', '-o', BENCHMARKING_RESULT_FILE_NAME] + ['--', paths.mlir_paths.cpu_runner_path] + mlir_cpu_runner_args
     commands.append(profilerCommand)
-    runPipeline(commands)
+    outs, noerr = runPipeline(commands)
+    nanoSeconds = np.nan
+    if noerr:
+        nanoSeconds = getNanoSeconds(getProfilerOutputPath(arch, BENCHMARKING_STATS_FILE_NAME))
+
+    return nanoSeconds
 
 # Generate fusion vs. gemm/conv performance results
 def benchmarkFusionKernels(test_dir, paths: Paths, arch, numCU, tuningDb: MaybeTuningDb):
@@ -1747,18 +1760,14 @@ def benchmarkFusionKernels(test_dir, paths: Paths, arch, numCU, tuningDb: MaybeT
 
         # Run fusion test
         rocmlirGenArgs = ['-ph', '-fut='+futName+'_wrapper', '--perf_config='+bestPerf, '-']
-        runFusionKernel(filename, rocmlirGenArgs, paths)
-        # Get nanoseconds of fusion test
-        nanoSeconds = getNanoSeconds(getProfilerOutputPath(arch, BENCHMARKING_STATS_FILE_NAME))
+        nanoSeconds = runFusionKernel(filename, rocmlirGenArgs, paths)
         oneEntry = config.tableEntry(nanoSeconds)
         # Keep the best performance
         if testVector in perfResults and oneEntry['TFlops'] <= perfResults[testVector]['TFlops']:
             continue
 
         # Run gemm or conv op with the same configuration
-        runConfigWithMLIR(config, paths, arch, '')
-        # Get nanoseconds of gemm/conv
-        nanoSeconds = getNanoSeconds(getProfilerOutputPath(arch, BENCHMARKING_STATS_FILE_NAME))
+        nanoSeconds = runConfigWithMLIR(config, paths, arch, '')
         oneEntry['MLIR TFlops'] = config.computeTFlops(nanoSeconds)
         oneEntry['Fusion/MLIR'] = oneEntry['TFlops']/oneEntry['MLIR TFlops']
         oneEntry['FileName'] = filename