4242OUTPUT_DATA_TYPES_MAP = {'f32' : 'f32' , 'f16' : 'f16' , 'bf16' : 'bf16' , 'i8' : 'i32' , 'fp8' :'f32' ,
4343 'fp8_fp8' : 'f32' , 'fp8_bf8' : 'f32' , 'bf8_fp8' : 'f32' ,
4444 'bf8_bf8' : 'f32' }
45- MLIR_N_REPEATS = 5
46-
47- MLIR_FILTER_LAYOUTS = {"NCHW" : "kcyx" , "NCHWG" : "kcyxg" , "NHWC" : "kyxc" , "NHWCG" : "kyxcg" ,
48- "NC01" : "kc01" , "NC01G" : "kc01g" , "N01C" : "k01c" , "N01CG" : "k01cg" ,
49- "GNC01" :"gkc01" , "GN01C" :"gk01c" }
50- MLIR_OUTPUT_LAYOUTS = {"NCHW" : "nkhw" , "NCHWG" : "nkhwg" , "NHWC" : "nhwk" , "NHWCG" : "nhwkg" ,
51- "NC01" : "nk01" , "NC01G" : "nk01g" , "N01C" : "n01k" , "N01CG" : "n01kg" ,
52- "NGC01" :"ngk01" , "N01GC" : "n01gk" }
53- INVERSE_FILTER_LAYOUTS = {v : k for k , v in MLIR_FILTER_LAYOUTS .items ()}
54- INVERSE_OUTPUT_LAYOUTS = {v : k for k , v in MLIR_OUTPUT_LAYOUTS .items ()}
45+ MLIR_N_REPEATS = 100
5546
5647FILTER_LAYOUT_MAP = {'N' :'k' , 'C' :'c' , 'H' :'0' , 'W' :'1' , 'G' :'g' }
5748INPUT_LAYOUT_MAP = {'N' :'n' , 'C' :'c' , 'H' :'0' , 'W' :'1' , 'G' :'g' }
6455INFO_ARCH_NAME = re .compile (r"Name:\s*(.*)" )
6556INFO_ARCH_CU = re .compile (r"Compute Unit:\s*(.*)" )
6657
58+ def inverse_output_layouts (output_layout ):
59+ map = {"n" : "N" , "k" : "C" , "h" : "H" , "w" : "W" , "g" : "G" , "0" : "0" , "1" : "1" }
60+ return "" .join (map [char ] for char in output_layout )
61+
62+ def inverse_filter_layouts (filter_layout ):
63+ map = {"k" : "N" , "c" : "C" , "y" : "H" , "x" : "W" , "g" : "G" , "0" : "0" , "1" : "1" }
64+ return "" .join (map [char ] for char in filter_layout )
65+
6766@dataclass
6867class MLIRPaths :
6968 rocmlir_gen_path : str
@@ -285,15 +284,15 @@ def runPipeline(proc_specs):
285284 for p in procs :
286285 p .wait ()
287286 if p .returncode != 0 :
288- raise OSError (str (p .stderr ))
287+ raise OSError (str (p .stderr . read () ))
289288 outs , errs = p .communicate ()
290- return outs , errs
289+ return outs , True
291290 except Exception as err :
292291 print (f"Error: { err } " )
293292 print (f"Failing command: { ' ' .join (p .args )} " )
294293 print (f"Failing pipeline: { ' | ' .join ([' ' .join (proc ) for proc in proc_specs ])} " )
295294 outs , errs = p .communicate ()
296- return outs , errs
295+ return outs , False
297296
298297class PerfConfiguration :
299298 TABLE_COLUMNS = []
@@ -537,8 +536,8 @@ def fromCommandLine(cls, argv, arch, numCU):
537536 def toCommandLine (self ):
538537 return (f"conv{ {'f32' :'' , 'f16' :'fp16' , 'bf16' :'bfp16' , 'i8' :'int8' ,'fp8_fp8' :'fp8_fp8' , 'fp8' : 'fp8' }[self .dataType ]} "
539538 + f"-F { {'fwd' :1 , 'bwd' :2 , 'wrw' :4 }[self .direction ]} "
540- + f"-f { INVERSE_FILTER_LAYOUTS [ self .filterLayout ] } -I { self .inputLayout .upper ()} "
541- + f"-O { INVERSE_OUTPUT_LAYOUTS [ self .outputLayout ] } "
539+ + f"-f { inverse_filter_layouts ( self .filterLayout ) } -I { self .inputLayout .upper ()} "
540+ + f"-O { inverse_output_layouts ( self .outputLayout ) } "
542541 + f"-n { self .n } -c { self .c } -H { self .hi } -W { self .wi } -k { self .k } "
543542 + f"-y { self .y } -x { self .x } -p { self .paddingH } -q { self .paddingW } "
544543 + f"-u { self .convStrideH } -v { self .convStrideW } -l { self .dilationH } "
@@ -593,17 +592,17 @@ def benchmarkExternal(cls, commandLine, paths: Paths, arch, numCU):
593592 MIOpenDriverCommand = [MIOPENDRIVER , * commandLine , '-V' , '0' , '-t' , '1' ]
594593 print ("Running MIOpen Benchmark: " , ' ' .join (commandLine ))
595594 # invoke MIOpenDriver.
596- outs ,errs = runPipeline ([MIOpenDriverCommand ])
597- if len (errs ) == 0 :
595+ outs , noerr = runPipeline ([MIOpenDriverCommand ])
596+ nanoSeconds = np .nan
597+ if noerr :
598598 # convert bytes to str
599599 outs = outs .decode ('utf-8' )
600600 # Extract Elapsed time in ms from the output of MIOpenDriver
601601 # Use regular expression to match the contents between
602602 # "Elasped: " (note the space at the end) and "ms"
603603 elapsedTimeInMs = ELAPSED_TIME_RE .search (outs ).group (1 )
604604 nanoSeconds = float (elapsedTimeInMs )* 1.0e6
605- else :
606- nanoSeconds = np .nan
605+
607606 return config .tableEntry (nanoSeconds )
608607
609608def getGemmConfigurations (fileName , dataTypes = DATA_TYPES_GEMM , outDataTypeMap = OUTPUT_DATA_TYPES_MAP ):
@@ -1109,7 +1108,7 @@ def fromCommandLine(cls, argv, arch, numCU):
11091108
11101109 def toCommandLine (self ):
11111110 return (f"-t { self .dataType } "
1112- + f"-f { INVERSE_FILTER_LAYOUTS [ self .filterLayout ] } -I { self .inputLayout .upper ()} "
1111+ + f"-f { inverse_filter_layouts ( self .filterLayout ) } -I { self .inputLayout .upper ()} "
11131112 + f"-transC { str (self .transC ).lower ()} -transO { str (self .transO ).lower ()} "
11141113 + f"-n { self .n } -c { self .c } -H { self .hi } -W { self .wi } -k { self .k } "
11151114 + f"-y { self .y } -x { self .x } -p { self .paddingH } -q { self .paddingW } "
@@ -1458,9 +1457,12 @@ def benchmarkExternal(cls, commandLine, paths: Paths, arch, numCU):
14581457 print (f"Running rocBLAS benchmark { config !r} " )
14591458 profilerCommand = [paths .mlir_paths .rocblas_benchmark_driver_path ] + \
14601459 benchmarkArgs .split ()
1461- outs ,errs = runPipeline ([profilerCommand ])
1462- milliSeconds = getMilliseconds (outs )
1463- nanoSeconds = milliSeconds * 1e6
1460+ outs , noerr = runPipeline ([profilerCommand ])
1461+ nanoSeconds = np .nan
1462+ if noerr :
1463+ milliSeconds = getMilliseconds (outs )
1464+ nanoSeconds = milliSeconds * 1e6
1465+
14641466 return config .tableEntry (nanoSeconds )
14651467
14661468class CKGemmConfig (GemmConfiguration ):
@@ -1479,9 +1481,12 @@ def benchmarkExternal(cls, commandLine, paths: Paths, arch, numCU):
14791481
14801482 profilerCommand = [paths .mlir_paths .ck_gemm_benchmark_driver_path ] + \
14811483 benchmarkArgs .split ()
1482- outs ,errs = runPipeline ([profilerCommand ])
1483- milliSeconds = getMilliseconds (outs )
1484- nanoSeconds = milliSeconds * 1e6
1484+ outs , noerr = runPipeline ([profilerCommand ])
1485+ nanoSeconds = np .nan
1486+ if noerr :
1487+ milliSeconds = getMilliseconds (outs )
1488+ nanoSeconds = milliSeconds * 1e6
1489+
14851490 return config .tableEntry (nanoSeconds )
14861491
14871492def runConfigWithMLIR (config : PerfConfiguration , paths : Paths , arch , rocmlir_gen_flags , debug = True ):
@@ -1496,7 +1501,12 @@ def runConfigWithMLIR(config: PerfConfiguration, paths: Paths, arch, rocmlir_gen
14961501 mlir_cpu_runner_args = [f'--shared-libs={ paths .mlir_paths .libmlir_rocm_runtime_path } ,{ paths .mlir_paths .libconv_validation_wrappers_path } ,{ paths .mlir_paths .libmlir_runtime_utils_path } ,{ paths .mlir_paths .libmlir_c_runner_utils_path } ' , '--entry-point-result=void' ]
14971502 profilerCommand = [ROCPROF ] + getMetricArgsForRocprof (arch ) + ['--kernel-trace' , '--stats' , '-o' , BENCHMARKING_RESULT_FILE_NAME , '--' ,paths .mlir_paths .cpu_runner_path ] + mlir_cpu_runner_args
14981503
1499- runPipeline ([rocmlirGenCommand .split (), rocmlirDriverCommand , profilerCommand ])
1504+ outs , noerr = runPipeline ([rocmlirGenCommand .split (), rocmlirDriverCommand , profilerCommand ])
1505+ nanoSeconds = np .nan
1506+ if noerr :
1507+ nanoSeconds = getNanoSeconds (getProfilerOutputPath (arch , BENCHMARKING_STATS_FILE_NAME ))
1508+
1509+ return nanoSeconds
15001510
15011511# Benchmarking function.
15021512def benchmarkMLIR (commandLine , confClass , paths : Paths , arch , numCU , tuningDb : MaybeTuningDb , rocmlir_gen_flags ):
@@ -1508,9 +1518,7 @@ def benchmarkMLIR(commandLine, confClass, paths: Paths, arch, numCU, tuningDb: M
15081518 else : # Tuning DB present but doesn't contain config, return N/A
15091519 return config .tableEntry (np .nan )
15101520
1511- runConfigWithMLIR (config , paths , arch , rocmlir_gen_flags )
1512- # get nanoseconds from rocprof output.
1513- nanoSeconds = getNanoSeconds (getProfilerOutputPath (arch , BENCHMARKING_STATS_FILE_NAME ))
1521+ nanoSeconds = runConfigWithMLIR (config , paths , arch , rocmlir_gen_flags )
15141522 return config .tableEntry (nanoSeconds )
15151523
15161524#Generate MLIR vs. MIOpen or rocBLAS performance results
@@ -1682,7 +1690,12 @@ def runFusionKernel(filename, rocmlirGenArgs, paths: Paths):
16821690 mlir_cpu_runner_args = [f'--shared-libs={ paths .mlir_paths .libmlir_rocm_runtime_path } ,{ paths .mlir_paths .libconv_validation_wrappers_path } ,{ paths .mlir_paths .libmlir_runtime_utils_path } ,{ paths .mlir_paths .libmlir_c_runner_utils_path } ' , '--entry-point-result=void' ]
16831691 profilerCommand = [ROCPROF ] + getMetricArgsForRocprof (chip ) + ['--kernel-trace' , '--stats' , '-o' , BENCHMARKING_RESULT_FILE_NAME ] + ['--' , paths .mlir_paths .cpu_runner_path ] + mlir_cpu_runner_args
16841692 commands .append (profilerCommand )
1685- runPipeline (commands )
1693+ outs , noerr = runPipeline (commands )
1694+ nanoSeconds = np .nan
1695+ if noerr :
1696+ nanoSeconds = getNanoSeconds (getProfilerOutputPath (arch , BENCHMARKING_STATS_FILE_NAME ))
1697+
1698+ return nanoSeconds
16861699
16871700# Generate fusion vs. gemm/conv performance results
16881701def benchmarkFusionKernels (test_dir , paths : Paths , arch , numCU , tuningDb : MaybeTuningDb ):
@@ -1747,18 +1760,14 @@ def benchmarkFusionKernels(test_dir, paths: Paths, arch, numCU, tuningDb: MaybeT
17471760
17481761 # Run fusion test
17491762 rocmlirGenArgs = ['-ph' , '-fut=' + futName + '_wrapper' , '--perf_config=' + bestPerf , '-' ]
1750- runFusionKernel (filename , rocmlirGenArgs , paths )
1751- # Get nanoseconds of fusion test
1752- nanoSeconds = getNanoSeconds (getProfilerOutputPath (arch , BENCHMARKING_STATS_FILE_NAME ))
1763+ nanoSeconds = runFusionKernel (filename , rocmlirGenArgs , paths )
17531764 oneEntry = config .tableEntry (nanoSeconds )
17541765 # Keep the best performance
17551766 if testVector in perfResults and oneEntry ['TFlops' ] <= perfResults [testVector ]['TFlops' ]:
17561767 continue
17571768
17581769 # Run gemm or conv op with the same configuration
1759- runConfigWithMLIR (config , paths , arch , '' )
1760- # Get nanoseconds of gemm/conv
1761- nanoSeconds = getNanoSeconds (getProfilerOutputPath (arch , BENCHMARKING_STATS_FILE_NAME ))
1770+ nanoSeconds = runConfigWithMLIR (config , paths , arch , '' )
17621771 oneEntry ['MLIR TFlops' ] = config .computeTFlops (nanoSeconds )
17631772 oneEntry ['Fusion/MLIR' ] = oneEntry ['TFlops' ]/ oneEntry ['MLIR TFlops' ]
17641773 oneEntry ['FileName' ] = filename
0 commit comments