[Test] add xla perf test, adjust script path (#632)

Dewei-Wang-sh · web-flow · commit 6b2df26555d2 · 2023-02-28T10:39:04.000+08:00
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -2,6 +2,9 @@ add_subdirectory(relu)
 add_subdirectory(silu)
 add_subdirectory(softmax)
 add_subdirectory(transpose)
+add_subdirectory(reduce)
+add_subdirectory(kLoopFusion)
+add_subdirectory(kInputFusion)
 
 if(WIN32)
     set(MLIR_RUNNER_UTILS_DIR ${LLVM_BINARY_DIR}/bin)
diff --git a/benchmarks/bench_imex.in b/benchmarks/bench_imex.in
@@ -11,12 +11,12 @@ case "${platform}" in
     *)          echo "UNKNOWN platform:${platform}" && exit -1
 esac
 
-MLIR_RUNNER_UTILS=/home/cchen/.local/lib/libmlir_runner_utils.so
-MLIR_C_RUNNER_UTILS=/home/cchen/.local/lib/libmlir_c_runner_utils.so
-IMEX_SYCL_RUNTIME=/home/cchen/imex/frameworks.ai.mlir.mlir-extensions/build/lib/libsycl-runtime.so
-IMEX_L0_RUNTIME=/home/cchen/imex/frameworks.ai.mlir.mlir-extensions/build/lib/liblevel-zero-runtime.so
-BENCHMARK_ROOT=/home/cchen/imex/frameworks.ai.mlir.mlir-extensions/build/benchmarks
-IMEX_RUNNER=/home/cchen/imex/frameworks.ai.mlir.mlir-extensions/build/bin/imex-runner.py
+MLIR_RUNNER_UTILS=@LLVM_LIBRARY_DIR@/libmlir_runner_utils.so
+MLIR_C_RUNNER_UTILS=@LLVM_LIBRARY_DIR@/libmlir_c_runner_utils.so
+IMEX_SYCL_RUNTIME=@IMEX_LIB_DIR@/libsycl-runtime.so
+IMEX_L0_RUNTIME=@IMEX_LIB_DIR@/liblevel-zero-runtime.so
+BENCHMARK_ROOT=@IMEX_BINARY_DIR@/benchmarks
+IMEX_RUNNER=@IMEX_BINARY_DIR@/bin/imex-runner.py
 
 # -l: using level-zero runtime
 # -s: using sycl runtime
diff --git a/benchmarks/kInputFusion/CMakeLists.txt b/benchmarks/kInputFusion/CMakeLists.txt
@@ -0,0 +1,8 @@
+file(STRINGS kInputFusion.dtypes.in test_dtypes)
+list(APPEND test_shapes "512x1024")
+
+foreach(shape ${test_shapes})
+    foreach(dtype ${test_dtypes})
+        configure_file(kInputFusion.mlir.in ${IMEX_BINARY_DIR}/benchmarks/kInputFusion/kInputFusion_${shape}_${dtype}.mlir @ONLY)
+    endforeach()
+endforeach()
diff --git a/benchmarks/kInputFusion/kInputFusion.dtypes.in b/benchmarks/kInputFusion/kInputFusion.dtypes.in
@@ -0,0 +1,3 @@
+f32
+f16
+bf16
diff --git a/benchmarks/kInputFusion/kInputFusion.mlir.in b/benchmarks/kInputFusion/kInputFusion.mlir.in
@@ -0,0 +1,50 @@
+#map = affine_map<(d0) -> (d0)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+#map2 = affine_map<(d0, d1) -> (d1, d0)>
+#map3 = affine_map<(d0, d1) -> (d0)>
+module {
+func.func @fusion(%arg4: tensor<512x1024x@dtype@>, %arg5: tensor<512x1024x@dtype@>) -> (tensor<1024x@dtype@> {lmhlo.written}) {
+  %0 = tensor.empty() : tensor<512x1024x@dtype@>
+  %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg5 : tensor<512x1024x@dtype@>) outs(%0 : tensor<512x1024x@dtype@>) {
+  ^bb0(%in: @dtype@, %out: @dtype@):
+    %9 = math.log %in : @dtype@
+    linalg.yield %9 : @dtype@
+  } -> tensor<512x1024x@dtype@>
+  %2 = tensor.empty() : tensor<512x1024x@dtype@>
+  %3 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg4 : tensor<512x1024x@dtype@>) outs(%2 : tensor<512x1024x@dtype@>) {
+  ^bb0(%in: @dtype@, %out: @dtype@):
+    %9 = math.absf %in : @dtype@
+    linalg.yield %9 : @dtype@
+  } -> tensor<512x1024x@dtype@>
+  %4 = tensor.empty() : tensor<512x1024x@dtype@>
+  %5 = linalg.generic {indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%1, %3 : tensor<512x1024x@dtype@>, tensor<512x1024x@dtype@>) outs(%4 : tensor<512x1024x@dtype@>) {
+  ^bb0(%in: @dtype@, %in_7: @dtype@, %out: @dtype@):
+    %9 = arith.addf %in, %in_7 : @dtype@
+    linalg.yield %9 : @dtype@
+  } -> tensor<512x1024x@dtype@>
+  %cst = arith.constant dense<0.000000e+00> : tensor<@dtype@>
+  %cst_6 = arith.constant 0.000000e+00 : @dtype@
+  %6 = tensor.empty() : tensor<1024x@dtype@>
+  %7 = linalg.fill ins(%cst_6 : @dtype@) outs(%6 : tensor<1024x@dtype@>) -> tensor<1024x@dtype@>
+  %8 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "reduction"]} ins(%5 : tensor<512x1024x@dtype@>) outs(%7 : tensor<1024x@dtype@>) {
+  ^bb0(%in: @dtype@, %out: @dtype@):
+    %from_elements = tensor.from_elements %out : tensor<@dtype@>
+    %from_elements_7 = tensor.from_elements %in : tensor<@dtype@>
+    %extracted = tensor.extract %from_elements[] : tensor<@dtype@>
+    %extracted_8 = tensor.extract %from_elements_7[] : tensor<@dtype@>
+    %9 = arith.addf %extracted, %extracted_8 : @dtype@
+    %from_elements_9 = tensor.from_elements %9 : tensor<@dtype@>
+    %extracted_10 = tensor.extract %from_elements_9[] : tensor<@dtype@>
+    linalg.yield %extracted_10 : @dtype@
+  } -> tensor<1024x@dtype@>
+  return %8 : tensor<1024x@dtype@>
+  //%expanded = tensor.expand_shape %8 [[0, 1]] : tensor<1024x@dtype@> into tensor<1x1024x@dtype@>
+  //return %expanded : tensor<1x1024x@dtype@>
+}
+  func.func @main() {
+    %0 = arith.constant dense<3.3> : tensor<512x1024x@dtype@>
+    %1 = arith.constant dense<1.0> : tensor<512x1024x@dtype@>
+    %2 = call @fusion(%0, %1) : (tensor<512x1024x@dtype@>, tensor<512x1024x@dtype@>) -> tensor<1024x@dtype@>
+    return
+  }
+}
diff --git a/benchmarks/kLoopFusion/CMakeLists.txt b/benchmarks/kLoopFusion/CMakeLists.txt
@@ -0,0 +1,8 @@
+file(STRINGS kLoopFusion.dtypes.in test_dtypes)
+list(APPEND test_shapes "512x1024")
+
+foreach(shape ${test_shapes})
+    foreach(dtype ${test_dtypes})
+        configure_file(kLoopFusion.mlir.in ${IMEX_BINARY_DIR}/benchmarks/kLoopFusion/kLoopFusion_${shape}_${dtype}.mlir @ONLY)
+    endforeach()
+endforeach()
diff --git a/benchmarks/kLoopFusion/kLoopFusion.dtypes.in b/benchmarks/kLoopFusion/kLoopFusion.dtypes.in
@@ -0,0 +1,3 @@
+f32
+f16
+bf16
diff --git a/benchmarks/kLoopFusion/kLoopFusion.mlir.in b/benchmarks/kLoopFusion/kLoopFusion.mlir.in
@@ -0,0 +1,37 @@
+#map = affine_map<(d0) -> (d0)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+      func.func @fusion(%arg3: tensor<512x1024x@dtype@>, %arg4: tensor<512x1024x@dtype@>) -> (tensor<512x1024x@dtype@> {lmhlo.written}) {
+        %0 = tensor.empty() : tensor<512x1024x@dtype@>
+        %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3 : tensor<512x1024x@dtype@>) outs(%0 : tensor<512x1024x@dtype@>) {
+        ^bb0(%in: @dtype@, %out: @dtype@):
+          %8 = math.log %in : @dtype@
+          linalg.yield %8 : @dtype@
+        } -> tensor<512x1024x@dtype@>
+        %2 = tensor.empty() : tensor<512x1024x@dtype@>
+        %3 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg4 : tensor<512x1024x@dtype@>) outs(%2 : tensor<512x1024x@dtype@>) {
+        ^bb0(%in: @dtype@, %out: @dtype@):
+          %8 = math.absf %in : @dtype@
+          linalg.yield %8 : @dtype@
+        } -> tensor<512x1024x@dtype@>
+        %4 = tensor.empty() : tensor<512x1024x@dtype@>
+        %5 = linalg.generic {indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%1, %3 : tensor<512x1024x@dtype@>, tensor<512x1024x@dtype@>) outs(%4 : tensor<512x1024x@dtype@>) {
+        ^bb0(%in: @dtype@, %in_4: @dtype@, %out: @dtype@):
+          %8 = arith.addf %in, %in_4 : @dtype@
+          linalg.yield %8 : @dtype@
+        } -> tensor<512x1024x@dtype@>
+        %6 = tensor.empty() : tensor<512x1024x@dtype@>
+        %7 = linalg.generic {indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%5, %arg3 : tensor<512x1024x@dtype@>, tensor<512x1024x@dtype@>) outs(%6 : tensor<512x1024x@dtype@>) {
+        ^bb0(%in: @dtype@, %in_4: @dtype@, %out: @dtype@):
+          %8 = arith.subf %in, %in_4 : @dtype@
+          linalg.yield %8 : @dtype@
+        } -> tensor<512x1024x@dtype@>
+        return %7 : tensor<512x1024x@dtype@>
+      }
+  func.func @main() {
+    %0 = arith.constant dense<3.3> : tensor<512x1024x@dtype@>
+    %1 = arith.constant dense<1.0> : tensor<512x1024x@dtype@>
+    %2 = call @fusion(%0, %1) : (tensor<512x1024x@dtype@>, tensor<512x1024x@dtype@>) -> tensor<512x1024x@dtype@>
+    return
+  }
+}
diff --git a/benchmarks/reduce/CMakeLists.txt b/benchmarks/reduce/CMakeLists.txt
@@ -0,0 +1,8 @@
+file(STRINGS reduce.dtypes.in test_dtypes)
+list(APPEND test_shapes "1x16x512x512")
+
+foreach(shape ${test_shapes})
+    foreach(dtype ${test_dtypes})
+        configure_file(reduce.mlir.in ${IMEX_BINARY_DIR}/benchmarks/reduce/reduce_${shape}_${dtype}.mlir @ONLY)
+    endforeach()
+endforeach()
diff --git a/benchmarks/reduce/reduce.dtypes.in b/benchmarks/reduce/reduce.dtypes.in
@@ -0,0 +1,3 @@
+f32
+f16
+bf16
diff --git a/benchmarks/reduce/reduce.mlir.in b/benchmarks/reduce/reduce.mlir.in
@@ -0,0 +1,28 @@
+#map4 = affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>
+#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+module {
+      func.func @fusion(%arg4: tensor<32x16x512x512x@dtype@>, %arg5: tensor<@dtype@>) -> (tensor<1x16x512x512x@dtype@> {lmhlo.written}) {
+        %extracted = tensor.extract %arg5[] : tensor<@dtype@>
+        %1 = tensor.empty() : tensor<16x512x512x@dtype@>
+        %2 = linalg.fill ins(%extracted : @dtype@) outs(%1 : tensor<16x512x512x@dtype@>) -> tensor<16x512x512x@dtype@>
+        %3 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%arg4 : tensor<32x16x512x512x@dtype@>) outs(%2 : tensor<16x512x512x@dtype@>) {
+        ^bb0(%in: @dtype@, %out: @dtype@):
+          %from_elements = tensor.from_elements %out : tensor<@dtype@>
+          %from_elements_4 = tensor.from_elements %in : tensor<@dtype@>
+          %extracted_5 = tensor.extract %from_elements[] : tensor<@dtype@>
+          %extracted_6 = tensor.extract %from_elements_4[] : tensor<@dtype@>
+          %4 = arith.addf %extracted_5, %extracted_6 : @dtype@
+          %from_elements_7 = tensor.from_elements %4 : tensor<@dtype@>
+          %extracted_8 = tensor.extract %from_elements_7[] : tensor<@dtype@>
+          linalg.yield %extracted_8 : @dtype@
+        } -> tensor<16x512x512x@dtype@>
+        %expanded = tensor.expand_shape %3 [[0, 1], [2], [3]] : tensor<16x512x512x@dtype@> into tensor<1x16x512x512x@dtype@>
+        return %expanded : tensor<1x16x512x512x@dtype@>
+      }
+  func.func @main() {
+    %0 = arith.constant dense<3.3>:tensor<32x16x512x512x@dtype@>
+    %1 = arith.constant dense<0.0> : tensor<@dtype@>
+    %2 = call @fusion(%0, %1) : (tensor<32x16x512x512x@dtype@>, tensor<@dtype@>) -> tensor<1x16x512x512x@dtype@>
+    return
+  }
+}