1+ // RUN: gc-opt %s --gc-gpu-pipeline -split-input-file | FileCheck %s
2+
3+ // CHECK-LABEL: llvm
4+ module @fragment_name attributes {" #dlti.sys_spec" = #dlti.target_system_spec <" CPU" : #dlti.target_device_spec <#dlti.dl_entry <" tile_size" , 32 : i32 >>>} {
5+ func.func @matmul_f16 (%arg0: memref <4096 x4096 xf16 >, %arg1: memref <4096 x4096 xf16 >, %arg2: memref <4096 x4096 xf16 >) {
6+ %0 = bufferization.to_tensor %arg0 restrict : memref <4096 x4096 xf16 >
7+ %1 = bufferization.to_tensor %arg1 restrict : memref <4096 x4096 xf16 >
8+ %2 = tensor.empty () : tensor <4096 x4096 xf16 >
9+ %cst = arith.constant 0.000000e+00 : f16
10+ %3 = linalg.fill ins (%cst : f16 ) outs (%2 : tensor <4096 x4096 xf16 >) -> tensor <4096 x4096 xf16 >
11+ %4 = linalg.matmul_transpose_b ins (%0 , %1 : tensor <4096 x4096 xf16 >, tensor <4096 x4096 xf16 >) outs (%3 : tensor <4096 x4096 xf16 >) -> tensor <4096 x4096 xf16 >
12+ bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor <4096 x4096 xf16 >, memref <4096 x4096 xf16 >) -> ()
13+ return
14+ }
15+ }
16+
17+ // -----
18+ // CHECK-LABEL: llvm
19+ module @fragment_name attributes {" #dlti.sys_spec" = #dlti.target_system_spec <" CPU" : #dlti.target_device_spec <#dlti.dl_entry <" tile_size" , 32 : i32 >>>} {
20+ func.func @corner_shape_matmul_f16 (%arg0: memref <521 x521 xf16 >, %arg1: memref <521 x521 xf16 >, %arg2: memref <521 x521 xf16 >) {
21+ %0 = bufferization.to_tensor %arg0 restrict : memref <521 x521 xf16 >
22+ %1 = bufferization.to_tensor %arg1 restrict : memref <521 x521 xf16 >
23+ %2 = tensor.empty () : tensor <521 x521 xf16 >
24+ %cst = arith.constant 0.000000e+00 : f16
25+ %3 = linalg.fill ins (%cst : f16 ) outs (%2 : tensor <521 x521 xf16 >) -> tensor <521 x521 xf16 >
26+ %4 = linalg.matmul_transpose_b ins (%0 , %1 : tensor <521 x521 xf16 >, tensor <521 x521 xf16 >) outs (%3 : tensor <521 x521 xf16 >) -> tensor <521 x521 xf16 >
27+ bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor <521 x521 xf16 >, memref <521 x521 xf16 >) -> ()
28+ return
29+ }
30+ }
31+
32+ // -----
33+ // CHECK-LABEL: llvm
34+ module @fragment_name attributes {" #dlti.sys_spec" = #dlti.target_system_spec <" CPU" : #dlti.target_device_spec <#dlti.dl_entry <" tile_size" , 32 : i32 >>>} {
35+ func.func @dynamic_matmul_f16 (%arg0: memref <?x?xf16 >, %arg1: memref <1024 x1024 xf16 >, %arg2: memref <?x1024 xf16 >) {
36+ %0 = bufferization.to_tensor %arg0 restrict : memref <?x?xf16 >
37+ %c0 = arith.constant 0 : index
38+ %dim = tensor.dim %0 , %c0 : tensor <?x?xf16 >
39+ %c1 = arith.constant 1 : index
40+ %dim_0 = tensor.dim %0 , %c1 : tensor <?x?xf16 >
41+ %1 = bufferization.to_tensor %arg1 restrict : memref <1024 x1024 xf16 >
42+ %2 = tensor.empty (%dim ) : tensor <?x1024 xf16 >
43+ %cst = arith.constant 0.000000e+00 : f16
44+ %3 = linalg.fill ins (%cst : f16 ) outs (%2 : tensor <?x1024 xf16 >) -> tensor <?x1024 xf16 >
45+ %4 = linalg.matmul_transpose_b ins (%0 , %1 : tensor <?x?xf16 >, tensor <1024 x1024 xf16 >) outs (%3 : tensor <?x1024 xf16 >) -> tensor <?x1024 xf16 >
46+ bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor <?x1024 xf16 >, memref <?x1024 xf16 >) -> ()
47+ return
48+ }
49+ }
50+
51+ // -----
52+ // CHECK-LABEL: llvm
53+ module @fragment_name attributes {" #dlti.sys_spec" = #dlti.target_system_spec <" CPU" : #dlti.target_device_spec <#dlti.dl_entry <" tile_size" , 32 : i32 >>>} {
54+ func.func @matmul_bf16 (%arg0: memref <4096 x4096 xbf16 >, %arg1: memref <4096 x4096 xbf16 >, %arg2: memref <4096 x4096 xbf16 >) {
55+ %0 = bufferization.to_tensor %arg0 restrict : memref <4096 x4096 xbf16 >
56+ %1 = bufferization.to_tensor %arg1 restrict : memref <4096 x4096 xbf16 >
57+ %2 = tensor.empty () : tensor <4096 x4096 xbf16 >
58+ %cst = arith.constant 0.000000e+00 : bf16
59+ %3 = linalg.fill ins (%cst : bf16 ) outs (%2 : tensor <4096 x4096 xbf16 >) -> tensor <4096 x4096 xbf16 >
60+ %4 = linalg.matmul_transpose_b ins (%0 , %1 : tensor <4096 x4096 xbf16 >, tensor <4096 x4096 xbf16 >) outs (%3 : tensor <4096 x4096 xbf16 >) -> tensor <4096 x4096 xbf16 >
61+ bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor <4096 x4096 xbf16 >, memref <4096 x4096 xbf16 >) -> ()
62+ return
63+ }
64+ }
65+
66+ // -----
67+ // CHECK-LABEL: llvm
68+ module @fragment_name attributes {" #dlti.sys_spec" = #dlti.target_system_spec <" CPU" : #dlti.target_device_spec <#dlti.dl_entry <" tile_size" , 32 : i32 >>>} {
69+ func.func @matmul_f32 (%arg0: memref <4096 x4096 xf32 >, %arg1: memref <4096 x4096 xf32 >, %arg2: memref <4096 x4096 xf32 >) {
70+ %0 = bufferization.to_tensor %arg0 restrict : memref <4096 x4096 xf32 >
71+ %1 = bufferization.to_tensor %arg1 restrict : memref <4096 x4096 xf32 >
72+ %2 = tensor.empty () : tensor <4096 x4096 xf32 >
73+ %cst = arith.constant 0.000000e+00 : f32
74+ %3 = linalg.fill ins (%cst : f32 ) outs (%2 : tensor <4096 x4096 xf32 >) -> tensor <4096 x4096 xf32 >
75+ %4 = linalg.matmul_transpose_b ins (%0 , %1 : tensor <4096 x4096 xf32 >, tensor <4096 x4096 xf32 >) outs (%3 : tensor <4096 x4096 xf32 >) -> tensor <4096 x4096 xf32 >
76+ bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor <4096 x4096 xf32 >, memref <4096 x4096 xf32 >) -> ()
77+ return
78+ }
79+ }
0 commit comments