1- // RUN: iree-opt --split-input-file -pass-pipeline="builtin.module(func.func(iree-codegen-fission-transfer-ops-in-control-flow),cse,canonicalize)" %s | FileCheck %s
1+ // RUN: iree-opt --split-input-file -pass-pipeline="builtin.module(func.func(iree-codegen-fission-transfer-ops-in-control-flow{fission-multi-trip}),cse,canonicalize)" %s | FileCheck %s --check-prefixes=CHECK-ALL,MULTI
2+ // RUN: iree-opt --split-input-file -pass-pipeline="builtin.module(func.func(iree-codegen-fission-transfer-ops-in-control-flow),cse)" %s | FileCheck %s --check-prefixes=CHECK-ALL,SINGLE
23
3- // CHECK-LABEL: @fission_global_read_to_private_write
4- // CHECK-SAME: %[[ARG0:.*]]: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>
5- // CHECK-SAME: %[[ARG1:.*]]: index
6- // CHECK-SAME: %[[ARG2:.*]]: i1
7- // CHECK-SAME: %[[ARG3:.*]]: vector<1x1x1x8xbf16>
8- // CHECK-SAME: %[[ARG4:.*]]: memref<1x1x1x8xbf16, #gpu.address_space<private>>
4+ // CHECK-ALL- LABEL: @fission_global_read_to_private_write
5+ // CHECK-ALL- SAME: %[[ARG0:.*]]: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>
6+ // CHECK-ALL- SAME: %[[ARG1:.*]]: index
7+ // CHECK-ALL- SAME: %[[ARG2:.*]]: i1
8+ // CHECK-ALL- SAME: %[[ARG3:.*]]: vector<1x1x1x8xbf16>
9+ // CHECK-ALL- SAME: %[[ARG4:.*]]: memref<1x1x1x8xbf16, #gpu.address_space<private>>
910func.func @fission_global_read_to_private_write (%arg0: memref <1 x?x?x8 xbf16 , #amdgpu.address_space <fat_raw_buffer >>, %arg1: index , %arg2: i1 , %arg3: vector <1 x1 x1 x8 xbf16 >, %arg4: memref <1 x1 x1 x8 xbf16 , #gpu.address_space <private >>) {
1011 %c0 = arith.constant 0 : index
1112 %c1 = arith.constant 1 : index
@@ -17,23 +18,26 @@ func.func @fission_global_read_to_private_write(%arg0: memref<1x?x?x8xbf16, #amd
1718 }
1819 return
1920}
20- // CHECK: %[[ALLOCA:.*]] = memref.alloca(%[[ARG1]])
21- // CHECK: scf.for %[[ITER:.*]] = %c0 to %[[ARG1]] step %c1 {
22- // CHECK: %[[read:.*]] = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]}
23- // CHECK: vector.transfer_write %[[read]], %[[ALLOCA]][%[[ITER]], %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]}
24- // CHECK: }
25- // CHECK: scf.for %[[ITER:.*]] = %c0 to %[[ARG1]] step %c1 {
26- // CHECK: %[[read:.*]] = vector.transfer_read %[[ALLOCA]][%[[ITER]], %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]}
27- // CHECK: %[[select:.*]] = arith.select %[[ARG2]], %[[read]], %[[ARG3]]
28- // CHECK: vector.transfer_write %[[select]], %arg4[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]}
29- // CHECK: }
21+ // MULTI: %[[ALLOCA:.*]] = memref.alloca(%[[ARG1]])
22+ // MULTI: scf.for %[[ITER:.*]] = %c0 to %[[ARG1]] step %c1 {
23+ // MULTI: %[[read:.*]] = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]}
24+ // MULTI: vector.transfer_write %[[read]], %[[ALLOCA]][%[[ITER]], %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]}
25+ // MULTI: }
26+ // MULTI: scf.for %[[ITER:.*]] = %c0 to %[[ARG1]] step %c1 {
27+ // MULTI: %[[read:.*]] = vector.transfer_read %[[ALLOCA]][%[[ITER]], %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]}
28+ // MULTI: %[[select:.*]] = arith.select %[[ARG2]], %[[read]], %[[ARG3]]
29+ // MULTI: vector.transfer_write %[[select]], %arg4[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]}
30+ // MULTI: }
31+
32+ // SINGLE: scf.for
33+ // SINGLE-NOT: scf.for
3034
3135// -----
3236
33- // CHECK-LABEL: @fission_global_read_to_workgroup_write
34- // CHECK-SAME: %[[ARG0:.*]]: index
35- // CHECK-SAME: %[[ARG1:.*]]: memref<?x?xf32, #amdgpu.address_space<fat_raw_buffer>>
36- // CHECK-SAME: %[[ARG2:.*]]: memref<1x4xf32, #gpu.address_space<workgroup>>
37+ // CHECK-ALL- LABEL: @fission_global_read_to_workgroup_write
38+ // CHECK-ALL- SAME: %[[ARG0:.*]]: index
39+ // CHECK-ALL- SAME: %[[ARG1:.*]]: memref<?x?xf32, #amdgpu.address_space<fat_raw_buffer>>
40+ // CHECK-ALL- SAME: %[[ARG2:.*]]: memref<1x4xf32, #gpu.address_space<workgroup>>
3741func.func @fission_global_read_to_workgroup_write (%arg0: index , %arg1: memref <?x?xf32 , #amdgpu.address_space <fat_raw_buffer >>, %arg2: memref <1 x4 xf32 , #gpu.address_space <workgroup >>) {
3842 %c0 = arith.constant 0 : index
3943 %c16 = arith.constant 16 : index
@@ -45,28 +49,31 @@ func.func @fission_global_read_to_workgroup_write(%arg0: index, %arg1: memref<?x
4549 }
4650 return
4751}
48- // CHECK: %[[SUB:.*]] = arith.subi %c16, %[[ARG0]]
49- // CHECK: %[[DIV:.*]] = arith.ceildivui %[[SUB]], %c128
50- // CHECK: %[[ALLOCA:.*]] = memref.alloca(%[[DIV]])
51- // CHECK: scf.for %[[ITER:.*]] = %[[ARG0]] to %c16 step %c128 {
52- // CHECK: %[[READ:.*]] = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]}
53- // CHECK: %[[SUB:.*]] = arith.subi %[[ITER]], %[[ARG0]]
54- // CHECK: %[[DIV:.*]] = arith.divui %[[SUB]], %c128
55- // CHECK: vector.transfer_write %[[READ]], %[[ALLOCA]][%[[DIV]], %c0, %c0] {in_bounds = [true, true]}
56- // CHECK: }
57- // CHECK: scf.for %[[ITER:.*]] = %[[ARG0]] to %c16 step %c128 {
58- // CHECK: %[[SUB:.*]] = arith.subi %[[ITER]], %[[ARG0]]
59- // CHECK: %[[DIV:.*]] = arith.divui %[[SUB]], %c128
60- // CHECK: %[[READ:.*]] = vector.transfer_read %[[ALLOCA]][%[[DIV]], %c0, %c0], %cst {in_bounds = [true, true]}
61- // CHECK: vector.transfer_write %[[READ]], %arg2[%c0, %c0] {in_bounds = [true, true]}
62- // CHECK: }
52+ // MULTI: %[[SUB:.*]] = arith.subi %c16, %[[ARG0]]
53+ // MULTI: %[[DIV:.*]] = arith.ceildivui %[[SUB]], %c128
54+ // MULTI: %[[ALLOCA:.*]] = memref.alloca(%[[DIV]])
55+ // MULTI: scf.for %[[ITER:.*]] = %[[ARG0]] to %c16 step %c128 {
56+ // MULTI: %[[READ:.*]] = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]}
57+ // MULTI: %[[SUB:.*]] = arith.subi %[[ITER]], %[[ARG0]]
58+ // MULTI: %[[DIV:.*]] = arith.divui %[[SUB]], %c128
59+ // MULTI: vector.transfer_write %[[READ]], %[[ALLOCA]][%[[DIV]], %c0, %c0] {in_bounds = [true, true]}
60+ // MULTI: }
61+ // MULTI: scf.for %[[ITER:.*]] = %[[ARG0]] to %c16 step %c128 {
62+ // MULTI: %[[SUB:.*]] = arith.subi %[[ITER]], %[[ARG0]]
63+ // MULTI: %[[DIV:.*]] = arith.divui %[[SUB]], %c128
64+ // MULTI: %[[READ:.*]] = vector.transfer_read %[[ALLOCA]][%[[DIV]], %c0, %c0], %cst {in_bounds = [true, true]}
65+ // MULTI: vector.transfer_write %[[READ]], %arg2[%c0, %c0] {in_bounds = [true, true]}
66+ // MULTI: }
67+
68+ // SINGLE: scf.for
69+ // SINGLE-NOT: scf.for
6370
6471// -----
6572
66- // CHECK-LABEL: @no_fission_global_read_to_global_write
67- // CHECK-SAME: %[[ARG0:.*]]: memref<1x?x?xf32, #amdgpu.address_space<fat_raw_buffer>>
68- // CHECK-SAME: %[[ARG1:.*]]: memref<1x?x?xf32, #gpu.address_space<global>>
69- // CHECK-SAME: %[[ARG2:.*]]: index
73+ // CHECK-ALL- LABEL: @no_fission_global_read_to_global_write
74+ // CHECK-ALL- SAME: %[[ARG0:.*]]: memref<1x?x?xf32, #amdgpu.address_space<fat_raw_buffer>>
75+ // CHECK-ALL- SAME: %[[ARG1:.*]]: memref<1x?x?xf32, #gpu.address_space<global>>
76+ // CHECK-ALL- SAME: %[[ARG2:.*]]: index
7077func.func @no_fission_global_read_to_global_write (%arg0: memref <1 x?x?xf32 , #amdgpu.address_space <fat_raw_buffer >>, %arg1: memref <1 x?x?xf32 , #gpu.address_space <global >>, %arg2: index ) {
7178 %c0 = arith.constant 0 : index
7279 %c1 = arith.constant 1 : index
@@ -77,8 +84,28 @@ func.func @no_fission_global_read_to_global_write(%arg0: memref<1x?x?xf32, #amdg
7784 }
7885 return
7986}
80- // CHECK: scf.for %[[ITER:.*]] = %c0 to %[[ARG2]] step %c1 {
81- // CHECK: %[[READ:.*]] = vector.transfer_read
82- // CHECK: vector.transfer_write %[[READ]], %arg1[%[[ITER]], %c0, %c0] {in_bounds = [true, true, true]}
83- // CHECK: }
84- // CHECK-NOT: scf.for
87+ // MULTI: scf.for %[[ITER:.*]] = %c0 to %[[ARG2]] step %c1 {
88+ // MULTI: %[[READ:.*]] = vector.transfer_read
89+ // MULTI: vector.transfer_write %[[READ]], %arg1[%[[ITER]], %c0, %c0] {in_bounds = [true, true, true]}
90+ // MULTI: }
91+ // MULTI-NOT: scf.for
92+
93+ // SINGLE: scf.for
94+ // SINGLE-NOT: scf.for
95+
96+ // -----
97+
98+ // CHECK-ALL-LABEL: @fission_unit_trip
99+ func.func @fission_unit_trip (%arg0: memref <1 x?x?x8 xbf16 , #amdgpu.address_space <fat_raw_buffer >>, %arg1: index , %arg2: i1 , %arg3: vector <1 x1 x1 x8 xbf16 >, %arg4: memref <1 x1 x1 x8 xbf16 , #gpu.address_space <private >>) {
100+ %c0 = arith.constant 0 : index
101+ %c1 = arith.constant 1 : index
102+ %cst = arith.constant 0.000000e+00 : bf16
103+ %ub = affine.min affine_map <(d0 ) -> (1 , d0 )>(%arg1 )
104+ scf.for %arg5 = %c0 to %ub step %c1 {
105+ %read = vector.transfer_read %arg0 [%c0 , %c0 , %c0 , %c0 ], %cst {in_bounds = [true , true , true , true ]} : memref <1 x?x?x8 xbf16 , #amdgpu.address_space <fat_raw_buffer >>, vector <1 x1 x1 x8 xbf16 >
106+ %select = arith.select %arg2 , %read , %arg3 : vector <1 x1 x1 x8 xbf16 >
107+ vector.transfer_write %select , %arg4 [%c0 , %c0 , %c0 , %c0 ] {in_bounds = [true , true , true , true ]} : vector <1 x1 x1 x8 xbf16 >, memref <1 x1 x1 x8 xbf16 , #gpu.address_space <private >>
108+ }
109+ return
110+ }
111+ // CHECK-ALL-COUNT-2: scf.for
0 commit comments