@@ -62,111 +62,3 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
6262 tt.return
6363 }
6464}
65-
66- // -----
67-
68- #blocked1 = #triton_gpu.blocked <{sizePerThread = [2 ], threadsPerWarp = [32 ], warpsPerCTA = [4 ], order = [0 ]}>
69- module attributes {" triton_gpu.num-ctas" = 1 : i32 , " triton_gpu.num-warps" = 4 : i32 } {
70- // CHECK-LABEL: atomic_add_f16
71- tt.func @atomic_add_f16 (%arg0: !tt.ptr <f16 > {tt.divisibility = 16 : i32 }, %arg1 : tensor <256 xi1 , #blocked1 >, %arg2 : tensor <256 xf16 , #blocked1 >) {
72- %range = tt.make_range {end = 256 : i32 , start = 0 : i32 } : tensor <256 xi32 , #blocked1 >
73- %base_ptr = tt.splat %arg0 : !tt.ptr <f16 > -> tensor <256 x!tt.ptr <f16 >, #blocked1 >
74- %ptr = tt.addptr %base_ptr , %range : tensor <256 x!tt.ptr <f16 >, #blocked1 >, tensor <256 xi32 , #blocked1 >
75- // CHECK: llvm.cond_br
76- // CHECK: llvm.atomicrmw fadd {{.*}} vector<2xf16>
77- %0 = tt.atomic_rmw fadd , relaxed , gpu , %ptr , %arg2 , %arg1 : (tensor <256 x!tt.ptr <f16 >, #blocked1 >, tensor <256 xf16 , #blocked1 >, tensor <256 xi1 , #blocked1 >) -> tensor <256 xf16 , #blocked1 >
78- tt.return
79- }
80- }
81-
82- // -----
83-
84- #blocked2 = #triton_gpu.blocked <{sizePerThread = [2 ], threadsPerWarp = [32 ], warpsPerCTA = [4 ], order = [0 ]}>
85- module attributes {" triton_gpu.num-ctas" = 1 : i32 , " triton_gpu.num-warps" = 4 : i32 } {
86- // CHECK-LABEL: atomic_add_bf16
87- tt.func @atomic_add_bf16 (%arg0: !tt.ptr <bf16 > {tt.divisibility = 16 : i32 }, %arg1 : tensor <256 xi1 , #blocked2 >, %arg2 : tensor <256 xbf16 , #blocked2 >) {
88- %range = tt.make_range {end = 256 : i32 , start = 0 : i32 } : tensor <256 xi32 , #blocked2 >
89- %base_ptr = tt.splat %arg0 : !tt.ptr <bf16 > -> tensor <256 x!tt.ptr <bf16 >, #blocked2 >
90- %ptr = tt.addptr %base_ptr , %range : tensor <256 x!tt.ptr <bf16 >, #blocked2 >, tensor <256 xi32 , #blocked2 >
91- // CHECK: llvm.cond_br
92- // CHECK: llvm.atomicrmw fadd {{.*}} vector<2xbf16>
93- %0 = tt.atomic_rmw fadd , relaxed , gpu , %ptr , %arg2 , %arg1 : (tensor <256 x!tt.ptr <bf16 >, #blocked2 >, tensor <256 xbf16 , #blocked2 >, tensor <256 xi1 , #blocked2 >) -> tensor <256 xbf16 , #blocked2 >
94- tt.return
95- }
96- }
97-
98- // -----
99-
100- #blocked3 = #triton_gpu.blocked <{sizePerThread = [1 ], threadsPerWarp = [64 ], warpsPerCTA = [1 ], order = [0 ]}>
101- module attributes {" triton_gpu.num-ctas" = 1 : i32 , " triton_gpu.num-warps" = 1 : i32 , " triton_gpu.threads-per-warp" = 64 : i32 } {
102- // CHECK-LABEL: reduce_dpp_max
103- tt.func @reduce_dpp_max (%arg0: tensor <64 xf32 , #blocked3 >) {
104- // CHECK: rocdl.update.dpp
105- // CHECK-SAME: with 280, 15, 15, true : f32
106- // CHECK-NEXT: llvm.intr.maxnum
107-
108- // CHECK-NEXT: rocdl.update.dpp
109- // CHECK-SAME: with 276, 15, 15, true : f32
110- // CHECK-NEXT: llvm.intr.maxnum
111-
112- // CHECK-NEXT: rocdl.update.dpp
113- // CHECK-SAME: with 274, 15, 15, true : f32
114- // CHECK-NEXT: llvm.intr.maxnum
115-
116- // CHECK-NEXT: rocdl.update.dpp
117- // CHECK-SAME: with 273, 15, 15, true : f32
118- // CHECK-NEXT: llvm.intr.maxnum
119-
120- // CHECK-NEXT: rocdl.update.dpp
121- // CHECK-SAME: with 322, 10, 15, true : f32
122- // CHECK-NEXT: llvm.intr.maxnum
123-
124- // CHECK-NEXT: rocdl.update.dpp
125- // CHECK-SAME: with 323, 15, 15, true : f32
126- // CHECK-NEXT: llvm.intr.maxnum
127-
128- // CHECK: llvm.amdgcn.readlane
129- %0 = " tt.reduce" (%arg0 ) <{axis = 0 : i32 }> ({
130- ^bb0 (%arg1: f32 , %arg2: f32 ):
131- %1 = arith.maxnumf %arg1 , %arg2 : f32
132- tt.reduce.return %1 : f32
133- }) : (tensor <64 xf32 , #blocked3 >) -> f32
134- tt.return
135- }
136- }
137-
138- // -----
139-
140- #blocked4 = #triton_gpu.blocked <{sizePerThread = [1 ], threadsPerWarp = [64 ], warpsPerCTA = [1 ], order = [0 ]}>
141- module attributes {" triton_gpu.num-ctas" = 1 : i32 , " triton_gpu.num-warps" = 1 : i32 , " triton_gpu.threads-per-warp" = 64 : i32 } {
142- // CHECK-LABEL: reduce_xor_max
143- tt.func @reduce_xor_max (%arg0: tensor <32 xf32 , #blocked4 >) {
144- // CHECK: rocdl.ds_swizzle
145- // CHECK: llvm.intr.maxnum
146-
147- // CHECK: rocdl.update.dpp
148- // CHECK-SAME: with 280, 15, 12, false : i32
149- // CHECK: rocdl.update.dpp
150- // CHECK-SAME: with 264, 15, 3, false : i32
151- // CHECK: llvm.intr.maxnum
152-
153- // CHECK: rocdl.update.dpp
154- // CHECK-SAME: with 276, 15, 10, false : i32
155- // CHECK: rocdl.update.dpp
156- // CHECK-SAME: with 260, 15, 5, false : i32
157- // CHECK: llvm.intr.maxnum
158-
159- // CHECK: rocdl.update.dpp
160- // CHECK-SAME: with 78, 15, 15, false : i32
161- // CHECK: llvm.intr.maxnum
162-
163- // CHECK: rocdl.update.dpp
164- // CHECK-SAME: with 177, 15, 15, false : i32
165- %0 = " tt.reduce" (%arg0 ) <{axis = 0 : i32 }> ({
166- ^bb0 (%arg1: f32 , %arg2: f32 ):
167- %1 = arith.maxnumf %arg1 , %arg2 : f32
168- tt.reduce.return %1 : f32
169- }) : (tensor <32 xf32 , #blocked4 >) -> f32
170- tt.return
171- }
172- }
0 commit comments