Skip to content

Commit a2b6602

Browse files
authored
[NVPTX] expand trunc/ext on v2i32 (#161715)
#153478 made v2i32 legal on newer GPUs, but we can not lower all operations yet. Expand the `trunc/ext` operation until we implement efficient lowering.
1 parent 6048c2f commit a2b6602

File tree

2 files changed

+150
-0
lines changed

2 files changed

+150
-0
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
638638
// No support for these operations with v2f32/v2i32
639639
setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand);
640640
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);
641+
642+
setOperationAction(ISD::TRUNCATE, MVT::v2i16, Expand);
643+
setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
644+
MVT::v2i32, Expand);
645+
641646
// Need custom lowering in case the index is dynamic.
642647
if (STI.hasF32x2Instructions())
643648
setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32},

llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,5 +115,150 @@ define ptx_kernel void @inlineasm(ptr %p) {
115115
store <2 x float> %mul, ptr %p, align 8
116116
ret void
117117
}
118+
119+
define ptx_kernel void @trunc_v2i32(<2 x i32> %0) {
120+
; CHECK-SM90A-LABEL: trunc_v2i32(
121+
; CHECK-SM90A: {
122+
; CHECK-SM90A-NEXT: .reg .b32 %r<7>;
123+
; CHECK-SM90A-NEXT: .reg .b64 %rd<2>;
124+
; CHECK-SM90A-EMPTY:
125+
; CHECK-SM90A-NEXT: // %bb.0:
126+
; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r1, %r2}, [trunc_v2i32_param_0];
127+
; CHECK-SM90A-NEXT: prmt.b32 %r3, %r1, %r2, 0x3340U;
128+
; CHECK-SM90A-NEXT: mov.b32 %r4, 0;
129+
; CHECK-SM90A-NEXT: prmt.b32 %r5, %r4, 0, 0x3340U;
130+
; CHECK-SM90A-NEXT: prmt.b32 %r6, %r5, %r3, 0x5410U;
131+
; CHECK-SM90A-NEXT: mov.b64 %rd1, 0;
132+
; CHECK-SM90A-NEXT: st.b32 [%rd1], %r6;
133+
; CHECK-SM90A-NEXT: ret;
134+
;
135+
; CHECK-SM100-LABEL: trunc_v2i32(
136+
; CHECK-SM100: {
137+
; CHECK-SM100-NEXT: .reg .b32 %r<7>;
138+
; CHECK-SM100-NEXT: .reg .b64 %rd<3>;
139+
; CHECK-SM100-EMPTY:
140+
; CHECK-SM100-NEXT: // %bb.0:
141+
; CHECK-SM100-NEXT: ld.param.b64 %rd1, [trunc_v2i32_param_0];
142+
; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd1;
143+
; CHECK-SM100-NEXT: mov.b32 %r3, 0;
144+
; CHECK-SM100-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U;
145+
; CHECK-SM100-NEXT: prmt.b32 %r5, %r1, %r2, 0x3340U;
146+
; CHECK-SM100-NEXT: prmt.b32 %r6, %r4, %r5, 0x5410U;
147+
; CHECK-SM100-NEXT: mov.b64 %rd2, 0;
148+
; CHECK-SM100-NEXT: st.b32 [%rd2], %r6;
149+
; CHECK-SM100-NEXT: ret;
150+
%2 = trunc <2 x i32> %0 to <2 x i8>
151+
%3 = shufflevector <2 x i8> zeroinitializer, <2 x i8> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
152+
store <4 x i8> %3, ptr null, align 4
153+
ret void
154+
}
155+
156+
define ptx_kernel void @zextend_to_v2i32(<2 x i8> %0) {
157+
; CHECK-SM90A-LABEL: zextend_to_v2i32(
158+
; CHECK-SM90A: {
159+
; CHECK-SM90A-NEXT: .reg .b16 %rs<3>;
160+
; CHECK-SM90A-NEXT: .reg .b32 %r<4>;
161+
; CHECK-SM90A-NEXT: .reg .b64 %rd<5>;
162+
; CHECK-SM90A-EMPTY:
163+
; CHECK-SM90A-NEXT: // %bb.0:
164+
; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0];
165+
; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2};
166+
; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1;
167+
; CHECK-SM90A-NEXT: cvt.u32.u16 %r3, %rs2;
168+
; CHECK-SM90A-NEXT: mov.b64 %rd1, 12;
169+
; CHECK-SM90A-NEXT: st.b32 [%rd1], %r3;
170+
; CHECK-SM90A-NEXT: mov.b64 %rd2, 8;
171+
; CHECK-SM90A-NEXT: st.b32 [%rd2], %r2;
172+
; CHECK-SM90A-NEXT: mov.b64 %rd3, 4;
173+
; CHECK-SM90A-NEXT: st.b32 [%rd3], 0;
174+
; CHECK-SM90A-NEXT: mov.b64 %rd4, 0;
175+
; CHECK-SM90A-NEXT: st.b32 [%rd4], 0;
176+
; CHECK-SM90A-NEXT: ret;
177+
;
178+
; CHECK-SM100-LABEL: zextend_to_v2i32(
179+
; CHECK-SM100: {
180+
; CHECK-SM100-NEXT: .reg .b16 %rs<3>;
181+
; CHECK-SM100-NEXT: .reg .b32 %r<5>;
182+
; CHECK-SM100-NEXT: .reg .b64 %rd<8>;
183+
; CHECK-SM100-EMPTY:
184+
; CHECK-SM100-NEXT: // %bb.0:
185+
; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0];
186+
; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2};
187+
; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2;
188+
; CHECK-SM100-NEXT: cvt.u32.u16 %r3, %rs1;
189+
; CHECK-SM100-NEXT: mov.b64 %rd1, {%r3, %r2};
190+
; CHECK-SM100-NEXT: mov.b32 %r4, 0;
191+
; CHECK-SM100-NEXT: mov.b64 %rd2, {%r4, %r4};
192+
; CHECK-SM100-NEXT: mov.b64 %rd3, 4;
193+
; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2;
194+
; CHECK-SM100-NEXT: mov.b64 %rd4, 0;
195+
; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2;
196+
; CHECK-SM100-NEXT: mov.b64 %rd5, 8;
197+
; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1;
198+
; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32;
199+
; CHECK-SM100-NEXT: mov.b64 %rd7, 12;
200+
; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6;
201+
; CHECK-SM100-NEXT: ret;
202+
%2 = zext <2 x i8> %0 to <2 x i32>
203+
%3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
204+
store <4 x i32> %3, ptr null, align 4
205+
ret void
206+
}
207+
208+
define ptx_kernel void @sextend_to_v2i32(<2 x i8> %0) {
209+
; CHECK-SM90A-LABEL: sextend_to_v2i32(
210+
; CHECK-SM90A: {
211+
; CHECK-SM90A-NEXT: .reg .b16 %rs<3>;
212+
; CHECK-SM90A-NEXT: .reg .b32 %r<6>;
213+
; CHECK-SM90A-NEXT: .reg .b64 %rd<5>;
214+
; CHECK-SM90A-EMPTY:
215+
; CHECK-SM90A-NEXT: // %bb.0:
216+
; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0];
217+
; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2};
218+
; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1;
219+
; CHECK-SM90A-NEXT: cvt.s32.s8 %r3, %r2;
220+
; CHECK-SM90A-NEXT: cvt.u32.u16 %r4, %rs2;
221+
; CHECK-SM90A-NEXT: cvt.s32.s8 %r5, %r4;
222+
; CHECK-SM90A-NEXT: mov.b64 %rd1, 12;
223+
; CHECK-SM90A-NEXT: st.b32 [%rd1], %r5;
224+
; CHECK-SM90A-NEXT: mov.b64 %rd2, 8;
225+
; CHECK-SM90A-NEXT: st.b32 [%rd2], %r3;
226+
; CHECK-SM90A-NEXT: mov.b64 %rd3, 4;
227+
; CHECK-SM90A-NEXT: st.b32 [%rd3], 0;
228+
; CHECK-SM90A-NEXT: mov.b64 %rd4, 0;
229+
; CHECK-SM90A-NEXT: st.b32 [%rd4], 0;
230+
; CHECK-SM90A-NEXT: ret;
231+
;
232+
; CHECK-SM100-LABEL: sextend_to_v2i32(
233+
; CHECK-SM100: {
234+
; CHECK-SM100-NEXT: .reg .b16 %rs<3>;
235+
; CHECK-SM100-NEXT: .reg .b32 %r<7>;
236+
; CHECK-SM100-NEXT: .reg .b64 %rd<8>;
237+
; CHECK-SM100-EMPTY:
238+
; CHECK-SM100-NEXT: // %bb.0:
239+
; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0];
240+
; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2};
241+
; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2;
242+
; CHECK-SM100-NEXT: cvt.s32.s8 %r3, %r2;
243+
; CHECK-SM100-NEXT: cvt.u32.u16 %r4, %rs1;
244+
; CHECK-SM100-NEXT: cvt.s32.s8 %r5, %r4;
245+
; CHECK-SM100-NEXT: mov.b64 %rd1, {%r5, %r3};
246+
; CHECK-SM100-NEXT: mov.b32 %r6, 0;
247+
; CHECK-SM100-NEXT: mov.b64 %rd2, {%r6, %r6};
248+
; CHECK-SM100-NEXT: mov.b64 %rd3, 4;
249+
; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2;
250+
; CHECK-SM100-NEXT: mov.b64 %rd4, 0;
251+
; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2;
252+
; CHECK-SM100-NEXT: mov.b64 %rd5, 8;
253+
; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1;
254+
; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32;
255+
; CHECK-SM100-NEXT: mov.b64 %rd7, 12;
256+
; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6;
257+
; CHECK-SM100-NEXT: ret;
258+
%2 = sext <2 x i8> %0 to <2 x i32>
259+
%3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
260+
store <4 x i32> %3, ptr null, align 4
261+
ret void
262+
}
118263
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
119264
; CHECK: {{.*}}

0 commit comments

Comments
 (0)