-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Create hi-half of 64-bit ashr with mov of -1 #146569
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 | ||
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=finalize-isel -o - %s | FileCheck %s | ||
|
|
||
| ;; Test that reduction of: | ||
| ;; | ||
| ;; DST = ashr i64 X, Y | ||
| ;; | ||
| ;; where Y is in the range [63-32] to: | ||
| ;; | ||
| ;; DST = [ashr i32 HI(X), (Y & 0x1F), ashr i32 HI(X), 31] | ||
| ;; | ||
| ;; preserves flags | ||
|
|
||
| define i64 @ashr_exact(i64 %arg0, i64 %shift_amt) { | ||
| ; CHECK-LABEL: name: ashr_exact | ||
| ; CHECK: bb.0 (%ir-block.0): | ||
| ; CHECK-NEXT: liveins: $vgpr1, $vgpr2 | ||
| ; CHECK-NEXT: {{ $}} | ||
| ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 | ||
| ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 | ||
| ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF | ||
| ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF | ||
| ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF | ||
| ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]] | ||
| ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 | ||
| ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 | ||
| ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF | ||
| ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF | ||
| ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF | ||
| ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF3]] | ||
| ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY4]], %subreg.sub1 | ||
| ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 | ||
| ; CHECK-NEXT: [[V_ASHRREV_I32_e64_:%[0-9]+]]:vgpr_32 = exact V_ASHRREV_I32_e64 killed [[COPY5]], [[COPY3]], implicit $exec | ||
| ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 31 | ||
| ; CHECK-NEXT: [[V_ASHRREV_I32_e64_1:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 killed [[S_MOV_B32_]], [[COPY3]], implicit $exec | ||
| ; CHECK-NEXT: $vgpr0 = COPY [[V_ASHRREV_I32_e64_]] | ||
| ; CHECK-NEXT: $vgpr1 = COPY [[V_ASHRREV_I32_e64_1]] | ||
| ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 | ||
| %or = or i64 %shift_amt, 32 | ||
| %ashr = ashr exact i64 %arg0, %or | ||
| ret i64 %ashr | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,189 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
| ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: why pal?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was derived from ashr64_reduce.ll. There's no particular reason why pal was chosen.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because amdhsa errors on the shader calling conventions but it's useful to use them for the return in SGPRs |
||
|
|
||
| ; Test that negative 64-bit values shifted by [32-63] bits have | ||
| ; a hi-result created by moving an all-ones constant. | ||
|
|
||
| ; FIXME: Range metadata is invalidated when i64 types are legalized to v2i32 types. | ||
| ; We could call performSraCombine before legalization, but other optimizations only work | ||
| ; with 64-bit sra. | ||
| define i64 @scalar_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) { | ||
| ; CHECK-LABEL: scalar_ashr_metadata: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: flat_load_dwordx2 v[4:5], v[0:1] | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: flat_load_dword v4, v[2:3] | ||
| ; CHECK-NEXT: ; kill: killed $vgpr0 killed $vgpr1 | ||
| ; CHECK-NEXT: ; kill: killed $vgpr2 killed $vgpr3 | ||
| ; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v5 | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: v_ashrrev_i32_e32 v0, v4, v5 | ||
| ; CHECK-NEXT: s_setpc_b64 s[30:31] | ||
| %val = load i64, ptr %arg0.ptr, !range !0, !noundef !{} | ||
| %shift.amt = load i64, ptr %arg1.ptr, !range !1, !noundef !{} | ||
| %ashr = ashr i64 %val, %shift.amt | ||
| ret i64 %ashr | ||
| } | ||
|
|
||
| define <2 x i64> @v2_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) { | ||
| ; CHECK-LABEL: v2_ashr_metadata: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1] | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] | ||
| ; CHECK-NEXT: v_mov_b32_e32 v1, -1 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v3, -1 | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: v_ashrrev_i32_e32 v0, v8, v5 | ||
| ; CHECK-NEXT: v_ashrrev_i32_e32 v2, v10, v7 | ||
| ; CHECK-NEXT: s_setpc_b64 s[30:31] | ||
| %val = load <2 x i64>, ptr %arg0.ptr, !range !2, !noundef !{} | ||
| %shift.amt = load <2 x i64>, ptr %arg1.ptr, !range !3, !noundef !{} | ||
| %ashr = ashr <2 x i64> %val, %shift.amt | ||
| ret <2 x i64> %ashr | ||
| } | ||
|
|
||
| define <3 x i64> @v3_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) { | ||
| ; CHECK-LABEL: v3_ashr_metadata: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1] | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: flat_load_dword v4, v[0:1] offset:20 | ||
| ; CHECK-NEXT: flat_load_dword v6, v[2:3] offset:16 | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] | ||
| ; CHECK-NEXT: v_mov_b32_e32 v1, -1 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v3, -1 | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: v_ashrrev_i32_e32 v4, v6, v4 | ||
| ; CHECK-NEXT: v_ashrrev_i32_e32 v0, v8, v5 | ||
| ; CHECK-NEXT: v_ashrrev_i32_e32 v2, v10, v7 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v5, -1 | ||
| ; CHECK-NEXT: s_setpc_b64 s[30:31] | ||
| %val = load <3 x i64>, ptr %arg0.ptr, !range !4, !noundef !{} | ||
| %shift.amt = load <3 x i64>, ptr %arg1.ptr, !range !5, !noundef !{} | ||
| %ashr = ashr <3 x i64> %val, %shift.amt | ||
| ret <3 x i64> %ashr | ||
| } | ||
|
|
||
| define <4 x i64> @v4_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) { | ||
| ; CHECK-LABEL: v4_ashr_metadata: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[0:1] | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[11:14], v[0:1] offset:16 | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[15:18], v[2:3] offset:16 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v1, -1 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v3, -1 | ||
| ; CHECK-NEXT: v_mov_b32_e32 v5, -1 | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: v_mov_b32_e32 v7, -1 | ||
| ; CHECK-NEXT: v_ashrrev_i32_e32 v0, v4, v8 | ||
| ; CHECK-NEXT: v_ashrrev_i32_e32 v2, v6, v10 | ||
| ; CHECK-NEXT: v_ashrrev_i32_e32 v4, v15, v12 | ||
| ; CHECK-NEXT: v_ashrrev_i32_e32 v6, v17, v14 | ||
| ; CHECK-NEXT: s_setpc_b64 s[30:31] | ||
| %val = load <4 x i64>, ptr %arg0.ptr, !range !6, !noundef !{} | ||
| %shift.amt = load <4 x i64>, ptr %arg1.ptr, !range !7, !noundef !{} | ||
| %ashr = ashr <4 x i64> %val, %shift.amt | ||
| ret <4 x i64> %ashr | ||
| } | ||
|
|
||
| ; Ranges used when transformation is valid | ||
| !0 = !{i64 -6000000000, i64 0} | ||
| !1 = !{i64 32, i64 64} | ||
LU-JOHN marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| !2 = !{i64 -7000000000, i64 -1000} | ||
| !3 = !{i64 38, i64 64} | ||
| !4 = !{i64 -8000000000, i64 -2001} | ||
| !5 = !{i64 38, i64 60} | ||
| !6 = !{i64 -9000000000, i64 -3002} | ||
| !7 = !{i64 38, i64 50} | ||
|
|
||
| ; Test that negative 64-bit values shifted by [2?-63] bits do NOT have | ||
| ; a hi-result created by moving an all-ones constant. | ||
|
|
||
| define i64 @no_transform_scalar_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) { | ||
| ; CHECK-LABEL: no_transform_scalar_ashr_metadata: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: flat_load_dwordx2 v[4:5], v[0:1] | ||
| ; CHECK-NEXT: flat_load_dword v6, v[2:3] | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: v_ashrrev_i64 v[0:1], v6, v[4:5] | ||
| ; CHECK-NEXT: s_setpc_b64 s[30:31] | ||
| %val = load i64, ptr %arg0.ptr, !range !8, !noundef !{} | ||
| %shift.amt = load i64, ptr %arg1.ptr, !range !9, !noundef !{} | ||
| %ashr = ashr i64 %val, %shift.amt | ||
| ret i64 %ashr | ||
| } | ||
|
|
||
| define <2 x i64> @no_transform_v2_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) { | ||
| ; CHECK-LABEL: no_transform_v2_ashr_metadata: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1] | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: v_ashrrev_i64 v[0:1], v8, v[4:5] | ||
| ; CHECK-NEXT: v_ashrrev_i64 v[2:3], v10, v[6:7] | ||
| ; CHECK-NEXT: s_setpc_b64 s[30:31] | ||
| %val = load <2 x i64>, ptr %arg0.ptr, !range !10, !noundef !{} | ||
| %shift.amt = load <2 x i64>, ptr %arg1.ptr, !range !11, !noundef !{} | ||
| %ashr = ashr <2 x i64> %val, %shift.amt | ||
| ret <2 x i64> %ashr | ||
| } | ||
|
|
||
| define <3 x i64> @no_transform_v3_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) { | ||
| ; CHECK-LABEL: no_transform_v3_ashr_metadata: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[0:1] | ||
| ; CHECK-NEXT: flat_load_dwordx2 v[11:12], v[0:1] offset:16 | ||
| ; CHECK-NEXT: flat_load_dword v5, v[2:3] offset:16 | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: v_ashrrev_i64 v[0:1], v4, v[7:8] | ||
| ; CHECK-NEXT: v_ashrrev_i64 v[2:3], v6, v[9:10] | ||
| ; CHECK-NEXT: v_ashrrev_i64 v[4:5], v5, v[11:12] | ||
| ; CHECK-NEXT: s_setpc_b64 s[30:31] | ||
| %val = load <3 x i64>, ptr %arg0.ptr, !range !12, !noundef !{} | ||
| %shift.amt = load <3 x i64>, ptr %arg1.ptr, !range !13, !noundef !{} | ||
| %ashr = ashr <3 x i64> %val, %shift.amt | ||
| ret <3 x i64> %ashr | ||
| } | ||
|
|
||
| define <4 x i64> @no_transform_v4_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) { | ||
| ; CHECK-LABEL: no_transform_v4_ashr_metadata: | ||
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[0:1] | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[11:14], v[0:1] offset:16 | ||
| ; CHECK-NEXT: flat_load_dwordx4 v[15:18], v[2:3] offset:16 | ||
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
| ; CHECK-NEXT: v_ashrrev_i64 v[0:1], v4, v[7:8] | ||
| ; CHECK-NEXT: v_ashrrev_i64 v[2:3], v6, v[9:10] | ||
| ; CHECK-NEXT: v_ashrrev_i64 v[4:5], v15, v[11:12] | ||
| ; CHECK-NEXT: v_ashrrev_i64 v[6:7], v17, v[13:14] | ||
| ; CHECK-NEXT: s_setpc_b64 s[30:31] | ||
| %val = load <4 x i64>, ptr %arg0.ptr, !range !14, !noundef !{} | ||
| %shift.amt = load <4 x i64>, ptr %arg1.ptr, !range !15, !noundef !{} | ||
| %ashr = ashr <4 x i64> %val, %shift.amt | ||
| ret <4 x i64> %ashr | ||
| } | ||
|
|
||
| ; Ranges used when transformation is invalid | ||
| !8 = !{i64 -10000000000, i64 0} | ||
| !9 = !{i64 29, i64 64} | ||
| !10 = !{i64 -11000000000, i64 -1000} | ||
| !11 = !{i64 28, i64 64} | ||
| !12 = !{i64 -12000000000, i64 -2001} | ||
| !13 = !{i64 27, i64 60} | ||
| !14 = !{i64 -13000000000, i64 -3002} | ||
| !15 = !{i64 26, i64 50} | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this preserve flags?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The ashr for the hi-half cannot preserve an exact flag, but the ashr for the lo-half has been updated to preserve the exact flag.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't seem to preserve the flags in the previous changes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it was already an issue with the previous commit.