-
Notifications
You must be signed in to change notification settings - Fork 15.2k
AMDGPU: Use v_mov_b32 to implement divergent zext i32->i64 #168166
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Some cases are relying on SIFixSGPRCopies to force VALU reg_sequence inputs with SGPR inputs to use all VGPR inputs, but this doesn't always happen if the reg_sequence isn't invalid. Make sure we use a vgpr up-front here so we don't rely on something later.
|
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesSome cases are relying on SIFixSGPRCopies to force VALU Patch is 241.28 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168166.diff 21 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6dd4b1d7bd000..b7256b81ee826 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2941,10 +2941,15 @@ def : GCNPat <
>;
def : GCNPat <
- (i64 (zext i32:$src)),
+ (i64 (UniformUnaryFrag<zext> i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
>;
+def : GCNPat <
+ (i64 (zext i32:$src)),
+ (REG_SEQUENCE VReg_64, $src, sub0, (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
def : GCNPat <
(i64 (anyext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index ff74d1f71616d..88e3c86c791de 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2549,17 +2549,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1164_ITERATIVE-LABEL: add_i64_varying:
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v2, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
@@ -2606,7 +2606,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1132_ITERATIVE-LABEL: add_i64_varying:
; GFX1132_ITERATIVE: ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2614,8 +2614,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
@@ -2659,8 +2659,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1264_ITERATIVE-LABEL: add_i64_varying:
; GFX1264_ITERATIVE: ; %bb.0: ; %entry
-; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2668,8 +2668,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s8
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s8
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
@@ -2714,7 +2714,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1232_ITERATIVE-LABEL: add_i64_varying:
; GFX1232_ITERATIVE: ; %bb.0: ; %entry
-; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo
; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2723,8 +2723,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
-; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
@@ -6930,15 +6930,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1164_ITERATIVE-LABEL: sub_i64_varying:
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[8:9], 0
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
@@ -7087,8 +7087,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1264_ITERATIVE-LABEL: sub_i64_varying:
; GFX1264_ITERATIVE: ; %bb.0: ; %entry
-; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -7096,8 +7096,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s8
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s8
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
@@ -7142,7 +7142,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1232_ITERATIVE-LABEL: sub_i64_varying:
; GFX1232_ITERATIVE: ; %bb.0: ; %entry
-; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo
; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -7151,8 +7151,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
-; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index f5ca24f59a286..12517c2bc1b5d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2181,17 +2181,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_ITERATIVE-LABEL: add_i64_varying:
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v2, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
@@ -2233,7 +2233,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
;
; GFX1132_ITERATIVE-LABEL: add_i64_varying:
; GFX1132_ITERATIVE: ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2241,8 +2241,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
@@ -2982,19 +2982,20 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
;
; GFX1164_ITERATIVE-LABEL: add_i64_varying_nouse:
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s4, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s5
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5594,17 +5595,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_ITERATIVE-LABEL: sub_i64_varying:
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v2, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
@@ -5646,7 +5647,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
;
; GFX1132_ITERATIVE-LABEL: sub_i64_varying:
; GFX1132_ITERATIVE: ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -5654,8 +5655,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
@@ -7063,17 +7064,17 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_ITERATIVE-LABEL: and_i64_varying:
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s8
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
@@ -7113,7 +7114,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
;
; GFX1132_ITERATIVE-LABEL: and_i64_varying:
; GFX1132_ITERATIVE: ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1
; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -7121,8 +7122,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
@@ -8411,17 +8412,17 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_ITERATIVE-LABEL: or_i64_varying:
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s8
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
@@ -8461,7 +8462,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
;
; GFX1132_ITERATIVE-LABEL: or_i64_varying:
; GFX1132_ITERATIVE: ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -8469,8 +8470,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT...
[truncated]
|
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/4/builds/10490 Here is the relevant piece of the build log for the reference |

Some cases are relying on SIFixSGPRCopies to force VALU
reg_sequence inputs with SGPR inputs to use all VGPR inputs,
but this doesn't always happen if the reg_sequence isn't
invalid. Make sure we use a vgpr up-front here so we don't
rely on something later.