Skip to content

Conversation

@arsenm
Copy link
Contributor

@arsenm arsenm commented Nov 15, 2025

Some cases are relying on SIFixSGPRCopies to force VALU
reg_sequence inputs with SGPR inputs to use all VGPR inputs,
but this doesn't always happen if the reg_sequence isn't
invalid. Make sure we use a vgpr up-front here so we don't
rely on something later.

Some cases are relying on SIFixSGPRCopies to force VALU
reg_sequence inputs with SGPR inputs to use all VGPR inputs,
but this doesn't always happen if the reg_sequence isn't
invalid. Make sure we use a vgpr up-front here so we don't
rely on something later.
Copy link
Contributor Author

arsenm commented Nov 15, 2025

@llvmbot
Copy link
Member

llvmbot commented Nov 15, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Some cases are relying on SIFixSGPRCopies to force VALU
reg_sequence inputs with SGPR inputs to use all VGPR inputs,
but this doesn't always happen if the reg_sequence isn't
invalid. Make sure we use a vgpr up-front here so we don't
rely on something later.


Patch is 241.28 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168166.diff

21 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+6-1)
  • (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+24-24)
  • (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+75-74)
  • (modified) llvm/test/CodeGen/AMDGPU/collapse-endcf.ll (-2)
  • (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+311-322)
  • (modified) llvm/test/CodeGen/AMDGPU/ds_write2.ll (+12-12)
  • (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (-2)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll (+39-19)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll (+39-19)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll (+45-22)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll (+45-22)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll (-8)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll (-1)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll (-24)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll (-16)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll (-6)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll (-6)
  • (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+626-651)
  • (modified) llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll (+3-3)
  • (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll (-1)
  • (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved.ll (-2)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6dd4b1d7bd000..b7256b81ee826 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2941,10 +2941,15 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (i64 (zext i32:$src)),
+  (i64 (UniformUnaryFrag<zext> i32:$src)),
   (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
 >;
 
+def : GCNPat <
+  (i64 (zext i32:$src)),
+  (REG_SEQUENCE VReg_64, $src, sub0, (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
 def : GCNPat <
   (i64 (anyext i32:$src)),
   (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index ff74d1f71616d..88e3c86c791de 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2549,17 +2549,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1164_ITERATIVE-LABEL: add_i64_varying:
 ; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s3, v2, s2
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s2
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s2
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v2, s2
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s2
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s6, s6, s3
@@ -2606,7 +2606,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1132_ITERATIVE-LABEL: add_i64_varying:
 ; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -2614,8 +2614,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v3, s1
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v2, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
@@ -2659,8 +2659,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1264_ITERATIVE-LABEL: add_i64_varying:
 ; GFX1264_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1264_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
-; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1264_ITERATIVE-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -2668,8 +2668,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[0:1]
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s8
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v2, s8
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v3, s8
 ; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s8
 ; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s8
 ; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
@@ -2714,7 +2714,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1232_ITERATIVE-LABEL: add_i64_varying:
 ; GFX1232_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1232_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1232_ITERATIVE-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
 ; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1232_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -2723,8 +2723,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
-; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
+; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v2, s1
+; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
 ; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
@@ -6930,15 +6930,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1164_ITERATIVE-LABEL: sub_i64_varying:
 ; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1164_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v1, s2
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s2
@@ -7087,8 +7087,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1264_ITERATIVE-LABEL: sub_i64_varying:
 ; GFX1264_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1264_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
-; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1264_ITERATIVE-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -7096,8 +7096,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[0:1]
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s8
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v2, s8
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v3, s8
 ; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s8
 ; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s8
 ; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
@@ -7142,7 +7142,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1232_ITERATIVE-LABEL: sub_i64_varying:
 ; GFX1232_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1232_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1232_ITERATIVE-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
 ; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1232_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -7151,8 +7151,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
-; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
+; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v2, s1
+; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
 ; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index f5ca24f59a286..12517c2bc1b5d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2181,17 +2181,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164_ITERATIVE-LABEL: add_i64_varying:
 ; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s6, s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s6
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v2, s6
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s6
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
@@ -2233,7 +2233,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132_ITERATIVE-LABEL: add_i64_varying:
 ; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -2241,8 +2241,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v3, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
@@ -2982,19 +2982,20 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ;
 ; GFX1164_ITERATIVE-LABEL: add_i64_varying_nouse:
 ; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s4, s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s5, v0, s4
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v1, s4
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s0, s0, s5
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5594,17 +5595,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164_ITERATIVE-LABEL: sub_i64_varying:
 ; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB14_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s6, s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s6
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v2, s6
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s6
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
@@ -5646,7 +5647,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132_ITERATIVE-LABEL: sub_i64_varying:
 ; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -5654,8 +5655,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v3, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
@@ -7063,17 +7064,17 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164_ITERATIVE-LABEL: and_i64_varying:
 ; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s8
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v3, s8
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s8
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s8
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
@@ -7113,7 +7114,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132_ITERATIVE-LABEL: and_i64_varying:
 ; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -7121,8 +7122,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
@@ -8411,17 +8412,17 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164_ITERATIVE-LABEL: or_i64_varying:
 ; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s8
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v3, s8
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s8
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s8
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
@@ -8461,7 +8462,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132_ITERATIVE-LABEL: or_i64_varying:
 ; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -8469,8 +8470,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT...
[truncated]

@arsenm arsenm marked this pull request as ready for review November 15, 2025 02:10
@arsenm arsenm merged commit 0fa6a67 into main Nov 15, 2025
14 checks passed
@arsenm arsenm deleted the users/arsenm/use-v-mov-b32-divergent-zext-i64 branch November 15, 2025 04:19
@llvm-ci
Copy link
Collaborator

llvm-ci commented Nov 15, 2025

LLVM Buildbot has detected a new failure on builder clang-aarch64-sve-vls-2stage running on linaro-g3-02 while building llvm at step 12 "ninja check 2".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/4/builds/10490

Here is the relevant piece of the build log for the reference
Step 12 (ninja check 2) failure: stage 2 checked (failure)
******************** TEST 'AddressSanitizer-aarch64-linux-dynamic :: TestCases/Posix/halt_on_error-signals.c' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2/./bin/clang  -fsanitize=address -mno-omit-leaf-frame-pointer -fno-omit-frame-pointer -fno-optimize-sibling-calls -gline-tables-only   -Wthread-safety -Wthread-safety-reference -Wthread-safety-beta   -shared-libasan -fsanitize-recover=address -pthread /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/compiler-rt/test/asan/TestCases/Posix/halt_on_error-signals.c -o /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2/runtimes/runtimes-bins/compiler-rt/test/asan/AARCH64LinuxDynamicConfig/TestCases/Posix/Output/halt_on_error-signals.c.tmp # RUN: at line 3
+ /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2/./bin/clang -fsanitize=address -mno-omit-leaf-frame-pointer -fno-omit-frame-pointer -fno-optimize-sibling-calls -gline-tables-only -Wthread-safety -Wthread-safety-reference -Wthread-safety-beta -shared-libasan -fsanitize-recover=address -pthread /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/compiler-rt/test/asan/TestCases/Posix/halt_on_error-signals.c -o /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2/runtimes/runtimes-bins/compiler-rt/test/asan/AARCH64LinuxDynamicConfig/TestCases/Posix/Output/halt_on_error-signals.c.tmp
In file included from /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/compiler-rt/test/asan/TestCases/Posix/halt_on_error-signals.c:11:
In file included from /usr/include/stdio.h:27:
In file included from /usr/include/aarch64-linux-gnu/bits/libc-header-start.h:33:
/usr/include/features.h:194:3: warning: "_BSD_SOURCE and _SVID_SOURCE are deprecated, use _DEFAULT_SOURCE" [-W#warnings]
  194 | # warning "_BSD_SOURCE and _SVID_SOURCE are deprecated, use _DEFAULT_SOURCE"
      |   ^
1 warning generated.
env ASAN_OPTIONS=halt_on_error=false:suppress_equal_pcs=false  /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2/runtimes/runtimes-bins/compiler-rt/test/asan/AARCH64LinuxDynamicConfig/TestCases/Posix/Output/halt_on_error-signals.c.tmp 100 >/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2/runtimes/runtimes-bins/compiler-rt/test/asan/AARCH64LinuxDynamicConfig/TestCases/Posix/Output/halt_on_error-signals.c.tmp.log 2>&1 || true # RUN: at line 5
+ env ASAN_OPTIONS=halt_on_error=false:suppress_equal_pcs=false /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2/runtimes/runtimes-bins/compiler-rt/test/asan/AARCH64LinuxDynamicConfig/TestCases/Posix/Output/halt_on_error-signals.c.tmp 100
+ true
FileCheck --check-prefix=CHECK-COLLISION /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/compiler-rt/test/asan/TestCases/Posix/halt_on_error-signals.c </home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2/runtimes/runtimes-bins/compiler-rt/test/asan/AARCH64LinuxDynamicConfig/TestCases/Posix/Output/halt_on_error-signals.c.tmp.log || FileCheck --check-prefix=CHECK-NO-COLLISION /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/compiler-rt/test/asan/TestCases/Posix/halt_on_error-signals.c </home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2/runtimes/runtimes-bins/compiler-rt/test/asan/AARCH64LinuxDynamicConfig/TestCases/Posix/Output/halt_on_error-signals.c.tmp.log # RUN: at line 7
+ FileCheck --check-prefix=CHECK-COLLISION /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/compiler-rt/test/asan/TestCases/Posix/halt_on_error-signals.c
/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/compiler-rt/test/asan/TestCases/Posix/halt_on_error-signals.c:29:22: error: CHECK-COLLISION: expected string not found in input
 // CHECK-COLLISION: AddressSanitizer: nested bug in the same thread, aborting
                     ^
<stdin>:1:1: note: scanning from here
=================================================================
^
<stdin>:58:85: note: possible intended match here
AddressSanitizer/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2/./bin/llvm-symbolizer: nested bug in the same thread, aborting.
                                                                                    ^

Input file: <stdin>
Check file: /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/compiler-rt/test/asan/TestCases/Posix/halt_on_error-signals.c

-dump-input=help explains the following input dump.

Input was:
<<<<<<
            1: ================================================================= 
check:29'0     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
            2: ==1571805==ERROR: AddressSanitizer: use-after-poison on address 0xb598e47f4160 at pc 0xb598e47e1c20 bp 0xddd900ffe620 sp 0xddd900ffe618 
check:29'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            3: WRITE of size 1 at 0xb598e47f4160 thread T1 
check:29'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            4:  #0 0xb598e47e1c1c in error /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/compiler-rt/test/asan/TestCases/Posix/halt_on_error-signals.c:32:12 
check:29'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            5:  #1 0xb598e47e1fc8 in receiver /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/compiler-rt/test/asan/TestCases/Posix/halt_on_error-signals.c:66:5 
check:29'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            6:  #2 0xe1d904b1c714 in asan_thread_start(void*) /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/compiler-rt/lib/asan/asan_interceptors.cpp:246:28 
check:29'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            .
            .
...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants