Skip to content

Conversation

@JanekvO
Copy link
Contributor

@JanekvO JanekvO commented Aug 19, 2025

Enable gfx942 for tests that are affected by the an AMDGPU bitcast constant combine (#154115)

Expecting to see more tests affected in aforementioned PR after rebase on top of this PR

@llvmbot
Copy link
Member

llvmbot commented Aug 19, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Janek van Oirschot (JanekvO)

Changes

Enable gfx942 for tests that are affected by the an AMDGPU bitcast constant combine (#154115)

Expecting to see more tests affected in aforementioned PR after rebase on top of this PR


Patch is 2.25 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154363.diff

37 Files Affected:

  • (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (+3519)
  • (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+1211)
  • (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+2519)
  • (modified) llvm/test/CodeGen/AMDGPU/bypass-div.ll (+1270)
  • (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll (+907)
  • (modified) llvm/test/CodeGen/AMDGPU/dagcombine-select.ll (+148-72)
  • (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+4154)
  • (modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (+3548)
  • (modified) llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll (+1034)
  • (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.ll (+605)
  • (modified) llvm/test/CodeGen/AMDGPU/fceil64.ll (+88)
  • (modified) llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll (+48-23)
  • (modified) llvm/test/CodeGen/AMDGPU/fptoi.i128.ll (+2236)
  • (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+1675)
  • (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+1213)
  • (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+1213)
  • (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+1984)
  • (modified) llvm/test/CodeGen/AMDGPU/imm.ll (+761)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll (+1089)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll (+200)
  • (modified) llvm/test/CodeGen/AMDGPU/lower-lds-with-alias-scope.ll (+44-21)
  • (modified) llvm/test/CodeGen/AMDGPU/lround.ll (+448)
  • (modified) llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll (+33)
  • (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+2686-2)
  • (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+1931)
  • (modified) llvm/test/CodeGen/AMDGPU/shift-i128.ll (+550)
  • (modified) llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll (+38)
  • (modified) llvm/test/CodeGen/AMDGPU/sibling-call.ll (+559)
  • (modified) llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll (+155)
  • (modified) llvm/test/CodeGen/AMDGPU/srem.ll ()
  • (modified) llvm/test/CodeGen/AMDGPU/srem64.ll (+2087)
  • (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+1523)
  • (modified) llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll (+222)
  • (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+1471)
  • (modified) llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll (+134-1)
  • (modified) llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll (+47)
  • (modified) llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll (+53)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index c7385e4324e2c..c6ad5c93fb7fa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -2,6 +2,7 @@
 ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX942 %s
 
 define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-LABEL: @udiv_i32(
@@ -98,6 +99,37 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT:    s_sub_i32 s4, 0, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX942-NEXT:    s_mul_i32 s5, s4, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s5
+; GFX942-NEXT:    s_add_i32 s6, s4, 1
+; GFX942-NEXT:    s_sub_i32 s5, s2, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_add_i32 s5, s4, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = udiv i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -191,6 +223,35 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT:    s_sub_i32 s4, 0, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX942-NEXT:    s_mul_i32 s4, s4, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s4
+; GFX942-NEXT:    s_sub_i32 s4, s2, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-NEXT:    s_sub_i32 s4, s2, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = urem i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -312,6 +373,42 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_abs_i32 s4, s3
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX942-NEXT:    s_sub_i32 s5, 0, s4
+; GFX942-NEXT:    s_xor_b32 s3, s2, s3
+; GFX942-NEXT:    s_abs_i32 s2, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX942-NEXT:    s_mul_i32 s5, s5, s6
+; GFX942-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX942-NEXT:    s_add_i32 s6, s6, s5
+; GFX942-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX942-NEXT:    s_mul_i32 s6, s5, s4
+; GFX942-NEXT:    s_sub_i32 s2, s2, s6
+; GFX942-NEXT:    s_add_i32 s7, s5, 1
+; GFX942-NEXT:    s_sub_i32 s6, s2, s4
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s4
+; GFX942-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX942-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX942-NEXT:    s_add_i32 s6, s5, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s4
+; GFX942-NEXT:    s_cselect_b32 s2, s6, s5
+; GFX942-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -423,6 +520,40 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_abs_i32 s3, s3
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT:    s_sub_i32 s5, 0, s3
+; GFX942-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX942-NEXT:    s_abs_i32 s2, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX942-NEXT:    s_mul_i32 s5, s5, s6
+; GFX942-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX942-NEXT:    s_add_i32 s6, s6, s5
+; GFX942-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX942-NEXT:    s_mul_i32 s5, s5, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s5
+; GFX942-NEXT:    s_sub_i32 s5, s2, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_sub_i32 s5, s2, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_xor_b32 s2, s2, s4
+; GFX942-NEXT:    s_sub_i32 s2, s2, s4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = srem i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -492,6 +623,29 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v3, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GFX942-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s0
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX942-NEXT:    v_fma_f32 v1, -v2, v0, v1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_short v3, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = udiv i16 %x, %y
   store i16 %r, ptr addrspace(1) %out
   ret void
@@ -567,6 +721,31 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT:    s_and_b32 s0, s2, 0xffff
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s0
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX942-NEXT:    v_fma_f32 v1, -v2, v0, v1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX942-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_short v3, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = urem i16 %x, %y
   store i16 %r, ptr addrspace(1) %out
   ret void
@@ -648,6 +827,31 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    v_add_u32_e32 v0, s2, v3
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s3, s2, 16
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX942-NEXT:    s_sext_i32_i16 s2, s2
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s2
+; GFX942-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-NEXT:    v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv i16 %x, %y
   store i16 %r, ptr addrspace(1) %out
   ret void
@@ -735,6 +939,33 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s6, v0
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s4, s6, 16
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX942-NEXT:    s_sext_i32_i16 s2, s6
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s2
+; GFX942-NEXT:    s_xor_b32 s2, s2, s4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_or_b32 s5, s2, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s5, 0
+; GFX942-NEXT:    v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GFX942-NEXT:    v_sub_u32_e32 v0, s6, v0
+; GFX942-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = srem i16 %x, %y
   store i16 %r, ptr addrspace(1) %out
   ret void
@@ -798,6 +1029,25 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
 ; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_i8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX942-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
+; GFX942-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v1
+; GFX942-NEXT:    v_fma_f32 v1, -v1, v0, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT:    global_store_byte v2, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = udiv i8 %x, %y
   store i8 %r, ptr addrspace(1) %out
   ret void
@@ -869,6 +1119,28 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_i8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX942-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
+; GFX942-NEXT:    s_lshr_b32 s3, s2, 8
+; GFX942-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v1
+; GFX942-NEXT:    v_fma_f32 v1, -v1, v0, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX942-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX942-NEXT:    global_store_byte v2, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = urem i8 %x, %y
   store i8 %r, ptr addrspace(1) %out
   ret void
@@ -950,6 +1222,31 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    v_add_u32_e32 v0, s2, v3
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_i8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_bfe_i32 s3, s2, 0x80008
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX942-NEXT:    s_sext_i32_i8 s2, s2
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s2
+; GFX942-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-NEXT:    v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv i8 %x, %y
   store i8 %r, ptr addrspace(1) %out
   ret void
@@ -1039,6 +1336,34 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s6, v0
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_i8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_bfe_i32 s2, s6, 0x80008
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GFX942-NEXT:    s_sext_i32_i8 s3, s6
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s3
+; GFX942-NEXT:    s_xor_b32 s2, s3, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_lshr_b32 s4, s6, 8
+; GFX942-NEXT:    s_or_b32 s5, s2, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v1|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s5, 0
+; GFX942-NEXT:    v_add_u32_e32 v1, s2, v3
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, s4
+; GFX942-NEXT:    v_sub_u32_e32 v1, s6, v1
+; GFX942-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = srem i8 %x, %y
   store i8 %r, ptr addrspace(1) %out
   ret void
@@ -1367,6 +1692,99 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_v4i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s13
+; GFX942-NEXT:    s_sub_i32 s2, 0, s12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, s14
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX942-NEXT:    s_mul_i32 s2, s2, s3
+; GFX942-NEXT:    s_mul_hi_u32 s2, s3, s2
+; GFX942-NEXT:    s_add_i32 s3, s3, s2
+; GFX942-NEXT:    s_mul_hi_u32 s2, s8, s3
+; GFX942-NEXT:    s_mul_i32 s3, s2, s12
+; GFX942-NEXT:    s_sub_i32 s3, s8, s3
+; GFX942-NEXT:    s_add_i32 s5, s2, 1
+; GFX942-NEXT:    s_sub_i32 s6, s3, s12
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s12
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_cselect_b32 s3, s6, s3
+; GFX942-NEXT:    s_add_i32 s5, s2, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s12
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_sub_i32 s3, 0, s13
+; GFX942-NEXT:    s_mul_i32 s3, s3, s4
+; GFX942-NEXT:    s_mul_hi_u32 s3, s4, s3
+; GFX942-NEXT:    s_add_i32 s4, s4, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v3
+; GFX942-NEXT:    s_mul_hi_u32 s3, s9, s4
+; GFX942-NEXT:    s_mul_i32 s4, s3, s13
+; GFX942-NEXT:    s_sub_i32 s4, s9, s4
+; GFX942-NEXT:    s_add_i32 s5, s3, 1
+; GFX942-NEXT:    s_sub_i32 s6, s4, s13
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    s_cmp_ge_u32 s4, s13
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX942-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-NEXT:    s_add_i32 s5, s3, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s4, s13
+; GFX942-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s15
+; GFX942-NEXT:    s_sub_i32 s4, 0, s14
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_mul_hi_u32 s4, s10, s5
+; GFX942-NEXT:    s_mul_i32 s5, s4, s14
+; GFX942-NEXT:    s_sub_i32 s5, s10, s5
+; GFX942-NEXT:    s_add_i32 s6, s4, 1
+; GFX942-NEXT:    s_sub_i32 s7, s5, s14
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    s_cmp_ge_u32 s5, s14
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX942-NEXT:    s_add_i32 s6, s4, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s5, s14
+; GFX942-NEXT:    s_cselect_b...
[truncated]

@JanekvO
Copy link
Contributor Author

JanekvO commented Aug 19, 2025

I've rebased locally on top of changes in #154115 and it does affect about 28 of the 37 tests I'm enabling gfx942 for here; let me know if I should cut the amount of tests adding in this PR to only those 28.

Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not every test that changes needs to run with 942, this can be more targeted.

I was hoping the original commit had a more targeted test, but 7900334 looks like a bunch of random test changes. The int_to_fp*s removed FIXMEs so those are probably more interesting.

@JanekvO
Copy link
Contributor Author

JanekvO commented Aug 20, 2025

The int_to_fp*s removed FIXMEs so those are probably more interesting.

Sorry, which FIXMEs? Also, I only have enabled for the int_to_fp* tests that were affected by the bitcast combine so let me know if I should do so for the other int_to_fp* tests as well.

@arsenm
Copy link
Contributor

arsenm commented Aug 20, 2025

The int_to_fp*s removed FIXMEs so those are probably more interesting.

Sorry, which FIXMEs? Also, I only have enabled for the int_to_fp* tests that were affected by the bitcast combine so let me know if I should do so for the other int_to_fp* tests as well.

The ones in the test diffs from 7900334

@JanekvO JanekvO merged commit 40e1510 into llvm:main Aug 20, 2025
9 checks passed
@JanekvO JanekvO deleted the add-gfx942-tests branch August 20, 2025 14:46
@llvm-ci
Copy link
Collaborator

llvm-ci commented Aug 20, 2025

LLVM Buildbot has detected a new failure on builder openmp-offload-amdgpu-runtime-2 running on rocm-worker-hw-02 while building llvm at step 8 "Add check check-llvm".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/10/builds/11822

Here is the relevant piece of the build log for the reference
Step 8 (Add check check-llvm) failure: test (failure)
******************** TEST 'LLVM :: Transforms/SLPVectorizer/X86/debug-info-salvage.ll' FAILED ********************
Exit Code: 2

Command Output (stderr):
--
/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mattr=+avx2 < /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/llvm/test/Transforms/SLPVectorizer/X86/debug-info-salvage.ll | /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/FileCheck /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/llvm/test/Transforms/SLPVectorizer/X86/debug-info-salvage.ll # RUN: at line 2
+ /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mattr=+avx2
+ /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/FileCheck /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/llvm/test/Transforms/SLPVectorizer/X86/debug-info-salvage.ll
opt: /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.src/llvm/include/llvm/ADT/DenseMap.h:701: bool llvm::DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT>::LookupBucketFor(const LookupKeyT&, BucketT*&) [with LookupKeyT = unsigned int; DerivedT = llvm::DenseMap<unsigned int, unsigned int, llvm::DenseMapInfo<unsigned int>, llvm::detail::DenseMapPair<unsigned int, unsigned int> >; KeyT = unsigned int; ValueT = unsigned int; KeyInfoT = llvm::DenseMapInfo<unsigned int>; BucketT = llvm::detail::DenseMapPair<unsigned int, unsigned int>]: Assertion `!KeyInfoT::isEqual(Val, EmptyKey) && !KeyInfoT::isEqual(Val, TombstoneKey) && "Empty/Tombstone value shouldn't be inserted into map!"' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.	Program arguments: /home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mattr=+avx2
1.	Running pass "function(slp-vectorizer)" on module "<stdin>"
2.	Running pass "slp-vectorizer" on function "test"
 #0 0x000075091a5fae70 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/libLLVMSupport.so.22.0git+0x1fae70)
 #1 0x000075091a5f7b1f llvm::sys::RunSignalHandlers() (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/libLLVMSupport.so.22.0git+0x1f7b1f)
 #2 0x000075091a5f7c72 SignalHandler(int, siginfo_t*, void*) Signals.cpp:0:0
 #3 0x000075091a042520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520)
 #4 0x000075091a0969fc __pthread_kill_implementation ./nptl/pthread_kill.c:44:76
 #5 0x000075091a0969fc __pthread_kill_internal ./nptl/pthread_kill.c:78:10
 #6 0x000075091a0969fc pthread_kill ./nptl/pthread_kill.c:89:10
 #7 0x000075091a042476 gsignal ./signal/../sysdeps/posix/raise.c:27:6
 #8 0x000075091a0287f3 abort ./stdlib/abort.c:81:7
 #9 0x000075091a02871b _nl_load_domain ./intl/loadmsgcat.c:1177:9
#10 0x000075091a039e96 (/lib/x86_64-linux-gnu/libc.so.6+0x39e96)
#11 0x000075091675adae (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x15adae)
#12 0x00007509167b2afe llvm::slpvectorizer::BoUpSLP::VLOperands::getBestLaneToStartReordering() const (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x1b2afe)
#13 0x00007509167c0297 llvm::slpvectorizer::BoUpSLP::VLOperands::reorder() (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x1c0297)
#14 0x00007509167fd8e8 llvm::slpvectorizer::BoUpSLP::buildTreeRec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x1fd8e8)
#15 0x00007509167fc978 llvm::slpvectorizer::BoUpSLP::buildTreeRec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x1fc978)
#16 0x00007509167fcdb9 llvm::slpvectorizer::BoUpSLP::buildTreeRec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x1fcdb9)
#17 0x00007509167fcdb9 llvm::slpvectorizer::BoUpSLP::buildTreeRec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x1fcdb9)
#18 0x00007509167fcdb9 llvm::slpvectorizer::BoUpSLP::buildTreeRec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x1fcdb9)
#19 0x00007509167fcdb9 llvm::slpvectorizer::BoUpSLP::buildTreeRec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x1fcdb9)
#20 0x00007509167fcdb9 llvm::slpvectorizer::BoUpSLP::buildTreeRec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x1fcdb9)
#21 0x00007509167fc10a llvm::slpvectorizer::BoUpSLP::buildTreeRec(llvm::ArrayRef<llvm::Value*>, unsigned int, llvm::slpvectorizer::BoUpSLP::EdgeInfo const&, unsigned int) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x1fc10a)
#22 0x00007509167ff260 llvm::slpvectorizer::BoUpSLP::buildTree(llvm::ArrayRef<llvm::Value*>) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x1ff260)
#23 0x00007509168417aa llvm::SLPVectorizerPass::vectorizeStoreChain(llvm::ArrayRef<llvm::Value*>, llvm::slpvectorizer::BoUpSLP&, unsigned int, unsigned int, unsigned int&) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x2417aa)
#24 0x0000750916843604 llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&)::'lambda'(std::map<long, unsigned int, std::less<long>, std::allocator<std::pair<long const, unsigned int>>> const&)::operator()(std::map<long, unsigned int, std::less<long>, std::allocator<std::pair<long const, unsigned int>>> const&) const SLPVectorizer.cpp:0:0
#25 0x00007509168454f0 llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x2454f0)
#26 0x0000750916845d69 llvm::SLPVectorizerPass::vectorizeStoreChains(llvm::slpvectorizer::BoUpSLP&) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x245d69)
#27 0x0000750916847325 llvm::SLPVectorizerPass::runImpl(llvm::Function&, llvm::ScalarEvolution*, llvm::TargetTransformInfo*, llvm::TargetLibraryInfo*, llvm::AAResults*, llvm::LoopInfo*, llvm::DominatorTree*, llvm::AssumptionCache*, llvm::DemandedBits*, llvm::OptimizationRemarkEmitter*) (.part.0) SLPVectorizer.cpp:0:0
#28 0x0000750916847f21 llvm::SLPVectorizerPass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMVectorize.so.22.0git+0x247f21)
#29 0x0000750917289a16 llvm::detail::PassModel<llvm::Function, llvm::SLPVectorizerPass, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMPasses.so.22.0git+0x89a16)
#30 0x00007509142fadd2 llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMCore.so.22.0git+0x2fadd2)
#31 0x00007509190af5e6 llvm::detail::PassModel<llvm::Function, llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMX86CodeGen.so.22.0git+0xaf5e6)
#32 0x00007509142fb499 llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMCore.so.22.0git+0x2fb499)
#33 0x000075091a75a2c6 llvm::detail::PassModel<llvm::Module, llvm::ModuleToFunctionPassAdaptor, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/libLLVMOptDriver.so.22.0git+0x182c6)
#34 0x00007509142fcd25 llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/../lib/libLLVMCore.so.22.0git+0x2fcd25)
#35 0x000075091a7661f2 llvm::runPassPipeline(llvm::StringRef, llvm::Module&, llvm::TargetMachine*, llvm::TargetLibraryInfoImpl*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::StringRef, llvm::ArrayRef<llvm::PassPlugin>, llvm::ArrayRef<std::function<void (llvm::PassBuilder&)>>, llvm::opt_tool::OutputKind, llvm::opt_tool::VerifierKind, bool, bool, bool, bool, bool, bool, bool, bool) (/home/botworker/builds/openmp-offload-amdgpu-runtime-2/llvm.build/bin/../lib/libLLVMOptDriver.so.22.0git+0x241f2)
...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants