diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h index 22f6a5fde546a..8f560c42082f9 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h @@ -997,6 +997,7 @@ class LegalizationArtifactCombiner { // Recognize UnmergeSrc that can be unmerged to DstTy directly. // Types have to be either both vector or both non-vector types. + // In case of vector types, the scalar elements need to match. // Merge-like opcodes are combined one at the time. First one creates new // unmerge, following should use the same unmerge (builder performs CSE). // @@ -1005,7 +1006,9 @@ class LegalizationArtifactCombiner { // %AnotherDst:_(DstTy) = G_merge_like_opcode %2:_(EltTy), %3 // // %Dst:_(DstTy), %AnotherDst = G_UNMERGE_VALUES %UnmergeSrc - if ((DstTy.isVector() == UnmergeSrcTy.isVector()) && + if (((!DstTy.isVector() && !UnmergeSrcTy.isVector()) || + (DstTy.isVector() && UnmergeSrcTy.isVector() && + DstTy.getScalarType() == UnmergeSrcTy.getScalarType())) && (Elt0UnmergeIdx % NumMIElts == 0) && getCoverTy(UnmergeSrcTy, DstTy) == UnmergeSrcTy) { if (!isSequenceFromUnmerge(MI, 0, Unmerge, Elt0UnmergeIdx, NumMIElts, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 8134eb3ca2afc..51d0b225b2a27 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -6506,3 +6506,47 @@ entry: %insert = insertelement <5 x double> %vec, double %val, i32 %idx ret <5 x double> %insert } + +; Found by fuzzer, reduced with llvm-reduce. +define void @insert_very_small_from_very_large(<32 x i16> %L3, ptr %ptr) { +; GPRIDX-LABEL: insert_very_small_from_very_large: +; GPRIDX: ; %bb.0: ; %bb +; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GPRIDX-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GPRIDX-NEXT: v_and_b32_e32 v0, 1, v0 +; GPRIDX-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GPRIDX-NEXT: v_and_b32_e32 v0, 3, v0 +; GPRIDX-NEXT: flat_store_byte v[16:17], v0 +; GPRIDX-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GPRIDX-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: insert_very_small_from_very_large: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX10-NEXT: flat_store_byte v[16:17], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: insert_very_small_from_very_large: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b16 v0.l, 1, v0.l +; GFX11-NEXT: v_and_b16 v0.l, v0.l, 1 +; GFX11-NEXT: v_lshlrev_b16 v0.l, 1, v0.l +; GFX11-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-NEXT: flat_store_b8 v[16:17], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +bb: + %a = bitcast <32 x i16> %L3 to i512 + %b = trunc i512 %a to i8 + %c = trunc i8 %b to i2 + %d = bitcast i2 %c to <2 x i1> + %insert = insertelement <2 x i1> %d, i1 false, i32 0 + store <2 x i1> %insert, ptr %ptr, align 1 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index fdc1dd6cce8e1..53b2542cf9a7e 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -2166,14 +2166,14 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) { ; GISEL-LABEL: load_v6i8: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 ; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 ; GISEL-NEXT: s_waitcnt vmcnt(1) -; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <6 x i8>, ptr addrspace(7) %p @@ -3630,10 +3630,10 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc ; GISEL-NEXT: s_waitcnt vmcnt(1) ; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load volatile <6 x i8>, ptr addrspace(7) %p