[SelectionDAG] Split vector types for atomic load #165818

jofrn · 2025-10-31T02:26:18Z

Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.

llvmbot · 2025-10-31T02:26:47Z

@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-llvm-selectiondag

Author: None (jofrn)

Changes

Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.

Patch is 24.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165818.diff

3 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h (+1)
(modified) llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (+37)
(modified) llvm/test/CodeGen/X86/atomic-load-store.ll (+473)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ed2c30be7d71d..9028ff4d3401c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -978,6 +978,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 22f9fd548f52b..e34b9fa8e787c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1226,6 +1226,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SplitVecRes_STEP_VECTOR(N, Lo, Hi);
     break;
   case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::ATOMIC_LOAD:
+    SplitVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N), Lo, Hi);
+    break;
   case ISD::LOAD:
     SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
     break;
@@ -2202,6 +2205,40 @@ void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+                                               SDValue &Hi) {
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+         "Extended load during type legalization!");
+  SDLoc dl(LD);
+  EVT VT = LD->getValueType(0);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  EVT MemIntVT =
+      EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+  SDValue ALD = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, MemIntVT, IntVT, Ch,
+                                  Ptr, LD->getMemOperand());
+
+  EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+  EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+  SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+  SDValue ExtractHi =
+      DAG.getNode(ISD::SRL, dl, IntVT, ALD,
+                  DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));
+  ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
+
+  Lo = DAG.getBitcast(LoVT, ExtractLo);
+  Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
 void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
                                         SDValue &Hi) {
   assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 9ea21cae97f32..286799f36e80a 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -565,6 +565,180 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
   ret <2 x float> %ret
 }
 
+define <2 x half> @atomic_vec2_half(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec2_half:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movl (%rdi), %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    shrl $16, %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_half:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    shrl $16, %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_half:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec2_half:
+; CHECK-AVX512-O3:       # %bb.0:
+; CHECK-AVX512-O3-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec2_half:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movl (%rdi), %eax
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    shrl $16, %ecx
+; CHECK-O0-NEXT:    movw %cx, %dx
+; CHECK-O0-NEXT:    # implicit-def: $ecx
+; CHECK-O0-NEXT:    movw %dx, %cx
+; CHECK-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-O0-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-O0-NEXT:    movw %ax, %cx
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_half:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movl (%rdi), %eax
+; CHECK-SSE-O0-NEXT:    movl %eax, %ecx
+; CHECK-SSE-O0-NEXT:    shrl $16, %ecx
+; CHECK-SSE-O0-NEXT:    movw %cx, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT:    movw %dx, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O0-NEXT:    movw %ax, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_half:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT:    retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec2_half:
+; CHECK-AVX512-O0:       # %bb.0:
+; CHECK-AVX512-O0-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512-O0-NEXT:    retq
+  %ret = load atomic <2 x half>, ptr %x acquire, align 4
+  ret <2 x half> %ret
+}
+define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movl (%rdi), %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    shrl $16, %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    shrl $16, %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    shrl $16, %eax
+; CHECK-AVX-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX512-O3:       # %bb.0:
+; CHECK-AVX512-O3-NEXT:    movl (%rdi), %eax
+; CHECK-AVX512-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    shrl $16, %eax
+; CHECK-AVX512-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movl (%rdi), %eax
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    shrl $16, %ecx
+; CHECK-O0-NEXT:    # kill: def $cx killed $cx killed $ecx
+; CHECK-O0-NEXT:    movw %ax, %dx
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %dx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movl (%rdi), %eax
+; CHECK-SSE-O0-NEXT:    movl %eax, %ecx
+; CHECK-SSE-O0-NEXT:    shrl $16, %ecx
+; CHECK-SSE-O0-NEXT:    # kill: def $cx killed $cx killed $ecx
+; CHECK-SSE-O0-NEXT:    movw %ax, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %dx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movl (%rdi), %eax
+; CHECK-AVX-O0-NEXT:    movw %ax, %cx
+; CHECK-AVX-O0-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT:    shrl $16, %eax
+; CHECK-AVX-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-AVX-O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O0-NEXT:    retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX512-O0:       # %bb.0:
+; CHECK-AVX512-O0-NEXT:    movl (%rdi), %eax
+; CHECK-AVX512-O0-NEXT:    movw %ax, %cx
+; CHECK-AVX512-O0-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O0-NEXT:    shrl $16, %eax
+; CHECK-AVX512-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-AVX512-O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O0-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O0-NEXT:    retq
+  %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4
+  ret <2 x bfloat> %ret
+}
 define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
 ; CHECK-O3-LABEL: atomic_vec1_ptr:
 ; CHECK-O3:       # %bb.0:
@@ -1205,6 +1379,305 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
   ret <4 x i16> %ret
 }
 
+define <4 x half> @atomic_vec4_half(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_half:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movq (%rdi), %rax
+; CHECK-O3-NEXT:    movl %eax, %ecx
+; CHECK-O3-NEXT:    shrl $16, %ecx
+; CHECK-O3-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    movq %rax, %rcx
+; CHECK-O3-NEXT:    shrq $32, %rcx
+; CHECK-O3-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-O3-NEXT:    shrq $48, %rax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_half:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movq (%rdi), %rax
+; CHECK-SSE-O3-NEXT:    movl %eax, %ecx
+; CHECK-SSE-O3-NEXT:    shrl $16, %ecx
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    movq %rax, %rcx
+; CHECK-SSE-O3-NEXT:    shrq $32, %rcx
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O3-NEXT:    shrq $48, %rax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_half:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    vmovq (%rdi), %xmm0
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec4_half:
+; CHECK-AVX512-O3:       # %bb.0:
+; CHECK-AVX512-O3-NEXT:    vmovq (%rdi), %xmm0
+; CHECK-AVX512-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec4_half:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movq (%rdi), %rax
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    shrl $16, %ecx
+; CHECK-O0-NEXT:    movw %cx, %dx
+; CHECK-O0-NEXT:    # implicit-def: $ecx
+; CHECK-O0-NEXT:    movw %dx, %cx
+; CHECK-O0-NEXT:    # implicit-def: $xmm2
+; CHECK-O0-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-O0-NEXT:    movw %ax, %dx
+; CHECK-O0-NEXT:    # implicit-def: $ecx
+; CHECK-O0-NEXT:    movw %dx, %cx
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %ecx, %xmm0
+; CHECK-O0-NEXT:    movq %rax, %rcx
+; CHECK-O0-NEXT:    shrq $32, %rcx
+; CHECK-O0-NEXT:    movw %cx, %dx
+; CHECK-O0-NEXT:    # implicit-def: $ecx
+; CHECK-O0-NEXT:    movw %dx, %cx
+; CHECK-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-O0-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-O0-NEXT:    shrq $48, %rax
+; CHECK-O0-NEXT:    movw %ax, %cx
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm3
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-O0-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec4_half:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movq (%rdi), %rax
+; CHECK-SSE-O0-NEXT:    movl %eax, %ecx
+; CHECK-SSE-O0-NEXT:    shrl $16, %ecx
+; CHECK-SSE-O0-NEXT:    movw %cx, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT:    movw %dx, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm2
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O0-NEXT:    movw %ax, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT:    movw %dx, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %ecx, %xmm0
+; CHECK-SSE-O0-NEXT:    movq %rax, %rcx
+; CHECK-SSE-O0-NEXT:    shrq $32, %rcx
+; CHECK-SSE-O0-NEXT:    movw %cx, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT:    movw %dx, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O0-NEXT:    shrq $48, %rax
+; CHECK-SSE-O0-NEXT:    movw %ax, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm3
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O0-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; CHECK-SSE-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-SSE-O0-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec4_half:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    vmovq (%rdi), %xmm0
+; CHECK-AVX-O0-NEXT:    retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec4_half:
+; CHECK-AVX512-O0:       # %bb.0:
+; CHECK-AVX512-O0-NEXT:    vmovq (%rdi), %xmm0
+; CHECK-AVX512-O0-NEXT:    retq
+  %ret = load atomic <4 x half>, ptr %x acquire, align 8
+  ret <4 x half> %ret
+}
+define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movq (%rdi), %rax
+; CHECK-O3-NEXT:    movq %rax, %rcx
+; CHECK-O3-NEXT:    movq %rax, %rdx
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-O3-NEXT:    shrl $16, %eax
+; CHECK-O3-NEXT:    shrq $32, %rcx
+; CHECK-O3-NEXT:    shrq $48, %rdx
+; CHECK-O3-NEXT:    pinsrw $0, %edx, %xmm1
+; CHECK-O3-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-O3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movq (%rdi), %rax
+; CHECK-SSE-O3-NEXT:    movq %rax, %rcx
+; CHECK-SSE-O3-NEXT:    movq %rax, %rdx
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-SSE-O3-NEXT:    shrl $16, %eax
+; CHECK-SSE-O3-NEXT:    shrq $32, %rcx
+; CHECK-SSE-O3-NEXT:    shrq $48, %rdx
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %edx, %xmm1
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-SSE-O3-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movq (%rdi), %rax
+; CHECK-AVX-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    movq %rax, %rcx
+; CHECK-AVX-O3-NEXT:    shrq $48, %rcx
+; CHECK-AVX-O3-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    movq %rax, %rcx
+; CHECK-AVX-O3-NEXT:    shrq $32, %rcx
+; CHECK-AVX-O3-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    shrl $16, %eax
+; CHECK-AVX-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-AVX512-O3:       # %bb.0:
+; CHECK-AVX512-O3-NEXT:    movq (%rdi), %rax
+; CHECK-AVX512-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    movq %rax, %rcx
+; CHECK-AVX512-O3-NEXT:    shrq $48, %rcx
+; CHECK-AVX512-O3-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    movq %rax, %rcx
+; CHECK-AVX512-O3-NEXT:    shrq $32, %rcx
+; CHECK-AVX512-O3-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    shrl $16, %eax
+; CHECK-AVX512-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec4_bfloat:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movq (%rdi), %rax
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    shrl $16, %ecx
+; CHECK-O0-NEXT:    # kill: def $cx killed $cx killed $ecx
+; CHECK-O0-NEXT:    movw %ax, %dx
+; CHECK-O0-NEXT:    movq %rax, %rsi
+; CHECK-O0-NEXT:    shrq $32, %rsi
+; CHECK-O0-NEXT:    # kill: def $si killed $si killed $rsi
+; CHECK-O0-NEXT:    shrq $48, %rax
+; CHECK-O0-NEXT:    movw %ax, %di
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %di, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %si, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} x...
[truncated]

jofrn · 2025-10-31T05:13:23Z

Warning

This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
Learn more

This stack of pull requests is managed by Graphite. Learn more about stacking.

RKSimon

please regenerate the test checks

RKSimon · 2025-10-31T16:19:04Z

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

+  SDValue ExtractHi =
+      DAG.getNode(ISD::SRL, dl, IntVT, ALD,
+                  DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));
+  ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);


Why not use SplitInteger?

arsenm · 2025-10-31T19:52:55Z

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+         "Extended load during type legalization!");


Wouldn't this just be a matter of passing through the extension type instead of hardcoding NON_EXTLOAD below?

arsenm · 2025-10-31T19:53:20Z

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

+  SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+  SDValue ExtractHi =
+      DAG.getNode(ISD::SRL, dl, IntVT, ALD,
+                  DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));


getIntPtrConstant is the wrong type to use, getShiftAmountConstant

Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.

Apply suggested changes

llvmbot added backend:X86 llvm:SelectionDAG SelectionDAGISel as well labels Oct 31, 2025

jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch from 81c146e to a0038c0 Compare October 31, 2025 05:12

jofrn force-pushed the users/jofrn/gt/07-15-_x86_remove_extra_mov_after_widening_atomic_load branch from 226394d to 55ec858 Compare October 31, 2025 05:12

RKSimon requested changes Oct 31, 2025

View reviewed changes

jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch from a0038c0 to 5c2428c Compare October 31, 2025 16:05

RKSimon reviewed Oct 31, 2025

View reviewed changes

arsenm reviewed Oct 31, 2025

View reviewed changes

jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch from 5c2428c to 9411926 Compare November 6, 2025 16:24

jofrn force-pushed the users/jofrn/gt/07-15-_x86_remove_extra_mov_after_widening_atomic_load branch from 55ec858 to b92b6da Compare November 6, 2025 17:45

jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch from 9411926 to 1434bcf Compare November 6, 2025 17:45

jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch from 1434bcf to 8466578 Compare November 6, 2025 18:19

Apply suggested changes

64c6ae5

Apply suggested changes

arsenm approved these changes Nov 7, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[SelectionDAG] Split vector types for atomic load #165818

[SelectionDAG] Split vector types for atomic load #165818

jofrn commented Oct 31, 2025

Uh oh!

llvmbot commented Oct 31, 2025 •

edited

Loading

Uh oh!

jofrn commented Oct 31, 2025 •

edited

Loading

Uh oh!

RKSimon left a comment

Uh oh!

RKSimon Oct 31, 2025

Uh oh!

arsenm Oct 31, 2025

Uh oh!

arsenm Oct 31, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

		assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
		"Extended load during type legalization!");

[SelectionDAG] Split vector types for atomic load #165818

Are you sure you want to change the base?

[SelectionDAG] Split vector types for atomic load #165818

Conversation

jofrn commented Oct 31, 2025

Uh oh!

llvmbot commented Oct 31, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jofrn commented Oct 31, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

RKSimon Oct 31, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm Oct 31, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm Oct 31, 2025

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

llvmbot commented Oct 31, 2025 •

edited

Loading

jofrn commented Oct 31, 2025 •

edited

Loading