Skip to content

Conversation

@jofrn
Copy link
Contributor

@jofrn jofrn commented Oct 31, 2025

Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.

@llvmbot llvmbot added backend:X86 llvm:SelectionDAG SelectionDAGISel as well labels Oct 31, 2025
@llvmbot
Copy link
Member

llvmbot commented Oct 31, 2025

@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-llvm-selectiondag

Author: None (jofrn)

Changes

Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.


Patch is 24.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165818.diff

3 Files Affected:

  • (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h (+1)
  • (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (+37)
  • (modified) llvm/test/CodeGen/X86/atomic-load-store.ll (+473)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ed2c30be7d71d..9028ff4d3401c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -978,6 +978,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 22f9fd548f52b..e34b9fa8e787c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1226,6 +1226,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SplitVecRes_STEP_VECTOR(N, Lo, Hi);
     break;
   case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::ATOMIC_LOAD:
+    SplitVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N), Lo, Hi);
+    break;
   case ISD::LOAD:
     SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
     break;
@@ -2202,6 +2205,40 @@ void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+                                               SDValue &Hi) {
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+         "Extended load during type legalization!");
+  SDLoc dl(LD);
+  EVT VT = LD->getValueType(0);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  EVT MemIntVT =
+      EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+  SDValue ALD = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, MemIntVT, IntVT, Ch,
+                                  Ptr, LD->getMemOperand());
+
+  EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+  EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+  SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+  SDValue ExtractHi =
+      DAG.getNode(ISD::SRL, dl, IntVT, ALD,
+                  DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));
+  ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
+
+  Lo = DAG.getBitcast(LoVT, ExtractLo);
+  Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
 void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
                                         SDValue &Hi) {
   assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 9ea21cae97f32..286799f36e80a 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -565,6 +565,180 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
   ret <2 x float> %ret
 }
 
+define <2 x half> @atomic_vec2_half(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec2_half:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movl (%rdi), %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    shrl $16, %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_half:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    shrl $16, %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_half:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec2_half:
+; CHECK-AVX512-O3:       # %bb.0:
+; CHECK-AVX512-O3-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec2_half:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movl (%rdi), %eax
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    shrl $16, %ecx
+; CHECK-O0-NEXT:    movw %cx, %dx
+; CHECK-O0-NEXT:    # implicit-def: $ecx
+; CHECK-O0-NEXT:    movw %dx, %cx
+; CHECK-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-O0-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-O0-NEXT:    movw %ax, %cx
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_half:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movl (%rdi), %eax
+; CHECK-SSE-O0-NEXT:    movl %eax, %ecx
+; CHECK-SSE-O0-NEXT:    shrl $16, %ecx
+; CHECK-SSE-O0-NEXT:    movw %cx, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT:    movw %dx, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O0-NEXT:    movw %ax, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_half:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT:    retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec2_half:
+; CHECK-AVX512-O0:       # %bb.0:
+; CHECK-AVX512-O0-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512-O0-NEXT:    retq
+  %ret = load atomic <2 x half>, ptr %x acquire, align 4
+  ret <2 x half> %ret
+}
+define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movl (%rdi), %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    shrl $16, %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    shrl $16, %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    shrl $16, %eax
+; CHECK-AVX-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX512-O3:       # %bb.0:
+; CHECK-AVX512-O3-NEXT:    movl (%rdi), %eax
+; CHECK-AVX512-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    shrl $16, %eax
+; CHECK-AVX512-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movl (%rdi), %eax
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    shrl $16, %ecx
+; CHECK-O0-NEXT:    # kill: def $cx killed $cx killed $ecx
+; CHECK-O0-NEXT:    movw %ax, %dx
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %dx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movl (%rdi), %eax
+; CHECK-SSE-O0-NEXT:    movl %eax, %ecx
+; CHECK-SSE-O0-NEXT:    shrl $16, %ecx
+; CHECK-SSE-O0-NEXT:    # kill: def $cx killed $cx killed $ecx
+; CHECK-SSE-O0-NEXT:    movw %ax, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %dx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movl (%rdi), %eax
+; CHECK-AVX-O0-NEXT:    movw %ax, %cx
+; CHECK-AVX-O0-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT:    shrl $16, %eax
+; CHECK-AVX-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-AVX-O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O0-NEXT:    retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX512-O0:       # %bb.0:
+; CHECK-AVX512-O0-NEXT:    movl (%rdi), %eax
+; CHECK-AVX512-O0-NEXT:    movw %ax, %cx
+; CHECK-AVX512-O0-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O0-NEXT:    shrl $16, %eax
+; CHECK-AVX512-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-AVX512-O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O0-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O0-NEXT:    retq
+  %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4
+  ret <2 x bfloat> %ret
+}
 define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
 ; CHECK-O3-LABEL: atomic_vec1_ptr:
 ; CHECK-O3:       # %bb.0:
@@ -1205,6 +1379,305 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
   ret <4 x i16> %ret
 }
 
+define <4 x half> @atomic_vec4_half(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_half:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movq (%rdi), %rax
+; CHECK-O3-NEXT:    movl %eax, %ecx
+; CHECK-O3-NEXT:    shrl $16, %ecx
+; CHECK-O3-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    movq %rax, %rcx
+; CHECK-O3-NEXT:    shrq $32, %rcx
+; CHECK-O3-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-O3-NEXT:    shrq $48, %rax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_half:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movq (%rdi), %rax
+; CHECK-SSE-O3-NEXT:    movl %eax, %ecx
+; CHECK-SSE-O3-NEXT:    shrl $16, %ecx
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    movq %rax, %rcx
+; CHECK-SSE-O3-NEXT:    shrq $32, %rcx
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O3-NEXT:    shrq $48, %rax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_half:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    vmovq (%rdi), %xmm0
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec4_half:
+; CHECK-AVX512-O3:       # %bb.0:
+; CHECK-AVX512-O3-NEXT:    vmovq (%rdi), %xmm0
+; CHECK-AVX512-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec4_half:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movq (%rdi), %rax
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    shrl $16, %ecx
+; CHECK-O0-NEXT:    movw %cx, %dx
+; CHECK-O0-NEXT:    # implicit-def: $ecx
+; CHECK-O0-NEXT:    movw %dx, %cx
+; CHECK-O0-NEXT:    # implicit-def: $xmm2
+; CHECK-O0-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-O0-NEXT:    movw %ax, %dx
+; CHECK-O0-NEXT:    # implicit-def: $ecx
+; CHECK-O0-NEXT:    movw %dx, %cx
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %ecx, %xmm0
+; CHECK-O0-NEXT:    movq %rax, %rcx
+; CHECK-O0-NEXT:    shrq $32, %rcx
+; CHECK-O0-NEXT:    movw %cx, %dx
+; CHECK-O0-NEXT:    # implicit-def: $ecx
+; CHECK-O0-NEXT:    movw %dx, %cx
+; CHECK-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-O0-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-O0-NEXT:    shrq $48, %rax
+; CHECK-O0-NEXT:    movw %ax, %cx
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm3
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-O0-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec4_half:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movq (%rdi), %rax
+; CHECK-SSE-O0-NEXT:    movl %eax, %ecx
+; CHECK-SSE-O0-NEXT:    shrl $16, %ecx
+; CHECK-SSE-O0-NEXT:    movw %cx, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT:    movw %dx, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm2
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O0-NEXT:    movw %ax, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT:    movw %dx, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %ecx, %xmm0
+; CHECK-SSE-O0-NEXT:    movq %rax, %rcx
+; CHECK-SSE-O0-NEXT:    shrq $32, %rcx
+; CHECK-SSE-O0-NEXT:    movw %cx, %dx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT:    movw %dx, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O0-NEXT:    shrq $48, %rax
+; CHECK-SSE-O0-NEXT:    movw %ax, %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm3
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O0-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; CHECK-SSE-O0-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-SSE-O0-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec4_half:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    vmovq (%rdi), %xmm0
+; CHECK-AVX-O0-NEXT:    retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec4_half:
+; CHECK-AVX512-O0:       # %bb.0:
+; CHECK-AVX512-O0-NEXT:    vmovq (%rdi), %xmm0
+; CHECK-AVX512-O0-NEXT:    retq
+  %ret = load atomic <4 x half>, ptr %x acquire, align 8
+  ret <4 x half> %ret
+}
+define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movq (%rdi), %rax
+; CHECK-O3-NEXT:    movq %rax, %rcx
+; CHECK-O3-NEXT:    movq %rax, %rdx
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-O3-NEXT:    shrl $16, %eax
+; CHECK-O3-NEXT:    shrq $32, %rcx
+; CHECK-O3-NEXT:    shrq $48, %rdx
+; CHECK-O3-NEXT:    pinsrw $0, %edx, %xmm1
+; CHECK-O3-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-O3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movq (%rdi), %rax
+; CHECK-SSE-O3-NEXT:    movq %rax, %rcx
+; CHECK-SSE-O3-NEXT:    movq %rax, %rdx
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-SSE-O3-NEXT:    shrl $16, %eax
+; CHECK-SSE-O3-NEXT:    shrq $32, %rcx
+; CHECK-SSE-O3-NEXT:    shrq $48, %rdx
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %edx, %xmm1
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-SSE-O3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-SSE-O3-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movq (%rdi), %rax
+; CHECK-AVX-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    movq %rax, %rcx
+; CHECK-AVX-O3-NEXT:    shrq $48, %rcx
+; CHECK-AVX-O3-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    movq %rax, %rcx
+; CHECK-AVX-O3-NEXT:    shrq $32, %rcx
+; CHECK-AVX-O3-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    shrl $16, %eax
+; CHECK-AVX-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-AVX512-O3:       # %bb.0:
+; CHECK-AVX512-O3-NEXT:    movq (%rdi), %rax
+; CHECK-AVX512-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    movq %rax, %rcx
+; CHECK-AVX512-O3-NEXT:    shrq $48, %rcx
+; CHECK-AVX512-O3-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    movq %rax, %rcx
+; CHECK-AVX512-O3-NEXT:    shrq $32, %rcx
+; CHECK-AVX512-O3-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    shrl $16, %eax
+; CHECK-AVX512-O3-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec4_bfloat:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movq (%rdi), %rax
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    shrl $16, %ecx
+; CHECK-O0-NEXT:    # kill: def $cx killed $cx killed $ecx
+; CHECK-O0-NEXT:    movw %ax, %dx
+; CHECK-O0-NEXT:    movq %rax, %rsi
+; CHECK-O0-NEXT:    shrq $32, %rsi
+; CHECK-O0-NEXT:    # kill: def $si killed $si killed $rsi
+; CHECK-O0-NEXT:    shrq $48, %rax
+; CHECK-O0-NEXT:    movw %ax, %di
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %di, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %si, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm1
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-O0-NEXT:    punpcklwd {{.*#+}} x...
[truncated]

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please regenerate the test checks

@jofrn jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch from a0038c0 to 5c2428c Compare October 31, 2025 16:05
SDValue ExtractHi =
DAG.getNode(ISD::SRL, dl, IntVT, ALD,
DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));
ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not use SplitInteger?

Comment on lines 2210 to 2211
assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
"Extended load during type legalization!");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't this just be a matter of passing through the extension type instead of hardcoding NON_EXTLOAD below?

SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
SDValue ExtractHi =
DAG.getNode(ISD::SRL, dl, IntVT, ALD,
DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getIntPtrConstant is the wrong type to use, getShiftAmountConstant

@jofrn jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch from 5c2428c to 9411926 Compare November 6, 2025 16:24
@jofrn jofrn force-pushed the users/jofrn/gt/07-15-_x86_remove_extra_mov_after_widening_atomic_load branch from 55ec858 to b92b6da Compare November 6, 2025 17:45
@jofrn jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch from 9411926 to 1434bcf Compare November 6, 2025 17:45
Vector types that aren't widened are split
so that a single ATOMIC_LOAD is issued for the entire vector at once.
This change utilizes the load vectorization infrastructure in
SelectionDAG in order to group the vectors. This enables SelectionDAG
to translate vectors with type bfloat,half.
@jofrn jofrn force-pushed the users/jofrn/gt/07-15-_selectiondag_split_vector_types_for_atomic_load branch from 1434bcf to 8466578 Compare November 6, 2025 18:19
Apply suggested changes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:X86 llvm:SelectionDAG SelectionDAGISel as well

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants