-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[SelectionDAG] Split vector types for atomic load #165818
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/jofrn/gt/07-15-_x86_remove_extra_mov_after_widening_atomic_load
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-llvm-selectiondag Author: None (jofrn) ChangesVector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half. Patch is 24.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165818.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ed2c30be7d71d..9028ff4d3401c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -978,6 +978,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+ void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 22f9fd548f52b..e34b9fa8e787c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1226,6 +1226,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
SplitVecRes_STEP_VECTOR(N, Lo, Hi);
break;
case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+ case ISD::ATOMIC_LOAD:
+ SplitVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N), Lo, Hi);
+ break;
case ISD::LOAD:
SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
break;
@@ -2202,6 +2205,40 @@ void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N, SDValue &Lo,
Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi);
}
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+ SDValue &Hi) {
+ assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Extended load during type legalization!");
+ SDLoc dl(LD);
+ EVT VT = LD->getValueType(0);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+ SDValue Ch = LD->getChain();
+ SDValue Ptr = LD->getBasePtr();
+
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+ EVT MemIntVT =
+ EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+ SDValue ALD = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, MemIntVT, IntVT, Ch,
+ Ptr, LD->getMemOperand());
+
+ EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+ EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+ SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+ SDValue ExtractHi =
+ DAG.getNode(ISD::SRL, dl, IntVT, ALD,
+ DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));
+ ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
+
+ Lo = DAG.getBitcast(LoVT, ExtractLo);
+ Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+ // Legalize the chain result - switch anything that used the old chain to
+ // use the new one.
+ ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
SDValue &Hi) {
assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 9ea21cae97f32..286799f36e80a 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -565,6 +565,180 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
ret <2 x float> %ret
}
+define <2 x half> @atomic_vec2_half(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec2_half:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movl (%rdi), %eax
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT: shrl $16, %eax
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_half:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movl (%rdi), %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: shrl $16, %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_half:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec2_half:
+; CHECK-AVX512-O3: # %bb.0:
+; CHECK-AVX512-O3-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec2_half:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movl (%rdi), %eax
+; CHECK-O0-NEXT: movl %eax, %ecx
+; CHECK-O0-NEXT: shrl $16, %ecx
+; CHECK-O0-NEXT: movw %cx, %dx
+; CHECK-O0-NEXT: # implicit-def: $ecx
+; CHECK-O0-NEXT: movw %dx, %cx
+; CHECK-O0-NEXT: # implicit-def: $xmm1
+; CHECK-O0-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-O0-NEXT: movw %ax, %cx
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %cx, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm0
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_half:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movl (%rdi), %eax
+; CHECK-SSE-O0-NEXT: movl %eax, %ecx
+; CHECK-SSE-O0-NEXT: shrl $16, %ecx
+; CHECK-SSE-O0-NEXT: movw %cx, %dx
+; CHECK-SSE-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT: movw %dx, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O0-NEXT: movw %ax, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %cx, %ax
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_half:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT: retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec2_half:
+; CHECK-AVX512-O0: # %bb.0:
+; CHECK-AVX512-O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512-O0-NEXT: retq
+ %ret = load atomic <2 x half>, ptr %x acquire, align 4
+ ret <2 x half> %ret
+}
+define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movl (%rdi), %eax
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT: shrl $16, %eax
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movl (%rdi), %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: shrl $16, %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movl (%rdi), %eax
+; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: shrl $16, %eax
+; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX512-O3: # %bb.0:
+; CHECK-AVX512-O3-NEXT: movl (%rdi), %eax
+; CHECK-AVX512-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT: shrl $16, %eax
+; CHECK-AVX512-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movl (%rdi), %eax
+; CHECK-O0-NEXT: movl %eax, %ecx
+; CHECK-O0-NEXT: shrl $16, %ecx
+; CHECK-O0-NEXT: # kill: def $cx killed $cx killed $ecx
+; CHECK-O0-NEXT: movw %ax, %dx
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %dx, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm0
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %cx, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm1
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movl (%rdi), %eax
+; CHECK-SSE-O0-NEXT: movl %eax, %ecx
+; CHECK-SSE-O0-NEXT: shrl $16, %ecx
+; CHECK-SSE-O0-NEXT: # kill: def $cx killed $cx killed $ecx
+; CHECK-SSE-O0-NEXT: movw %ax, %dx
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %dx, %ax
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %cx, %ax
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: movl (%rdi), %eax
+; CHECK-AVX-O0-NEXT: movw %ax, %cx
+; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT: shrl $16, %eax
+; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-AVX-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O0-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O0-NEXT: retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec2_bfloat:
+; CHECK-AVX512-O0: # %bb.0:
+; CHECK-AVX512-O0-NEXT: movl (%rdi), %eax
+; CHECK-AVX512-O0-NEXT: movw %ax, %cx
+; CHECK-AVX512-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O0-NEXT: shrl $16, %eax
+; CHECK-AVX512-O0-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-AVX512-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O0-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O0-NEXT: retq
+ %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4
+ ret <2 x bfloat> %ret
+}
define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
; CHECK-O3-LABEL: atomic_vec1_ptr:
; CHECK-O3: # %bb.0:
@@ -1205,6 +1379,305 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
ret <4 x i16> %ret
}
+define <4 x half> @atomic_vec4_half(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_half:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movq (%rdi), %rax
+; CHECK-O3-NEXT: movl %eax, %ecx
+; CHECK-O3-NEXT: shrl $16, %ecx
+; CHECK-O3-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT: movq %rax, %rcx
+; CHECK-O3-NEXT: shrq $32, %rcx
+; CHECK-O3-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-O3-NEXT: shrq $48, %rax
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; CHECK-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-O3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_half:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movq (%rdi), %rax
+; CHECK-SSE-O3-NEXT: movl %eax, %ecx
+; CHECK-SSE-O3-NEXT: shrl $16, %ecx
+; CHECK-SSE-O3-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: movq %rax, %rcx
+; CHECK-SSE-O3-NEXT: shrq $32, %rcx
+; CHECK-SSE-O3-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O3-NEXT: shrq $48, %rax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-SSE-O3-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_half:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovq (%rdi), %xmm0
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec4_half:
+; CHECK-AVX512-O3: # %bb.0:
+; CHECK-AVX512-O3-NEXT: vmovq (%rdi), %xmm0
+; CHECK-AVX512-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec4_half:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movq (%rdi), %rax
+; CHECK-O0-NEXT: movl %eax, %ecx
+; CHECK-O0-NEXT: shrl $16, %ecx
+; CHECK-O0-NEXT: movw %cx, %dx
+; CHECK-O0-NEXT: # implicit-def: $ecx
+; CHECK-O0-NEXT: movw %dx, %cx
+; CHECK-O0-NEXT: # implicit-def: $xmm2
+; CHECK-O0-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-O0-NEXT: movw %ax, %dx
+; CHECK-O0-NEXT: # implicit-def: $ecx
+; CHECK-O0-NEXT: movw %dx, %cx
+; CHECK-O0-NEXT: # implicit-def: $xmm0
+; CHECK-O0-NEXT: pinsrw $0, %ecx, %xmm0
+; CHECK-O0-NEXT: movq %rax, %rcx
+; CHECK-O0-NEXT: shrq $32, %rcx
+; CHECK-O0-NEXT: movw %cx, %dx
+; CHECK-O0-NEXT: # implicit-def: $ecx
+; CHECK-O0-NEXT: movw %dx, %cx
+; CHECK-O0-NEXT: # implicit-def: $xmm1
+; CHECK-O0-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-O0-NEXT: shrq $48, %rax
+; CHECK-O0-NEXT: movw %ax, %cx
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %cx, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm3
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; CHECK-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-O0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec4_half:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movq (%rdi), %rax
+; CHECK-SSE-O0-NEXT: movl %eax, %ecx
+; CHECK-SSE-O0-NEXT: shrl $16, %ecx
+; CHECK-SSE-O0-NEXT: movw %cx, %dx
+; CHECK-SSE-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT: movw %dx, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm2
+; CHECK-SSE-O0-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O0-NEXT: movw %ax, %dx
+; CHECK-SSE-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT: movw %dx, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT: pinsrw $0, %ecx, %xmm0
+; CHECK-SSE-O0-NEXT: movq %rax, %rcx
+; CHECK-SSE-O0-NEXT: shrq $32, %rcx
+; CHECK-SSE-O0-NEXT: movw %cx, %dx
+; CHECK-SSE-O0-NEXT: # implicit-def: $ecx
+; CHECK-SSE-O0-NEXT: movw %dx, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1
+; CHECK-SSE-O0-NEXT: pinsrw $0, %ecx, %xmm1
+; CHECK-SSE-O0-NEXT: shrq $48, %rax
+; CHECK-SSE-O0-NEXT: movw %ax, %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %cx, %ax
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm3
+; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-SSE-O0-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec4_half:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovq (%rdi), %xmm0
+; CHECK-AVX-O0-NEXT: retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec4_half:
+; CHECK-AVX512-O0: # %bb.0:
+; CHECK-AVX512-O0-NEXT: vmovq (%rdi), %xmm0
+; CHECK-AVX512-O0-NEXT: retq
+ %ret = load atomic <4 x half>, ptr %x acquire, align 8
+ ret <4 x half> %ret
+}
+define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movq (%rdi), %rax
+; CHECK-O3-NEXT: movq %rax, %rcx
+; CHECK-O3-NEXT: movq %rax, %rdx
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-O3-NEXT: shrl $16, %eax
+; CHECK-O3-NEXT: shrq $32, %rcx
+; CHECK-O3-NEXT: shrq $48, %rdx
+; CHECK-O3-NEXT: pinsrw $0, %edx, %xmm1
+; CHECK-O3-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-O3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movq (%rdi), %rax
+; CHECK-SSE-O3-NEXT: movq %rax, %rcx
+; CHECK-SSE-O3-NEXT: movq %rax, %rdx
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-SSE-O3-NEXT: shrl $16, %eax
+; CHECK-SSE-O3-NEXT: shrq $32, %rcx
+; CHECK-SSE-O3-NEXT: shrq $48, %rdx
+; CHECK-SSE-O3-NEXT: pinsrw $0, %edx, %xmm1
+; CHECK-SSE-O3-NEXT: pinsrw $0, %ecx, %xmm2
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm3
+; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; CHECK-SSE-O3-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movq (%rdi), %rax
+; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: movq %rax, %rcx
+; CHECK-AVX-O3-NEXT: shrq $48, %rcx
+; CHECK-AVX-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: movq %rax, %rcx
+; CHECK-AVX-O3-NEXT: shrq $32, %rcx
+; CHECK-AVX-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: shrl $16, %eax
+; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec4_bfloat:
+; CHECK-AVX512-O3: # %bb.0:
+; CHECK-AVX512-O3-NEXT: movq (%rdi), %rax
+; CHECK-AVX512-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT: movq %rax, %rcx
+; CHECK-AVX512-O3-NEXT: shrq $48, %rcx
+; CHECK-AVX512-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT: movq %rax, %rcx
+; CHECK-AVX512-O3-NEXT: shrq $32, %rcx
+; CHECK-AVX512-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT: shrl $16, %eax
+; CHECK-AVX512-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-AVX512-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec4_bfloat:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movq (%rdi), %rax
+; CHECK-O0-NEXT: movl %eax, %ecx
+; CHECK-O0-NEXT: shrl $16, %ecx
+; CHECK-O0-NEXT: # kill: def $cx killed $cx killed $ecx
+; CHECK-O0-NEXT: movw %ax, %dx
+; CHECK-O0-NEXT: movq %rax, %rsi
+; CHECK-O0-NEXT: shrq $32, %rsi
+; CHECK-O0-NEXT: # kill: def $si killed $si killed $rsi
+; CHECK-O0-NEXT: shrq $48, %rax
+; CHECK-O0-NEXT: movw %ax, %di
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %di, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm0
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %si, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm1
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-O0-NEXT: punpcklwd {{.*#+}} x...
[truncated]
|
81c146e to
a0038c0
Compare
226394d to
55ec858
Compare
|
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please regenerate the test checks
a0038c0 to
5c2428c
Compare
| SDValue ExtractHi = | ||
| DAG.getNode(ISD::SRL, dl, IntVT, ALD, | ||
| DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl)); | ||
| ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not use SplitInteger?
| assert(LD->getExtensionType() == ISD::NON_EXTLOAD && | ||
| "Extended load during type legalization!"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wouldn't this just be a matter of passing through the extension type instead of hardcoding NON_EXTLOAD below?
| SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD); | ||
| SDValue ExtractHi = | ||
| DAG.getNode(ISD::SRL, dl, IntVT, ALD, | ||
| DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getIntPtrConstant is the wrong type to use, getShiftAmountConstant
5c2428c to
9411926
Compare
55ec858 to
b92b6da
Compare
9411926 to
1434bcf
Compare
Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.
1434bcf to
8466578
Compare
Apply suggested changes

Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half.