Skip to content

Commit 236379e

Browse files
committed
[DAGCombiner][X86] Enable bitcast-load optimization through freeze
1 parent 4bf5ab4 commit 236379e

File tree

7 files changed

+207
-116
lines changed

7 files changed

+207
-116
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16693,21 +16693,31 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
1669316693
}
1669416694

1669516695
// fold (conv (load x)) -> (load (conv*)x)
16696+
// fold (conv (freeze (load x))) -> (freeze (load (conv*)x))
1669616697
// If the resultant load doesn't need a higher alignment than the original!
16697-
if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
16698+
// Peek through freeze to find the load.
16699+
SDValue N0Load = N0;
16700+
bool HasFreeze = false;
16701+
if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse()) {
16702+
N0Load = N0.getOperand(0);
16703+
HasFreeze = true;
16704+
}
16705+
16706+
if (ISD::isNormalLoad(N0Load.getNode()) && N0Load.hasOneUse() &&
1669816707
// Do not remove the cast if the types differ in endian layout.
16699-
TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
16708+
TLI.hasBigEndianPartOrdering(N0Load.getValueType(),
16709+
DAG.getDataLayout()) ==
1670016710
TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
1670116711
// If the load is volatile, we only want to change the load type if the
1670216712
// resulting load is legal. Otherwise we might increase the number of
1670316713
// memory accesses. We don't care if the original type was legal or not
1670416714
// as we assume software couldn't rely on the number of accesses of an
1670516715
// illegal type.
16706-
((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) ||
16716+
((!LegalOperations && cast<LoadSDNode>(N0Load)->isSimple()) ||
1670716717
TLI.isOperationLegal(ISD::LOAD, VT))) {
16708-
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
16718+
LoadSDNode *LN0 = cast<LoadSDNode>(N0Load);
1670916719

16710-
if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
16720+
if (TLI.isLoadBitCastBeneficial(N0Load.getValueType(), VT, DAG,
1671116721
*LN0->getMemOperand())) {
1671216722
// If the range metadata type does not match the new memory
1671316723
// operation type, remove the range metadata.
@@ -16721,7 +16731,12 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
1672116731
SDValue Load =
1672216732
DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
1672316733
LN0->getMemOperand());
16724-
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
16734+
DAG.ReplaceAllUsesOfValueWith(N0Load.getValue(1), Load.getValue(1));
16735+
16736+
// If there was a freeze, wrap the load with freeze again.
16737+
if (HasFreeze)
16738+
Load = DAG.getFreeze(Load);
16739+
1672516740
return Load;
1672616741
}
1672716742
}

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3448,6 +3448,20 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
34483448
if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
34493449
return false;
34503450

3451+
// With low alignment, don't convert integer vectors to large scalar loads,
3452+
// because otherwise they get broken into many small scalar loads.
3453+
if (LoadVT.isVector() && LoadVT.isInteger() && !BitcastVT.isVector() &&
3454+
BitcastVT.isInteger()) {
3455+
const DataLayout &DL = DAG.getDataLayout();
3456+
unsigned MinAlign = DL.getPointerSize();
3457+
// Aligned well, will legalize into a clean sequence of loads.
3458+
if (MMO.getAlign() >= MinAlign)
3459+
return true;
3460+
// Aligned poorly for a large enough scalar.
3461+
if (BitcastVT.getSizeInBits() > 2 * DL.getPointerSizeInBits())
3462+
return false;
3463+
}
3464+
34513465
// If both types are legal vectors, it's always ok to convert them.
34523466
if (LoadVT.isVector() && BitcastVT.isVector() &&
34533467
isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
3+
4+
; Test that bitcast(freeze(load)) is optimized to freeze(load) with appropriate type
5+
define i32 @freeze_bitcast_load(ptr %p) {
6+
; CHECK-LABEL: freeze_bitcast_load:
7+
; CHECK: // %bb.0:
8+
; CHECK-NEXT: ldr w8, [x0]
9+
; CHECK-NEXT: lsr w0, w8, #8
10+
; CHECK-NEXT: ret
11+
%load = load <4 x i8>, ptr %p, align 4
12+
%freeze = freeze <4 x i8> %load
13+
%bitcast = bitcast <4 x i8> %freeze to i32
14+
%result = lshr i32 %bitcast, 8
15+
ret i32 %result
16+
}

llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src,
9494
;
9595
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512:
9696
; X86: # %bb.0:
97-
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
9897
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
98+
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
9999
; X86-NEXT: vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
100100
; X86-NEXT: vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08]
101101
; X86-NEXT: vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]

llvm/test/CodeGen/X86/avx10_2bf16-arith.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src,
147147
;
148148
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256:
149149
; X86: # %bb.0:
150-
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
151150
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
151+
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
152152
; X86-NEXT: vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
153153
; X86-NEXT: vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08]
154154
; X86-NEXT: vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
@@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8
201201
;
202202
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128:
203203
; X86: # %bb.0:
204-
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
205204
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
205+
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
206206
; X86-NEXT: vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
207207
; X86-NEXT: vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08]
208208
; X86-NEXT: vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
3+
4+
; Test that we don't bitcast loads from integer vectors to large illegal scalars.
5+
; The threshold is 2*pointer_size (128 bits on x86-64).
6+
7+
; i192 is just above the threshold - should NOT bitcast even through freeze
8+
define i192 @load_v3i64_to_i192_freeze(ptr %p) nounwind {
9+
; CHECK-LABEL: load_v3i64_to_i192_freeze:
10+
; CHECK: # %bb.0:
11+
; CHECK-NEXT: movq (%rdi), %rax
12+
; CHECK-NEXT: movq 8(%rdi), %rdx
13+
; CHECK-NEXT: movq 16(%rdi), %rcx
14+
; CHECK-NEXT: retq
15+
%vec = load <3 x i64>, ptr %p
16+
%freeze = freeze <3 x i64> %vec
17+
%result = bitcast <3 x i64> %freeze to i192
18+
ret i192 %result
19+
}
20+
21+
; i128 is at the threshold - should allow bitcast through freeze
22+
define i128 @load_v2i64_to_i128_freeze(ptr %p) nounwind {
23+
; CHECK-LABEL: load_v2i64_to_i128_freeze:
24+
; CHECK: # %bb.0:
25+
; CHECK-NEXT: movq (%rdi), %rax
26+
; CHECK-NEXT: movq 8(%rdi), %rdx
27+
; CHECK-NEXT: retq
28+
%vec = load <2 x i64>, ptr %p
29+
%freeze = freeze <2 x i64> %vec
30+
%result = bitcast <2 x i64> %freeze to i128
31+
ret i128 %result
32+
}
33+
34+
; Test with i256 (way above threshold)
35+
define i256 @load_v4i64_to_i256_freeze(ptr %p) nounwind {
36+
; CHECK-LABEL: load_v4i64_to_i256_freeze:
37+
; CHECK: # %bb.0:
38+
; CHECK-NEXT: movq %rdi, %rax
39+
; CHECK-NEXT: movaps (%rsi), %xmm0
40+
; CHECK-NEXT: movaps 16(%rsi), %xmm1
41+
; CHECK-NEXT: movaps %xmm1, 16(%rdi)
42+
; CHECK-NEXT: movaps %xmm0, (%rdi)
43+
; CHECK-NEXT: retq
44+
%vec = load <4 x i64>, ptr %p
45+
%freeze = freeze <4 x i64> %vec
46+
%result = bitcast <4 x i64> %freeze to i256
47+
ret i256 %result
48+
}
49+
50+
; Test with i160 (also above threshold)
51+
define i160 @load_v5i32_to_i160_freeze(ptr %p) nounwind {
52+
; CHECK-LABEL: load_v5i32_to_i160_freeze:
53+
; CHECK: # %bb.0:
54+
; CHECK-NEXT: movq (%rdi), %rax
55+
; CHECK-NEXT: movq 8(%rdi), %rdx
56+
; CHECK-NEXT: movl 16(%rdi), %ecx
57+
; CHECK-NEXT: movl %ecx, %ecx
58+
; CHECK-NEXT: retq
59+
%vec = load <5 x i32>, ptr %p
60+
%freeze = freeze <5 x i32> %vec
61+
%result = bitcast <5 x i32> %freeze to i160
62+
ret i160 %result
63+
}
64+
65+
; FP vectors should still be allowed to bitcast through freeze
66+
define double @load_v2f32_to_f64_freeze(ptr %p) nounwind {
67+
; CHECK-LABEL: load_v2f32_to_f64_freeze:
68+
; CHECK: # %bb.0:
69+
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
70+
; CHECK-NEXT: retq
71+
%vec = load <2 x float>, ptr %p
72+
%freeze = freeze <2 x float> %vec
73+
%result = bitcast <2 x float> %freeze to double
74+
ret double %result
75+
}

0 commit comments

Comments
 (0)