[DAGCombiner][X86] Enable bitcast-load optimization through freeze

guy-david · guy-david · commit 236379e467c6 · 2025-10-12T16:10:32.000+03:00
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16693,21 +16693,31 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   }
 
   // fold (conv (load x)) -> (load (conv*)x)
+  // fold (conv (freeze (load x))) -> (freeze (load (conv*)x))
   // If the resultant load doesn't need a higher alignment than the original!
-  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+  // Peek through freeze to find the load.
+  SDValue N0Load = N0;
+  bool HasFreeze = false;
+  if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse()) {
+    N0Load = N0.getOperand(0);
+    HasFreeze = true;
+  }
+
+  if (ISD::isNormalLoad(N0Load.getNode()) && N0Load.hasOneUse() &&
       // Do not remove the cast if the types differ in endian layout.
-      TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
+      TLI.hasBigEndianPartOrdering(N0Load.getValueType(),
+                                   DAG.getDataLayout()) ==
           TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
       // If the load is volatile, we only want to change the load type if the
       // resulting load is legal. Otherwise we might increase the number of
       // memory accesses. We don't care if the original type was legal or not
       // as we assume software couldn't rely on the number of accesses of an
       // illegal type.
-      ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) ||
+      ((!LegalOperations && cast<LoadSDNode>(N0Load)->isSimple()) ||
        TLI.isOperationLegal(ISD::LOAD, VT))) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0Load);
 
-    if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
+    if (TLI.isLoadBitCastBeneficial(N0Load.getValueType(), VT, DAG,
                                     *LN0->getMemOperand())) {
       // If the range metadata type does not match the new memory
       // operation type, remove the range metadata.
@@ -16721,7 +16731,12 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       SDValue Load =
           DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
                       LN0->getMemOperand());
-      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
+      DAG.ReplaceAllUsesOfValueWith(N0Load.getValue(1), Load.getValue(1));
+
+      // If there was a freeze, wrap the load with freeze again.
+      if (HasFreeze)
+        Load = DAG.getFreeze(Load);
+
       return Load;
     }
   }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3448,6 +3448,20 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
   if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
     return false;
 
+  // With low alignment, don't convert integer vectors to large scalar loads,
+  // because otherwise they get broken into many small scalar loads.
+  if (LoadVT.isVector() && LoadVT.isInteger() && !BitcastVT.isVector() &&
+      BitcastVT.isInteger()) {
+    const DataLayout &DL = DAG.getDataLayout();
+    unsigned MinAlign = DL.getPointerSize();
+    // Aligned well, will legalize into a clean sequence of loads.
+    if (MMO.getAlign() >= MinAlign)
+      return true;
+    // Aligned poorly for a large enough scalar.
+    if (BitcastVT.getSizeInBits() > 2 * DL.getPointerSizeInBits())
+      return false;
+  }
+
   // If both types are legal vectors, it's always ok to convert them.
   if (LoadVT.isVector() && BitcastVT.isVector() &&
       isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
diff --git a/llvm/test/CodeGen/AArch64/freeze-bitcast.ll b/llvm/test/CodeGen/AArch64/freeze-bitcast.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+; Test that bitcast(freeze(load)) is optimized to freeze(load) with appropriate type
+define i32 @freeze_bitcast_load(ptr %p) {
+; CHECK-LABEL: freeze_bitcast_load:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    lsr w0, w8, #8
+; CHECK-NEXT:    ret
+  %load = load <4 x i8>, ptr %p, align 4
+  %freeze = freeze <4 x i8> %load
+  %bitcast = bitcast <4 x i8> %freeze to i32
+  %result = lshr i32 %bitcast, 8
+  ret i32 %result
+}
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
@@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src,
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
@@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src,
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
@@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/load-freeze-bitcast-opt.ll b/llvm/test/CodeGen/X86/load-freeze-bitcast-opt.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+; Test that we don't bitcast loads from integer vectors to large illegal scalars.
+; The threshold is 2*pointer_size (128 bits on x86-64).
+
+; i192 is just above the threshold - should NOT bitcast even through freeze
+define i192 @load_v3i64_to_i192_freeze(ptr %p) nounwind {
+; CHECK-LABEL: load_v3i64_to_i192_freeze:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    movq 16(%rdi), %rcx
+; CHECK-NEXT:    retq
+  %vec = load <3 x i64>, ptr %p
+  %freeze = freeze <3 x i64> %vec
+  %result = bitcast <3 x i64> %freeze to i192
+  ret i192 %result
+}
+
+; i128 is at the threshold - should allow bitcast through freeze
+define i128 @load_v2i64_to_i128_freeze(ptr %p) nounwind {
+; CHECK-LABEL: load_v2i64_to_i128_freeze:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    retq
+  %vec = load <2 x i64>, ptr %p
+  %freeze = freeze <2 x i64> %vec
+  %result = bitcast <2 x i64> %freeze to i128
+  ret i128 %result
+}
+
+; Test with i256 (way above threshold)
+define i256 @load_v4i64_to_i256_freeze(ptr %p) nounwind {
+; CHECK-LABEL: load_v4i64_to_i256_freeze:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movaps (%rsi), %xmm0
+; CHECK-NEXT:    movaps 16(%rsi), %xmm1
+; CHECK-NEXT:    movaps %xmm1, 16(%rdi)
+; CHECK-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+  %vec = load <4 x i64>, ptr %p
+  %freeze = freeze <4 x i64> %vec
+  %result = bitcast <4 x i64> %freeze to i256
+  ret i256 %result
+}
+
+; Test with i160 (also above threshold)
+define i160 @load_v5i32_to_i160_freeze(ptr %p) nounwind {
+; CHECK-LABEL: load_v5i32_to_i160_freeze:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    movl 16(%rdi), %ecx
+; CHECK-NEXT:    movl %ecx, %ecx
+; CHECK-NEXT:    retq
+  %vec = load <5 x i32>, ptr %p
+  %freeze = freeze <5 x i32> %vec
+  %result = bitcast <5 x i32> %freeze to i160
+  ret i160 %result
+}
+
+; FP vectors should still be allowed to bitcast through freeze
+define double @load_v2f32_to_f64_freeze(ptr %p) nounwind {
+; CHECK-LABEL: load_v2f32_to_f64_freeze:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    retq
+  %vec = load <2 x float>, ptr %p
+  %freeze = freeze <2 x float> %vec
+  %result = bitcast <2 x float> %freeze to double
+  ret double %result
+}
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll