[X86] Eliminate redundant zero-extension instructions

FranciscoThiesen · FranciscoThiesen · commit 622f767197a0 · 2025-09-30T09:52:34.000-07:00
This pass eliminates redundant MOVZX32rr8 instructions when the source
register is a sub-register of the destination and the destination's upper
bits are already known to be zero.

For example, in loops processing byte values:
```
  movzbl (%rdi), %ecx  ; ECX upper 24 bits are zero
  ...
  movzbl %cl, %ecx     ; Redundant! CL is part of ECX, upper bits already 0
```

The optimization:
- Runs post-register allocation in the X86 backend pipeline
- Analyzes backward through basic blocks to verify upper bits are zero
- Handles cross-block analysis by checking predecessor definitions
- Only eliminates when provably safe (not heuristic)

This commonly occurs in loops that process byte values, saving one
instruction per loop iteration and reducing code size by 3 bytes.
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
@@ -47,6 +47,7 @@ set(sources
   X86FixupVectorConstants.cpp
   X86AvoidStoreForwardingBlocks.cpp
   X86DynAllocaExpander.cpp
+  X86EliminateRedundantZeroExtend.cpp
   X86FixupSetCC.cpp
   X86FlagsCopyLowering.cpp
   X86FloatingPoint.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
@@ -127,6 +127,10 @@ FunctionPass *createX86CmovConverterPass();
 /// the upper portions of registers, and to save code size.
 FunctionPass *createX86FixupBWInsts();
 
+/// Return a Machine IR pass that eliminates redundant zero-extension
+/// instructions where the upper bits are already known to be zero.
+FunctionPass *createX86EliminateRedundantZeroExtend();
+
 /// Return a Machine IR pass that reassigns instruction chains from one domain
 /// to another, when profitable.
 FunctionPass *createX86DomainReassignmentPass();
diff --git a/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp b/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp
@@ -0,0 +1,292 @@
+//===-- X86EliminateRedundantZeroExtend.cpp - Eliminate Redundant ZExt ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass eliminates redundant zero-extension instructions where the source
+/// register is a sub-register of the destination and the destination's upper
+/// bits are known to be zero.
+///
+/// For example:
+///   movzbl (%rdi), %ecx  ; ECX = zero-extend byte, upper 24 bits are zero
+///   ...
+///   movzbl %cl, %ecx     ; Redundant! CL is part of ECX, upper bits already 0
+///
+/// This pattern commonly occurs in loops processing byte values.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-eliminate-zext"
+#define PASS_NAME "X86 Eliminate Redundant Zero Extension"
+
+namespace {
+class EliminateRedundantZeroExtend : public MachineFunctionPass {
+public:
+  static char ID;
+  EliminateRedundantZeroExtend() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return PASS_NAME; }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().setNoVRegs();
+  }
+
+private:
+  const X86InstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+
+  /// Check if the register's upper bits are known to be zero at this point.
+  /// This checks backward from MI to find the most recent definition of Reg.
+  bool hasZeroUpperBits(Register Reg, const MachineInstr &MI,
+                        const MachineBasicBlock &MBB) const;
+
+  /// Try to eliminate a redundant MOVZX instruction.
+  bool tryEliminateRedundantZeroExtend(MachineInstr &MI,
+                                       MachineBasicBlock &MBB) const;
+};
+
+char EliminateRedundantZeroExtend::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86EliminateRedundantZeroExtend() {
+  return new EliminateRedundantZeroExtend();
+}
+
+bool EliminateRedundantZeroExtend::hasZeroUpperBits(
+    Register Reg, const MachineInstr &MI, const MachineBasicBlock &MBB) const {
+  // Walk backward from MI to find the most recent definition of Reg
+  MachineBasicBlock::const_reverse_iterator I = ++MI.getReverseIterator();
+  MachineBasicBlock::const_reverse_iterator E = MBB.rend();
+  for (; I != E; ++I) {
+    const MachineInstr &Inst = *I;
+
+    // Check if this instruction defines Reg
+    for (const MachineOperand &MO : Inst.operands()) {
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+
+      Register DefReg = MO.getReg();
+      if (DefReg == Reg || TRI->isSuperRegister(Reg, DefReg)) {
+        // Found a definition - check if it zeros upper bits
+        unsigned Opc = Inst.getOpcode();
+        switch (Opc) {
+        // These instructions zero-extend to 32 bits
+        case X86::MOVZX32rm8:
+        case X86::MOVZX32rr8:
+        case X86::MOVZX32rm16:
+        case X86::MOVZX32rr16:
+          return true;
+        // XOR with self zeros the register
+        case X86::XOR32rr:
+          if (Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg())
+            return true;
+          return false;
+        // MOV32r0 explicitly zeros
+        case X86::MOV32r0:
+          return true;
+        // ADD, SUB on 32-bit register (implicitly zero-extends to 64-bit)
+        case X86::ADD32rr:
+        case X86::ADD32ri:
+        case X86::ADD32rm:
+        case X86::SUB32rr:
+        case X86::SUB32ri:
+        case X86::SUB32rm:
+        case X86::LEA32r:
+          return true;
+        default:
+          // Any other definition might set upper bits, so not safe
+          return false;
+        }
+      }
+
+      // Check if this instruction modifies Reg (partial write or implicit use)
+      if (TRI->regsOverlap(DefReg, Reg)) {
+        // Partial register update - upper bits are unknown
+        return false;
+      }
+    }
+
+    // Check for implicit defs
+    for (const MachineOperand &MO : Inst.implicit_operands()) {
+      if (MO.isReg() && MO.isDef() && TRI->regsOverlap(MO.getReg(), Reg)) {
+        return false;
+      }
+    }
+  }
+
+  // Didn't find a definition in this block - check predecessors
+  // If all predecessors define Reg with zero upper bits, it's safe
+  if (MBB.pred_empty())
+    return false;
+
+  // Check all predecessor blocks
+  for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+    bool FoundZeroExtend = false;
+
+    // SAFETY CHECK: If the sub-register is live-in to the predecessor,
+    // we make the CONSERVATIVE assumption that the parent register was
+    // zero-extended in an earlier block.
+    //
+    // This is safe because:
+    // 1. After register allocation, if $cl is live-in but $ecx is not,
+    //    it means only the low 8 bits are meaningful
+    // 2. The register allocator ensures no other code modifies $ecx between
+    //    the zero-extension and this point (otherwise $ecx would be live)
+    // 3. Any write to $ch or upper bits would show as a def of $ecx, which
+    //    would be found in our backward scan below and handled correctly
+    //
+    // However, this is still conservative - we should verify the actual
+    // definition to be completely safe.
+    Register SubReg8 = TRI->getSubReg(Reg, X86::sub_8bit);
+    Register SubReg16 = TRI->getSubReg(Reg, X86::sub_16bit);
+    bool SubRegLiveIn = (SubReg8 && Pred->isLiveIn(SubReg8)) ||
+                        (SubReg16 && Pred->isLiveIn(SubReg16));
+
+    if (SubRegLiveIn) {
+      // Sub-register is live-in. We'll verify this is safe by checking
+      // that no instructions in this block modify the parent register
+      // before we reach the end (where control flows to our block).
+      // If we find any such modification, we'll conservatively bail out.
+      bool SafeToAssume = true;
+      for (const MachineInstr &Inst : *Pred) {
+        for (const MachineOperand &MO : Inst.operands()) {
+          if (MO.isReg() && MO.isDef()) {
+            Register DefReg = MO.getReg();
+            // Check if this modifies Reg or overlaps with it (partial write)
+            if ((DefReg == Reg || TRI->regsOverlap(DefReg, Reg)) &&
+                DefReg != SubReg8 && DefReg != SubReg16) {
+              // Found a write to the parent register or overlapping register
+              // that's not just the sub-register we expect
+              SafeToAssume = false;
+              break;
+            }
+          }
+        }
+        if (!SafeToAssume)
+          break;
+      }
+
+      if (SafeToAssume) {
+        FoundZeroExtend = true;
+        goto next_predecessor;
+      }
+    }
+
+    // Walk backward through predecessor to find last definition of Reg
+    for (const MachineInstr &Inst : llvm::reverse(*Pred)) {
+      // Check if this instruction defines Reg
+      for (const MachineOperand &MO : Inst.operands()) {
+        if (!MO.isReg() || !MO.isDef())
+          continue;
+
+        Register DefReg = MO.getReg();
+        if (DefReg == Reg || TRI->isSuperRegister(Reg, DefReg)) {
+          // Found a definition - check if it zeros upper bits
+          unsigned Opc = Inst.getOpcode();
+          switch (Opc) {
+          case X86::MOVZX32rm8:
+          case X86::MOVZX32rr8:
+          case X86::MOVZX32rm16:
+          case X86::MOVZX32rr16:
+          case X86::MOV32r0:
+          case X86::ADD32rr:
+          case X86::ADD32ri:
+          case X86::ADD32rm:
+          case X86::SUB32rr:
+          case X86::SUB32ri:
+          case X86::SUB32rm:
+          case X86::LEA32r:
+            FoundZeroExtend = true;
+            break;
+          case X86::XOR32rr:
+            if (Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg())
+              FoundZeroExtend = true;
+            break;
+          default:
+            // Found a definition that doesn't zero upper bits
+            return false;
+          }
+          // Found the definition in this predecessor
+          goto next_predecessor;
+        }
+
+        // Check for partial register updates
+        if (TRI->regsOverlap(DefReg, Reg)) {
+          return false;
+        }
+      }
+    }
+
+  next_predecessor:
+    // If we didn't find a zero-extending definition in this predecessor, fail
+    if (!FoundZeroExtend)
+      return false;
+  }
+
+  // All predecessors have zero-extending definitions
+  return true;
+}
+
+bool EliminateRedundantZeroExtend::tryEliminateRedundantZeroExtend(
+    MachineInstr &MI, MachineBasicBlock &MBB) const {
+  unsigned Opc = MI.getOpcode();
+
+  // Only handle MOVZX32rr8 for now (can extend to MOVZX32rr16 later)
+  if (Opc != X86::MOVZX32rr8)
+    return false;
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+
+  // Check if source is a sub-register of destination
+  // e.g., CL is sub-register of ECX
+  if (!TRI->isSubRegister(DstReg, SrcReg))
+    return false;
+
+  // Check if destination's upper bits are already zero
+  if (!hasZeroUpperBits(DstReg, MI, MBB))
+    return false;
+
+  // The MOVZX is redundant! Since SrcReg is part of DstReg and DstReg's
+  // upper bits are already zero, this instruction does nothing.
+  LLVM_DEBUG(dbgs() << "Eliminating redundant zero-extend: " << MI);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool EliminateRedundantZeroExtend::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    // Iterate through instructions - use a worklist to handle erasures
+    SmallVector<MachineInstr *, 4> ToErase;
+
+    for (MachineInstr &MI : MBB) {
+      if (tryEliminateRedundantZeroExtend(MI, MBB)) {
+        Changed = true;
+        // Note: MI is already erased in tryEliminateRedundantZeroExtend
+        break; // Restart iteration for this block
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -558,6 +558,7 @@ void X86PassConfig::addPreEmitPass() {
 
   if (getOptLevel() != CodeGenOptLevel::None) {
     addPass(createX86FixupBWInsts());
+    addPass(createX86EliminateRedundantZeroExtend());
     addPass(createX86PadShortFunctions());
     addPass(createX86FixupLEAs());
     addPass(createX86FixupInstTuning());
diff --git a/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -O2 | FileCheck %s
+
+; Test that redundant MOVZX instructions are eliminated when the source
+; register is a sub-register of the destination and the destination's upper
+; bits are already known to be zero.
+
+; This is the original countholes test case from GitHub issue that demonstrates
+; the redundant movzbl %cl, %ecx in the loop
+define i32 @countholes(ptr %s) {
+; CHECK-LABEL: countholes:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl (%rdi), %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpb $48, %cl
+; CHECK-NEXT:    jb .LBB0_3
+; CHECK-NEXT:  # %bb.1: # %while.body.preheader
+; CHECK-NEXT:    incq %rdi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    leaq pre_table(%rip), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_2: # %while.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addl $-48, %ecx
+; CHECK-NEXT:    addl (%rdx,%rcx,4), %eax
+; CHECK-NEXT:    movzbl (%rdi), %ecx
+; CHECK-NEXT:    incq %rdi
+; CHECK-NEXT:    cmpb $47, %cl
+; CHECK-NEXT:    ja .LBB0_2
+; CHECK-NEXT:  .LBB0_3: # %cleanup
+; CHECK-NEXT:    retq
+entry:
+  %c.0 = load i8, ptr %s, align 1
+  %conv = zext i8 %c.0 to i32
+  %cmp = icmp ult i8 %c.0, 48
+  br i1 %cmp, label %cleanup, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %s.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %s, %while.body.preheader ]
+  %c.010 = phi i8 [ %c.1, %while.body ], [ %c.0, %while.body.preheader ]
+  %tot.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %conv3 = zext i8 %c.010 to i64
+  %sub = add nsw i64 %conv3, -48
+  %arrayidx = getelementptr inbounds [10 x i32], ptr @pre_table, i64 0, i64 %sub
+  %0 = load i32, ptr %arrayidx, align 4
+  %add = add i32 %0, %tot.09
+  %incdec.ptr = getelementptr inbounds i8, ptr %s.addr.011, i64 1
+  %c.1 = load i8, ptr %incdec.ptr, align 1
+  %cmp1 = icmp ult i8 %c.1, 48
+  br i1 %cmp1, label %cleanup.loopexit, label %while.body
+
+cleanup.loopexit:
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i32 [ 0, %entry ], [ %add, %cleanup.loopexit ]
+  ret i32 %retval.0
+}
+
+@pre_table = internal constant [10 x i32] [i32 1, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 2, i32 1], align 4