Skip to content

Conversation

@daniel-zabawa
Copy link
Contributor

Add patterns to select 16b imulzu with -mapx-feature=zu, including folding of zero-extends of the result. IsDesirableToPromoteOp is changed to leave 16b multiplies by constant un-promoted, as imulzu will not cause partial-write stalls.

Add patterns to select 16b imulzu with -mapx-feature=zu, including
folding of zero-extends of the result. IsDesirableToPromoteOp is
changed to leave 16b multiplies by constant un-promoted, as imulzu
will not cause partial-write stalls.
@llvmbot
Copy link
Member

llvmbot commented Nov 19, 2024

@llvm/pr-subscribers-backend-x86

Author: Daniel Zabawa (daniel-zabawa)

Changes

Add patterns to select 16b imulzu with -mapx-feature=zu, including folding of zero-extends of the result. IsDesirableToPromoteOp is changed to leave 16b multiplies by constant un-promoted, as imulzu will not cause partial-write stalls.


Full diff: https://github.com/llvm/llvm-project/pull/116806.diff

4 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+6)
  • (modified) llvm/lib/Target/X86/X86InstrCompiler.td (+30-4)
  • (modified) llvm/lib/Target/X86/X86InstrPredicates.td (+2)
  • (added) llvm/test/CodeGen/X86/apx/imulzu.ll (+238)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 34bc5d76c15cea..7476c268e5acbc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58919,6 +58919,12 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
     if (IsFoldableAtomicRMW(N0, Op) ||
         (Commute && IsFoldableAtomicRMW(N1, Op)))
       return false;
+    // When ZU is enabled, we prefer to not promote for MUL by a constant,
+    // since a 16b imulzu will not incur partial-write stalls, and may be
+    // able to fold away a zero-extend of the 16b result.
+    if (Subtarget.hasZU() && Op.getOpcode() == ISD::MUL &&
+        (isa<ConstantSDNode>(N0) || isa<ConstantSDNode>(N1)))
+      return false;
   }
   }
 
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index a05c3f028442c0..885c9e98fce4a5 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -2184,17 +2184,43 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
 defm : EFLAGSDefiningPats<"", NoNDD>;
 defm : EFLAGSDefiningPats<"_ND", HasNDD>;
 
+let Predicates = [HasZU] in {
+  // zext (mul reg/mem, imm) -> imulzu
+  def : Pat<(i32 (zext (i16 (mul GR16:$src1, imm:$src2)))),
+            (SUBREG_TO_REG (i32 0), (IMULZU16rri GR16:$src1, imm:$src2), sub_16bit)>;
+  def : Pat<(i32 (zext (i16 (mul (loadi16 addr:$src1), imm:$src2)))),
+            (SUBREG_TO_REG (i32 0), (IMULZU16rmi addr:$src1, imm:$src2), sub_16bit)>;
+  def : Pat<(i64 (zext (i16 (mul GR16:$src1, imm:$src2)))),
+            (SUBREG_TO_REG (i64 0), (IMULZU16rri GR16:$src1, imm:$src2), sub_16bit)>;
+  def : Pat<(i64 (zext (i16 (mul (loadi16 addr:$src1), imm:$src2)))),
+            (SUBREG_TO_REG (i64 0), (IMULZU16rmi addr:$src1, imm:$src2), sub_16bit)>;
+  
+  // (mul (reg/mem), imm) -> imulzu
+  // Note this pattern doesn't explicitly require the zero-upper behaviour of imulzu,
+  // but instead avoids the zero-extend of the reg/mem operand that would be 
+  // required if the multiply were promoted to 32b to avoid partial-write stalls.
+  // The imulzu here simply doesn't incur any partial-write stalls.
+  def : Pat<(mul GR16:$src1, imm:$src2),
+            (IMULZU16rri GR16:$src1, imm:$src2)>;
+  def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
+            (IMULZU16rmi addr:$src1, imm:$src2)>;
+}
+
 // mul reg, imm
-def : Pat<(mul GR16:$src1, imm:$src2),
-          (IMUL16rri GR16:$src1, imm:$src2)>;
+let Predicates = [NoZU] in {
+  def : Pat<(mul GR16:$src1, imm:$src2),
+            (IMUL16rri GR16:$src1, imm:$src2)>;
+}
 def : Pat<(mul GR32:$src1, imm:$src2),
           (IMUL32rri GR32:$src1, imm:$src2)>;
 def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
           (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
 
 // reg = mul mem, imm
-def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
-          (IMUL16rmi addr:$src1, imm:$src2)>;
+let Predicates = [NoZU] in {
+  def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
+            (IMUL16rmi addr:$src1, imm:$src2)>;
+}
 def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
           (IMUL32rmi addr:$src1, imm:$src2)>;
 def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 7fb566fba51818..02cb4556ba13ba 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -45,6 +45,8 @@ def NoEGPR       : Predicate<"!Subtarget->hasEGPR()">;
 // entries, so that the NDD variant can be selected first to benefit RA.
 def HasNDD       : Predicate<"Subtarget->hasNDD()">;
 def NoNDD        : Predicate<"!Subtarget->hasNDD()">;
+def HasZU        : Predicate<"Subtarget->hasZU()">;
+def NoZU         : Predicate<"!Subtarget->hasZU()">;
 def HasCF        : Predicate<"Subtarget->hasCF()">;
 def HasCMOV      : Predicate<"Subtarget->canUseCMOV()">;
 def NoCMOV       : Predicate<"!Subtarget->canUseCMOV()">;
diff --git a/llvm/test/CodeGen/X86/apx/imulzu.ll b/llvm/test/CodeGen/X86/apx/imulzu.ll
new file mode 100644
index 00000000000000..5b598ebf35ab89
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/imulzu.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mattr=+zu | FileCheck %s --check-prefix=ZU
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu | FileCheck %s --check-prefix=NOZU
+
+; Test generation of 16b imulzu when -mattr=+zu is specified.
+; The mulzu_* tests check for basic generation, which will fold away a zero-extend of the
+; result if present.
+; The following tests are modifications of selected test/CodeGen/X86/imul.ll tests with
+; 16b multiplies, to check that common strength reductions in ISel are still performed
+; when -mattr=+zu is in effect.
+;
+; FIXME: several cases from imul.ll covering DAG combines, in particular those using LEA,
+; are not ported as X86's IsDesirableToPromoteOp has no way to accurately identify when
+; promotion will permit a better sequence than an unpromoted imulzu.
+; These cases should be added when they are implemented.
+
+define i32 @mulzu_16_32(i16 %A) {
+; ZU-LABEL: mulzu_16_32:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imulzuw $1234, %di, %ax # imm = 0x4D2
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_16_32:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; NOZU-NEXT:    movzwl %ax, %eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 1234
+    %r = zext i16 %mul to i32
+    ret i32 %r
+}
+
+define i64 @mulzu_16_64(i16 %A) {
+; ZU-LABEL: mulzu_16_64:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imulzuw $1234, %di, %ax # imm = 0x4D2
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_16_64:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; NOZU-NEXT:    movzwl %ax, %eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 1234
+    %r = zext i16 %mul to i64
+    ret i64 %r
+}
+
+define i32 @mulzu_16_32_mem(ptr %P) {
+; ZU-LABEL: mulzu_16_32_mem:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imulzuw $1234, (%rdi), %ax # imm = 0x4D2
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_16_32_mem:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    movzwl (%rdi), %eax
+; NOZU-NEXT:    imull $1234, %eax, %eax # imm = 0x4D2
+; NOZU-NEXT:    movzwl %ax, %eax
+; NOZU-NEXT:    retq
+    %gep = getelementptr i16, ptr %P, i64 0
+    %A = load i16, ptr %gep
+    %mul = mul i16 %A, 1234
+    %r = zext i16 %mul to i32
+    ret i32 %r
+}
+
+define i64 @mulzu_16_64_mem(ptr %P) {
+; ZU-LABEL: mulzu_16_64_mem:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imulzuw $1234, (%rdi), %ax # imm = 0x4D2
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_16_64_mem:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    movzwl (%rdi), %eax
+; NOZU-NEXT:    imull $1234, %eax, %eax # imm = 0x4D2
+; NOZU-NEXT:    movzwl %ax, %eax
+; NOZU-NEXT:    retq
+    %gep = getelementptr i16, ptr %P, i64 0
+    %A = load i16, ptr %gep
+    %mul = mul i16 %A, 1234
+    %r = zext i16 %mul to i64
+    ret i64 %r
+}
+
+define void @mulzu_16_store(i16 %A, ptr %R) {
+; ZU-LABEL: mulzu_16_store:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imulzuw $1234, %di, %ax # imm = 0x4D2
+; ZU-NEXT:    movw %ax, (%rsi)
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_16_store:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    imull $1234, %edi, %eax # imm = 0x4D2
+; NOZU-NEXT:    movw %ax, (%rsi)
+; NOZU-NEXT:    retq
+    %gep = getelementptr i16, ptr %R, i64 0
+    %mul = mul i16 %A, 1234
+    store i16 %mul, ptr %gep
+    ret void
+}
+
+define void @mulzu_16_store_mem(ptr %P, ptr %R) {
+; ZU-LABEL: mulzu_16_store_mem:
+; ZU:       # %bb.0:
+; ZU-NEXT:    imulzuw $1234, (%rdi), %ax # imm = 0x4D2
+; ZU-NEXT:    movw %ax, (%rsi)
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulzu_16_store_mem:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    movzwl (%rdi), %eax
+; NOZU-NEXT:    imull $1234, %eax, %eax # imm = 0x4D2
+; NOZU-NEXT:    movw %ax, (%rsi)
+; NOZU-NEXT:    retq
+    %gep = getelementptr i16, ptr %P, i64 0
+    %gep1 = getelementptr i16, ptr %R, i64 0
+    %A = load i16, ptr %gep
+    %mul = mul i16 %A, 1234
+    store i16 %mul, ptr %gep1
+    ret void
+}
+
+; Tests ported from test/CodeGen/X86/imul.ll follow from this point.
+
+define i16 @mul4_16(i16 %A) {
+;
+; ZU-LABEL: mul4_16:
+; ZU:       # %bb.0:
+; ZU-NEXT:    # kill: def $edi killed $edi def $rdi
+; ZU-NEXT:    leal (,%rdi,4), %eax
+; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mul4_16:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    # kill: def $edi killed $edi def $rdi
+; NOZU-NEXT:    leal (,%rdi,4), %eax
+; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 4
+    ret i16 %mul
+}
+
+define i16 @mul4096_16(i16 %A) {
+;
+; ZU-LABEL: mul4096_16:
+; ZU:       # %bb.0:
+; ZU-NEXT:    movl %edi, %eax
+; ZU-NEXT:    shll $12, %eax
+; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mul4096_16:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    movl %edi, %eax
+; NOZU-NEXT:    shll $12, %eax
+; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 4096
+    ret i16 %mul
+}
+
+define i16 @mulmin4096_16(i16 %A) {
+;
+; ZU-LABEL: mulmin4096_16:
+; ZU:       # %bb.0:
+; ZU-NEXT:    movl %edi, %eax
+; ZU-NEXT:    shll $12, %eax
+; ZU-NEXT:    negl %eax
+; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mulmin4096_16:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    movl %edi, %eax
+; NOZU-NEXT:    shll $12, %eax
+; NOZU-NEXT:    negl %eax
+; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, -4096
+    ret i16 %mul
+}
+
+define i16 @mul4_16_minsize(i16 %A) minsize {
+;
+; ZU-LABEL: mul4_16_minsize:
+; ZU:       # %bb.0:
+; ZU-NEXT:    # kill: def $edi killed $edi def $rdi
+; ZU-NEXT:    leal (,%rdi,4), %eax
+; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mul4_16_minsize:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    # kill: def $edi killed $edi def $rdi
+; NOZU-NEXT:    leal (,%rdi,4), %eax
+; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 4
+    ret i16 %mul
+}
+
+define i16 @mul0_16(i16 %A) {
+;
+; ZU-LABEL: mul0_16:
+; ZU:       # %bb.0:
+; ZU-NEXT:    xorl %eax, %eax
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mul0_16:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    xorl %eax, %eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 0
+    ret i16 %mul
+}
+
+define i16 @mul4294967295_16(i16 %A) {
+;
+; ZU-LABEL: mul4294967295_16:
+; ZU:       # %bb.0:
+; ZU-NEXT:    movl %edi, %eax
+; ZU-NEXT:    negl %eax
+; ZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; ZU-NEXT:    retq
+;
+; NOZU-LABEL: mul4294967295_16:
+; NOZU:       # %bb.0:
+; NOZU-NEXT:    movl %edi, %eax
+; NOZU-NEXT:    negl %eax
+; NOZU-NEXT:    # kill: def $ax killed $ax killed $eax
+; NOZU-NEXT:    retq
+    %mul = mul i16 %A, 4294967295
+    ret i16 %mul
+}

Copy link
Contributor

@phoebewang phoebewang left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM with some nits.

@daniel-zabawa
Copy link
Contributor Author

Addressed the latest comments. Please merge on my behalf if the build completes normally.

@phoebewang phoebewang merged commit c1a3960 into llvm:main Nov 26, 2024
8 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants