Address comments.

fzou1 · fzou1 · commit e28c3230bbb3 · 2024-10-30T09:53:20.000+08:00
diff --git a/clang/lib/Headers/amxfp8intrin.h b/clang/lib/Headers/amxfp8intrin.h
@@ -1,4 +1,4 @@
-/*===---------- amxfp8intrin.h - AMX intrinsics -*- C++ -*------------===
+/*===------------- amxfp8intrin.h - AMX intrinsics -*- C++ -*----------------===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
@@ -15,9 +15,68 @@
 #define __AMXFP8INTRIN_H
 #ifdef __x86_64__
 
+
+/// Compute dot-product of brain-float8 (BF8) or hybrid-float8 (HF8)
+///    floating-point pairs in tiles \a a and \a b, accumulating the
+///    intermediate single-precision (32-bit) floating-point elements with
+///    elements in \a dst, and store the 32-bit result back to tile \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_dpbf8ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPBF8PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_dpbf8ps __builtin_ia32_tdpbf8ps
+
+/// \code
+/// void _tile_dpbhf8ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPBHF8PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_dpbhf8ps __builtin_ia32_tdpbhf8ps
+
+/// \code
+/// void _tile_dphbf8ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPHBF8PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_dphbf8ps __builtin_ia32_tdphbf8ps
+
+/// \code
+/// void _tile_dphf8ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPHF8PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_dphf8ps __builtin_ia32_tdphf8ps
 
 #endif /* __x86_64__ */
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
@@ -271,8 +271,8 @@ def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true",
                                          "Support AMX-COMPLEX instructions",
                                          [FeatureAMXTILE]>;
 def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true",
-                                         "Support AMX-FP8 instructions",
-                                         [FeatureAMXTILE]>;
+                                     "Support AMX-FP8 instructions",
+                                     [FeatureAMXTILE]>;
 def FeatureCMPCCXADD : SubtargetFeature<"cmpccxadd", "HasCMPCCXADD", "true",
                                         "Support CMPCCXADD instructions">;
 def FeatureRAOINT : SubtargetFeature<"raoint", "HasRAOINT", "true",
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -37410,7 +37410,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTDPBUSD:
   case X86::PTDPBUUD:
   case X86::PTDPBF16PS:
-  case X86::PTDPFP16PS: {
+  case X86::PTDPFP16PS:
+  case X86::PTDPBF8PS:
+  case X86::PTDPBHF8PS:
+  case X86::PTDPHBF8PS:
+  case X86::PTDPHF8PS: {
     unsigned Opc;
     switch (MI.getOpcode()) {
     // clang-format off
@@ -37421,6 +37425,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
     case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
     case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
+    case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
+    case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
+    case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
+    case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
     // clang-format on
     }
 
@@ -37503,38 +37511,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
-  case X86::PTDPBF8PS:
-  case X86::PTDPBHF8PS:
-  case X86::PTDPHBF8PS:
-  case X86::PTDPHF8PS: {
-    const DebugLoc &DL = MI.getDebugLoc();
-    unsigned Opc;
-    switch (MI.getOpcode()) {
-    default:
-      llvm_unreachable("Unexpected instruction!");
-    case X86::PTDPBF8PS:
-      Opc = X86::TDPBF8PS;
-      break;
-    case X86::PTDPBHF8PS:
-      Opc = X86::TDPBHF8PS;
-      break;
-    case X86::PTDPHBF8PS:
-      Opc = X86::TDPHBF8PS;
-      break;
-    case X86::PTDPHF8PS:
-      Opc = X86::TDPHF8PS;
-      break;
-    }
-
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
-    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
-    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
-    MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
-    MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
-
-    MI.eraseFromParent();
-    return BB;
-  }
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -274,9 +274,9 @@ let Predicates = [HasAMXFP8, In64BitMode] in {
     let Constraints = "$src1 = $dst" in {
       class AMX_FP8_BASE<bits<8> Opcode, string Opstr> :
         I<Opcode, MRMSrcReg4VOp3, (outs TILE:$dst),
-        (ins TILE:$src1, TILE:$src2, TILE:$src3),
-        !strconcat(Opstr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-        []>, VEX, VVVV;
+          (ins TILE:$src1, TILE:$src2, TILE:$src3),
+          !strconcat(Opstr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          []>, VEX, VVVV;
     }
 
     def TDPBF8PS : AMX_FP8_BASE<0xfd, "tdpbf8ps">, T_MAP5, PS;
@@ -287,22 +287,22 @@ let Predicates = [HasAMXFP8, In64BitMode] in {
     let usesCustomInserter = 1 in {
       // Pseudo instructions, using immediates instead of tile registers.
       // To be translated to the actual instructions in X86ISelLowering.cpp
-      def PTDPBF8PS : PseudoI<(outs), (ins u8imm:$src1,
-                                       u8imm:$src2, u8imm:$src3),
-                                       [(int_x86_tdpbf8ps timm:$src1,
-                                         timm:$src2, timm:$src3)]>;
-      def PTDPBHF8PS : PseudoI<(outs), (ins u8imm:$src1,
-                                        u8imm:$src2, u8imm:$src3),
-                                        [(int_x86_tdpbhf8ps timm:$src1,
-                                          timm:$src2, timm:$src3)]>;
-      def PTDPHBF8PS : PseudoI<(outs), (ins u8imm:$src1,
-                                        u8imm:$src2, u8imm:$src3),
-                                        [(int_x86_tdphbf8ps timm:$src1,
-                                          timm:$src2, timm:$src3)]>;
-      def PTDPHF8PS : PseudoI<(outs), (ins u8imm:$src1,
-                                       u8imm:$src2, u8imm:$src3),
-                                       [(int_x86_tdphf8ps timm:$src1,
-                                         timm:$src2, timm:$src3)]>;
+      def PTDPBF8PS : PseudoI<(outs),
+                              (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
+                              [(int_x86_tdpbf8ps timm:$src1, timm:$src2,
+                                timm:$src3)]>;
+      def PTDPBHF8PS : PseudoI<(outs),
+                               (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
+                               [(int_x86_tdpbhf8ps timm:$src1, timm:$src2,
+                                 timm:$src3)]>;
+      def PTDPHBF8PS : PseudoI<(outs),
+                               (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
+                               [(int_x86_tdphbf8ps timm:$src1, timm:$src2,
+                                 timm:$src3)]>;
+      def PTDPHF8PS : PseudoI<(outs),
+                              (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
+                              [(int_x86_tdphf8ps timm:$src1, timm:$src2,
+                                timm:$src3)]>;
     }
   }
 }