intel
diff --git a/‎clang/lib/Basic/Targets/NVPTX.cpp
Lines changed: 3 additions & 0 deletions b/‎clang/lib/Basic/Targets/NVPTX.cpp
Lines changed: 3 additions & 0 deletions
diff --git a/‎clang/lib/Basic/Targets/NVPTX.h
Lines changed: 2 additions & 0 deletions b/‎clang/lib/Basic/Targets/NVPTX.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎clang/test/CodeGenCUDA/bf16.cu
Lines changed: 46 additions & 0 deletions b/‎clang/test/CodeGenCUDA/bf16.cu
Lines changed: 46 additions & 0 deletions
diff --git a/‎clang/test/SemaCUDA/bf16.cu
Lines changed: 33 additions & 0 deletions b/‎clang/test/SemaCUDA/bf16.cu
Lines changed: 33 additions & 0 deletions
diff --git a/‎llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
Lines changed: 1 addition & 0 deletions b/‎llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Lines changed: 28 additions & 27 deletions b/‎llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Lines changed: 28 additions & 27 deletions
diff --git a/‎llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Lines changed: 17 additions & 6 deletions b/‎llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Lines changed: 17 additions & 6 deletions
@@ -52,6 +52,9 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
   VLASupported = false;
   AddrSpaceMap = &NVPTXAddrSpaceMap;
   UseAddrSpaceMapMangling = true;
+  // __bf16 is always available as a load/store only type.
+  BFloat16Width = BFloat16Align = 16;
+  BFloat16Format = &llvm::APFloat::BFloat();
 
   // Define available target features
   // These must be defined in sorted order!
 
@@ -177,6 +177,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
   }
 
   bool hasBitIntType() const override { return true; }
+  bool hasBFloat16Type() const override { return true; }
+  const char *getBFloat16Mangling() const override { return "u6__bf16"; };
 };
 } // namespace targets
 } // namespace clang
 
@@ -0,0 +1,46 @@
+// REQUIRES: nvptx-registered-target
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "nvptx64-nvidia-cuda" \
+// RUN:    -fcuda-is-device "-aux-target-cpu" "x86-64" -S -o - %s | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+// CHECK-LABEL: .visible .func _Z8test_argPu6__bf16u6__bf16(
+// CHECK:        .param .b64 _Z8test_argPu6__bf16u6__bf16_param_0,
+// CHECK:        .param .b16 _Z8test_argPu6__bf16u6__bf16_param_1
+//
+__device__ void test_arg(__bf16 *out, __bf16 in) {
+// CHECK:         ld.param.b16    %{{h.*}}, [_Z8test_argPu6__bf16u6__bf16_param_1];
+  __bf16 bf16 = in;
+  *out = bf16;
+// CHECK:         st.b16
+// CHECK:         ret;
+}
+
+
+// CHECK-LABEL: .visible .func (.param .b32 func_retval0) _Z8test_retu6__bf16(
+// CHECK:         .param .b16 _Z8test_retu6__bf16_param_0
+__device__ __bf16 test_ret( __bf16 in) {
+// CHECK:        ld.param.b16    %h{{.*}}, [_Z8test_retu6__bf16_param_0];
+  return in;
+// CHECK:        st.param.b16    [func_retval0+0], %h
+// CHECK:        ret;
+}
+
+// CHECK-LABEL: .visible .func  (.param .b32 func_retval0) _Z9test_callu6__bf16(
+// CHECK:        .param .b16 _Z9test_callu6__bf16_param_0
+__device__ __bf16 test_call( __bf16 in) {
+// CHECK:        ld.param.b16    %h{{.*}}, [_Z9test_callu6__bf16_param_0];
+// CHECK:        st.param.b16    [param0+0], %h2;
+// CHECK:        .param .b32 retval0;
+// CHECK:        call.uni (retval0),
+// CHECK-NEXT:   _Z8test_retu6__bf16,
+// CHECK-NEXT:   (
+// CHECK-NEXT:   param0
+// CHECK-NEXT    );
+// CHECK:        ld.param.b16    %h{{.*}}, [retval0+0];
+  return test_ret(in);
+// CHECK:        st.param.b16    [func_retval0+0], %h
+// CHECK:        ret;
+}
@@ -0,0 +1,33 @@
+// REQUIRES: nvptx-registered-target
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cc1 "-triple" "x86_64-unknown-linux-gnu" "-aux-triple" "nvptx64-nvidia-cuda" \
+// RUN:    "-target-cpu" "x86-64" -fsyntax-only -verify=scalar %s
+// RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "nvptx64-nvidia-cuda" \
+// RUN:    -fcuda-is-device "-aux-target-cpu" "x86-64" -fsyntax-only -verify=scalar %s
+
+#include "Inputs/cuda.h"
+
+__device__ void test(bool b, __bf16 *out, __bf16 in) {
+  __bf16 bf16 = in; // No error on using the type itself.
+
+  bf16 + bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
+  bf16 - bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
+  bf16 * bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
+  bf16 / bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
+
+  __fp16 fp16;
+
+  bf16 + fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
+  fp16 + bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
+  bf16 - fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
+  fp16 - bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
+  bf16 * fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
+  fp16 * bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
+  bf16 / fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
+  fp16 / bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
+  bf16 = fp16; // scalar-error {{assigning to '__bf16' from incompatible type '__fp16'}}
+  fp16 = bf16; // scalar-error {{assigning to '__fp16' from incompatible type '__bf16'}}
+  bf16 + (b ? fp16 : bf16); // scalar-error {{incompatible operand types ('__fp16' and '__bf16')}}
+  *out = bf16;
+}
@@ -1831,6 +1831,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
     break;
 
   case Type::HalfTyID:
+  case Type::BFloatTyID:
   case Type::FloatTyID:
   case Type::DoubleTyID:
     AddIntToBuffer(cast<ConstantFP>(CPV)->getValueAPF().bitcastToAPInt());
 
@@ -823,8 +823,10 @@ static Optional<unsigned> pickOpcodeForVT(
   case MVT::i64:
     return Opcode_i64;
   case MVT::f16:
+  case MVT::bf16:
     return Opcode_f16;
   case MVT::v2f16:
+  case MVT::v2bf16:
     return Opcode_f16x2;
   case MVT::f32:
     return Opcode_f32;
@@ -835,6 +837,21 @@ static Optional<unsigned> pickOpcodeForVT(
   }
 }
 
+static int getLdStRegType(EVT VT) {
+  if (VT.isFloatingPoint())
+    switch (VT.getSimpleVT().SimpleTy) {
+    case MVT::f16:
+    case MVT::bf16:
+    case MVT::v2f16:
+    case MVT::v2bf16:
+      return NVPTX::PTXLdStInstCode::Untyped;
+    default:
+      return NVPTX::PTXLdStInstCode::Float;
+    }
+  else
+    return NVPTX::PTXLdStInstCode::Unsigned;
+}
+
 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   SDLoc dl(N);
   MemSDNode *LD = cast<MemSDNode>(N);
@@ -891,19 +908,16 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   // Vector Setting
   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
   if (SimpleVT.isVector()) {
-    assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");
-    // v2f16 is loaded using ld.b32
+    assert((LoadedVT == MVT::v2f16 || LoadedVT == MVT::v2bf16) &&
+           "Unexpected vector type");
+    // v2f16/v2bf16 is loaded using ld.b32
     fromTypeWidth = 32;
   }
 
   if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
     fromType = NVPTX::PTXLdStInstCode::Signed;
-  else if (ScalarVT.isFloatingPoint())
-    // f16 uses .b16 as its storage type.
-    fromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
-                                             : NVPTX::PTXLdStInstCode::Float;
   else
-    fromType = NVPTX::PTXLdStInstCode::Unsigned;
+    fromType = getLdStRegType(ScalarVT);
 
   // Create the machine instruction DAG
   SDValue Chain = N->getOperand(0);
@@ -1033,11 +1047,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
       N->getOperand(N->getNumOperands() - 1))->getZExtValue();
   if (ExtensionType == ISD::SEXTLOAD)
     FromType = NVPTX::PTXLdStInstCode::Signed;
-  else if (ScalarVT.isFloatingPoint())
-    FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
-                                             : NVPTX::PTXLdStInstCode::Float;
   else
-    FromType = NVPTX::PTXLdStInstCode::Unsigned;
+    FromType = getLdStRegType(ScalarVT);
 
   unsigned VecType;
 
@@ -1057,7 +1068,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   // v8f16 is a special case. PTX doesn't have ld.v8.f16
   // instruction. Instead, we split the vector into v2f16 chunks and
   // load them with ld.v4.b32.
-  if (EltVT == MVT::v2f16) {
+  if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16) {
     assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
     EltVT = MVT::i32;
     FromType = NVPTX::PTXLdStInstCode::Untyped;
@@ -1745,18 +1756,13 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   MVT ScalarVT = SimpleVT.getScalarType();
   unsigned toTypeWidth = ScalarVT.getSizeInBits();
   if (SimpleVT.isVector()) {
-    assert(StoreVT == MVT::v2f16 && "Unexpected vector type");
+    assert((StoreVT == MVT::v2f16 || StoreVT == MVT::v2bf16) &&
+           "Unexpected vector type");
     // v2f16 is stored using st.b32
     toTypeWidth = 32;
   }
 
-  unsigned int toType;
-  if (ScalarVT.isFloatingPoint())
-    // f16 uses .b16 as its storage type.
-    toType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
-                                           : NVPTX::PTXLdStInstCode::Float;
-  else
-    toType = NVPTX::PTXLdStInstCode::Unsigned;
+  unsigned int toType = getLdStRegType(ScalarVT);
 
   // Create the machine instruction DAG
   SDValue Chain = ST->getChain();
@@ -1896,12 +1902,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   assert(StoreVT.isSimple() && "Store value is not simple");
   MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
   unsigned ToTypeWidth = ScalarVT.getSizeInBits();
-  unsigned ToType;
-  if (ScalarVT.isFloatingPoint())
-    ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
-                                           : NVPTX::PTXLdStInstCode::Float;
-  else
-    ToType = NVPTX::PTXLdStInstCode::Unsigned;
+  unsigned ToType = getLdStRegType(ScalarVT);
 
   SmallVector<SDValue, 12> StOps;
   SDValue N2;
@@ -1929,7 +1930,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   // v8f16 is a special case. PTX doesn't have st.v8.f16
   // instruction. Instead, we split the vector into v2f16 chunks and
   // store them with st.v4.b32.
-  if (EltVT == MVT::v2f16) {
+  if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16) {
     assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
     EltVT = MVT::i32;
     ToType = NVPTX::PTXLdStInstCode::Untyped;
 
@@ -133,6 +133,9 @@ static bool IsPTXVectorType(MVT VT) {
   case MVT::v2f16:
   case MVT::v4f16:
   case MVT::v8f16: // <4 x f16x2>
+  case MVT::v2bf16:
+  case MVT::v4bf16:
+  case MVT::v8bf16: // <4 x bf16x2>
   case MVT::v2f32:
   case MVT::v4f32:
   case MVT::v2f64:
@@ -190,8 +193,8 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
       // Vectors with an even number of f16 elements will be passed to
       // us as an array of v2f16 elements. We must match this so we
       // stay in sync with Ins/Outs.
-      if (EltVT == MVT::f16 && NumElts % 2 == 0) {
-        EltVT = MVT::v2f16;
+      if ((EltVT == MVT::f16 || EltVT == MVT::f16) && NumElts % 2 == 0) {
+        EltVT = EltVT == MVT::f16 ? MVT::v2f16 : MVT::v2bf16;
         NumElts /= 2;
       }
       for (unsigned j = 0; j != NumElts; ++j) {
@@ -400,6 +403,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
   addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
   addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
+  addRegisterClass(MVT::bf16, &NVPTX::Float16RegsRegClass);
+  addRegisterClass(MVT::v2bf16, &NVPTX::Float16x2RegsRegClass);
 
   // Conversion to/from FP16/FP16x2 is always legal.
   setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
@@ -495,6 +500,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
   setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
 
   // TRAP can be lowered to PTX trap
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
@@ -2334,14 +2340,17 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     case MVT::v2i32:
     case MVT::v2i64:
     case MVT::v2f16:
+    case MVT::v2bf16:
     case MVT::v2f32:
     case MVT::v2f64:
     case MVT::v4i8:
     case MVT::v4i16:
     case MVT::v4i32:
     case MVT::v4f16:
+    case MVT::v4bf16:
     case MVT::v4f32:
     case MVT::v8f16: // <4 x f16x2>
+    case MVT::v8bf16: // <4 x bf16x2>
       // This is a "native" vector type
       break;
     }
@@ -2386,7 +2395,8 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
       // v8f16 is a special case. PTX doesn't have st.v8.f16
       // instruction. Instead, we split the vector into v2f16 chunks and
       // store them with st.v4.b32.
-      assert(EltVT == MVT::f16 && "Wrong type for the vector.");
+      assert((EltVT == MVT::f16 || EltVT == MVT::bf16) &&
+             "Wrong type for the vector.");
       Opcode = NVPTXISD::StoreV4;
       StoreF16x2 = true;
       break;
@@ -4987,11 +4997,12 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
     // v8f16 is a special case. PTX doesn't have ld.v8.f16
     // instruction. Instead, we split the vector into v2f16 chunks and
     // load them with ld.v4.b32.
-    assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
+    assert((EltVT == MVT::f16 || EltVT == MVT::bf16) &&
+           "Unsupported v8 vector type.");
     LoadF16x2 = true;
     Opcode = NVPTXISD::LoadV4;
-    EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
-                     MVT::Other};
+    EVT VVT = (EltVT == MVT::f16) ? MVT::v2f16 : MVT::v2bf16;
+    EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
     LdResVTs = DAG.getVTList(ListVTs);
     break;
   }
Original file line number	Diff line number	Diff line change
`@@ -177,6 +177,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {`
`177`	`177`	`}`
`178`	`178`
`179`	`179`	`bool hasBitIntType() const override { return true; }`
	`180`	`+ bool hasBFloat16Type() const override { return true; }`
	`181`	`+ const char *getBFloat16Mangling() const override { return "u6__bf16"; };`
`180`	`182`	`};`
`181`	`183`	`} // namespace targets`
`182`	`184`	`} // namespace clang`