intel
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h‎
Lines changed: 21 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 31 additions & 34 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 31 additions & 34 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp‎
Lines changed: 0 additions & 20 deletions b/‎lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 9 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎python/test/unit/language/test_core.py‎
Lines changed: 2 additions & 2 deletions b/‎python/test/unit/language/test_core.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/Conversion/tma_to_llvm.mlir‎
Lines changed: 9 additions & 6 deletions b/‎test/Conversion/tma_to_llvm.mlir‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎test/Conversion/tritongpu_to_llvm.mlir‎
Lines changed: 6 additions & 9 deletions b/‎test/Conversion/tritongpu_to_llvm.mlir‎
Lines changed: 6 additions & 9 deletions
diff --git a/‎test/Conversion/tritongpu_to_llvm_blackwell.mlir‎
Lines changed: 1 addition & 1 deletion b/‎test/Conversion/tritongpu_to_llvm_blackwell.mlir‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/tritongpu_to_llvm_hopper.mlir‎
Lines changed: 5 additions & 5 deletions b/‎test/Conversion/tritongpu_to_llvm_hopper.mlir‎
Lines changed: 5 additions & 5 deletions
@@ -208,6 +208,27 @@ class ElementwiseOpConversionBase : public ConvertOpToLLVMPattern<SourceOp> {
   ModuleAxisInfoAnalysis &axisAnalysisPass;
 };
 
+// Trivial case where we map elementwise to an existing LLVM operator
+template <typename SourceOp, typename DestOp>
+struct ElementwiseOpConversion
+    : public ElementwiseOpConversionBase<
+          SourceOp, ElementwiseOpConversion<SourceOp, DestOp>> {
+  using Base =
+      ElementwiseOpConversionBase<SourceOp,
+                                  ElementwiseOpConversion<SourceOp, DestOp>>;
+  using Base::Base;
+  using OpAdaptor = typename Base::OpAdaptor;
+
+  // An interface to support variant DestOp builder.
+  SmallVector<DestOp> createDestOps(SourceOp op, OpAdaptor adaptor,
+                                    ConversionPatternRewriter &rewriter,
+                                    Type elemTy, MultipleOperandsRange operands,
+                                    Location loc) const {
+    return {rewriter.create<DestOp>(loc, elemTy, operands[0],
+                                    adaptor.getAttributes().getValue())};
+  }
+};
+
 } // namespace gpu
 
 } // namespace mlir::triton
 
@@ -29,6 +29,32 @@
 using namespace mlir;
 using namespace mlir::triton;
 
+namespace mlir::LLVM {
+using namespace mlir::triton;
+
+Value createConstantI1(Location loc, OpBuilder &rewriter, bool v);
+Value createConstantI32(Location loc, OpBuilder &rewriter, int32_t v);
+Value createConstantI64(Location loc, OpBuilder &rewriter, int64_t v);
+Value createConstantF16(Location loc, OpBuilder &rewriter, float v);
+Value createConstantBF16(Location loc, OpBuilder &rewriter, float v);
+Value createConstantF32(Location loc, OpBuilder &rewriter, float v);
+Value createConstantF64(Location loc, OpBuilder &rewriter, double v);
+Value createNaNConstant(Location loc, OpBuilder &rewriter, Type type);
+Value createIndexConstant(OpBuilder &builder, Location loc,
+                          const TypeConverter *converter, int64_t value);
+Value createLLVMIntegerConstant(OpBuilder &builder, Location loc, short width,
+                                int64_t value);
+
+LLVM::CallOp createLLVMCallOp(OpBuilder &builder, Location loc,
+                              LLVMFuncOp funcOp, ValueRange args);
+LLVM::CallIntrinsicOp
+createLLVMIntrinsicCallOp(OpBuilder &builder, Location loc, StringRef intrinsic,
+                          TypeRange types, ValueRange args);
+} // namespace mlir::LLVM
+
+// Is v an integer or floating-point scalar constant equal to 0?
+bool isConstantZero(Value v);
+
 namespace mlir::triton {
 
 // Returns CTA level thread idx
@@ -248,26 +274,15 @@ struct TritonLLVMOpBuilder {
   Value i1_val(int64_t val) { return int_val(1, val); }
   Value true_val() { return int_val(1, true); }
   Value false_val() { return int_val(1, false); }
-  Value f16_val(float v) {
-    auto type = type::f16Ty(builder->getContext());
-    return builder->create<LLVM::ConstantOp>(loc, type,
-                                             builder->getF16FloatAttr(v));
-  }
-  Value f32_val(float v) {
-    auto type = type::f32Ty(builder->getContext());
-    return builder->create<LLVM::ConstantOp>(loc, type,
-                                             builder->getF32FloatAttr(v));
-  }
-  Value f64_val(double v) {
-    auto type = type::f64Ty(builder->getContext());
-    return builder->create<LLVM::ConstantOp>(loc, type,
-                                             builder->getF64FloatAttr(v));
-  }
+  Value f16_val(float v) { return LLVM::createConstantF16(loc, *builder, v); }
+  Value bf16_val(float v) { return LLVM::createConstantBF16(loc, *builder, v); }
+  Value f32_val(float v) { return LLVM::createConstantF32(loc, *builder, v); }
+  Value f64_val(double v) { return LLVM::createConstantF64(loc, *builder, v); }
   Value i8_val(int64_t val) { return int_val(8, val); }
   Value i16_val(int64_t val) { return int_val(16, val); }
   Value i32_val(int64_t val) { return int_val(32, val); }
   Value i64_val(int64_t val) { return int_val(64, val); }
-  Value tid_val() { return getThreadId(*this->builder, loc); }
+  Value tid_val() { return getThreadId(*builder, loc); }
 
   Location loc;
   OpBuilder *builder;
@@ -375,24 +390,6 @@ LLVM::LLVMFuncOp appendOrGetExternFuncOp(RewriterBase &rewriter, Operation *op,
 namespace LLVM {
 using namespace mlir::triton;
 
-Value createConstantI1(Location loc, OpBuilder &rewriter, bool v);
-Value createConstantI32(Location loc, OpBuilder &rewriter, int32_t v);
-Value createConstantI64(Location loc, OpBuilder &rewriter, int64_t v);
-Value createConstantF16(Location loc, OpBuilder &rewriter, float v);
-Value createConstantF32(Location loc, OpBuilder &rewriter, float v);
-Value createConstantF64(Location loc, OpBuilder &rewriter, double v);
-Value createNaNConstant(Location loc, OpBuilder &rewriter, Type type);
-Value createIndexConstant(OpBuilder &builder, Location loc,
-                          const TypeConverter *converter, int64_t value);
-Value createLLVMIntegerConstant(OpBuilder &builder, Location loc, short width,
-                                int64_t value);
-
-LLVM::CallOp createLLVMCallOp(OpBuilder &builder, Location loc,
-                              LLVMFuncOp funcOp, ValueRange args);
-LLVM::CallIntrinsicOp
-createLLVMIntrinsicCallOp(OpBuilder &builder, Location loc, StringRef intrinsic,
-                          TypeRange types, ValueRange args);
-
 // Is v an integer or floating-point scalar constant equal to 0?
 bool isConstantZero(Value v);
 
 
@@ -215,26 +215,6 @@ struct ExternElementwiseOpConversion
   }
 };
 
-template <typename SourceOp, typename DestOp>
-struct ElementwiseOpConversion
-    : public ElementwiseOpConversionBase<
-          SourceOp, ElementwiseOpConversion<SourceOp, DestOp>> {
-  using Base =
-      ElementwiseOpConversionBase<SourceOp,
-                                  ElementwiseOpConversion<SourceOp, DestOp>>;
-  using Base::Base;
-  using OpAdaptor = typename Base::OpAdaptor;
-
-  // An interface to support variant DestOp builder.
-  SmallVector<DestOp> createDestOps(SourceOp op, OpAdaptor adaptor,
-                                    ConversionPatternRewriter &rewriter,
-                                    Type elemTy, MultipleOperandsRange operands,
-                                    Location loc) const {
-    return {rewriter.create<DestOp>(loc, elemTy, operands[0],
-                                    adaptor.getAttributes().getValue())};
-  }
-};
-
 struct ElementwiseInlineAsmOpConversion
     : public ConvertOpToLLVMPattern<ElementwiseInlineAsmOp> {
   using Base = ConvertOpToLLVMPattern<ElementwiseInlineAsmOp>;
 
@@ -505,6 +505,15 @@ Value createConstantF16(Location loc, OpBuilder &rewriter, float v) {
                                            rewriter.getF16FloatAttr(v));
 }
 
+Value createConstantBF16(Location loc, OpBuilder &rewriter, float v) {
+  APFloat apf(v);
+  bool ignored;
+  apf.convert(APFloat::BFloat(), APFloat::rmNearestTiesToEven, &ignored);
+  auto type = type::bf16Ty(rewriter.getContext());
+  auto attr = FloatAttr::get(type, apf);
+  return rewriter.create<LLVM::ConstantOp>(loc, type, attr);
+}
+
 Value createConstantF32(Location loc, OpBuilder &rewriter, float v) {
   auto type = type::f32Ty(rewriter.getContext());
   return rewriter.create<LLVM::ConstantOp>(loc, type,
 
@@ -6466,7 +6466,7 @@ def kernel(x_ptr, min_ptr, max_ptr, out_ptr, ref_ptr, N, BLOCK_SIZE: tl.constexp
 # Test for symmetric clamp(x, -limit, limit), as it may go through optimized
 # codegen in the backends
 @pytest.mark.interpreter
-@pytest.mark.parametrize("dtype", ['float16', 'float32'])
+@pytest.mark.parametrize("dtype", ['bfloat16', 'float16', 'float32'])
 def test_clamp_symmetric(dtype, device):
 
     @triton.jit
@@ -6545,7 +6545,7 @@ def test_tl_range(device):
             if capability[0] >= 8:
                 ptx = pgm.asm['ptx']
                 # check that the loop got pipelined with the right number of stages.
-                assert 'cp.async.wait_group 0x6' in ptx
+                assert 'cp.async.wait_group 6' in ptx
 
 
 @triton.jit(noinline=True)
 
@@ -22,8 +22,9 @@ tt.func @tma_gather_simple(%arg0: !tt.ptr<i8>, %arg1: !ttg.memdesc<1xi64, #share
   // CHECK: [[WIDX:%.*]] = lshr i32 [[TIDX]], 5
   // CHECK: [[WARP_ID:%.*]] = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 [[WIDX]],
 
-  // CHECK: [[ELECT:%.*]] = tail call i1 asm "elect.sync
-  // CHECK: [[PRED:%.*]] = and i1 %5, [[ELECT]]
+  // CHECK: [[ELECT:%.*]] = tail call { i32, i1 } @llvm.nvvm.elect.sync
+  // CHECK: [[ELECT_PRED:%.*]] = extractvalue { i32, i1 } [[ELECT]], 1
+  // CHECK: [[PRED:%.*]] = and i1 %5, [[ELECT_PRED]]
 
   // CHECK: [[IDX0:%.*]] = extractvalue {{.*}} %2, 0
   // CHECK: [[IDX1:%.*]] = extractvalue {{.*}} %2, 1
@@ -142,8 +143,9 @@ tt.func @tma_gather_redundant_warps(%arg0: !tt.ptr<i8>, %arg1: !ttg.memdesc<1xi6
   // CHECK: [[WARP_SELECT:%.*]] = and i32 [[WARP_ID]], 2
   // CHECK: [[WARP_PRED:%.*]] = icmp eq i32 [[WARP_SELECT]], 0
   // CHECK: [[PRED_TMP:%.*]] = and i1 %5, [[WARP_PRED]]
-  // CHECK: [[ELECT:%.*]] = tail call i1 asm "elect.sync
-  // CHECK: [[PRED:%.*]] = and i1 [[ELECT]], [[PRED_TMP]]
+  // CHECK: [[ELECT:%.*]] = tail call { i32, i1 } @llvm.nvvm.elect.sync
+  // CHECK: [[ELECT_PRED:%.*]] = extractvalue { i32, i1 } [[ELECT]], 1
+  // CHECK: [[PRED:%.*]] = and i1 [[ELECT_PRED]], [[PRED_TMP]]
 
   // CHECK-COUNT-8: cp.async.bulk.tensor{{.*}}(i1 [[PRED]],
   ttng.async_tma_gather %arg0[%arg2, %arg3] %arg4, %arg1, %arg5 : !tt.ptr<i8>, tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked2}>>, i32, !ttg.memdesc<1xi64, #shared, #smem, mutable>, !ttg.memdesc<32x128xbf16, #shared1, #smem, mutable>, i1
@@ -158,14 +160,15 @@ tt.func @tma_scatter(%arg0: !tt.ptr<i8>, %arg1: tensor<32xi32, #ttg.slice<{dim =
   // with `async_tma_gather`, so we don't need to re-test the indexing logic.
 
   // CHECK: [[BASE_PTR:%.*]] = extractvalue {{.*}} %3, 0
-  // CHECK: [[PRED:%.*]] = tail call i1 asm "elect.sync
+  // CHECK: [[ELECT:%.*]] = tail call { i32, i1 } @llvm.nvvm.elect.sync
+  // CHECK: [[PRED:%.*]] = extractvalue { i32, i1 } [[ELECT]], 1
 
   // CHECK: [[PTR:%.*]] = getelementptr {{.*}} [[BASE_PTR]]
   // CHECK-NEXT: "@$0 cp.async.bulk.tensor.2d.tile::scatter4.global.shared::cta.bulk_group [$1, {$2, $3, $4, $5, $6}], [$7];"
   // CHECK-SAME: (i1 [[PRED]], ptr addrspace(1) %0, i32 %2, i32 {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}}, ptr addrspace(3) [[PTR]])
   ttng.async_tma_scatter %arg0[%arg1, %arg2] %arg3 : !tt.ptr<i8>, tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>>, i32, !ttg.memdesc<32x128xbf16, #shared1, #smem, mutable>
 
-  // CHECK: asm sideeffect "cp.async.bulk.commit_group ;", ""()
+  // CHECK: call void @llvm.nvvm.cp.async.commit.group()
 
   // CHECK-NEXT: ret void
   tt.return
 
@@ -479,7 +479,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: basic_program_id
   tt.func @basic_program_id() {
-    // CHECK: llvm.inline_asm asm_dialect = att operand_attrs = [] "mov.u32 $0, %ctaid.x;", "=r"  : () -> i32
+    // CHECK: llvm.call_intrinsic "llvm.nvvm.read.ptx.sreg.ctaid.x"() : () -> i32
     %0 = tt.get_program_id x : i32
     tt.return
   }
@@ -553,7 +553,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: basic_async_wait
   tt.func @basic_async_wait() {
-    // CHECK: cp.async.wait_group 0x4
+    // CHECK: nvvm.cp.async.wait.group 4
     ttg.async_wait {num = 4: i32}
     tt.return
   }
@@ -588,7 +588,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
     // CHECK: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x8, 0x8
     // CHECK: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x8, 0x8
     // CHECK: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x8, 0x8
-    // CHECK: cp.async.commit_group
+    // CHECK: nvvm.cp.async.commit.group
     %73 = ttg.async_copy_global_to_local %66, %subview : tensor<64x!tt.ptr<i64>, #slice1d0> -> !ttg.memdesc<64xi64, #shared1D, #smem, mutable>
     ttg.async_commit_group %73
     tt.return
@@ -628,8 +628,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 
     // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "cp.async.cg.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x10, 0x10;"
     // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "cp.async.cg.shared.global [ ${{.*}} + 16 ], [ ${{.*}} + 0 ], 0x10, 0x10;"
-    // CHECK: llvm.inline_asm has_side_effects asm_dialect = att
-    // CHECK-SAME: cp.async.commit_group
+    // CHECK: nvvm.cp.async.commit.group
     %a = ttg.async_copy_global_to_local %a_ptr, %tensor : tensor<16x64x!tt.ptr<f32>, #AL> -> !ttg.memdesc<16x64xf32, #A, #smem, mutable>
     ttg.async_commit_group
     tt.return
@@ -675,8 +674,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
     // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4
     // CHECK: llvm.inline_asm
     // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4
-    // CHECK: llvm.inline_asm
-    // CHECK-SAME: cp.async.commit_group
+    // CHECK: nvvm.cp.async.commit.group
     %a = ttg.async_copy_global_to_local %a_ptr, %tensor : tensor<16x32x!tt.ptr<f32>, #AL> -> !ttg.memdesc<16x32xf32, #A, #smem, mutable>
     ttg.async_commit_group
     tt.return
@@ -732,8 +730,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
     // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4
     // CHECK: llvm.inline_asm
     // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4
-    // CHECK: llvm.inline_asm
-    // CHECK-SAME: cp.async.commit_group
+    // CHECK: nvvm.cp.async.commit.group
     %a = ttg.async_copy_global_to_local %a_ptr, %tensor : tensor<32x32x!tt.ptr<f32>, #AL> -> !ttg.memdesc<32x32xf32, #A, #smem, mutable>
     ttg.async_commit_group
     tt.return
 
@@ -12,7 +12,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
   // CHECK: %[[P0:.+]] = llvm.icmp "eq" %[[WID]], %[[C0]] : i32
   // CHECK: %[[P1:.+]] = llvm.and %{{.*}}, %[[P0]]  : i1
   // CHECK: llvm.cond_br %[[P1]]
-  // CHECK: %[[E:.+]] = llvm.inline_asm asm_dialect = att operand_attrs = [] "elect.sync _|$0, 0xffffffff;", "=b"  : () -> i1
+  // CHECK: %[[E:.+]] = nvvm.elect.sync -> i1
   // CHECK-COUNT-8: @$5 tcgen05.mma.cta_group::1.kind::f16 [ $0 + 0 ], $1, $2, $3, $4;", "r,l,l,r,b,b" %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %[[E]]
   //         CHECK: @$0 tcgen05.commit.cta_group::1.mbarrier::arrive::one.b64 [$1];", "b,l" %[[E]]
   tt.func @tc_gen5_mma(%a: !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>,
 
@@ -182,7 +182,7 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
     %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
     %neg_limit = arith.subf %cst, %limit : tensor<1024xf32, #blocked>
 
-    // CHECK: "min.xorsign.abs.f32 $0, $1, $2;", "=f,f,f"
+    // CHECK-COUNT-8: nvvm.fmin.xorsign.abs.f
     %12 = tt.clampf %x, %neg_limit, %limit, propagateNan = none : tensor<1024xf32, #blocked>
     tt.return
   }
@@ -208,12 +208,12 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
 #shared = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], hasLeadingOffset = true}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 // CHECK-LABEL: cvt_mma_to_dot_fp8
-// CHECK: prmt.b32
-// CHECK: prmt.b32
+// CHECK: nvvm.prmt
+// CHECK: nvvm.prmt
 // CHECK: nvvm.shfl.sync
 // CHECK: nvvm.shfl.sync
-// CHECK: prmt.b32
-// CHECK: prmt.b32
+// CHECK: nvvm.prmt
+// CHECK: nvvm.prmt
   tt.func @cvt_mma_to_dot_fp8(%a: tensor<128x64xf8E5M2, #mma>) {
     %opA = ttg.convert_layout %a : tensor<128x64xf8E5M2, #mma> -> tensor<128x64xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
     tt.return