intel
diff --git a/‎lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/Analysis/amd/test-alignment.mlir
Lines changed: 1 addition & 1 deletion b/‎test/Analysis/amd/test-alignment.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/amd/buffer_atomic_cas.mlir
Lines changed: 1 addition & 1 deletion b/‎test/Conversion/amd/buffer_atomic_cas.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/amd/buffer_load_store.mlir
Lines changed: 1 addition & 1 deletion b/‎test/Conversion/amd/buffer_load_store.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/amd/builtin_func_to_llvm.mlir
Lines changed: 1 addition & 1 deletion b/‎test/Conversion/amd/builtin_func_to_llvm.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/amd/dedup-by-constancy.mlir
Lines changed: 1 addition & 1 deletion b/‎test/Conversion/amd/dedup-by-constancy.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/amd/load_store.mlir
Lines changed: 1 addition & 1 deletion b/‎test/Conversion/amd/load_store.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/amd/math-denorm-handling.mlir
Lines changed: 6 additions & 6 deletions b/‎test/Conversion/amd/math-denorm-handling.mlir
Lines changed: 6 additions & 6 deletions
diff --git a/‎test/Conversion/amd/mfma-shortcut.mlir
Lines changed: 1 addition & 1 deletion b/‎test/Conversion/amd/mfma-shortcut.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/amd/tritongpu_to_llvm.mlir
Lines changed: 3 additions & 3 deletions b/‎test/Conversion/amd/tritongpu_to_llvm.mlir
Lines changed: 3 additions & 3 deletions
@@ -516,6 +516,7 @@ static LogicalResult verifyTMEMOperand(Operation *op, RankedTensorType type,
                                 << " layout is not TMEM compatible";
       for (Attribute layout : layouts)
         diag.attachNote() << "potential TMEM layout: " << layout;
+      return diag;
     }
   }
   return success();
 
@@ -2,7 +2,7 @@
 
 #mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
 
-tt.func public @kernel(%arg0: tensor<256x64xf16, #mma> {tt.contiguity=256 : i32, tt.divisibility=6: i32, tt.constancy=1: i32}) attributes {noinline = false} {
+tt.func public @kernel(%arg0: tensor<256x64xf16, #mma> {tt.contiguity=256 : i32, tt.divisibility=6: i32, tt.constancy=1: i32}) {
   // expeted-remark @below {{contiguity = [128, 32], divisibility = [6, 6], constancy = [1, 1], constant_value = <none>}}
   %0 = amdgpu.extract_slice %arg0 [128, 32] : tensor<256x64xf16, #mma> to tensor<128x32xf16, #mma>
   tt.return
 
@@ -2,7 +2,7 @@
 #blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
   // CHECK-LABEL: buffer_atomic_cas_i64
-  tt.func public @buffer_atomic_cas_i64(%arg0: !tt.ptr<i64> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<i64> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) attributes {noinline = false} {
+  tt.func public @buffer_atomic_cas_i64(%arg0: !tt.ptr<i64> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<i64> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) {
     // CHECK: %[[cas_val:.*]] = llvm.mlir.constant(2 : i64) : i64
     // CHECK: %[[cas_val_cast:.*]] = llvm.bitcast %[[cas_val]] : i64 to i64
     // CHECK: %[[cas_val_insert:.*]] = llvm.insertvalue %[[cas_val_cast]], %{{.*}}[1] : !llvm.struct<(i64, i64)>
 
@@ -262,7 +262,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 #blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
   // CHECK-LABEL: strided_buffer_load_and_store
-  tt.func public @strided_buffer_load_and_store(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) attributes {noinline = false} {
+  tt.func public @strided_buffer_load_and_store(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) {
     %cst = arith.constant dense<2> : tensor<1024xi32, #blocked>
     %0 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
     %1 = arith.muli %0, %cst : tensor<1024xi32, #blocked>
 
@@ -3,7 +3,7 @@
 
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @test_fast_expf(%arg0: tensor<64xf32, #blocked>) attributes {noinline = false} {
+  tt.func public @test_fast_expf(%arg0: tensor<64xf32, #blocked>) {
     // LLVM_FTZ: llvm.amdgcn.exp2.f32
     // LLVM_NO_FTZ: llvm.exp2.f32
     %0 = tt.extern_elementwise %arg0 {libname = "libdevice", libpath = "", pure = true, symbol = "__triton_hip_fast_expf"} : (tensor<64xf32, #blocked>) -> tensor<64xf32, #blocked>
 
@@ -15,7 +15,7 @@
 // In the future, we can reduce the icmp to 2 in such case.
 #mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [1, 1], instrShape = [32, 32], isTransposed = false}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @dedup_by_constancy_mfma(%arg0: i32 {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+  tt.func public @dedup_by_constancy_mfma(%arg0: i32 {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}) {
     %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #mma}>>
     %1 = tt.splat %arg0 : i32 -> tensor<32xi32, #ttg.slice<{dim = 1, parent = #mma}>>
     %2 = arith.cmpi slt, %0, %1 : tensor<32xi32, #ttg.slice<{dim = 1, parent = #mma}>>
 
@@ -33,7 +33,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
 #mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [1, 1], instrShape = [16, 16], isTransposed = true}>
 module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
   // CHECK-LABEL: global_store_mfma_vec16
-  tt.func public @global_store_mfma_vec16(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+  tt.func public @global_store_mfma_vec16(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}) {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
     %cst_0 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
     %cst_1 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>
 
@@ -4,7 +4,7 @@
 
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @test_exp2(%arg0: tensor<64xf32, #blocked>) attributes {noinline = false} {
+  tt.func public @test_exp2(%arg0: tensor<64xf32, #blocked>) {
     // LLVM_FTZ: llvm.amdgcn.exp2.f32
     // LLVM_NO_FTZ: llvm.exp2.f32
     %0 = math.exp2 %arg0 : tensor<64xf32, #blocked>
@@ -16,7 +16,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
 
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @test_exp(%arg0: tensor<64xf32, #blocked>) attributes {noinline = false} {
+  tt.func public @test_exp(%arg0: tensor<64xf32, #blocked>) {
     // LLVM_FTZ: llvm.exp2.f32
     // LLVM_NO_FTZ: llvm.exp2.f32
     %0 = math.exp %arg0 : tensor<64xf32, #blocked>
@@ -28,7 +28,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
 
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @test_rsqrt(%arg0: tensor<64xf32, #blocked>) attributes {noinline = false} {
+  tt.func public @test_rsqrt(%arg0: tensor<64xf32, #blocked>) {
     // LLVM_FTZ: llvm.amdgcn.rsq.f32
     // LLVM_NO_FTZ: _ocml_rsqrt_f32
     %0 = math.rsqrt %arg0 : tensor<64xf32, #blocked>
@@ -40,7 +40,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
 
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @test_sqrt_f32(%arg0: tensor<64xf32, #blocked>) attributes {noinline = false} {
+  tt.func public @test_sqrt_f32(%arg0: tensor<64xf32, #blocked>) {
     // LLVM_FTZ-LABEL: test_sqrt_f32
     // LLVM_FTZ-NOT: llvm.fcmp "ogt"
     // LLVM_FTZ: llvm.amdgcn.sqrt.f32
@@ -63,7 +63,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
 
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @test_sqrt_rn_f32(%arg0: tensor<64xf32, #blocked>) attributes {noinline = false} {
+  tt.func public @test_sqrt_rn_f32(%arg0: tensor<64xf32, #blocked>) {
     // LLVM_FTZ-LABEL: test_sqrt_rn_f32
     // LLVM_FTZ: llvm.amdgcn.rsq.f32
     // LLVM_FTZ: llvm.fmul
@@ -89,7 +89,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
 
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @test_sqrt_rn_f64(%arg0: tensor<64xf64, #blocked>) attributes {noinline = false} {
+  tt.func public @test_sqrt_rn_f64(%arg0: tensor<64xf64, #blocked>) {
     // COMMON-LABEL: test_sqrt_rn_f64
     // COMMON: llvm.intr.sqrt
     %0 = tt.precise_sqrt %arg0 : tensor<64xf64, #blocked>
 
@@ -55,7 +55,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 #mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
 module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
   // GFX950-LABEL: mfma_linear_permlane_swap
-  tt.func public @mfma_linear_permlane_swap(%arg0: tensor<128x128xf16, #mma>) attributes {noinline = false} {
+  tt.func public @mfma_linear_permlane_swap(%arg0: tensor<128x128xf16, #mma>) {
   // GFX950-COUNT-16: llvm.call_intrinsic "llvm.amdgcn.permlane32.swap"
     %1 = ttg.convert_layout %arg0: tensor<128x128xf16, #mma> -> tensor<128x128xf16, #linear>
     tt.return
 
@@ -474,7 +474,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 // -----
 #blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @atomic_kernel_bf16(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) attributes {noinline = false} {
+  tt.func public @atomic_kernel_bf16(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) {
     // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) release
     // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) acquire
     %cst = arith.constant dense<true> : tensor<1024xi1, #blocked>
@@ -495,7 +495,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 
 #blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @atomic_kernel_bf16(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) attributes {noinline = false} {
+  tt.func public @atomic_kernel_bf16(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) {
     // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) release
     // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) monotonic
     // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) monotonic
@@ -518,7 +518,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 
 #blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @atomic_kernel_fp32(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+  tt.func public @atomic_kernel_fp32(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
     // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) release
     // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) monotonic
     // CHECK: llvm.atomicrmw {{.*}}, {{.*}} syncscope({{"agent"}}) monotonic
Original file line number	Diff line number	Diff line change
`@@ -516,6 +516,7 @@ static LogicalResult verifyTMEMOperand(Operation *op, RankedTensorType type,`
`516`	`516`	`<< " layout is not TMEM compatible";`
`517`	`517`	`for (Attribute layout : layouts)`
`518`	`518`	`diag.attachNote() << "potential TMEM layout: " << layout;`
	`519`	`+ return diag;`
`519`	`520`	`}`
`520`	`521`	`}`
`521`	`522`	`return success();`