[MLIR][NVVM] [NFC] Update test cmd-lines and doc links (#128207)

durga4github · web-flow · commit 86cb0bd3a237 · 2025-02-24T12:55:11.000+05:30
For the NVVM Dialect tests under Target/LLVMIR/nvvm/ dir,
we verify the lowering to the intrinsics using mlir-translate.
Remove the -verify-diagnostics option from the cmd-line
for these tests since all the verifier checks are tested through
the nvvmir-invalid.mlir file. Similarly, remove the split-input-file
option which is not relevant here.

Update a few remaining links in the NVVMOps.td file.
All the reference links follow the same style now.

Rename the tcgen05-barriers.mlir file to tcgen05-commit.mlir
and move the wait/fence tests to a separate file.

Signed-off-by: Durgadoss R &lt;durgadossr@nvidia.com&gt;
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -2648,8 +2648,7 @@ def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc"> {
     the amount specified by `nCols` and writes the destination
     address to the `addr` argument. The `nCols` operand specifies the
     number of columns to be allocated and it must be a power-of-two.
-    [For more information, refer to the PTX ISA]
-    (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
   }];
 
   let arguments = (ins
@@ -2679,8 +2678,7 @@ def NVVM_Tcgen05DeallocOp : NVVM_Op<"tcgen05.dealloc"> {
     specified by `tmemAddr`, which must be from a previous tensor
     memory allocation. The `nCols` operand specifies the number
     of columns to be de-allocated, and it must be a power-of-two.
-    [For more information, refer to the PTX ISA]
-    (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
   }];
 
   let arguments = (ins LLVM_PointerTensor:$taddr, I32:$nCols,
@@ -2708,8 +2706,7 @@ def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_perm
     of the executing thread is relinquishing the right to allocate
     Tensor Memory. So, it is illegal for a CTA to perform `tcgen05.alloc`
     after any of its constituent threads execute `tcgen05.relinquish_alloc_permit`.
-    [For more information, refer to the PTX ISA]
-    (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
   }];
 
   let arguments = (ins
@@ -2733,8 +2730,7 @@ def NVVM_Tcgen05FenceOp : NVVM_Op<"tcgen05.fence"> {
     The `tcgen05.fence<after>` orders all subsequent async tcgen05 operations
     with respect to the prior tcgen05 and execution ordering operations.
 
-    [For more information refer to the PTX ISA]
-    (https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-fence)
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-fence)
   }];
 
   let arguments = (ins Tcgen05FenceKindAttr:$kind);
@@ -2756,8 +2752,7 @@ def NVVM_Tcgen05WaitOp : NVVM_Op<"tcgen05.wait"> {
     have completed. Similarly, the `tcgen05.wait<store>` causes the executing
     thread to block until all prior `tcgen05.st` operations issued by the
     executing thread have completed.
-    [For more information refer PTX ISA]
-    (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-wait)
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-wait)
   }];
 
   let arguments = (ins Tcgen05WaitKindAttr:$kind);
@@ -2782,8 +2777,7 @@ def NVVM_Tcgen05CommitOp : NVVM_Op<"tcgen05.commit"> {
     when present, specifies the destination CTAs in the cluster such
     that each bit position in the 16-bit `multicastMask` operand
     corresponds to the `nvvm.read.ptx.sreg.ctaid` of the destination CTA.
-    [For more information refer PTX ISA]
-    (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen-async-sync-operations-commit)
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen-async-sync-operations-commit)
   }];
 
   let arguments = (ins
diff --git a/mlir/test/Target/LLVMIR/nvvm/cvt_tf32.mlir b/mlir/test/Target/LLVMIR/nvvm/cvt_tf32.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-translate -mlir-to-llvmir %s  -split-input-file --verify-diagnostics | FileCheck %s
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
 // CHECK-LABEL: @convert_float_to_tf32_rna
 llvm.func @convert_float_to_tf32_rna(%src : f32) -> i32 {
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-alloc.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-alloc.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt -split-input-file -verify-diagnostics %s
-// RUN: mlir-translate -mlir-to-llvmir -split-input-file -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-LLVM
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --check-prefix=CHECK-LLVM
 
 // CHECK-LABEL: @llvm_nvvm_tcgen05_alloc
 llvm.func @llvm_nvvm_tcgen05_alloc(%addr : !llvm.ptr, %ncols : i32) {
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-commit.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-commit.mlir
@@ -1,27 +1,4 @@
-// RUN: mlir-opt -split-input-file -verify-diagnostics %s
-// RUN: mlir-translate -mlir-to-llvmir -split-input-file -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-LLVM
-
-// CHECK-LABEL: @llvm_nvvm_tcgen05_fence
-llvm.func @llvm_nvvm_tcgen05_fence() {
-  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.fence.before.thread.sync()
-  nvvm.tcgen05.fence #nvvm.tcgen05_fence<before>
-
-  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.fence.after.thread.sync()
-  nvvm.tcgen05.fence #nvvm.tcgen05_fence<after>
-
-  llvm.return
-}
-
-// CHECK-LABEL: @llvm_nvvm_tcgen05_wait
-llvm.func @llvm_nvvm_tcgen05_wait() {
-  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.wait.ld()
-  nvvm.tcgen05.wait #nvvm.tcgen05_wait<load>
-
-  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.wait.st()
-  nvvm.tcgen05.wait #nvvm.tcgen05_wait<store>
-
-  llvm.return
-}
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --check-prefix=CHECK-LLVM
 
 // CHECK-LABEL: @llvm_nvvm_tcgen05_commit_generic
 llvm.func @llvm_nvvm_tcgen05_commit_generic(%barrier : !llvm.ptr, %cta_mask : i16) {
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-cp.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-cp.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
 // CHECK-LABEL: @nvvm_tcgen05_cp_128x256b
 llvm.func @nvvm_tcgen05_cp_128x256b(%taddr : !llvm.ptr<6>, %smem_desc : i64) {
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-fence-wait.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-fence-wait.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --check-prefix=CHECK-LLVM
+
+// CHECK-LABEL: @llvm_nvvm_tcgen05_fence
+llvm.func @llvm_nvvm_tcgen05_fence() {
+  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.fence.before.thread.sync()
+  nvvm.tcgen05.fence #nvvm.tcgen05_fence<before>
+
+  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.fence.after.thread.sync()
+  nvvm.tcgen05.fence #nvvm.tcgen05_fence<after>
+
+  llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_tcgen05_wait
+llvm.func @llvm_nvvm_tcgen05_wait() {
+  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.wait.ld()
+  nvvm.tcgen05.wait #nvvm.tcgen05_wait<load>
+
+  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.wait.st()
+  nvvm.tcgen05.wait #nvvm.tcgen05_wait<store>
+
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-shift.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-shift.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
 // CHECK-LABEL: @llvm_nvvm_tcgen05_shift
 llvm.func @llvm_nvvm_tcgen05_shift(%taddr : !llvm.ptr<6>) {
diff --git a/mlir/test/Target/LLVMIR/nvvm/tma_bulk_copy.mlir b/mlir/test/Target/LLVMIR/nvvm/tma_bulk_copy.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt -split-input-file -verify-diagnostics %s
-// RUN: mlir-translate -mlir-to-llvmir -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
 // CHECK-LABEL: @llvm_nvvm_cp_async_bulk_global_to_shared_cluster
 llvm.func @llvm_nvvm_cp_async_bulk_global_to_shared_cluster(%dst : !llvm.ptr<3>, %src : !llvm.ptr<1>, %mbar : !llvm.ptr<3>, %size : i32, %mc : i16, %ch : i64) {
diff --git a/mlir/test/Target/LLVMIR/nvvm/tma_prefetch.mlir b/mlir/test/Target/LLVMIR/nvvm/tma_prefetch.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-translate -mlir-to-llvmir %s  -split-input-file --verify-diagnostics | FileCheck %s
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
 // CHECK-LABEL: @tma_prefetch_1d
 llvm.func @tma_prefetch_1d(%tma_desc : !llvm.ptr, %d0 : i32, %ch : i64) {
diff --git a/mlir/test/Target/LLVMIR/nvvm/tma_store_reduce.mlir b/mlir/test/Target/LLVMIR/nvvm/tma_store_reduce.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-translate -mlir-to-llvmir -split-input-file --verify-diagnostics %s | FileCheck %s
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
 // CHECK-LABEL: define void @tma_store_reduce_1d(
 llvm.func @tma_store_reduce_1d(%src : !llvm.ptr<3>, %tma_desc : !llvm.ptr, %d0 : i32, %ch : i64) {

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// RUN: mlir-translate -mlir-to-llvmir %s -split-input-file --verify-diagnostics \| FileCheck %s`
	`1`	`+// RUN: mlir-translate -mlir-to-llvmir %s \| FileCheck %s`
`2`	`2`
`3`	`3`	`// CHECK-LABEL: @convert_float_to_tf32_rna`
`4`	`4`	`llvm.func @convert_float_to_tf32_rna(%src : f32) -> i32 {`