[ROCMTarget] Make all pingpong arithmetic nsw and nuw (iree-org#21248)

qedawkins · web-flow · commit 46c97f97c163 · 2025-06-30T18:18:09.000-04:00
This is safe because all arithmetic is either thread id related
arithmetic which is known to be well within bounds given the required
workgroup size or is pointer arithmetic where wrapping would be ub.
diff --git a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
@@ -78,14 +78,14 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_
 
   scf.forall (%id) in (2048) {
     %delin:2 = affine.delinearize_index %id into (256, 8) : index, index
-    %vec = arith.muli %delin#1, %c8 : index
+    %vec = arith.muli %delin#1, %c8 overflow<nsw, nuw> : index
     %lhs_thread_local = tensor.extract_slice %lhs_init [%delin#0, %vec] [1, 8] [1, 1] : !block_in to tensor<1x8xf16>
     %lhs_vec_local = vector.transfer_read %lhs_thread_local [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x8xf16>, vector<1x8xf16>
     vector.transfer_write %lhs_vec_local, %lhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
   } {mapping = [#gpu.thread<linear_dim_0>]}
   scf.forall (%id) in (2048) {
     %delin:2 = affine.delinearize_index %id into (256, 8) : index, index
-    %vec = arith.muli %delin#1, %c8 : index
+    %vec = arith.muli %delin#1, %c8 overflow<nsw, nuw> : index
     %rhs_thread_local = tensor.extract_slice %rhs_init [%delin#0, %vec] [1, 8] [1, 1] : !block_in to tensor<1x8xf16>
     %rhs_vec_local = vector.transfer_read %rhs_thread_local [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x8xf16>, vector<1x8xf16>
     vector.transfer_write %rhs_vec_local, %rhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
@@ -97,21 +97,21 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_
   %0 = tensor.empty() : tensor<16x16x16x16xf32>
   %1 = scf.forall (%id) in (512) shared_outs(%out = %0) -> tensor<16x16x16x16xf32> {
     %ids:4 = affine.delinearize_index %id into (2, 4, 4, 16) : index, index, index, index
-    %inner_id = arith.muli %ids#2, %c4 : index
-    %m_outer_id = arith.muli %ids#0, %c8 : index
-    %n_outer_id = arith.muli %ids#1, %c4 : index
+    %inner_id = arith.muli %ids#2, %c4 overflow<nsw, nuw> : index
+    %m_outer_id = arith.muli %ids#0, %c8 overflow<nsw, nuw> : index
+    %n_outer_id = arith.muli %ids#1, %c4 overflow<nsw, nuw> : index
     %delin:2 = affine.delinearize_index %id into (64, 8) : index, index
     %wt:3 = affine.delinearize_index %id into (8, 8, 8) : index, index, index
 
     // Inner 64 loads 8 threads x 8 elements.
-    %gko = arith.muli %wt#2, %c8 : index
+    %gko = arith.muli %wt#2, %c8 overflow<nsw, nuw> : index
     // Each subgroup loads 32 contiguous rows out of 256.
-    %bpo = arith.muli %wt#0, %c32 : index
+    %bpo = arith.muli %wt#0, %c32 overflow<nsw, nuw> : index
     // Base index is remaining outer 8 lanes + subgroup base.
-    %glb0 = arith.addi %wt#1, %bpo : index
-    %glb1 = arith.addi %glb0, %c8 : index
-    %glb2 = arith.addi %glb1, %c8 : index
-    %glb3 = arith.addi %glb2, %c8 : index
+    %glb0 = arith.addi %wt#1, %bpo overflow<nsw, nuw> : index
+    %glb1 = arith.addi %glb0, %c8 overflow<nsw, nuw> : index
+    %glb2 = arith.addi %glb1, %c8 overflow<nsw, nuw> : index
+    %glb3 = arith.addi %glb2, %c8 overflow<nsw, nuw> : index
 
     %2 = arith.constant dense<0.0> : vector<8x4x1x4xf32>
 
@@ -299,14 +299,14 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in
 
   scf.forall (%id) in (2048) {
     %delin:2 = affine.delinearize_index %id into (256, 8) : index, index
-    %vec = arith.muli %delin#1, %c8 : index
+    %vec = arith.muli %delin#1, %c8 overflow<nsw, nuw> : index
     %lhs_thread_local = tensor.extract_slice %lhs_init [0, %delin#0, %vec] [1, 1, 8] [1, 1, 1] : !exp_block_in to tensor<1x1x8xf16>
     %lhs_vec_local = vector.transfer_read %lhs_thread_local [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x8xf16>, vector<1x8xf16>
     vector.transfer_write %lhs_vec_local, %lhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
   } {mapping = [#gpu.thread<linear_dim_0>]}
   scf.forall (%id) in (2048) {
     %delin:2 = affine.delinearize_index %id into (256, 8) : index, index
-    %vec = arith.muli %delin#1, %c8 : index
+    %vec = arith.muli %delin#1, %c8 overflow<nsw, nuw> : index
     %rhs_thread_local = tensor.extract_slice %rhs_init [%delin#0, %vec] [1, 8] [1, 1] : !block_in to tensor<1x8xf16>
     %rhs_vec_local = vector.transfer_read %rhs_thread_local [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x8xf16>, vector<1x8xf16>
     vector.transfer_write %rhs_vec_local, %rhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
@@ -318,21 +318,21 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in
   %0 = tensor.empty() : tensor<1x16x16x16x16xf32>
   %1 = scf.forall (%id) in (512) shared_outs(%out = %0) -> tensor<1x16x16x16x16xf32> {
     %ids:4 = affine.delinearize_index %id into (2, 4, 4, 16) : index, index, index, index
-    %inner_id = arith.muli %ids#2, %c4 : index
-    %m_outer_id = arith.muli %ids#0, %c8 : index
-    %n_outer_id = arith.muli %ids#1, %c4 : index
+    %inner_id = arith.muli %ids#2, %c4 overflow<nsw, nuw> : index
+    %m_outer_id = arith.muli %ids#0, %c8 overflow<nsw, nuw> : index
+    %n_outer_id = arith.muli %ids#1, %c4 overflow<nsw, nuw> : index
     %delin:2 = affine.delinearize_index %id into (64, 8) : index, index
     %wt:3 = affine.delinearize_index %id into (8, 8, 8) : index, index, index
 
     // Inner 64 loads 8 threads x 8 elements.
-    %gko = arith.muli %wt#2, %c8 : index
+    %gko = arith.muli %wt#2, %c8 overflow<nsw, nuw> : index
     // Each subgroup loads 32 contiguous rows out of 256.
-    %bpo = arith.muli %wt#0, %c32 : index
+    %bpo = arith.muli %wt#0, %c32 overflow<nsw, nuw> : index
     // Base index is remaining outer 8 lanes + subgroup base.
-    %glb0 = arith.addi %wt#1, %bpo : index
-    %glb1 = arith.addi %glb0, %c8 : index
-    %glb2 = arith.addi %glb1, %c8 : index
-    %glb3 = arith.addi %glb2, %c8 : index
+    %glb0 = arith.addi %wt#1, %bpo overflow<nsw, nuw> : index
+    %glb1 = arith.addi %glb0, %c8 overflow<nsw, nuw> : index
+    %glb2 = arith.addi %glb1, %c8 overflow<nsw, nuw> : index
+    %glb3 = arith.addi %glb2, %c8 overflow<nsw, nuw> : index
 
     %2 = arith.constant dense<0.0> : vector<8x4x1x4xf32>
 
@@ -525,14 +525,14 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas
 
   scf.forall (%id) in (2048) {
     %delin:2 = affine.delinearize_index %id into (256, 8) : index, index
-    %vec = arith.muli %delin#1, %c16 : index
+    %vec = arith.muli %delin#1, %c16 overflow<nsw, nuw> : index
     %lhs_thread_local = tensor.extract_slice %lhs_init [0, %delin#0, %vec] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
     %lhs_vec_local = vector.transfer_read %lhs_thread_local [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
     vector.transfer_write %lhs_vec_local, %lhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
   } {mapping = [#gpu.thread<linear_dim_0>]}
   scf.forall (%id) in (2048) {
     %delin:2 = affine.delinearize_index %id into (256, 8) : index, index
-    %vec = arith.muli %delin#1, %c16 : index
+    %vec = arith.muli %delin#1, %c16 overflow<nsw, nuw> : index
     %rhs_thread_local = tensor.extract_slice %rhs_init [%delin#0, %vec] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
     %rhs_vec_local = vector.transfer_read %rhs_thread_local [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
     vector.transfer_write %rhs_vec_local, %rhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
@@ -544,22 +544,22 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas
   %0 = tensor.empty() : tensor<1x16x16x16x16xf32>
   %1 = scf.forall (%id) in (512) shared_outs(%out = %0) -> tensor<1x16x16x16x16xf32> {
     %ids:4 = affine.delinearize_index %id into (2, 4, 4, 16) : index, index, index, index
-    %inner_id = arith.muli %ids#2, %c8 : index
-    %inner_id_acc = arith.muli %ids#2, %c4 : index
-    %m_outer_id = arith.muli %ids#0, %c8 : index
-    %n_outer_id = arith.muli %ids#1, %c4 : index
+    %inner_id = arith.muli %ids#2, %c8 overflow<nsw, nuw> : index
+    %inner_id_acc = arith.muli %ids#2, %c4 overflow<nsw, nuw> : index
+    %m_outer_id = arith.muli %ids#0, %c8 overflow<nsw, nuw> : index
+    %n_outer_id = arith.muli %ids#1, %c4 overflow<nsw, nuw> : index
     %delin:2 = affine.delinearize_index %id into (64, 8) : index, index
     %wt:3 = affine.delinearize_index %id into (8, 8, 8) : index, index, index
 
     // Inner 64 loads 8 threads x 16 elements.
-    %gko = arith.muli %wt#2, %c16 : index
+    %gko = arith.muli %wt#2, %c16 overflow<nsw, nuw> : index
     // Each subgroup loads 32 contiguous rows out of 256.
-    %bpo = arith.muli %wt#0, %c32 : index
+    %bpo = arith.muli %wt#0, %c32 overflow<nsw, nuw> : index
     // Base index is remaining outer 8 lanes + subgroup base.
-    %glb0 = arith.addi %wt#1, %bpo : index
-    %glb1 = arith.addi %glb0, %c8 : index
-    %glb2 = arith.addi %glb1, %c8 : index
-    %glb3 = arith.addi %glb2, %c8 : index
+    %glb0 = arith.addi %wt#1, %bpo overflow<nsw, nuw> : index
+    %glb1 = arith.addi %glb0, %c8 overflow<nsw, nuw> : index
+    %glb2 = arith.addi %glb1, %c8 overflow<nsw, nuw> : index
+    %glb3 = arith.addi %glb2, %c8 overflow<nsw, nuw> : index
 
     %2 = arith.constant dense<0.0> : vector<8x4x1x4xf32>
 
@@ -751,14 +751,14 @@ util.func private @pingpong_medium_expanded(%lhs_base: !mexp_in_ty, %rhs_base: !
 
   scf.forall (%id) in (1024) {
     %delin:2 = affine.delinearize_index %id into (128, 8) : index, index
-    %vec = arith.muli %delin#1, %c8 : index
+    %vec = arith.muli %delin#1, %c8 overflow<nsw, nuw> : index
     %lhs_thread_local = tensor.extract_slice %lhs_init [0, %delin#0, %vec] [1, 1, 8] [1, 1, 1] : !mexp_block_in to tensor<1x1x8xf16>
     %lhs_vec_local = vector.transfer_read %lhs_thread_local [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x8xf16>, vector<1x8xf16>
     vector.transfer_write %lhs_vec_local, %lhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x8xf16>, !mshared
   } {mapping = [#gpu.thread<linear_dim_0>]}
   scf.forall (%id) in (2048) {
     %delin:2 = affine.delinearize_index %id into (256, 8) : index, index
-    %vec = arith.muli %delin#1, %c8 : index
+    %vec = arith.muli %delin#1, %c8 overflow<nsw, nuw> : index
     %rhs_thread_local = tensor.extract_slice %rhs_init [%delin#0, %vec] [1, 8] [1, 1] : !block_in to tensor<1x8xf16>
     %rhs_vec_local = vector.transfer_read %rhs_thread_local [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x8xf16>, vector<1x8xf16>
     vector.transfer_write %rhs_vec_local, %rhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
@@ -770,25 +770,25 @@ util.func private @pingpong_medium_expanded(%lhs_base: !mexp_in_ty, %rhs_base: !
   %0 = tensor.empty() : tensor<1x8x16x16x16xf32>
   %1 = scf.forall (%id) in (512) shared_outs(%out = %0) -> tensor<1x8x16x16x16xf32> {
     %ids:4 = affine.delinearize_index %id into (2, 4, 4, 16) : index, index, index, index
-    %inner_id = arith.muli %ids#2, %c4 : index
-    %m_outer_id = arith.muli %ids#0, %c4 : index
-    %n_outer_id = arith.muli %ids#1, %c4 : index
+    %inner_id = arith.muli %ids#2, %c4 overflow<nsw, nuw> : index
+    %m_outer_id = arith.muli %ids#0, %c4 overflow<nsw, nuw> : index
+    %n_outer_id = arith.muli %ids#1, %c4 overflow<nsw, nuw> : index
     %delin:2 = affine.delinearize_index %id into (64, 8) : index, index
     %wt:3 = affine.delinearize_index %id into (8, 8, 8) : index, index, index
 
     // Inner 64 loads 8 threads x 8 elements.
-    %gko = arith.muli %wt#2, %c8 : index
+    %gko = arith.muli %wt#2, %c8 overflow<nsw, nuw> : index
     // RHS indexing. Each subgroup loads 32 contiguous rows out of 256.
-    %bpo = arith.muli %wt#0, %c32 : index
+    %bpo = arith.muli %wt#0, %c32 overflow<nsw, nuw> : index
     // Base index is remaining outer 8 lanes + subgroup base.
-    %glb0 = arith.addi %wt#1, %bpo : index
-    %glb1 = arith.addi %glb0, %c8 : index
-    %glb2 = arith.addi %glb1, %c8 : index
-    %glb3 = arith.addi %glb2, %c8 : index
+    %glb0 = arith.addi %wt#1, %bpo overflow<nsw, nuw> : index
+    %glb1 = arith.addi %glb0, %c8 overflow<nsw, nuw> : index
+    %glb2 = arith.addi %glb1, %c8 overflow<nsw, nuw> : index
+    %glb3 = arith.addi %glb2, %c8 overflow<nsw, nuw> : index
     // LHS indexing.
-    %bpo_lhs = arith.muli %wt#0, %c16 : index
-    %glb0_lhs = arith.addi %wt#1, %bpo_lhs : index
-    %glb1_lhs = arith.addi %glb0_lhs, %c8 : index
+    %bpo_lhs = arith.muli %wt#0, %c16 overflow<nsw, nuw> : index
+    %glb0_lhs = arith.addi %wt#1, %bpo_lhs overflow<nsw, nuw> : index
+    %glb1_lhs = arith.addi %glb0_lhs, %c8 overflow<nsw, nuw> : index
 
     %2 = arith.constant dense<0.0> : vector<4x4x1x4xf32>
 
@@ -944,14 +944,14 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b
 
   scf.forall (%id) in (1024) {
     %delin:2 = affine.delinearize_index %id into (128, 8) : index, index
-    %vec = arith.muli %delin#1, %c16 : index
+    %vec = arith.muli %delin#1, %c16 overflow<nsw, nuw> : index
     %lhs_thread_local = tensor.extract_slice %lhs_init [0, %delin#0, %vec] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
     %lhs_vec_local = vector.transfer_read %lhs_thread_local [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
     vector.transfer_write %lhs_vec_local, %lhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !mshared_f8
   } {mapping = [#gpu.thread<linear_dim_0>]}
   scf.forall (%id) in (2048) {
     %delin:2 = affine.delinearize_index %id into (256, 8) : index, index
-    %vec = arith.muli %delin#1, %c16 : index
+    %vec = arith.muli %delin#1, %c16 overflow<nsw, nuw> : index
     %rhs_thread_local = tensor.extract_slice %rhs_init [%delin#0, %vec] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
     %rhs_vec_local = vector.transfer_read %rhs_thread_local [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
     vector.transfer_write %rhs_vec_local, %rhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
@@ -963,26 +963,26 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b
   %0 = tensor.empty() : tensor<1x8x16x16x16xf32>
   %1 = scf.forall (%id) in (512) shared_outs(%out = %0) -> tensor<1x8x16x16x16xf32> {
     %ids:4 = affine.delinearize_index %id into (2, 4, 4, 16) : index, index, index, index
-    %inner_id = arith.muli %ids#2, %c8 : index
-    %inner_id_acc = arith.muli %ids#2, %c4 : index
-    %m_outer_id = arith.muli %ids#0, %c4 : index
-    %n_outer_id = arith.muli %ids#1, %c4 : index
+    %inner_id = arith.muli %ids#2, %c8 overflow<nsw, nuw> : index
+    %inner_id_acc = arith.muli %ids#2, %c4 overflow<nsw, nuw> : index
+    %m_outer_id = arith.muli %ids#0, %c4 overflow<nsw, nuw> : index
+    %n_outer_id = arith.muli %ids#1, %c4 overflow<nsw, nuw> : index
     %delin:2 = affine.delinearize_index %id into (64, 8) : index, index
     %wt:3 = affine.delinearize_index %id into (8, 8, 8) : index, index, index
 
     // Inner 64 loads 8 threads x 16 elements.
-    %gko = arith.muli %wt#2, %c16 : index
+    %gko = arith.muli %wt#2, %c16 overflow<nsw, nuw> : index
     // RHS indexing. Each subgroup loads 32 contiguous rows out of 256.
-    %bpo = arith.muli %wt#0, %c32 : index
+    %bpo = arith.muli %wt#0, %c32 overflow<nsw, nuw> : index
     // Base index is remaining outer 8 lanes + subgroup base.
-    %glb0 = arith.addi %wt#1, %bpo : index
-    %glb1 = arith.addi %glb0, %c8 : index
-    %glb2 = arith.addi %glb1, %c8 : index
-    %glb3 = arith.addi %glb2, %c8 : index
+    %glb0 = arith.addi %wt#1, %bpo overflow<nsw, nuw> : index
+    %glb1 = arith.addi %glb0, %c8 overflow<nsw, nuw> : index
+    %glb2 = arith.addi %glb1, %c8 overflow<nsw, nuw> : index
+    %glb3 = arith.addi %glb2, %c8 overflow<nsw, nuw> : index
     // LHS indexing.
-    %bpo_lhs = arith.muli %wt#0, %c16 : index
-    %glb0_lhs = arith.addi %wt#1, %bpo_lhs : index
-    %glb1_lhs = arith.addi %glb0_lhs, %c8 : index
+    %bpo_lhs = arith.muli %wt#0, %c16 overflow<nsw, nuw> : index
+    %glb0_lhs = arith.addi %wt#1, %bpo_lhs overflow<nsw, nuw> : index
+    %glb1_lhs = arith.addi %glb0_lhs, %c8 overflow<nsw, nuw> : index
 
     %2 = arith.constant dense<0.0> : vector<4x4x1x4xf32>