@@ -2614,8 +2614,6 @@ def kernel(X, Y, N, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, NUM_PID_N: tl.
26142614@pytest .mark .parametrize ("axis" , [0 , 1 ])
26152615@pytest .mark .parametrize ("add_overflow_check" , [False , True ])
26162616def test_scan_layouts (M , N , src_layout , axis , add_overflow_check , device , tmp_path : pathlib .Path ):
2617- if add_overflow_check is True and is_hip ():
2618- pytest .skip ("overflow check disabled on HIP while fixing issues" )
26192617
26202618 overflow_check = """
26212619 %17 = arith.extsi %arg2 : i32 to i64
@@ -2708,8 +2706,6 @@ def test_reduce_layouts(M, N, src_layout, axis, epilogue_kind, dtype_str, add_ov
27082706 pytest .skip ("Skipping because tensor shape is smaller than M(f)maLayout instr_shape" )
27092707 if is_hip () and isinstance (src_layout , MfmaLayout ) and ((M , N ) == (128 , 128 )):
27102708 pytest .skip ("Skipping test because it runs out of shared memory" )
2711- if add_overflow_check is True and is_hip ():
2712- pytest .skip ("overflow check disabled on HIP while fixing issues" )
27132709 if reduce_op == "sum" and dtype_str == "float16" and M * N > 1024 :
27142710 pytest .xfail ("Skipping sum reduction on float16 due to accuracy issues" )
27152711
@@ -5489,21 +5485,11 @@ def test_convertmma2mma(M, N, mma_pair, dtype, device, tmp_path: pathlib.Path):
54895485 pytest .skip ("Skip testing MMAv3 on devices with CC < 9" )
54905486
54915487 num_warps = np .cumprod (src_layout .warps_per_cta )[- 1 ]
5492- # TODO(Keren): Remove the intermediate layout once we have resolved the redundantDataMask issue for WGMMA
5493- warps_per_cta = src_layout .warps_per_cta
5494- interm = BlockedLayout ([1 , 4 ], [4 , THREADS_PER_WARP // 4 ], [warps_per_cta [0 ], warps_per_cta [1 ]], [0 , 1 ], [1 , 1 ],
5495- [1 , 1 ], [0 , 1 ])
54965488
54975489 def do_test (src_layout , dst_layout ):
54985490 layouts = f"""
54995491 #src = { src_layout }
55005492 #dst = { dst_layout }
5501- #interm = { interm }
5502- """
5503-
5504- conversion = f"""
5505- %12 = triton_gpu.convert_layout %9 : tensor<{ M } x{ N } xi32, #src> -> tensor<{ M } x{ N } xi32, #dst>
5506- %13 = triton_gpu.convert_layout %11 : tensor<{ M } x{ N } xf16, #src> -> tensor<{ M } x{ N } xf16, #dst>
55075493 """
55085494
55095495 ir = layouts + f"""
@@ -5513,6 +5499,7 @@ def do_test(src_layout, dst_layout):
55135499 %0 = tt.make_range {{end = { M } : i32, start = 0 : i32}} : tensor<{ M } xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>>
55145500 %1 = tt.make_range {{end = { N } : i32, start = 0 : i32}} : tensor<{ N } xi32, #triton_gpu.slice<{{dim = 0, parent = #src}}>>
55155501 %2 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<{ M } x{ N } x!tt.ptr<f16>, #src>
5502+ %3 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<{ M } x{ N } x!tt.ptr<f16>, #dst>
55165503 %4 = tt.expand_dims %0 {{axis = 1 : i32}} : tensor<{ M } xi32, #triton_gpu.slice<{{dim = 1, parent = #src}}>> -> tensor<{ M } x1xi32, #src>
55175504 %5 = arith.muli %4, %cst : tensor<{ M } x1xi32, #src>
55185505 %6 = tt.expand_dims %1 {{axis = 0 : i32}} : tensor<{ N } xi32, #triton_gpu.slice<{{dim = 0, parent = #src}}>> -> tensor<1x{ N } xi32, #src>
@@ -5521,12 +5508,10 @@ def do_test(src_layout, dst_layout):
55215508 %9 = arith.addi %8, %7 : tensor<{ M } x{ N } xi32, #src>
55225509 %10 = tt.addptr %2, %9 : tensor<{ M } x{ N } x!tt.ptr<f16>, #src>, tensor<{ M } x{ N } xi32, #src>
55235510 %11 = tt.load %10 : tensor<{ M } x{ N } x!tt.ptr<f16>, #src>
5524- %3 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<{ M } x{ N } x!tt.ptr<f16>, #interm>
5525- """ + conversion + f"""
5526- %15 = triton_gpu.convert_layout %12 : tensor<{ M } x{ N } xi32, #dst> -> tensor<{ M } x{ N } xi32, #interm>
5527- %16 = triton_gpu.convert_layout %13 : tensor<{ M } x{ N } xf16, #dst> -> tensor<{ M } x{ N } xf16, #interm>
5528- %17 = tt.addptr %3, %15 : tensor<{ M } x{ N } x!tt.ptr<f16>, #interm>, tensor<{ M } x{ N } xi32, #interm>
5529- tt.store %17, %16 : tensor<{ M } x{ N } x!tt.ptr<f16>, #interm>
5511+ %12 = triton_gpu.convert_layout %9 : tensor<{ M } x{ N } xi32, #src> -> tensor<{ M } x{ N } xi32, #dst>
5512+ %13 = triton_gpu.convert_layout %11 : tensor<{ M } x{ N } xf16, #src> -> tensor<{ M } x{ N } xf16, #dst>
5513+ %14 = tt.addptr %3, %12 : tensor<{ M } x{ N } x!tt.ptr<f16>, #dst>, tensor<{ M } x{ N } xi32, #dst>
5514+ tt.store %14, %13 : tensor<{ M } x{ N } x!tt.ptr<f16>, #dst>
55305515 tt.return
55315516 }}
55325517 }}
0 commit comments