@@ -940,10 +940,14 @@ def libdevice_kernel():
940940 a = ttgl .full ([4 , 32 ], 1 , ttgl .float32 , layout )
941941 b = ttgl .full ([4 , 32 ], 2 , ttgl .float32 , layout )
942942 c = ttgl .full ([4 , 32 ], 4 , ttgl .float32 , layout )
943+
943944 libdevice .abs (a )
944945 libdevice .fast_dividef (a , b )
945946 libdevice .fma (a , b , c )
946947
948+ libdevice .isnan (a )
949+ libdevice .isinf (a )
950+
947951
948952@pytest .mark .parametrize ("target" , ALL_TARGETS )
949953def test_libdevice (target ):
@@ -962,6 +966,14 @@ def test_libdevice(target):
962966 %0 = tt.extern_elementwise %cst_0 {libname = "", libpath = "", pure = true, symbol = "..."} : (tensor<4x32xf32, #blocked>) -> tensor<4x32xf32, #blocked>
963967 %1 = tt.extern_elementwise %cst_0, %cst_2 {libname = "", libpath = "", pure = true, symbol = "..."} : (tensor<4x32xf32, #blocked>, tensor<4x32xf32, #blocked>) -> tensor<4x32xf32, #blocked>
964968 %2 = tt.extern_elementwise %cst_0, %cst_2, %cst_4 {libname = "", libpath = "", pure = true, symbol = "..."} : (tensor<4x32xf32, #blocked>, tensor<4x32xf32, #blocked>, tensor<4x32xf32, #blocked>) -> tensor<4x32xf32, #blocked>
969+ %3 = tt.extern_elementwise %cst_0 {libname = "", libpath = "", pure = true, symbol = "..."} : (tensor<4x32xf32, #blocked>) -> tensor<4x32xi32, #blocked>
970+ %c0_i32 = arith.constant 0 : i32
971+ %cst_5 = arith.constant dense<0> : tensor<4x32xi32, #blocked>
972+ %4 = arith.cmpi ne, %3, %cst_5 : tensor<4x32xi32, #blocked>
973+ %5 = tt.extern_elementwise %cst_0 {libname = "", libpath = "", pure = true, symbol = "..."} : (tensor<4x32xf32, #blocked>) -> tensor<4x32xi32, #blocked>
974+ %c0_i32_6 = arith.constant 0 : i32
975+ %cst_7 = arith.constant dense<0> : tensor<4x32xi32, #blocked>
976+ %6 = arith.cmpi ne, %5, %cst_7 : tensor<4x32xi32, #blocked>
965977 tt.return
966978 }
967979}
@@ -1926,10 +1938,10 @@ def buffer_load_store_kernel(x, y):
19261938 mask = ttgl .full ((64 , 64 ), 1 , tl .int1 , layout = layout )
19271939 other = ttgl .full ((64 , 64 ), 1.0 , tl .float32 , layout = layout )
19281940 a = ttgl .amd .cdna3 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1929- ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca ' )
1941+ ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.cs ' )
19301942
19311943 a = ttgl .amd .cdna4 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1932- ttgl .amd .cdna4 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca ' )
1944+ ttgl .amd .cdna4 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.cs ' )
19331945
19341946
19351947@pytest .mark .parametrize ("target" , [HIP_TARGET_CDNA3 , HIP_TARGET_CDNA4 ])
@@ -1951,9 +1963,9 @@ def test_buffer_load_store(target):
19511963 %cst_0 = arith.constant 1.000000e+00 : f32
19521964 %cst_1 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
19531965 %3 = amdgpu.buffer_load %arg0[%2], %cst, %cst_1 cacheModifier = ca : tensor<64x64xf32, #blocked>
1954- amdgpu.buffer_store %3, %arg1[%2], %cst cacheModifier = ca : tensor<64x64xf32, #blocked>
1966+ amdgpu.buffer_store %3, %arg1[%2], %cst cacheModifier = cs : tensor<64x64xf32, #blocked>
19551967 %4 = amdgpu.buffer_load %arg0[%2], %cst, %cst_1 cacheModifier = ca : tensor<64x64xf32, #blocked>
1956- amdgpu.buffer_store %4, %arg1[%2], %cst cacheModifier = ca : tensor<64x64xf32, #blocked>
1968+ amdgpu.buffer_store %4, %arg1[%2], %cst cacheModifier = cs : tensor<64x64xf32, #blocked>
19571969 tt.return
19581970 }
19591971}
@@ -1971,15 +1983,15 @@ def buffer_load_store_with_broadcast_kernel(x, y):
19711983
19721984 mask = ttgl .full ((64 , 1 ), 1 , tl .int1 , layout = layout )
19731985 a = ttgl .amd .cdna3 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1974- ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca ' )
1986+ ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.cs ' )
19751987
19761988 mask = ttgl .full ((1 , 64 ), 1 , tl .int1 , layout = layout )
19771989 a = ttgl .amd .cdna3 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1978- ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca ' )
1990+ ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.cs ' )
19791991
19801992 other = 1.0
19811993 a = ttgl .amd .cdna3 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1982- ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca ' )
1994+ ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.cs ' )
19831995
19841996
19851997@pytest .mark .parametrize ("target" , [HIP_TARGET_CDNA3 , HIP_TARGET_CDNA4 ])
@@ -2003,19 +2015,19 @@ def test_buffer_load_store_with_broadcast(target):
20032015 %3 = tt.broadcast %cst_1 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked>
20042016 %4 = amdgpu.buffer_load %arg0[%2], %3, %cst_0 cacheModifier = ca : tensor<64x64xf32, #blocked>
20052017 %5 = tt.broadcast %cst_1 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked>
2006- amdgpu.buffer_store %4, %arg1[%2], %5 cacheModifier = ca : tensor<64x64xf32, #blocked>
2018+ amdgpu.buffer_store %4, %arg1[%2], %5 cacheModifier = cs : tensor<64x64xf32, #blocked>
20072019 %true_2 = arith.constant true
20082020 %cst_3 = arith.constant dense<true> : tensor<1x64xi1, #blocked>
20092021 %6 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
20102022 %7 = amdgpu.buffer_load %arg0[%2], %6, %cst_0 cacheModifier = ca : tensor<64x64xf32, #blocked>
20112023 %8 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
2012- amdgpu.buffer_store %7, %arg1[%2], %8 cacheModifier = ca : tensor<64x64xf32, #blocked>
2024+ amdgpu.buffer_store %7, %arg1[%2], %8 cacheModifier = cs : tensor<64x64xf32, #blocked>
20132025 %cst_4 = arith.constant 1.000000e+00 : f32
20142026 %9 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
20152027 %cst_5 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
20162028 %10 = amdgpu.buffer_load %arg0[%2], %9, %cst_5 cacheModifier = ca : tensor<64x64xf32, #blocked>
20172029 %11 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
2018- amdgpu.buffer_store %10, %arg1[%2], %11 cacheModifier = ca : tensor<64x64xf32, #blocked>
2030+ amdgpu.buffer_store %10, %arg1[%2], %11 cacheModifier = cs : tensor<64x64xf32, #blocked>
20192031 tt.return
20202032 }
20212033}
@@ -2111,3 +2123,72 @@ def kernel():
21112123 }
21122124}
21132125""" )
2126+
2127+
2128+ @gluon .jit
2129+ def padded_shared_layout_kernel ():
2130+ padded_shared_layout : ttgl .constexpr = ttgl .PaddedSharedLayout (interval_padding_pairs = [[2 , 1 ], [4 , 2 ], [8 , 4 ]],
2131+ order = [1 , 0 ], ctas_per_cga = [1 , 1 ],
2132+ cta_split_num = [1 , 1 ], cta_order = [1 , 0 ])
2133+
2134+ ttgl .allocate_shared_memory (ttgl .int32 , [64 , 64 ], padded_shared_layout )
2135+
2136+
2137+ @pytest .mark .parametrize ("target" , [HIP_TARGET_CDNA3 , HIP_TARGET_CDNA4 ])
2138+ def test_padded_shared_layout (target ):
2139+ # This test is used to test the construction of PaddedSharedEncodingAttr in the gluon.
2140+ module = run_parser (padded_shared_layout_kernel , target = target )
2141+ expecttest .assert_expected_inline (
2142+ anonymize_ir (module .str_nodebug ()), """\
2143+ #shared = #ttg.padded_shared<[2:+1, 4:+2, 8:+4] {order = [1, 0]}>
2144+ #smem = #ttg.shared_memory
2145+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
2146+ tt.func public @padded_shared_layout_kernel() attributes {noinline = false} {
2147+ %0 = ttg.local_alloc : () -> !ttg.memdesc<64x64xi32, #shared, #smem, mutable>
2148+ tt.return
2149+ }
2150+ }
2151+ """ )
2152+
2153+
2154+ @gluon .jit
2155+ def infer_layout_for_padded_shared_kernel ():
2156+ layout : ttgl .constexpr = ttgl .PaddedSharedLayout (interval_padding_pairs = [[2 , 1 ], [4 , 2 ], [8 , 4 ]], order = [2 , 0 , 1 ])
2157+ smem = ttgl .allocate_shared_memory (ttgl .int32 , [32 , 4 , 32 ], layout )
2158+
2159+ reshaped = smem .permute ((1 , 0 , 2 ))
2160+ """
2161+ permute is [1 0 2], which means
2162+ old 1 to new 0
2163+ old 0 to new 1
2164+ old 2 to new 2
2165+ so inverseMapping[0] = 1, inverseMapping[1] = 0, inverseMapping[2] = 2
2166+
2167+ order in srcEnc is [2, 0, 1]
2168+ thus the order in dstEnc are:
2169+ newOrder[0] = inverseMapping[srcEncOrder[0]] = 2
2170+ newOrder[1] = inverseMapping[srcEncOrder[1]] = 1
2171+ newOrder[2] = inverseMapping[srcEncOrder[2]] = 0
2172+ """
2173+ ttgl .static_assert (
2174+ reshaped .layout == ttgl .PaddedSharedLayout (interval_padding_pairs = [(2 , 1 ), (4 , 2 ), (8 , 4 )], order = [2 , 1 , 0 ]))
2175+
2176+
2177+ @pytest .mark .parametrize ("target" , ALL_TARGETS )
2178+ def test_infer_layout_for_padded_shared (target ):
2179+ # This test is used to test the conversion to gluon object PaddedSharedLayout from PaddedSharedEncodingAttr.
2180+ # This conversion is in layoutToGluon and ttgl.permute will finally use it.
2181+ module = run_parser (infer_layout_for_padded_shared_kernel , target = target )
2182+ expecttest .assert_expected_inline (
2183+ anonymize_ir (module .str_nodebug ()), """\
2184+ #shared = #ttg.padded_shared<[2:+1, 4:+2, 8:+4] {order = [2, 0, 1]}>
2185+ #shared1 = #ttg.padded_shared<[2:+1, 4:+2, 8:+4] {order = [2, 1, 0]}>
2186+ #smem = #ttg.shared_memory
2187+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
2188+ tt.func public @infer_layout_for_padded_shared_kernel() attributes {noinline = false} {
2189+ %0 = ttg.local_alloc : () -> !ttg.memdesc<32x4x32xi32, #shared, #smem, mutable>
2190+ %1 = ttg.memdesc_trans %0 {order = array<i32: 1, 0, 2>} : !ttg.memdesc<32x4x32xi32, #shared, #smem, mutable> -> !ttg.memdesc<4x32x32xi32, #shared1, #smem, mutable>
2191+ tt.return
2192+ }
2193+ }
2194+ """ )
0 commit comments