@@ -1938,10 +1938,10 @@ def buffer_load_store_kernel(x, y):
19381938 mask = ttgl .full ((64 , 64 ), 1 , tl .int1 , layout = layout )
19391939 other = ttgl .full ((64 , 64 ), 1.0 , tl .float32 , layout = layout )
19401940 a = ttgl .amd .cdna3 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1941- ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca ' )
1941+ ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.cs ' )
19421942
19431943 a = ttgl .amd .cdna4 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1944- ttgl .amd .cdna4 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca ' )
1944+ ttgl .amd .cdna4 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.cs ' )
19451945
19461946
19471947@pytest .mark .parametrize ("target" , [HIP_TARGET_CDNA3 , HIP_TARGET_CDNA4 ])
@@ -1963,9 +1963,9 @@ def test_buffer_load_store(target):
19631963 %cst_0 = arith.constant 1.000000e+00 : f32
19641964 %cst_1 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
19651965 %3 = amdgpu.buffer_load %arg0[%2], %cst, %cst_1 cacheModifier = ca : tensor<64x64xf32, #blocked>
1966- amdgpu.buffer_store %3, %arg1[%2], %cst cacheModifier = ca : tensor<64x64xf32, #blocked>
1966+ amdgpu.buffer_store %3, %arg1[%2], %cst cacheModifier = cs : tensor<64x64xf32, #blocked>
19671967 %4 = amdgpu.buffer_load %arg0[%2], %cst, %cst_1 cacheModifier = ca : tensor<64x64xf32, #blocked>
1968- amdgpu.buffer_store %4, %arg1[%2], %cst cacheModifier = ca : tensor<64x64xf32, #blocked>
1968+ amdgpu.buffer_store %4, %arg1[%2], %cst cacheModifier = cs : tensor<64x64xf32, #blocked>
19691969 tt.return
19701970 }
19711971}
@@ -1983,15 +1983,15 @@ def buffer_load_store_with_broadcast_kernel(x, y):
19831983
19841984 mask = ttgl .full ((64 , 1 ), 1 , tl .int1 , layout = layout )
19851985 a = ttgl .amd .cdna3 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1986- ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca ' )
1986+ ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.cs ' )
19871987
19881988 mask = ttgl .full ((1 , 64 ), 1 , tl .int1 , layout = layout )
19891989 a = ttgl .amd .cdna3 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1990- ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca ' )
1990+ ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.cs ' )
19911991
19921992 other = 1.0
19931993 a = ttgl .amd .cdna3 .buffer_load (ptr = x , offsets = offsets , mask = mask , other = other , cache = '.ca' )
1994- ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.ca ' )
1994+ ttgl .amd .cdna3 .buffer_store (stored_value = a , ptr = y , offsets = offsets , mask = mask , cache = '.cs ' )
19951995
19961996
19971997@pytest .mark .parametrize ("target" , [HIP_TARGET_CDNA3 , HIP_TARGET_CDNA4 ])
@@ -2015,19 +2015,19 @@ def test_buffer_load_store_with_broadcast(target):
20152015 %3 = tt.broadcast %cst_1 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked>
20162016 %4 = amdgpu.buffer_load %arg0[%2], %3, %cst_0 cacheModifier = ca : tensor<64x64xf32, #blocked>
20172017 %5 = tt.broadcast %cst_1 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked>
2018- amdgpu.buffer_store %4, %arg1[%2], %5 cacheModifier = ca : tensor<64x64xf32, #blocked>
2018+ amdgpu.buffer_store %4, %arg1[%2], %5 cacheModifier = cs : tensor<64x64xf32, #blocked>
20192019 %true_2 = arith.constant true
20202020 %cst_3 = arith.constant dense<true> : tensor<1x64xi1, #blocked>
20212021 %6 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
20222022 %7 = amdgpu.buffer_load %arg0[%2], %6, %cst_0 cacheModifier = ca : tensor<64x64xf32, #blocked>
20232023 %8 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
2024- amdgpu.buffer_store %7, %arg1[%2], %8 cacheModifier = ca : tensor<64x64xf32, #blocked>
2024+ amdgpu.buffer_store %7, %arg1[%2], %8 cacheModifier = cs : tensor<64x64xf32, #blocked>
20252025 %cst_4 = arith.constant 1.000000e+00 : f32
20262026 %9 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
20272027 %cst_5 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
20282028 %10 = amdgpu.buffer_load %arg0[%2], %9, %cst_5 cacheModifier = ca : tensor<64x64xf32, #blocked>
20292029 %11 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
2030- amdgpu.buffer_store %10, %arg1[%2], %11 cacheModifier = ca : tensor<64x64xf32, #blocked>
2030+ amdgpu.buffer_store %10, %arg1[%2], %11 cacheModifier = cs : tensor<64x64xf32, #blocked>
20312031 tt.return
20322032 }
20332033}
0 commit comments