Skip to content

Commit 6576e50

Browse files
[AMD][Gluon] Fix cache_modifier for buffer_store (#7922)
Previously, one would use `_str_to_load_cache_modifier`, leading to the "cs" modifier being rejected. The proper helper is `_str_to_store_cache_modifier`.
1 parent 28eb9fb commit 6576e50

File tree

2 files changed

+11
-11
lines changed

2 files changed

+11
-11
lines changed

python/test/gluon/test_frontend.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1938,10 +1938,10 @@ def buffer_load_store_kernel(x, y):
19381938
mask = ttgl.full((64, 64), 1, tl.int1, layout=layout)
19391939
other = ttgl.full((64, 64), 1.0, tl.float32, layout=layout)
19401940
a = ttgl.amd.cdna3.buffer_load(ptr=x, offsets=offsets, mask=mask, other=other, cache='.ca')
1941-
ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.ca')
1941+
ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.cs')
19421942

19431943
a = ttgl.amd.cdna4.buffer_load(ptr=x, offsets=offsets, mask=mask, other=other, cache='.ca')
1944-
ttgl.amd.cdna4.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.ca')
1944+
ttgl.amd.cdna4.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.cs')
19451945

19461946

19471947
@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
@@ -1963,9 +1963,9 @@ def test_buffer_load_store(target):
19631963
%cst_0 = arith.constant 1.000000e+00 : f32
19641964
%cst_1 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
19651965
%3 = amdgpu.buffer_load %arg0[%2], %cst, %cst_1 cacheModifier = ca : tensor<64x64xf32, #blocked>
1966-
amdgpu.buffer_store %3, %arg1[%2], %cst cacheModifier = ca : tensor<64x64xf32, #blocked>
1966+
amdgpu.buffer_store %3, %arg1[%2], %cst cacheModifier = cs : tensor<64x64xf32, #blocked>
19671967
%4 = amdgpu.buffer_load %arg0[%2], %cst, %cst_1 cacheModifier = ca : tensor<64x64xf32, #blocked>
1968-
amdgpu.buffer_store %4, %arg1[%2], %cst cacheModifier = ca : tensor<64x64xf32, #blocked>
1968+
amdgpu.buffer_store %4, %arg1[%2], %cst cacheModifier = cs : tensor<64x64xf32, #blocked>
19691969
tt.return
19701970
}
19711971
}
@@ -1983,15 +1983,15 @@ def buffer_load_store_with_broadcast_kernel(x, y):
19831983

19841984
mask = ttgl.full((64, 1), 1, tl.int1, layout=layout)
19851985
a = ttgl.amd.cdna3.buffer_load(ptr=x, offsets=offsets, mask=mask, other=other, cache='.ca')
1986-
ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.ca')
1986+
ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.cs')
19871987

19881988
mask = ttgl.full((1, 64), 1, tl.int1, layout=layout)
19891989
a = ttgl.amd.cdna3.buffer_load(ptr=x, offsets=offsets, mask=mask, other=other, cache='.ca')
1990-
ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.ca')
1990+
ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.cs')
19911991

19921992
other = 1.0
19931993
a = ttgl.amd.cdna3.buffer_load(ptr=x, offsets=offsets, mask=mask, other=other, cache='.ca')
1994-
ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.ca')
1994+
ttgl.amd.cdna3.buffer_store(stored_value=a, ptr=y, offsets=offsets, mask=mask, cache='.cs')
19951995

19961996

19971997
@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
@@ -2015,19 +2015,19 @@ def test_buffer_load_store_with_broadcast(target):
20152015
%3 = tt.broadcast %cst_1 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked>
20162016
%4 = amdgpu.buffer_load %arg0[%2], %3, %cst_0 cacheModifier = ca : tensor<64x64xf32, #blocked>
20172017
%5 = tt.broadcast %cst_1 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked>
2018-
amdgpu.buffer_store %4, %arg1[%2], %5 cacheModifier = ca : tensor<64x64xf32, #blocked>
2018+
amdgpu.buffer_store %4, %arg1[%2], %5 cacheModifier = cs : tensor<64x64xf32, #blocked>
20192019
%true_2 = arith.constant true
20202020
%cst_3 = arith.constant dense<true> : tensor<1x64xi1, #blocked>
20212021
%6 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
20222022
%7 = amdgpu.buffer_load %arg0[%2], %6, %cst_0 cacheModifier = ca : tensor<64x64xf32, #blocked>
20232023
%8 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
2024-
amdgpu.buffer_store %7, %arg1[%2], %8 cacheModifier = ca : tensor<64x64xf32, #blocked>
2024+
amdgpu.buffer_store %7, %arg1[%2], %8 cacheModifier = cs : tensor<64x64xf32, #blocked>
20252025
%cst_4 = arith.constant 1.000000e+00 : f32
20262026
%9 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
20272027
%cst_5 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
20282028
%10 = amdgpu.buffer_load %arg0[%2], %9, %cst_5 cacheModifier = ca : tensor<64x64xf32, #blocked>
20292029
%11 = tt.broadcast %cst_3 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked>
2030-
amdgpu.buffer_store %10, %arg1[%2], %11 cacheModifier = ca : tensor<64x64xf32, #blocked>
2030+
amdgpu.buffer_store %10, %arg1[%2], %11 cacheModifier = cs : tensor<64x64xf32, #blocked>
20312031
tt.return
20322032
}
20332033
}

python/triton/experimental/gluon/language/amd/cdna3/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def buffer_store(stored_value, ptr, offsets, mask=None, cache=None, _semantic: G
7777
offsets, mask = _semantic.broadcast_impl_value(offsets, mask)
7878

7979
mask = mask.handle if mask is not None else ir.value()
80-
cache_modifier = _semantic._str_to_load_cache_modifier(cache) if cache is not None else ir.CACHE_MODIFIER.NONE
80+
cache_modifier = _semantic._str_to_store_cache_modifier(cache) if cache is not None else ir.CACHE_MODIFIER.NONE
8181

8282
_semantic.builder.create_buffer_store(stored_value.handle, ptr.handle, offsets.handle, mask, cache_modifier)
8383

0 commit comments

Comments
 (0)