@@ -1608,7 +1608,9 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
1608
1608
@pytest .mark .parametrize ("M_BLOCK,N_BLOCK" , [(2 , 16 ), (8 , 16 ), (8 , 32 ), (8 , 128 )])
1609
1609
def test_host_tensor_descriptor_load (dtype_str , num_ctas , M_BLOCK , N_BLOCK , device ):
1610
1610
if num_ctas == 2 and (not is_cuda () or torch .cuda .get_device_capability (0 )[0 ] not in (9 , 10 )):
1611
- pytest .skip ("CTAs is unsupported for these cards" )
1611
+ pytest .xfail ("CTAs is unsupported for these cards" )
1612
+ if is_xpu ():
1613
+ pytest .skip ("FIXME: issue #4289" )
1612
1614
1613
1615
@triton .jit (debug = True )
1614
1616
def kernel (out_ptr , desc , M , N , M_BLOCK : tl .constexpr , N_BLOCK : tl .constexpr ):
@@ -1668,10 +1670,12 @@ def matmul_kernel_host_tensor_descriptor(a_desc, b_desc, c_desc):
1668
1670
])
1669
1671
def test_host_tensor_descriptor_matmul (num_stages , num_ctas , BLOCK_M , BLOCK_N , BLOCK_K , device ):
1670
1672
if num_ctas == 2 and (not is_cuda () or torch .cuda .get_device_capability (0 )[0 ] not in (9 , 10 )):
1671
- pytest .skip ("CTAs is unsupported for these cards" )
1673
+ pytest .xfail ("CTAs is unsupported for these cards" )
1672
1674
1673
1675
if is_hip () and (BLOCK_M , BLOCK_N , BLOCK_K , num_stages ) == (256 , 128 , 32 , 4 ):
1674
1676
pytest .skip ("Insufficient shared memory on HIP devices" )
1677
+ if is_xpu ():
1678
+ pytest .skip ("FIXME: issue #4289" )
1675
1679
1676
1680
if is_interpreter ():
1677
1681
M , N , K = BLOCK_M , BLOCK_N , BLOCK_K
0 commit comments