@@ -85,7 +85,7 @@ def async_tma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr
8585 tma .store_wait (0 )
8686
8787
88- @pytest .mark .skipif (not is_cuda () or torch .cuda .get_device_capability ()[0 ] < 9 , reason = "Requires hopper or newer" )
88+ @pytest .mark .xfail (not is_cuda () or torch .cuda .get_device_capability ()[0 ] < 9 , reason = "Requires hopper or newer" )
8989@pytest .mark .parametrize ("FAILURE" , [True , False ])
9090def test_async_tma_kernel (FAILURE , device , run_wrapper ):
9191 if run_wrapper :
@@ -141,7 +141,7 @@ def tma_interleave_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: ttgl.cons
141141 tma .store_wait (0 )
142142
143143
144- @pytest .mark .skipif (not is_cuda () or torch .cuda .get_device_capability ()[0 ] < 9 , reason = "Requires hopper or newer" )
144+ @pytest .mark .xfail (not is_cuda () or torch .cuda .get_device_capability ()[0 ] < 9 , reason = "Requires hopper or newer" )
145145@pytest .mark .parametrize ("FAILURE" , [True , False ])
146146def test_tma_interleave_kernel (FAILURE , device , run_wrapper ):
147147 if run_wrapper :
@@ -190,7 +190,7 @@ def async_copy_kernel(input, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr):
190190 ampere .async_copy .wait_group (0 )
191191
192192
193- @pytest .mark .skipif (not is_cuda () or torch .cuda .get_device_capability ()[0 ] < 9 , reason = "Requires ampere or newer" )
193+ @pytest .mark .xfail (not is_cuda () or torch .cuda .get_device_capability ()[0 ] < 9 , reason = "Requires ampere or newer" )
194194@pytest .mark .parametrize ("FAILURE" , [True , False ])
195195def test_async_copy (FAILURE , device , run_wrapper ):
196196 if run_wrapper :
@@ -252,7 +252,7 @@ def tcgen5_mma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexp
252252 mbarrier .invalidate (bar .index (1 ))
253253
254254
255- @pytest .mark .skipif (not is_cuda () or torch .cuda .get_device_capability ()[0 ] < 10 , reason = "Requires blackwell or newer" )
255+ @pytest .mark .xfail (not is_cuda () or torch .cuda .get_device_capability ()[0 ] < 10 , reason = "Requires blackwell or newer" )
256256@pytest .mark .parametrize ("FAILURE" , [True , False ])
257257@pytest .mark .parametrize ("MEM_ACCESS_KIND" , ["tma_cp" , "local_store" , "tmem_load" , "tmem_store" ])
258258def test_tcgen5_mma (FAILURE , MEM_ACCESS_KIND , device , run_wrapper ):
@@ -305,7 +305,7 @@ def warpgroup_mma_kernel(input, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr)
305305 smemA .store (ttgl .full ([XBLOCK , XBLOCK ], 42 , ttgl .float16 , blocked_layout ))
306306
307307
308- @pytest .mark .skipif (not is_cuda () or torch .cuda .get_device_capability ()[0 ] != 9 , reason = "Requires hopper" )
308+ @pytest .mark .xfail (not is_cuda () or torch .cuda .get_device_capability ()[0 ] != 9 , reason = "Requires hopper" )
309309@pytest .mark .parametrize ("FAILURE" , [True , False ])
310310def test_warpgroup_mma (FAILURE , device , run_wrapper ):
311311 if run_wrapper :
@@ -353,7 +353,7 @@ def warpgroup_mma_kernel2(input, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr
353353 smemA .store (ttgl .full ([XBLOCK , XBLOCK ], 42 , ttgl .float16 , blocked_layout ))
354354
355355
356- @pytest .mark .skipif (not is_cuda () or torch .cuda .get_device_capability ()[0 ] != 9 , reason = "Requires hopper" )
356+ @pytest .mark .xfail (not is_cuda () or torch .cuda .get_device_capability ()[0 ] != 9 , reason = "Requires hopper" )
357357@pytest .mark .parametrize ("FAILURE" , [True , False ])
358358def test_warpgroup_mma2 (FAILURE , device , run_wrapper ):
359359 if run_wrapper :
@@ -406,7 +406,7 @@ def tcgen5_mma_multibar_kernel(input_desc, XBLOCK: ttgl.constexpr, BUF_IDX: ttgl
406406 mbarrier .invalidate (bar .index (i ))
407407
408408
409- @pytest .mark .skipif (not is_cuda () or torch .cuda .get_device_capability ()[0 ] < 10 , reason = "Requires blackwell or newer" )
409+ @pytest .mark .xfail (not is_cuda () or torch .cuda .get_device_capability ()[0 ] < 10 , reason = "Requires blackwell or newer" )
410410@pytest .mark .parametrize ("BUF_IDX" , [0 , 1 ])
411411@pytest .mark .parametrize ("BAR_IDX" , [0 , 1 , 2 , 3 ])
412412def test_tcgen5_mma_multibar (BUF_IDX , BAR_IDX , device , run_wrapper ):
@@ -529,7 +529,7 @@ def multibuffered_loop_tma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: t
529529 mbarrier .invalidate (barMMA .index (i ))
530530
531531
532- @pytest .mark .skipif (not is_cuda () or torch .cuda .get_device_capability ()[0 ] < 10 , reason = "Requires blackwell or newer" )
532+ @pytest .mark .xfail (not is_cuda () or torch .cuda .get_device_capability ()[0 ] < 10 , reason = "Requires blackwell or newer" )
533533@pytest .mark .parametrize ("FAILURE" , [True , False ])
534534def test_multibuffered_loop (FAILURE , device , run_wrapper ):
535535 if run_wrapper :
@@ -611,7 +611,7 @@ def multibuffered_loop_wgmma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE:
611611 mbarrier .invalidate (barLoadB .index (i ))
612612
613613
614- @pytest .mark .skipif (not is_cuda () or torch .cuda .get_device_capability ()[0 ] != 9 , reason = "Requires hopper" )
614+ @pytest .mark .xfail (not is_cuda () or torch .cuda .get_device_capability ()[0 ] != 9 , reason = "Requires hopper" )
615615@pytest .mark .parametrize ("FAILURE" , [True , False ])
616616def test_multibuffered_wgmma_loop (FAILURE , device , run_wrapper ):
617617 if run_wrapper :
0 commit comments