diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index e5813ae1f..21d33cc43 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -639,6 +639,9 @@ def test_gpt_selective_activation_recompute(dtype, bs, model, fp8, recipe, fp8_m pytest.skip("FP8 parameters are not supported in debug mode.") if recipe.float8_block_scaling() and not fp8_block_scaling_available: pytest.skip(reason_for_no_fp8_block_scaling) + if (IS_HIP_EXTENSION and get_device_compute_capability() == (9, 5) and + dtype in (torch.float16, torch.bfloat16) and rocm_attn_backend()[2]): + pytest.skip("Test is not supported on GFX950 with current parameters and CK fused attention backend and non-zero dropout.") config = model_configs[model] @@ -761,6 +764,9 @@ def test_gpt_full_activation_recompute( pytest.skip("FP8 parameters are not supported in debug mode.") if recipe.float8_block_scaling() and not fp8_block_scaling_available: pytest.skip(reason_for_no_fp8_block_scaling) + if (IS_HIP_EXTENSION and get_device_compute_capability() == (9, 5) and + dtype in (torch.float16, torch.bfloat16) and rocm_attn_backend()[2]): + pytest.skip("Test is not supported on GFX950 with current parameters and CK fused attention backend and non-zero dropout.") config = model_configs[model] torch.compiler.reset() # avoid cache size limit overflow @@ -909,6 +915,10 @@ def _test_e2e_checkpointing(bs, dtype, config, checkpoint=False, steps=10, path= @pytest.mark.parametrize("bs", batch_sizes) @pytest.mark.parametrize("model", ["126m"]) def test_gpt_checkpointing(dtype, bs, model): + if (IS_HIP_EXTENSION and get_device_compute_capability() == (9, 5) and + dtype in (torch.float16, torch.bfloat16) and rocm_attn_backend()[2]): + pytest.skip("Test is not supported on GFX950 with current parameters and CK fused attention backend and non-zero dropout.") + config = model_configs[model] outputs = _test_e2e_checkpointing(bs, dtype, config, checkpoint=False) outputs_checkpoint = _test_e2e_checkpointing(bs, dtype, config, checkpoint=True) @@ -2410,6 +2420,9 @@ def test_gpt_fp8_parameters(dtype, bs, model, recipe): pytest.skip("FP8 parameters are not supported in debug mode.") if recipe.float8_block_scaling() and not fp8_block_scaling_available: pytest.skip(reason_for_no_fp8_block_scaling) + if (IS_HIP_EXTENSION and get_device_compute_capability() == (9, 5) and + dtype in (torch.float16, torch.bfloat16) and rocm_attn_backend()[2]): + pytest.skip("Test is not supported on GFX950 with current parameters and CK fused attention backend and non-zero dropout.") config = model_configs[model] @@ -2561,10 +2574,21 @@ def test_transformer_layer_hidden_states_format(dtype, bs, model): max_seqlen_kv=config.seq_len, ) - torch.testing.assert_close( - y_bshd, - y_thd.reshape(bs, config.seq_len, config.hidden_size).contiguous(), - ) + if IS_HIP_EXTENSION: + # On some GPUs CK fused attention with THD can produce larger error + tols = dtype_tols(dtype) + tols["atol"] = 1e-3 + torch.testing.assert_close( + y_bshd, + y_thd.reshape(bs, config.seq_len, config.hidden_size).contiguous(), + **tols, + ) + else: + + torch.testing.assert_close( + y_bshd, + y_thd.reshape(bs, config.seq_len, config.hidden_size).contiguous(), + ) @pytest.mark.parametrize("dtype", param_types)