@@ -2509,7 +2509,6 @@ at::Tensor _qscaled_dot_product_cpu(
2509
2509
#ifdef CPU_CAPABILITY_AVX512
2510
2510
if (at::native::cpublas::could_pack (dtype)) {
2511
2511
at::Tensor output = at::empty_like (query, query.options ()).transpose (1 , 2 );
2512
- std::cout << " int8_sdpa_fused_kernel" << std::endl;
2513
2512
int8_sdpa_fused_kernel (output, query, key, value,
2514
2513
dropout_p, is_causal, attn_mask, scale,
2515
2514
q_scale, q_zp,
@@ -2520,7 +2519,6 @@ at::Tensor _qscaled_dot_product_cpu(
2520
2519
return output.transpose (1 , 2 );
2521
2520
} else {
2522
2521
#endif // CPU_CAPABILITY_AVX512
2523
- std::cout << " int8_sdpa_math_kernel" << std::endl;
2524
2522
return int8_sdpa_math_kernel (query, key, value,
2525
2523
dropout_p, is_causal, attn_mask, scale,
2526
2524
q_scale, q_zp,
@@ -2536,7 +2534,6 @@ at::Tensor _qscaled_dot_product_cpu(
2536
2534
// CPUBLAS_BRGEMM_F8F8F32 is defined if FP8 BRGEMM is supported in PyTorch CPUBlas.
2537
2535
if (at::native::cpublas::could_pack (dtype)) {
2538
2536
at::Tensor output = at::empty_like (query, query.options ()).transpose (1 , 2 );
2539
- std::cout << " fp8_sdpa_fused_kernel" << std::endl;
2540
2537
fp8_sdpa_fused_kernel (output, query, key, value,
2541
2538
dropout_p, is_causal, attn_mask, scale,
2542
2539
q_scale, k_scale,
@@ -2545,7 +2542,6 @@ at::Tensor _qscaled_dot_product_cpu(
2545
2542
return output.transpose (1 , 2 );
2546
2543
} else {
2547
2544
#endif // CPU_CAPABILITY_AVX512 && CPUBLAS_BRGEMM_F8F8F32
2548
- std::cout << " fp8_sdpa_math_kernel" << std::endl;
2549
2545
return fp8_sdpa_math_kernel (query, key, value,
2550
2546
dropout_p, is_causal, attn_mask, scale,
2551
2547
q_scale, k_scale,
0 commit comments