@@ -1214,9 +1214,16 @@ void invokeComputeScalesAndQuantizeMatrixCol(
1214
1214
dim3 grid ((lda + CTA_SIZE - 1 ) / CTA_SIZE);
1215
1215
C10_CUDA_CHECK (cudaMemsetAsync (quant_ptr, 0 , lda * sizeof (T_S), stream));
1216
1216
C10_CUDA_KERNEL_LAUNCH_CHECK ();
1217
- computeFP8QuantizeScaleColwise<<<grid, block, 0 , stream>>> (
1218
- quant_ptr, input, numel, lda);
1219
- C10_CUDA_KERNEL_LAUNCH_CHECK ();
1217
+ FBGEMM_LAUNCH_KERNEL (
1218
+ (computeFP8QuantizeScaleColwise<T_S, T_IN>),
1219
+ grid,
1220
+ block,
1221
+ 0 ,
1222
+ stream,
1223
+ quant_ptr,
1224
+ input,
1225
+ numel,
1226
+ lda);
1220
1227
invokeQuantizeMatrixColwise (output, quant_ptr, input, numel, lda, stream);
1221
1228
}
1222
1229
@@ -1639,15 +1646,25 @@ void invokeFP4Quantization(
1639
1646
1640
1647
// Launch the cvt kernel.
1641
1648
if (useUE8M0) {
1642
- cvt_fp16_to_fp4<T, true ><<<grid, block, 0 , stream>>> (
1649
+ FBGEMM_LAUNCH_KERNEL (
1650
+ (cvt_fp16_to_fp4<T, true >),
1651
+ grid,
1652
+ block,
1653
+ 0 ,
1654
+ stream,
1643
1655
m,
1644
1656
n,
1645
1657
input,
1646
1658
SFScale,
1647
1659
reinterpret_cast <uint32_t *>(output),
1648
1660
reinterpret_cast <uint32_t *>(SFOuput));
1649
1661
} else {
1650
- cvt_fp16_to_fp4<T, false ><<<grid, block, 0 , stream>>> (
1662
+ FBGEMM_LAUNCH_KERNEL (
1663
+ (cvt_fp16_to_fp4<T, false >),
1664
+ grid,
1665
+ block,
1666
+ 0 ,
1667
+ stream,
1651
1668
m,
1652
1669
n,
1653
1670
input,
@@ -1924,10 +1941,17 @@ void fp4_fused_amax_quantize(
1924
1941
const dim3 block (blocksize, blocks_per_cta);
1925
1942
const int blocks = ceil_div (numel, blocksize * blocks_per_cta);
1926
1943
1927
- compute_amax_and_quantize_kernel<__nv_bfloat16, 16 , 4 >
1928
- <<<blocks, block, 0 , stream>>> (x, y, numel, blocksize, global_amax_ptr);
1929
-
1930
- C10_CUDA_KERNEL_LAUNCH_CHECK ();
1944
+ FBGEMM_LAUNCH_KERNEL (
1945
+ (compute_amax_and_quantize_kernel<__nv_bfloat16, 16 , 4 >),
1946
+ blocks,
1947
+ block,
1948
+ 0 ,
1949
+ stream,
1950
+ x,
1951
+ y,
1952
+ numel,
1953
+ blocksize,
1954
+ global_amax_ptr);
1931
1955
}
1932
1956
1933
1957
template <typename T_S, typename T_W>
@@ -1974,15 +1998,19 @@ void invokeComputeFP4GlobalAmax(
1974
1998
constexpr dim3 grid (1024 );
1975
1999
int64_t numel_scale = numel;
1976
2000
C10_CUDA_CHECK (cudaMemsetAsync (quant_ptr, 0 , sizeof (T_S), stream));
1977
- computeFP4GlobalAmax<<<grid, block, 0 , stream>>> (
2001
+ FBGEMM_LAUNCH_KERNEL (
2002
+ (computeFP4GlobalAmax<T_S, T_IN>),
2003
+ grid,
2004
+ block,
2005
+ 0 ,
2006
+ stream,
1978
2007
quant_ptr,
1979
2008
input,
1980
2009
numel_scale,
1981
2010
lda,
1982
2011
total_elements_per_slice,
1983
2012
bs,
1984
2013
scale_ub);
1985
- C10_CUDA_KERNEL_LAUNCH_CHECK ();
1986
2014
}
1987
2015
1988
2016
std::vector<at::Tensor> fake_quantize_nvfp4_per_tensor (
0 commit comments