@@ -76,7 +76,7 @@ template void invokeQuantization<__nv_bfloat16>(int8_t* dst, __nv_bfloat16 const
76
76
template <typename T>
77
77
void invokeMxFP8Quantization (int b, int m, int n, int padded_n, T const * input, int64_t * output,
78
78
int32_t * SFOuput, FP4QuantizationSFLayout layout,
79
- int multiProcessorCount, cudaStream_t stream) {
79
+ int multiProcessorCount, bool enable_pdl, cudaStream_t stream) {
80
80
// Fixed SF_VEC_SIZE as 32
81
81
static constexpr int SF_VEC_SIZE = 32 ;
82
82
@@ -95,7 +95,7 @@ void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input,
95
95
config.stream = stream;
96
96
cudaLaunchAttribute attrs[1 ];
97
97
attrs[0 ].id = cudaLaunchAttributeProgrammaticStreamSerialization;
98
- attrs[0 ].val .programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL () ;
98
+ attrs[0 ].val .programmaticStreamSerializationAllowed = enable_pdl ;
99
99
config.numAttrs = 1 ;
100
100
config.attrs = attrs;
101
101
cudaLaunchKernelEx (
@@ -168,7 +168,7 @@ INSTANTIATE_INVOKE_PER_TOKEN_QUANTIZATION(__nv_bfloat16, __nv_fp8_e4m3);
168
168
template <typename T, int SF_VEC_SIZE>
169
169
void invokeFP4Quantization (int m, int n, T const * input, float const * SFScale, int64_t * output,
170
170
int32_t * SFOuput, bool useUE8M0, FP4QuantizationSFLayout layout,
171
- int multiProcessorCount, cudaStream_t stream) {
171
+ int multiProcessorCount, bool enable_pdl, cudaStream_t stream) {
172
172
#ifdef ENABLE_FP8
173
173
if constexpr (std::is_same_v<T, __nv_fp8_e4m3>) {
174
174
// Grid, Block size.
@@ -204,7 +204,7 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, i
204
204
config.stream = stream;
205
205
cudaLaunchAttribute attrs[1 ];
206
206
attrs[0 ].id = cudaLaunchAttributeProgrammaticStreamSerialization;
207
- attrs[0 ].val .programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL () ;
207
+ attrs[0 ].val .programmaticStreamSerializationAllowed = enable_pdl ;
208
208
config.numAttrs = 1 ;
209
209
config.attrs = attrs;
210
210
cudaLaunchKernelEx (&config, kernel_instance, m, n, input, SFScale,
@@ -217,7 +217,7 @@ template <typename T, int SF_VEC_SIZE>
217
217
void invokeBatchedFP4Quantization (int b, int m, int n, T const * input, float const * SFScale,
218
218
int64_t * output, int32_t * SFOuput, bool useUE8M0,
219
219
int multiProcessorCount, FP4QuantizationSFLayout layout,
220
- cudaStream_t stream) {
220
+ bool enable_pdl, cudaStream_t stream) {
221
221
#ifdef ENABLE_FP8
222
222
if constexpr (std::is_same_v<T, __nv_fp8_e4m3>) {
223
223
// Grid, Block size.
@@ -253,7 +253,7 @@ void invokeBatchedFP4Quantization(int b, int m, int n, T const* input, float con
253
253
config.stream = stream;
254
254
cudaLaunchAttribute attrs[1 ];
255
255
attrs[0 ].id = cudaLaunchAttributeProgrammaticStreamSerialization;
256
- attrs[0 ].val .programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL () ;
256
+ attrs[0 ].val .programmaticStreamSerializationAllowed = enable_pdl ;
257
257
config.numAttrs = 1 ;
258
258
config.attrs = attrs;
259
259
cudaLaunchKernelEx (&config, kernel_instance, b, m, n, input, SFScale,
@@ -344,47 +344,56 @@ void invokeNVFP4BlockScaleInterleaveReverse(int b, int m, int n, uint8_t const*
344
344
template void invokeFP4Quantization<half, 16 >(int m, int n, half const * input, float const * SFScale,
345
345
int64_t * output, int32_t * SFOuput, bool useUE8M0,
346
346
FP4QuantizationSFLayout layout,
347
- int multiProcessorCount, cudaStream_t stream);
347
+ int multiProcessorCount, bool enable_pdl,
348
+ cudaStream_t stream);
348
349
template void invokeFP4Quantization<half, 32 >(int m, int n, half const * input, float const * SFScale,
349
350
int64_t * output, int32_t * SFOuput, bool useUE8M0,
350
351
FP4QuantizationSFLayout layout,
351
- int multiProcessorCount, cudaStream_t stream);
352
- template void invokeBatchedFP4Quantization<half, 16 >(
353
- int b, int m, int n, half const * input, float const * SFScale, int64_t * output, int32_t * SFOuput,
354
- bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout, cudaStream_t stream);
355
- template void invokeBatchedFP4Quantization<half, 32 >(
356
- int b, int m, int n, half const * input, float const * SFScale, int64_t * output, int32_t * SFOuput,
357
- bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout, cudaStream_t stream);
352
+ int multiProcessorCount, bool enable_pdl,
353
+ cudaStream_t stream);
354
+ template void invokeBatchedFP4Quantization<half, 16 >(int b, int m, int n, half const * input,
355
+ float const * SFScale, int64_t * output,
356
+ int32_t * SFOuput, bool useUE8M0,
357
+ int multiProcessorCount,
358
+ FP4QuantizationSFLayout layout,
359
+ bool enable_pdl, cudaStream_t stream);
360
+ template void invokeBatchedFP4Quantization<half, 32 >(int b, int m, int n, half const * input,
361
+ float const * SFScale, int64_t * output,
362
+ int32_t * SFOuput, bool useUE8M0,
363
+ int multiProcessorCount,
364
+ FP4QuantizationSFLayout layout,
365
+ bool enable_pdl, cudaStream_t stream);
358
366
template void invokeMxFP8Quantization<half>(int b, int m, int n, int padded_n, half const * input,
359
367
int64_t * output, int32_t * SFOuput,
360
368
FP4QuantizationSFLayout layout, int multiProcessorCount,
361
- cudaStream_t stream);
369
+ bool enable_pdl, cudaStream_t stream);
362
370
#ifdef ENABLE_BF16
363
371
template void invokeFP4Quantization<__nv_bfloat16, 16 >(int m, int n, __nv_bfloat16 const * input,
364
372
float const * SFScale, int64_t * output,
365
373
int32_t * SFOuput, bool useUE8M0,
366
374
FP4QuantizationSFLayout layout,
367
- int multiProcessorCount,
375
+ int multiProcessorCount, bool enable_pdl,
368
376
cudaStream_t stream);
369
377
template void invokeFP4Quantization<__nv_bfloat16, 32 >(int m, int n, __nv_bfloat16 const * input,
370
378
float const * SFScale, int64_t * output,
371
379
int32_t * SFOuput, bool useUE8M0,
372
380
FP4QuantizationSFLayout layout,
373
- int multiProcessorCount,
381
+ int multiProcessorCount, bool enable_pdl,
374
382
cudaStream_t stream);
375
383
template void invokeBatchedFP4Quantization<__nv_bfloat16, 16 >(
376
384
int b, int m, int n, __nv_bfloat16 const * input, float const * SFScale, int64_t * output,
377
385
int32_t * SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
378
- cudaStream_t stream);
386
+ bool enable_pdl, cudaStream_t stream);
379
387
template void invokeBatchedFP4Quantization<__nv_bfloat16, 32 >(
380
388
int b, int m, int n, __nv_bfloat16 const * input, float const * SFScale, int64_t * output,
381
389
int32_t * SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
382
- cudaStream_t stream);
390
+ bool enable_pdl, cudaStream_t stream);
383
391
template void invokeMxFP8Quantization<__nv_bfloat16>(int b, int m, int n, int padded_n,
384
392
__nv_bfloat16 const * input, int64_t * output,
385
393
int32_t * SFOuput,
386
394
FP4QuantizationSFLayout layout,
387
- int multiProcessorCount, cudaStream_t stream);
395
+ int multiProcessorCount, bool enable_pdl,
396
+ cudaStream_t stream);
388
397
389
398
#endif
390
399
@@ -393,22 +402,22 @@ template void invokeFP4Quantization<__nv_fp8_e4m3, 16>(int m, int n, __nv_fp8_e4
393
402
float const * SFScale, int64_t * output,
394
403
int32_t * SFOuput, bool useUE8M0,
395
404
FP4QuantizationSFLayout layout,
396
- int multiProcessorCount,
405
+ int multiProcessorCount, bool enable_pdl,
397
406
cudaStream_t stream);
398
407
template void invokeFP4Quantization<__nv_fp8_e4m3, 32 >(int m, int n, __nv_fp8_e4m3 const * input,
399
408
float const * SFScale, int64_t * output,
400
409
int32_t * SFOuput, bool useUE8M0,
401
410
FP4QuantizationSFLayout layout,
402
- int multiProcessorCount,
411
+ int multiProcessorCount, bool enable_pdl,
403
412
cudaStream_t stream);
404
413
template void invokeBatchedFP4Quantization<__nv_fp8_e4m3, 16 >(
405
414
int b, int m, int n, __nv_fp8_e4m3 const * input, float const * SFScale, int64_t * output,
406
415
int32_t * SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
407
- cudaStream_t stream);
416
+ bool enable_pdl, cudaStream_t stream);
408
417
template void invokeBatchedFP4Quantization<__nv_fp8_e4m3, 32 >(
409
418
int b, int m, int n, __nv_fp8_e4m3 const * input, float const * SFScale, int64_t * output,
410
419
int32_t * SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
411
- cudaStream_t stream);
420
+ bool enable_pdl, cudaStream_t stream);
412
421
#endif
413
422
414
423
// //////////////////////////////////////////////////////////////////////////////////////////////////
0 commit comments