@@ -301,7 +301,7 @@ __device__ __forceinline__ float GetMaxValue(float* in_data, uint32_t row_idx, u
301
301
}
302
302
max_val = max (
303
303
max_val, BlockReduce<float , BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim .reduce )
304
- .Reduce <VEC_SIZE>(in_data_, MaxReduceOp{}));
304
+ .template Reduce <VEC_SIZE>(in_data_, MaxReduceOp{}));
305
305
__syncthreads ();
306
306
}
307
307
if (tx == 0 ) {
@@ -610,7 +610,7 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
610
610
}
611
611
float aggregate_local =
612
612
BlockReduce<float , BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage->block_prim .reduce )
613
- .Sum <VEC_SIZE>(prob_greater_than_threshold);
613
+ .template Sum <VEC_SIZE>(prob_greater_than_threshold);
614
614
if (tx == 0 ) {
615
615
temp_storage->block_aggregate .value = aggregate_local;
616
616
}
@@ -623,7 +623,7 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
623
623
prob_greater_than_threshold, inclusive_cdf, temp_storage);
624
624
} else {
625
625
BlockScan<float , BLOCK_THREADS, SCAN_ALGORITHM>(temp_storage->block_prim .scan )
626
- .InclusiveSum <VEC_SIZE>(prob_greater_than_threshold, inclusive_cdf);
626
+ .template InclusiveSum <VEC_SIZE>(prob_greater_than_threshold, inclusive_cdf);
627
627
628
628
__syncthreads ();
629
629
}
@@ -639,7 +639,7 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
639
639
.SubtractLeft <VEC_SIZE>(greater_than_u, greater_than_u_diff, BoolDiffOp ());
640
640
#else
641
641
BlockAdjacentDifference<bool , BLOCK_THREADS>(temp_storage->block_prim .adj_diff )
642
- .FlagHeads <VEC_SIZE>(greater_than_u_diff, greater_than_u, BoolDiffOp (), 0 );
642
+ .template FlagHeads <VEC_SIZE>(greater_than_u_diff, greater_than_u, BoolDiffOp (), 0 );
643
643
#endif
644
644
__syncthreads ();
645
645
@@ -775,7 +775,7 @@ __global__ void SamplingFromLogitsKernel(DType* logits, IdType* output, IdType*
775
775
776
776
max_data +=
777
777
BlockReduce<DataAndIndex<DType, IdType>, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage)
778
- .Sum <VEC_SIZE>(cur_data);
778
+ .template Sum <VEC_SIZE>(cur_data);
779
779
}
780
780
if (tx == 0 ) {
781
781
output[bx] = max_data.index ;
@@ -1015,15 +1015,15 @@ __global__ void TopPSamplingFromProbKernel(DType* probs, IdType* output, IdType*
1015
1015
}
1016
1016
1017
1017
aggregate_gt_pivot_0 += BlockReduce<float , BLOCK_THREADS>(temp_storage.block_prim .reduce )
1018
- .Sum <VEC_SIZE>(probs_gt_pivot_0);
1018
+ .template Sum <VEC_SIZE>(probs_gt_pivot_0);
1019
1019
if (tx == 0 ) {
1020
1020
temp_storage.block_aggregate .value = aggregate_gt_pivot_0;
1021
1021
}
1022
1022
__syncthreads ();
1023
1023
aggregate_gt_pivot_0 = temp_storage.block_aggregate .value ;
1024
1024
1025
1025
aggregate_gt_pivot_1 += BlockReduce<float , BLOCK_THREADS>(temp_storage.block_prim .reduce )
1026
- .Sum <VEC_SIZE>(probs_gt_pivot_1);
1026
+ .template Sum <VEC_SIZE>(probs_gt_pivot_1);
1027
1027
if (tx == 0 ) {
1028
1028
temp_storage.block_aggregate .value = aggregate_gt_pivot_1;
1029
1029
}
@@ -1676,12 +1676,12 @@ __global__ void TopPRenormProbKernel(DType* probs, DType* renormed_prob, float*
1676
1676
1677
1677
aggregate_gt_pivot_0 +=
1678
1678
BlockReduce<float , BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim .reduce )
1679
- .Sum <VEC_SIZE>(probs_gt_pivot_0);
1679
+ .template Sum <VEC_SIZE>(probs_gt_pivot_0);
1680
1680
__syncthreads ();
1681
1681
1682
1682
aggregate_gt_pivot_1 +=
1683
1683
BlockReduce<float , BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim .reduce )
1684
- .Sum <VEC_SIZE>(probs_gt_pivot_1);
1684
+ .template Sum <VEC_SIZE>(probs_gt_pivot_1);
1685
1685
__syncthreads ();
1686
1686
}
1687
1687
min_gt_low = BlockReduce<float , BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim .reduce )
@@ -1917,12 +1917,12 @@ __global__ void TopKRenormProbKernel(DType* probs, DType* renormed_prob, IdType*
1917
1917
1918
1918
aggregate_gt_pivot_0 += BlockReduce<ValueCount<float >, BLOCK_THREADS, REDUCE_ALGORITHM>(
1919
1919
temp_storage.block_prim .reduce_value_count )
1920
- .Sum <VEC_SIZE>(probs_gt_pivot_0_pair);
1920
+ .template Sum <VEC_SIZE>(probs_gt_pivot_0_pair);
1921
1921
__syncthreads ();
1922
1922
1923
1923
aggregate_gt_pivot_1 += BlockReduce<ValueCount<float >, BLOCK_THREADS, REDUCE_ALGORITHM>(
1924
1924
temp_storage.block_prim .reduce_value_count )
1925
- .Sum <VEC_SIZE>(probs_gt_pivot_1_pair);
1925
+ .template Sum <VEC_SIZE>(probs_gt_pivot_1_pair);
1926
1926
__syncthreads ();
1927
1927
}
1928
1928
min_gt_low =
0 commit comments