We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 1fee196 commit 02cee70Copy full SHA for 02cee70
aten/src/ATen/native/cuda/Reduce.cuh
@@ -1159,7 +1159,8 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
1159
config.ctas_per_output = div_up(num_mp, 2);
1160
else if (config.ctas_per_output < 16)
1161
config.ctas_per_output = 1;
1162
- if (iter.ndim() == 3 && !reduction_on_fastest_striding_dimension)
+ bool is_channel_last = iter.tensor_base(1).is_contiguous(at::MemoryFormat::ChannelsLast);
1163
+ if (iter.ndim() == 3 && !reduction_on_fastest_striding_dimension && !is_channel_last)
1164
config.ctas_per_output = 4;
1165
#endif
1166
if (config.ctas_per_output > 1) {
0 commit comments