Skip to content

Commit 7145db6

Browse files
author
zhangkaihuo
authored
Revert "SparseConv support duplicate coordinates (#44976)" (#45202) (#47699)
Revert SparseConv support duplicate coordinates
1 parent 72e1eb6 commit 7145db6

File tree

4 files changed

+53
-168
lines changed

4 files changed

+53
-168
lines changed

paddle/phi/kernels/funcs/sparse/scatter.cu.h

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ __global__ void ScatterKernelV2(const T* input,
7979
const int* index_groups,
8080
const int non_zero_num,
8181
const int kernel_size,
82-
const int max_voxel,
8382
const int channels,
8483
const int buffer_counts,
8584
T* out) {
@@ -97,11 +96,10 @@ __global__ void ScatterKernelV2(const T* input,
9796
&sums);
9897
for (int it = 0; it < buffer_counts; it++) {
9998
int len = index_counts[indices_i + it * non_zero_num];
100-
const int group_offset = it * max_voxel * kernel_size * non_zero_num;
99+
const int group_offset = it * kernel_size * non_zero_num;
101100
for (int j = 0; j < len; j++) {
102101
const int out_feature_i =
103-
index_groups[indices_i * max_voxel * kernel_size + j +
104-
group_offset];
102+
index_groups[indices_i * kernel_size + j + group_offset];
105103
LoadT vec_in;
106104
phi::Load<T, VecSize>(
107105
input + out_feature_i * channels + channels_i * VecSize, &vec_in);
@@ -123,7 +121,6 @@ void ScatterV2(const GPUContext& dev_ctx,
123121
const int* index_groups,
124122
const int non_zero_num,
125123
const int kernel_size,
126-
const int max_voxel,
127124
const int channels,
128125
const int buffer_counts,
129126
T* output) {
@@ -139,7 +136,6 @@ void ScatterV2(const GPUContext& dev_ctx,
139136
index_groups,
140137
non_zero_num,
141138
kernel_size,
142-
max_voxel,
143139
channels,
144140
buffer_counts,
145141
output);
@@ -154,7 +150,6 @@ void ScatterV2(const GPUContext& dev_ctx,
154150
index_groups,
155151
non_zero_num,
156152
kernel_size,
157-
max_voxel,
158153
channels,
159154
buffer_counts,
160155
output);

paddle/phi/kernels/sparse/gpu/conv.cu.h

Lines changed: 27 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ __global__ void GatherKernelV2(const T* inputs,
6666
const int* index_groups,
6767
const int non_zero_num,
6868
const int kernel_size,
69-
const int max_voxel,
7069
const int channels,
7170
const int buffer_count,
7271
T* output) {
@@ -84,11 +83,10 @@ __global__ void GatherKernelV2(const T* inputs,
8483
#pragma unroll
8584
for (int it = 0; it < buffer_count; it++) {
8685
int len = index_counts[indices_i + it * non_zero_num];
87-
const int group_offset = it * kernel_size * max_voxel * non_zero_num;
86+
const int group_offset = it * kernel_size * non_zero_num;
8887
#pragma unroll
8988
for (int j = 0; j < len; j++) {
90-
int out_i = index_groups[indices_i * kernel_size * max_voxel + j +
91-
group_offset];
89+
int out_i = index_groups[indices_i * kernel_size + j + group_offset];
9290
phi::Store<T, VecSize>(
9391
in_vec, output + out_i * channels + channels_i * VecSize);
9492
}
@@ -130,7 +128,6 @@ inline void GatherV2(const GPUContext& dev_ctx,
130128
const int* index_groups,
131129
const int non_zero_num,
132130
const int kernel_size,
133-
const int max_voxel,
134131
const int channels,
135132
const int buffer_count,
136133
T* output) {
@@ -146,7 +143,6 @@ inline void GatherV2(const GPUContext& dev_ctx,
146143
index_groups,
147144
non_zero_num,
148145
kernel_size,
149-
max_voxel,
150146
channels,
151147
buffer_count,
152148
output);
@@ -161,7 +157,6 @@ inline void GatherV2(const GPUContext& dev_ctx,
161157
index_groups,
162158
non_zero_num,
163159
kernel_size,
164-
max_voxel,
165160
channels,
166161
buffer_count,
167162
output);
@@ -207,7 +202,7 @@ __global__ void UniqueKernel(const IntT* in_indexs,
207202
template <typename IntT>
208203
__global__ void GroupIndexs(const int* out_index_table,
209204
const int n,
210-
const int offset,
205+
const int kernel_size,
211206
IntT* out_indexs,
212207
int* out_index_counts,
213208
int* out_index_groups) {
@@ -219,7 +214,7 @@ __global__ void GroupIndexs(const int* out_index_table,
219214
// kernel_size at most
220215
int j = atomicAdd(out_index_counts + real_index, 1);
221216
// nnz * kernel_size
222-
out_index_groups[real_index * offset + j] = i;
217+
out_index_groups[real_index * kernel_size + j] = i;
223218
}
224219
}
225220

@@ -303,36 +298,18 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
303298
}
304299
}
305300

306-
template <typename IntT, bool save_out_index = true>
301+
template <typename IntT>
307302
__global__ void GetOutIndexTable(const IntT* indices,
308303
const IntT non_zero_num,
309304
const Dims4D dims,
310-
int* out_index_table,
311-
int* out_index_table2,
312-
int* max_voxel) {
313-
__shared__ int cache_max;
314-
if (threadIdx.x == 0) {
315-
cache_max = 0;
316-
}
317-
__syncthreads();
318-
305+
int* out_index_table) {
319306
CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
320307
IntT batch = indices[i];
321308
IntT in_z = indices[i + non_zero_num];
322309
IntT in_y = indices[i + 2 * non_zero_num];
323310
IntT in_x = indices[i + 3 * non_zero_num];
324311
IntT index = PointToIndex(batch, in_x, in_y, in_z, dims);
325-
if (save_out_index) {
326-
out_index_table[index] = i == 0 ? -1 : i;
327-
}
328-
329-
int count = atomicAdd(out_index_table2 + index, 1);
330-
atomicMax(&cache_max, count);
331-
}
332-
333-
__syncthreads();
334-
if (threadIdx.x == 0) {
335-
atomicMax(max_voxel, cache_max + 1);
312+
out_index_table[index] = i == 0 ? -1 : i;
336313
}
337314
}
338315

@@ -341,22 +318,10 @@ __global__ void GetOutIndexTable(int* indexs,
341318
const int non_zero_num,
342319
const Dims4D out_dims,
343320
int* out_index_table,
344-
int* out_index_table2,
345-
int* max_voxel,
346321
IntT* out_indices) {
347-
__shared__ int cache_max;
348-
if (threadIdx.x == 0) {
349-
cache_max = 0;
350-
}
351-
__syncthreads();
352-
353322
CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
354323
IntT index = static_cast<IntT>(indexs[i]);
355324
out_index_table[index] = i;
356-
357-
int count = atomicAdd(out_index_table2 + index, 1);
358-
atomicMax(&cache_max, count);
359-
360325
IntT batch, x, y, z;
361326
phi::funcs::sparse::IndexToPoint<Dims4D>(
362327
index, out_dims, &batch, &x, &y, &z);
@@ -367,11 +332,6 @@ __global__ void GetOutIndexTable(int* indexs,
367332
out_indices[i + non_zero_num * 3] = x;
368333
indexs[i] = 0;
369334
}
370-
371-
__syncthreads();
372-
if (threadIdx.x == 0) {
373-
atomicMax(max_voxel, cache_max + 1);
374-
}
375335
}
376336

377337
template <typename IntT>
@@ -491,7 +451,7 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
491451

492452
template <typename IntT>
493453
__global__ void GroupIndexs(const int n,
494-
const int offset,
454+
const int kernel_size,
495455
const IntT* indexs,
496456
int* index_counts,
497457
int* index_groups) {
@@ -500,15 +460,15 @@ __global__ void GroupIndexs(const int n,
500460
// kernel_size at most
501461
int j = atomicAdd(index_counts + index, 1);
502462
// nnz * kernel_size
503-
index_groups[index * offset + j] = i;
463+
index_groups[index * kernel_size + j] = i;
504464
}
505465
}
506466

507467
// double space to reduce atomicAdd conflict
508468
template <typename IntT>
509469
__global__ void GroupIndexsV2(const int rulebook_len,
510470
const int non_zero_num,
511-
const int offset,
471+
const int kernel_size,
512472
const int half_kernel_offset,
513473
const IntT* indexs,
514474
int* index_counts,
@@ -519,11 +479,11 @@ __global__ void GroupIndexsV2(const int rulebook_len,
519479
i < half_kernel_offset ? index_counts : index_counts + non_zero_num;
520480
int* groups_ptr = i < half_kernel_offset
521481
? index_groups
522-
: index_groups + non_zero_num * offset;
482+
: index_groups + non_zero_num * kernel_size;
523483
// conflict kernel_size times at most
524484
int j = atomicAdd(counts_ptr + index, 1);
525485
// nnz * kernel_size
526-
groups_ptr[index * offset + j] = i;
486+
groups_ptr[index * kernel_size + j] = i;
527487
}
528488
}
529489

@@ -622,10 +582,6 @@ int ProductRuleBook(const Context& dev_ctx,
622582
DenseTensor out_index_table = phi::Empty<int>(dev_ctx, {table_size});
623583
int* out_index_table_ptr = out_index_table.data<int>();
624584

625-
DenseTensor out_index_table2 = phi::Empty<int>(dev_ctx, {table_size + 1});
626-
int* out_index_table2_ptr = out_index_table2.data<int>();
627-
int* h_max_voxel = h_counter + kernel_size;
628-
629585
if (subm) {
630586
DenseTensor tmp_rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta));
631587
IntT* rulebook_ptr = tmp_rulebook.data<IntT>();
@@ -636,29 +592,14 @@ int ProductRuleBook(const Context& dev_ctx,
636592

637593
phi::backends::gpu::GpuMemsetAsync(
638594
out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
639-
phi::backends::gpu::GpuMemsetAsync(out_index_table2_ptr,
640-
0,
641-
sizeof(int) * (table_size + 1),
642-
dev_ctx.stream());
643595

644596
auto config =
645597
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
646-
GetOutIndexTable<IntT>
647-
<<<config.block_per_grid,
648-
config.thread_per_block,
649-
0,
650-
dev_ctx.stream()>>>(out_indices.data<IntT>(),
651-
non_zero_num,
652-
d_x_dims,
653-
out_index_table_ptr,
654-
out_index_table2_ptr,
655-
out_index_table2_ptr + table_size);
656-
phi::backends::gpu::GpuMemcpyAsync(h_max_voxel,
657-
out_index_table2_ptr + table_size,
658-
sizeof(int),
659-
gpuMemcpyDeviceToHost,
660-
dev_ctx.stream());
661-
dev_ctx.Wait();
598+
GetOutIndexTable<IntT><<<config.block_per_grid,
599+
config.thread_per_block,
600+
0,
601+
dev_ctx.stream()>>>(
602+
out_indices.data<IntT>(), non_zero_num, d_x_dims, out_index_table_ptr);
662603

663604
size_t cache_size =
664605
kernel_size * 2 * sizeof(int) +
@@ -712,22 +653,6 @@ int ProductRuleBook(const Context& dev_ctx,
712653
out_rulebook_ptr);
713654
*rulebook = out_rulebook;
714655

715-
unique_value->ResizeAndAllocate(
716-
{static_cast<int>(non_zero_num * h_max_voxel[0] * kernel_size)});
717-
int* unique_value_ptr = unique_value->data<int>();
718-
out_index->ResizeAndAllocate({static_cast<int>(rulebook_len)});
719-
int* out_index_ptr = out_index->data<int>();
720-
phi::backends::gpu::GpuMemsetAsync(
721-
out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
722-
GroupIndexs<<<config.block_per_grid,
723-
config.thread_per_block,
724-
0,
725-
dev_ctx.stream()>>>(rulebook_len,
726-
kernel_size * h_max_voxel[0],
727-
out_rulebook_ptr + rulebook_len,
728-
out_index_ptr,
729-
unique_value_ptr);
730-
731656
return rulebook_len;
732657

733658
} else {
@@ -811,43 +736,25 @@ int ProductRuleBook(const Context& dev_ctx,
811736

812737
IntT* out_indices_ptr = out_indices.data<IntT>();
813738

814-
phi::backends::gpu::GpuMemsetAsync(
815-
out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
816-
phi::backends::gpu::GpuMemsetAsync(out_index_table2_ptr,
817-
0,
818-
sizeof(int) * (table_size + 1),
819-
dev_ctx.stream());
820-
821739
config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
822-
GetOutIndexTable<IntT>
823-
<<<config.block_per_grid,
824-
config.thread_per_block,
825-
0,
826-
dev_ctx.stream()>>>(out_index_ptr,
827-
out_nnz,
828-
d_out_dims,
829-
out_index_table_ptr,
830-
out_index_table2_ptr,
831-
out_index_table2_ptr + table_size,
832-
out_indices_ptr);
833-
phi::backends::gpu::GpuMemcpyAsync(h_max_voxel,
834-
out_index_table2_ptr + table_size,
835-
sizeof(int),
836-
gpuMemcpyDeviceToHost,
837-
dev_ctx.stream());
838-
dev_ctx.Wait();
839-
740+
GetOutIndexTable<IntT><<<config.block_per_grid,
741+
config.thread_per_block,
742+
0,
743+
dev_ctx.stream()>>>(out_index_ptr,
744+
out_nnz,
745+
d_out_dims,
746+
out_index_table_ptr,
747+
out_indices_ptr);
840748
config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
841-
unique_value->ResizeAndAllocate(
842-
{static_cast<int>(out_nnz * h_max_voxel[0] * kernel_size)});
749+
unique_value->ResizeAndAllocate({static_cast<int>(out_nnz * kernel_size)});
843750
int* unique_value_ptr = unique_value->data<int>();
844751

845752
GroupIndexs<<<config.block_per_grid,
846753
config.thread_per_block,
847754
0,
848755
dev_ctx.stream()>>>(out_index_table_ptr,
849756
rulebook_len,
850-
kernel_size * h_max_voxel[0],
757+
kernel_size,
851758
rulebook_ptr + rulebook_len,
852759
out_index_ptr,
853760
unique_value_ptr);

0 commit comments

Comments
 (0)