Skip to content

Commit 37af1d1

Browse files
authored
fix model compression error (#1043)
* fix model compression error * add doc for model compression limitation
1 parent 5ab5fa1 commit 37af1d1

File tree

4 files changed

+7
-6
lines changed

4 files changed

+7
-6
lines changed

doc/train/gpu-limitations.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ If you use deepmd-kit in a GPU environment, the acceptable value range of some v
33
1. The number of atom type of a given system must be less than 128.
44
2. The maximum distance between an atom and it's neighbors must be less than 128. It can be controlled by setting the rcut value of training parameters.
55
3. Theoretically, the maximum number of atoms that a single GPU can accept is about 10,000,000. However, this value is actually limited by the GPU memory size currently, usually within 1000,000 atoms even at the model compression mode.
6-
4. The total sel value of training parameters(in model/descriptor section) must be less than 4096.
6+
4. The total sel value of training parameters(in model/descriptor section) must be less than 4096.
7+
5. The size of the last layer of embedding net must be less than 1024 during the model compression process.

source/lib/src/cuda/tabulate.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,8 @@ __global__ void tabulate_fusion_grad_fifth_order_polynomial(
135135
bool unloop = false;
136136
FPTYPE * iteratorA = (FPTYPE *)&_data[0]; // dy
137137
for (int ii = 0; ii < MTILE; ii++) {
138-
if (thread_idx < last_layer_size) {
139-
iteratorA[ii * last_layer_size + thread_idx] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx];
138+
for (int jj = thread_idx; jj < last_layer_size; jj += blockDim.x) {
139+
iteratorA[ii * last_layer_size + jj] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + jj];
140140
}
141141
}
142142
__syncthreads();

source/lib/src/rocm/tabulate.hip.cu

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
#define TPB 256
77
#define WARP_SIZE 64
88
#define FULL_MASK 0xffffffff
9-
#include "gpu_rocm.h"
109

1110
template <typename FPTYPE>
1211
__forceinline__ __device__
@@ -140,8 +139,8 @@ __global__ void tabulate_fusion_grad_fifth_order_polynomial(
140139
bool unloop = false;
141140
FPTYPE * iteratorA = (FPTYPE *)&_data[0]; // dy
142141
for (int ii = 0; ii < MTILE; ii++) {
143-
if (thread_idx < last_layer_size) {
144-
iteratorA[ii * last_layer_size + thread_idx] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx];
142+
for (int jj = thread_idx; jj < last_layer_size; jj += blockDim.x) {
143+
iteratorA[ii * last_layer_size + jj] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + jj];
145144
}
146145
}
147146
__syncthreads();

source/op/tabulate_multi_device.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ class TabulateFusionGradGradOp : public OpKernel {
222222
dz_dy,
223223
table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei, last_layer_size);
224224
#endif // TENSORFLOW_USE_ROCM
225+
OP_REQUIRES (context, (last_layer_size <= 1024), errors::InvalidArgument ("In the process of model compression, the size of the last layer of embedding net must be less than 1024!"));
225226
}
226227
else if (device == "CPU") {
227228
deepmd::tabulate_fusion_grad_grad_cpu(

0 commit comments

Comments
 (0)