Skip to content

Commit 10dffc6

Browse files
authored
Merge pull request #13618 from typhoonzero/revert_13530
Revert "Some trivial optimization (#13530)"
2 parents 46f2554 + a4f7696 commit 10dffc6

File tree

10 files changed

+44
-116
lines changed

10 files changed

+44
-116
lines changed

paddle/fluid/framework/op_info.h

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,31 +38,27 @@ struct OpInfo {
3838
OpAttrChecker* checker_{nullptr};
3939
InferVarTypeFN infer_var_type_;
4040
InferShapeFN infer_shape_;
41-
std::string op_type_;
4241

4342
bool HasOpProtoAndChecker() const {
4443
return proto_ != nullptr && checker_ != nullptr;
4544
}
4645

4746
const proto::OpProto& Proto() const {
48-
PADDLE_ENFORCE_NOT_NULL(proto_, "Operator %s Proto has not been registered",
49-
op_type_);
47+
PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
5048
PADDLE_ENFORCE(proto_->IsInitialized(),
51-
"Operator %s Proto must be initialized in op info",
52-
op_type_);
49+
"Operator Proto must be initialized in op info");
5350
return *proto_;
5451
}
5552

5653
const OpCreator& Creator() const {
57-
PADDLE_ENFORCE_NOT_NULL(
58-
creator_, "Operator %s Creator has not been registered", op_type_);
54+
PADDLE_ENFORCE_NOT_NULL(creator_,
55+
"Operator Creator has not been registered");
5956
return creator_;
6057
}
6158

6259
const GradOpMakerFN& GradOpMaker() const {
6360
PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
64-
"Operator %s GradOpMaker has not been registered.",
65-
op_type_);
61+
"Operator GradOpMaker has not been registered.");
6662
return grad_op_maker_;
6763
}
6864

@@ -77,9 +73,8 @@ class OpInfoMap {
7773
return map_.find(op_type) != map_.end();
7874
}
7975

80-
void Insert(const std::string& type, OpInfo info) {
76+
void Insert(const std::string& type, const OpInfo& info) {
8177
PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
82-
info.op_type_ = type;
8378
map_.insert({type, info});
8479
}
8580

paddle/fluid/operators/read_op.cc

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,10 @@ class ReadInferVarType : public framework::VarTypeInference {
4545
framework::VarDesc* reader = block->FindVarRecursive(reader_name);
4646
auto dtypes = reader->GetDataTypes();
4747
PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
48-
auto lod_levels = reader->GetLoDLevels();
4948
for (size_t i = 0; i < dtypes.size(); ++i) {
5049
framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
5150
out.SetType(framework::proto::VarType::LOD_TENSOR);
5251
out.SetDataType(dtypes[i]);
53-
out.SetLoDLevel(lod_levels[i]);
5452
}
5553
}
5654
};

paddle/fluid/operators/sgd_op.cu

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

15-
#include <algorithm>
15+
#define EIGEN_USE_GPU
1616
#include "paddle/fluid/operators/sgd_op.h"
1717
#include "paddle/fluid/platform/cuda_primitives.h"
1818

@@ -33,21 +33,22 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
3333
}
3434
}
3535

36-
template <typename T>
36+
template <typename T, int block_size>
3737
__global__ void SparseSGDFunctorKernel(const T* selected_rows,
3838
const int64_t* rows,
3939
const T* learning_rate, T* tensor_out,
40-
int64_t row_numel, int64_t limit) {
41-
for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) {
42-
const T* selected_rows_ptr = selected_rows + i * row_numel;
43-
T* tensor_out_ptr = tensor_out + rows[i] * row_numel;
44-
for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
45-
// Since index in rows of SelectedRows can be duplicate, we have to use
46-
// Atomic Operation to avoid concurrent write error.
47-
paddle::platform::CudaAtomicAdd(
48-
tensor_out_ptr + index,
49-
-1.0 * learning_rate[0] * selected_rows_ptr[index]);
50-
}
40+
int64_t row_numel) {
41+
const int ty = blockIdx.y;
42+
int tid = threadIdx.x;
43+
44+
selected_rows += ty * row_numel;
45+
tensor_out += rows[ty] * row_numel;
46+
47+
for (int index = tid; index < row_numel; index += block_size) {
48+
// Since index in rows of SelectedRows can be duplicate, we have to use
49+
// Atomic Operation to avoid concurrent write error.
50+
paddle::platform::CudaAtomicAdd(
51+
tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]);
5152
}
5253
}
5354
} // namespace
@@ -96,15 +97,13 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
9697
auto* in_data = in_value.data<T>();
9798
auto* out_data = param_out->data<T>();
9899

99-
const int kThreadsPerBlock = 256;
100-
int thread_x = kThreadsPerBlock;
101-
int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount();
102-
int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
103-
104-
SparseSGDFunctorKernel<<<max_blocks, thread_x, 0,
105-
ctx.cuda_device_context().stream()>>>(
100+
const int block_size = 256;
101+
dim3 threads(block_size, 1);
102+
dim3 grid(1, in_rows.size());
103+
SparseSGDFunctorKernel<
104+
T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
106105
in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
107-
out_data, in_row_numel, in_rows.size());
106+
out_data, in_row_numel);
108107

109108
} else {
110109
PADDLE_THROW("Unsupported Variable Type of Grad");

paddle/fluid/operators/shrink_rnn_memory_op.cc

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -52,26 +52,16 @@ class ShrinkRNNMemoryOp : public ArrayOp {
5252
size_t height = dst_num_rows;
5353

5454
// do shrink for the top level LoD
55-
5655
if (x_tensor.lod().size() > 0 &&
5756
x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
58-
if (x_tensor.lod().size() > 1) { // MultiLevel LoD
59-
auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(
60-
x_tensor.lod(), 0, dst_num_rows, 0);
61-
height = lod_offset.second.second;
62-
auto out_lod = out_tensor.mutable_lod();
63-
framework::AppendLoD(out_lod, lod_offset.first);
64-
} else {
65-
// Shrink LoD
66-
auto lod_item = x_tensor.lod()[0];
67-
lod_item.resize(dst_num_rows + 1);
68-
out_tensor.set_lod({lod_item});
69-
const auto &const_lod_item = lod_item;
70-
height = const_lod_item.back();
71-
}
57+
auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0,
58+
dst_num_rows, 0);
59+
height = lod_offset.second.second;
60+
auto out_lod = out_tensor.mutable_lod();
61+
framework::AppendLoD(out_lod, lod_offset.first);
7262
}
7363

74-
if (height != 0) {
64+
if (dst_num_rows != 0) {
7565
out_tensor.mutable_data(place, x_tensor.type());
7666
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
7767
framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx,
@@ -144,11 +134,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
144134
} else {
145135
auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
146136
auto height = dout_tensor.dims()[0];
147-
if (height != 0) {
148-
auto slice = dx_tensor.Slice(0, static_cast<int>(height));
149-
framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx,
150-
&slice);
151-
}
137+
auto slice = dx_tensor.Slice(0, static_cast<int>(height));
138+
framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
152139
if (dx_tensor.dims()[0] > height) {
153140
auto rest_tensor = dx_tensor.Slice(
154141
static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));

paddle/fluid/platform/device_context.cc

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,6 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
201201
compute_capability = GetCUDAComputeCapability(place_.device);
202202
multi_process = GetCUDAMultiProcessors(place_.device);
203203
max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
204-
grid_max_dims_ = GpuMaxGridDim(place_.device);
205204
PADDLE_ENFORCE(cudaStreamCreate(&stream_));
206205
eigen_stream_.reset(new EigenCudaStreamDevice());
207206
eigen_stream_->Reinitialize(&stream_, place);
@@ -240,10 +239,6 @@ int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
240239
return multi_process * max_threads_per_mp;
241240
}
242241

243-
std::tuple<int, int, int> CUDADeviceContext::GetMaxGridDims() const {
244-
return grid_max_dims_;
245-
}
246-
247242
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
248243
return eigen_device_.get();
249244
}

paddle/fluid/platform/device_context.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ limitations under the License. */
1313
#include <memory>
1414
#include <mutex> // NOLINT
1515
#include <string>
16-
#include <tuple>
1716
#include <unordered_map>
1817
#include <vector>
1918

@@ -92,8 +91,6 @@ class CUDADeviceContext : public DeviceContext {
9291
/*! \brief Return the max physical thread count in the device context */
9392
int GetMaxPhysicalThreadCount() const;
9493

95-
std::tuple<int, int, int> GetMaxGridDims() const;
96-
9794
/*! \brief Return eigen device in the device context. */
9895
Eigen::GpuDevice* eigen_device() const;
9996

@@ -138,8 +135,6 @@ class CUDADeviceContext : public DeviceContext {
138135
cudaStream_t stream_;
139136
cublasHandle_t cublas_handle_;
140137

141-
std::tuple<int, int, int> grid_max_dims_;
142-
143138
int compute_capability;
144139
int multi_process;
145140
int max_threads_per_mp;

paddle/fluid/platform/for_range.h

Lines changed: 10 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -48,54 +48,35 @@ __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
4848
}
4949

5050
template <typename Function>
51-
__global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
51+
__global__ static void ForRangeElemwiseOp(Function func, int limit) {
5252
size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
5353
if (idx < limit) {
5454
func(idx);
5555
}
5656
}
5757

58-
template <typename Function>
59-
__global__ static void ForRangeElemwiseOpGridLarge(Function func, size_t limit,
60-
int grid_dim) {
61-
size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
62-
while (idx < limit) {
63-
func(idx);
64-
idx += grid_dim;
65-
}
66-
}
67-
6858
template <>
6959
struct ForRange<CUDADeviceContext> {
7060
ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
71-
: dev_ctx_(dev_ctx), limit_(limit) {}
61+
: dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
7262

7363
template <typename Function>
7464
inline void operator()(Function func) const {
7565
constexpr int num_threads = 1024;
7666
int block_size = limit_ <= num_threads ? limit_ : num_threads;
77-
size_t grid_size = (limit_ + num_threads - 1) / num_threads;
78-
79-
int max_grid_dim = std::get<0>(dev_ctx_.GetMaxGridDims());
80-
81-
if (grid_size < max_grid_dim) {
82-
int grid_size_int = static_cast<int>(grid_size);
83-
if (grid_size == 1) {
84-
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
85-
func);
86-
} else {
87-
ForRangeElemwiseOp<<<grid_size_int, block_size, 0, dev_ctx_.stream()>>>(
88-
func, limit_);
89-
}
67+
int grid_size = (limit_ + num_threads - 1) / num_threads;
68+
69+
if (grid_size == 1) {
70+
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
71+
func);
9072
} else {
91-
ForRangeElemwiseOpGridLarge<<<max_grid_dim, block_size, 0,
92-
dev_ctx_.stream()>>>(func, limit_,
93-
max_grid_dim);
73+
ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
74+
func, limit_);
9475
}
9576
}
9677

9778
const CUDADeviceContext& dev_ctx_;
98-
size_t limit_;
79+
int limit_;
9980
};
10081

10182
#endif

paddle/fluid/platform/gpu_info.cc

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -152,22 +152,5 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
152152
PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
153153
"cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
154154
}
155-
156-
std::tuple<int, int, int> GpuMaxGridDim(int id) {
157-
std::tuple<int, int, int> result;
158-
PADDLE_ENFORCE(
159-
cudaDeviceGetAttribute(&std::get<0>(result), cudaDevAttrMaxBlockDimX, id),
160-
"cudaDeviceGetAttribute failed in "
161-
"cudaDevAttrMaxBlockDim");
162-
PADDLE_ENFORCE(
163-
cudaDeviceGetAttribute(&std::get<1>(result), cudaDevAttrMaxBlockDimY, id),
164-
"cudaDeviceGetAttribute failed in "
165-
"cudaDevAttrMaxBlockDim");
166-
PADDLE_ENFORCE(
167-
cudaDeviceGetAttribute(&std::get<2>(result), cudaDevAttrMaxBlockDimZ, id),
168-
"cudaDeviceGetAttribute failed in "
169-
"cudaDevAttrMaxBlockDim");
170-
return result;
171-
}
172155
} // namespace platform
173156
} // namespace paddle

paddle/fluid/platform/gpu_info.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ limitations under the License. */
1919
#include <cuda_runtime.h>
2020
#include <stddef.h>
2121
#include <string>
22-
#include <tuple>
2322

2423
namespace paddle {
2524
namespace platform {
@@ -73,8 +72,6 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
7372
//! Set memory dst with value count size asynchronously
7473
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
7574

76-
std::tuple<int, int, int> GpuMaxGridDim(int id);
77-
7875
} // namespace platform
7976
} // namespace paddle
8077

python/paddle/fluid/layers/io.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,6 @@ def _copy_reader_var_(block, var):
311311
new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER)
312312
new_var.desc.set_shapes(var.desc.shapes())
313313
new_var.desc.set_dtypes(var.desc.dtypes())
314-
new_var.desc.set_lod_levels(var.desc.lod_levels())
315314
new_var.persistable = True
316315
return new_var
317316

@@ -633,7 +632,6 @@ def py_reader(capacity,
633632
})
634633

635634
startup_var.desc.set_dtypes(dtypes)
636-
startup_var.desc.set_lod_levels(lod_levels)
637635
startup_var.persistable = True
638636

639637
main_prog_var = _copy_reader_var_(default_main_program().current_block(),

0 commit comments

Comments
 (0)