@@ -67,7 +67,7 @@ HOSTDEVICE inline void StridedMemcpy(const T* x, const size_t* x_dims, T* out,
67
67
}
68
68
} else {
69
69
x += offset_i * x_stride;
70
- for (size_t j = 0 ; j < x_dim_i ; ++j) {
70
+ for (size_t j = 0 ; j < out_dim_i ; ++j) {
71
71
StridedMemcpy<T>(x, x_dims, out, out_dims, i + 1 , rank, x_stride,
72
72
out_stride, offsets);
73
73
x += x_stride;
@@ -86,8 +86,6 @@ struct RandomCropFunctor {
86
86
int rank_;
87
87
int64_t seed_;
88
88
89
- size_t prod_x_dims_;
90
- size_t prod_out_dims_;
91
89
size_t prod_batchsize_dims_;
92
90
size_t prod_x_ins_dims_;
93
91
size_t prod_out_ins_dims_;
@@ -118,8 +116,6 @@ struct RandomCropFunctor {
118
116
prod_out_ins_dims_ *= out_dim_i;
119
117
}
120
118
}
121
- prod_x_dims_ = prod_batchsize_dims_ * prod_x_ins_dims_;
122
- prod_out_dims_ = prod_batchsize_dims_ * prod_out_ins_dims_;
123
119
}
124
120
125
121
HOSTDEVICE void operator ()(size_t ins_idx) {
@@ -146,7 +142,17 @@ template <typename DeviceContext, typename T>
146
142
class RandomCropKernel : public framework ::OpKernel<T> {
147
143
public:
148
144
virtual void Compute (const framework::ExecutionContext& ctx) const {
149
- int64_t seed = *ctx.Input <framework::LoDTensor>(" Seed" )->data <int64_t >();
145
+ auto & seed_tensor = detail::Ref (ctx.Input <framework::LoDTensor>(" Seed" ));
146
+ int64_t seed = 0 ;
147
+ if (platform::is_cpu_place (seed_tensor.place ())) {
148
+ seed = *seed_tensor.data <int64_t >();
149
+ } else {
150
+ LOG (WARNING) << " It is slow to place seed in GPU memory. Please verify "
151
+ " your program" ;
152
+ framework::LoDTensor cpu_seed;
153
+ framework::TensorCopySync (seed_tensor, platform::CPUPlace (), &cpu_seed);
154
+ seed = *cpu_seed.data <int64_t >();
155
+ }
150
156
auto shape = ctx.Attr <std::vector<int >>(" shape" );
151
157
auto & x = detail::Ref (ctx.Input <framework::LoDTensor>(" X" ));
152
158
auto & out = detail::Ref (ctx.Output <framework::LoDTensor>(" Out" ));
0 commit comments