@@ -14,16 +14,22 @@ limitations under the License. */
14
14
15
15
#pragma once
16
16
#include < Python.h>
17
+ #include < algorithm>
18
+ #include < memory>
17
19
#include < string>
18
20
#include < tuple>
19
21
#include < vector>
20
22
#include " paddle/fluid/framework/lod_tensor.h"
21
23
#include " paddle/fluid/memory/memcpy.h"
24
+ #include " paddle/fluid/operators/math/concat_and_split.h"
25
+ #include " paddle/fluid/operators/strided_memcpy.h"
22
26
#include " paddle/fluid/platform/device_context.h"
23
27
#include " paddle/fluid/platform/float16.h"
24
28
#include " pybind11/numpy.h"
25
29
#include " pybind11/pybind11.h"
26
30
31
+ namespace py = pybind11;
32
+
27
33
namespace paddle {
28
34
namespace pybind {
29
35
namespace details {
@@ -191,6 +197,253 @@ inline void PyCPUTensorSetFromArray(
191
197
std::memcpy (dst, array.data (), sizeof (uint16_t ) * array.size ());
192
198
}
193
199
200
+ template <typename T, size_t D>
201
+ void _sliceCompute (const framework::Tensor *in, framework::Tensor *out,
202
+ const platform::CPUDeviceContext &ctx,
203
+ const std::vector<int > &axes,
204
+ const std::vector<int > &starts) {
205
+ auto &eigen_place = *ctx.eigen_device ();
206
+ auto place = in->place ();
207
+ auto out_dims = out->dims ();
208
+ auto in_dims = in->dims ();
209
+
210
+ auto offsets = Eigen::array<int , D>();
211
+ auto extents = Eigen::array<int , D>();
212
+ for (size_t i = 0 ; i < D; ++i) {
213
+ offsets[i] = 0 ;
214
+ extents[i] = out_dims[i];
215
+ }
216
+ int start;
217
+ for (size_t i = 0 ; i < axes.size (); ++i) {
218
+ start = starts[i];
219
+ if (start < 0 ) {
220
+ start = (start + in_dims[axes[i]]);
221
+ }
222
+ start = std::max (start, 0 );
223
+ offsets[axes[i]] = start;
224
+ }
225
+ auto in_t =
226
+ framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From (
227
+ *in);
228
+ auto out_t =
229
+ framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From (
230
+ *out);
231
+ out_t .device (eigen_place) = in_t .slice (offsets, extents);
232
+ }
233
+
234
+ template <typename T>
235
+ void _concatCompute (const std::vector<paddle::framework::Tensor> &ins,
236
+ paddle::framework::Tensor *out,
237
+ const platform::CPUDeviceContext &ctx, int64_t axis) {
238
+ if (axis == 0 && ins.size () < 10 ) {
239
+ size_t output_offset = 0 ;
240
+ for (auto &in : ins) {
241
+ auto in_stride = framework::stride_numel (in.dims ());
242
+ auto out_stride = framework::stride_numel (out->dims ());
243
+ paddle::operators::StridedNumelCopyWithAxis<T>(
244
+ ctx, axis, out->data <T>() + output_offset, out_stride, in.data <T>(),
245
+ in_stride, in_stride[axis]);
246
+ output_offset += in_stride[axis];
247
+ }
248
+ } else {
249
+ paddle::operators::math::ConcatFunctor<platform::CPUDeviceContext, T>
250
+ concat_functor;
251
+ concat_functor (ctx, ins, static_cast <int >(axis), out);
252
+ }
253
+ }
254
+
255
+ void _getSliceinfo (const framework::Tensor &self, py::object obj,
256
+ const int64_t dim, int64_t *pstart, int64_t *pstop,
257
+ int64_t *pstep, int64_t *pslicelength) {
258
+ auto &start = *pstart;
259
+ auto &stop = *pstop;
260
+ auto &step = *pstep;
261
+ auto &slicelength = *pslicelength;
262
+ const framework::DDim &srcDDim = self.dims ();
263
+ if (dim < 0 || dim >= srcDDim.size ()) {
264
+ throw py::index_error ();
265
+ }
266
+ if (py::isinstance<py::slice>(obj)) {
267
+ size_t lstart, lstop, lstep, lslicelength;
268
+ py::slice s = static_cast <py::slice>(obj);
269
+ if (!s.compute (srcDDim[dim], &lstart, &lstop, &lstep, &lslicelength)) {
270
+ throw py::index_error ();
271
+ }
272
+ start = static_cast <int64_t >(lstart);
273
+ stop = static_cast <int64_t >(lstop);
274
+ step = static_cast <int64_t >(lstep);
275
+ slicelength = static_cast <int64_t >(lslicelength);
276
+ } else if (py::isinstance<py::int_>(obj)) {
277
+ start = static_cast <int64_t >(static_cast <py::int_>(obj));
278
+ if (std::abs (start) >= srcDDim[dim]) {
279
+ throw py::index_error ();
280
+ }
281
+ start = (start >= 0 ) ? start : srcDDim[dim] - start;
282
+ stop = start + 1 ;
283
+ step = 1 ;
284
+ slicelength = 1 ;
285
+ } else {
286
+ throw py::index_error ();
287
+ }
288
+ }
289
+
290
+ inline framework::Tensor *_getTensor (const framework::Tensor &self,
291
+ const framework::DDim &ddim) {
292
+ framework::Tensor *output = new framework::Tensor ();
293
+ output->Resize (ddim);
294
+ auto place = self.place ();
295
+ if (platform::is_cpu_place (place)) {
296
+ output->mutable_data (boost::get<platform::CPUPlace>(place), self.type ());
297
+ #ifdef PADDLE_WITH_CUDA
298
+ } else {
299
+ if (platform::is_cuda_pinned_place (place)) {
300
+ output->mutable_data (boost::get<platform::CUDAPinnedPlace>(place),
301
+ self.type ());
302
+ } else if ((platform::is_gpu_place (place))) {
303
+ output->mutable_data (boost::get<platform::CUDAPlace>(place), self.type ());
304
+ }
305
+ #endif
306
+ }
307
+ return output;
308
+ }
309
+
310
+ template <typename T>
311
+ void _sliceDapper (const framework::Tensor *in, framework::Tensor *out,
312
+ const platform::CPUDeviceContext &ctx,
313
+ const std::vector<int > &axes, const std::vector<int > &starts,
314
+ int size) {
315
+ switch (size) {
316
+ case 1 :
317
+ _sliceCompute<T, 1 >(in, out, ctx, axes, starts);
318
+ break ;
319
+ case 2 :
320
+ _sliceCompute<T, 2 >(in, out, ctx, axes, starts);
321
+ break ;
322
+ case 3 :
323
+ _sliceCompute<T, 3 >(in, out, ctx, axes, starts);
324
+ break ;
325
+ case 4 :
326
+ _sliceCompute<T, 4 >(in, out, ctx, axes, starts);
327
+ break ;
328
+ case 5 :
329
+ _sliceCompute<T, 5 >(in, out, ctx, axes, starts);
330
+ break ;
331
+ case 6 :
332
+ _sliceCompute<T, 6 >(in, out, ctx, axes, starts);
333
+ break ;
334
+ case 7 :
335
+ _sliceCompute<T, 7 >(in, out, ctx, axes, starts);
336
+ break ;
337
+ case 8 :
338
+ _sliceCompute<T, 8 >(in, out, ctx, axes, starts);
339
+ break ;
340
+ case 9 :
341
+ _sliceCompute<T, 9 >(in, out, ctx, axes, starts);
342
+ break ;
343
+ default :
344
+ PADDLE_THROW (" dim size not exepected, current is %d" , size);
345
+ break ;
346
+ }
347
+ }
348
+
349
+ template <typename T>
350
+ inline framework::Tensor *_sliceWrapper (const framework::Tensor &self,
351
+ const platform::CPUDeviceContext &ctx,
352
+ py::object obj, int dim, int64_t start,
353
+ int64_t slicelength) {
354
+ framework::DDim dstDDim = self.dims ();
355
+ dstDDim[dim] = static_cast <int64_t >(slicelength);
356
+ std::vector<int > axes ({dim});
357
+ std::vector<int > starts ({static_cast <int >(start)});
358
+ framework::Tensor *output = _getTensor (self, dstDDim);
359
+ _sliceDapper<T>(&self, output, ctx, axes, starts, dstDDim.size ());
360
+ return output;
361
+ }
362
+
363
+ template <typename T>
364
+ inline framework::Tensor *_sliceAndConcat (const framework::Tensor &self,
365
+ py::object obj, int dim) {
366
+ platform::CPUDeviceContext ctx;
367
+ int64_t start, stop, step, slicelength;
368
+ _getSliceinfo (self, obj, dim, &start, &stop, &step, &slicelength);
369
+ if (step == 1 || slicelength == 1 ) {
370
+ return _sliceWrapper<T>(self, ctx, obj, dim, start, slicelength);
371
+ } else {
372
+ std::vector<framework::Tensor> ins;
373
+ for (auto i = 0 ; i < slicelength; ++i, start += step) {
374
+ ins.emplace_back (*_sliceWrapper<T>(self, ctx, obj, dim, start, 1 ));
375
+ }
376
+
377
+ // do the concat operation
378
+ framework::DDim dstDDim = self.dims ();
379
+ dstDDim[dim] = static_cast <int64_t >(slicelength);
380
+ framework::Tensor *output1 = _getTensor (self, dstDDim);
381
+ _concatCompute<T>(ins, output1, ctx, dim);
382
+ return output1;
383
+ }
384
+ }
385
+
386
+ inline framework::Tensor *_sliceTensor (const framework::Tensor &self,
387
+ py::object obj, int dim) {
388
+ auto src_type = self.type ();
389
+ switch (src_type) {
390
+ case framework::proto::VarType::FP16:
391
+ return _sliceAndConcat<paddle::platform::float16>(self, obj, dim);
392
+ case framework::proto::VarType::FP32:
393
+ return _sliceAndConcat<float >(self, obj, dim);
394
+ case framework::proto::VarType::FP64:
395
+ return _sliceAndConcat<double >(self, obj, dim);
396
+ case framework::proto::VarType::INT32:
397
+ return _sliceAndConcat<int >(self, obj, dim);
398
+ case framework::proto::VarType::INT64:
399
+ return _sliceAndConcat<int64_t >(self, obj, dim);
400
+ case framework::proto::VarType::BOOL:
401
+ return _sliceAndConcat<bool >(self, obj, dim);
402
+ case framework::proto::VarType::INT16:
403
+ return _sliceAndConcat<bool >(self, obj, dim);
404
+ case framework::proto::VarType::UINT8:
405
+ return _sliceAndConcat<bool >(self, obj, dim);
406
+ default :
407
+ PADDLE_THROW (" Not support type %d" , src_type);
408
+ }
409
+ }
410
+
411
+ inline framework::Tensor *_pySliceTensor (const framework::Tensor &self,
412
+ py::object obj) {
413
+ if (py::isinstance<py::tuple>(obj)) {
414
+ py::list l = static_cast <py::list>(obj);
415
+ std::unique_ptr<framework::Tensor> target;
416
+ framework::Tensor *src = const_cast <framework::Tensor *>(&self);
417
+ for (auto i = 0 ; i < static_cast <int >(l.size ()); ++i) {
418
+ src = _sliceTensor (*src, l[i], i);
419
+ if (i + 1 == static_cast <int >(l.size ())) {
420
+ return src;
421
+ } else {
422
+ target.reset (src);
423
+ }
424
+ }
425
+ return nullptr ;
426
+ } else {
427
+ return _sliceTensor (self, obj, 0 );
428
+ }
429
+ }
430
+
431
+ inline framework::Tensor *PySliceTensor (const framework::Tensor &self,
432
+ py::object obj) {
433
+ if (platform::is_gpu_place (self.place ())) {
434
+ std::unique_ptr<framework::Tensor> holder;
435
+ framework::Tensor src;
436
+ framework::TensorCopySync (self, platform::CPUPlace (), &src);
437
+ framework::Tensor *output = _pySliceTensor (src, obj);
438
+ holder.reset (output);
439
+ framework::Tensor *dst = _getTensor (*output, output->dims ());
440
+ framework::TensorCopySync (*output, self.place (), dst);
441
+ return dst;
442
+ } else {
443
+ return _pySliceTensor (self, obj);
444
+ }
445
+ }
446
+
194
447
#ifdef PADDLE_WITH_CUDA
195
448
template <typename T>
196
449
void PyCUDATensorSetFromArray (
0 commit comments