DALI/dali/python/backend_impl.cc at 9319fa9f75802d9c0d9ed37d8e5468369d33a975 · rostan-t/DALI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright (c) 2017-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cuda_runtime_api.h>
#include <dlfcn.h>
#include <sstream>
#include <cstring>
#include "dali/core/common.h"
#include "dali/core/cuda_utils.h"
#include "dali/core/device_guard.h"
#include "pybind11/pybind11.h"
#include "pybind11/pytypes.h"
#include "pyerrors.h"  // NOLINT(build/include)
#if SHM_WRAPPER_ENABLED
#include "dali/core/os/shared_mem.h"
#endif
#include "dali/core/python_util.h"
#include "dali/core/mm/default_resources.h"
#include "dali/operators.h"
#include "dali/kernels/kernel.h"
#include "dali/operators/reader/parser/tf_feature.h"
#include "dali/pipeline/data/copy_to_external.h"
#include "dali/pipeline/data/dltensor.h"
#include "dali/pipeline/data/tensor.h"
#include "dali/pipeline/data/tensor_list.h"
#include "dali/pipeline/init.h"
#include "dali/pipeline/operator/error_reporting.h"
#include "dali/pipeline/operator/op_schema.h"
#include "dali/pipeline/operator/op_spec.h"
#include "dali/pipeline/operator/operator.h"
#include "dali/pipeline/pipeline.h"
#include "dali/pipeline/pipeline_debug.h"
#include "dali/plugin/plugin_manager.h"
#include "dali/python/python3_compat.h"
#include "dali/util/pybind.h"
#include "dali/util/user_stream.h"

namespace dali {
namespace python {


#if (CUDART_VERSION >= 10200 && CUDART_VERSION < 11100)
// add this alignment to work around a patchelf bug/feature which
// changes TLS alignment and break DALI interoperability with CUDA RT
alignas(0x1000) thread_local volatile bool __backend_impl_force_tls_align;

void __backend_impl_force_tls_align_fun(void) {
  __backend_impl_force_tls_align = 0;
}
#else
void __backend_impl_force_tls_align_fun(void) {}
#endif


using namespace pybind11::literals; // NOLINT

/**
 * @brief Override the default __module__ of Tensor classes from nvidia.dali.backend_impl
 * with the user-friendly Python module.
 * This definition must be provided as a first one for the Tensor classes, so all following
 * definitions can look it up when pybind is generating the signatures, otherwise the annotations
 * will contain the backend_impl module path.
 */
static std::string tensor_module_impl(const py::object &object) {
  (void)object;
  return "nvidia.dali.tensors";
}

static void* ctypes_void_ptr(const py::object& object) {
  auto ptr_as_int = getattr(object, "value", py::none());
  if (ptr_as_int.is_none()) {
    return nullptr;
  }
  void *ptr = PyLong_AsVoidPtr(ptr_as_int.ptr());
  return ptr;
}

TensorShape<> shape_from_py(py::tuple tup) {
  TensorShape<> shape;
  shape.resize(tup.size());
  for (size_t i = 0; i < tup.size(); ++i) {
    shape[i] = tup[i].cast<int64_t>();
  }
  return shape;
}

template <int ndim>
py::list as_py_list(const TensorShape<ndim> &shape) {
  py::list ret(shape.size());
  for (int i = 0; i < shape.size(); i++) {
    ret[i] = shape[i];
  }
  return ret;
}

template <typename Backend>
py::list py_shape(const Tensor<Backend> &t) {
  return as_py_list(t.shape());
}

template <typename Backend>
std::vector<py::tuple> py_shape_list(const TensorList<Backend> &tl) {
  std::vector<py::tuple> ret(tl.shape().size());
  for (int i = 0; i < tl.shape().size(); ++i) {
    ret[i] = py::tuple(as_py_list(tl.tensor_shape(i)));
  }
  return ret;
}

static string TensorLayoutRepr(const TensorLayout &tl) {
  std::stringstream ss;
  ss << "nvidia.dali.types.TensorLayout('";
  escape_string(ss, tl.c_str());
  ss << "')";
  return ss.str();
}

template<typename Backend>
py::dict ArrayInterfaceRepr(Tensor<Backend> &t) {
  py::dict d;
  py::tuple tup(2);
  d["typestr"] = FormatStrFromType(t.type());
  // __array_interface__ expects shape to be a tuple
  d["shape"] = py::tuple(py_shape<Backend>(t));
  // tuple of (raw_data_pointer, if_data_is_read_only)
  tup[0] = py::reinterpret_borrow<py::object>(PyLong_FromVoidPtr(t.raw_mutable_data()));
  // if we make it readonly, it prevents us from sharing memory with PyTorch tensor
  tup[1] = false;
  d["data"] = tup;
  if constexpr (std::is_same<Backend, GPUBackend>::value) {
    // see https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
    // this set of atributes is tagged as version 2
    d["version"] = 2;
  } else {
    // see https://docs.scipy.org/doc/numpy/reference/arrays.interface.html
    // this set of atributes is tagged as version 3
    d["version"] = 3;
    if (t.is_pinned()) {
      if (auto &event = t.ready_event())
        AccessOrder::host().wait(event);  // more fine-grained synchronization
      else
        AccessOrder::host().wait(t.order());
    }
  }
  d["strides"] = py::none();
  return d;
}

namespace {
  const uint32_t kCPUTensorColor = DomainTimeRange::kBlue1;
  const uint32_t kGPUTensorColor = DomainTimeRange::knvGreen;
}  // namespace

template<typename SrcBackend>
const TensorListShape<> ConvertShape(const TensorShape<> &shape,
                                      TensorList<SrcBackend> *shape_type_placeholder) {
  return uniform_list_shape(shape[0], shape.last(shape.size()-1));
}

template<typename SrcBackend>
const TensorShape<> &ConvertShape(const TensorShape<> &shape,
                                  Tensor<SrcBackend> *shape_type_placeholder) {
  return shape;
}

template<typename TStrides, typename TShape>
void CheckContiguousTensor(const TStrides &strides, int num_strides,
                           const TShape &shape, int num_extents, size_t element_size) {
  DALI_ENFORCE(num_strides == num_extents,
    "There should be exactly as many strides as there are extents in array shape.");
  int64_t stride_from_shape = element_size;
  int64_t stride_from_shape_collapsed = 1;
  int64_t last_non_one_dim = 1;
  for (int i = num_strides - 1; i >= 0; i--) {
    DALI_ENFORCE(strides[i] == stride_from_shape || strides[i] == stride_from_shape_collapsed,
        make_string("Strided data not supported. Dimension ", i, " has stride ", strides[i],
        " whereas densely packed data of this shape would have a stride ", stride_from_shape));
    stride_from_shape *= shape[i];
    // for shapes [1, 1, 5] leading dimensions may not contribute to stride
    if (shape[i] != 1) {
      stride_from_shape_collapsed *= last_non_one_dim;
      last_non_one_dim = shape[i];
    }
  }
}

template<typename TStrides, typename TShape>
void CheckContiguousTensor(const TStrides &strides, const TShape &shape, size_t element_size) {
  CheckContiguousTensor(strides, dali::size(strides), shape, dali::size(shape), element_size);
}

template <typename Backend>
void SetLayout(
      Tensor<Backend> &t,
      const std::optional<std::string> &layout_str,
      bool clear_if_none = true) {
  if (layout_str && !layout_str->empty()) {
    TensorLayout layout = *layout_str;
    if (t.ndim() != layout.ndim()) {
      throw py::value_error(make_string(
        "A non-empty layout must have the same number of dimensions as the "
        "number of dimensions of the Tensor.\n"
        "Got: ", layout.ndim(), " (", layout, ")\n",
        "Expected: ", t.ndim(), "."));
    }
    t.SetLayout(layout);
  } else if (clear_if_none) {
    t.SetLayout({});
  }
}

template <typename Backend>
void SetLayout(
      TensorList<Backend> &t,
      const std::optional<std::string> &layout_str,
      bool clear_if_none = true) {
  if (layout_str && !layout_str->empty()) {
    TensorLayout layout = *layout_str;
    if (t.sample_dim() != layout.ndim()) {
      throw py::value_error(make_string(
        "A non-empty layout must have the same number of dimensions as the "
        "number of dimensions of the TensorList.\n"
        "Got: ", layout.ndim(), " (", layout, ")\n",
        "Expected: ", t.sample_dim(), "."));
    }
    t.SetLayout(layout);
  } else if (clear_if_none) {
    t.SetLayout({});
  }
}

template<typename SrcBackend, template<typename> class SourceDataType>
void FillTensorFromDlPack(
      py::capsule capsule,
      SourceDataType<SrcBackend> *batch,
      const std::optional<std::string> &layout) {
  auto dlm_tensor_ptr = DLMTensorPtrFromCapsule(capsule);
  const auto &dl_tensor = dlm_tensor_ptr->dl_tensor;
  DALI_ENFORCE((std::is_same<SrcBackend, GPUBackend>::value &&
                  dl_tensor.device.device_type == kDLCUDA) ||
               (std::is_same<SrcBackend, CPUBackend>::value &&
                  dl_tensor.device.device_type == kDLCPU),
               "DLPack device type doesn't match Tensor type");

  const TypeInfo &dali_type = TypeTable::GetTypeInfo(ToDALIType(dl_tensor.dtype));
  TensorShape<> shape;
  shape.resize(dl_tensor.ndim);
  for (ssize_t i = 0; i < dl_tensor.ndim; ++i) {
    shape[i] = dl_tensor.shape[i];
  }

  if (dl_tensor.strides)
    CheckContiguousTensor(dl_tensor.strides, dl_tensor.ndim, dl_tensor.shape, dl_tensor.ndim, 1);

  size_t bytes = volume(shape) * dali_type.size();

  const auto &typed_shape = ConvertShape(shape, batch);
  bool is_pinned = dl_tensor.device.device_type == kDLCUDAHost;
  int device_id = CPU_ONLY_DEVICE_ID;
  // according to the docs kDLCUDAHost = kDLCPU | kDLCUDA so test it as a the first option
  if (dl_tensor.device.device_type == kDLCUDAHost) {
    device_id = CPU_ONLY_DEVICE_ID;
  } else if (dl_tensor.device.device_type == kDLCPU) {
    device_id = CPU_ONLY_DEVICE_ID;
  } else if (dl_tensor.device.device_type == kDLCUDA) {
    device_id = dl_tensor.device.device_id;
  } else {
    DALI_FAIL(make_string("Not supported DLPack device type: ", dl_tensor.device.device_type, "."));
  }

  // empty lambda that just captures dlm_tensor_ptr unique ptr that would be destructed when
  // shared ptr is destroyed
  batch->ShareData(shared_ptr<void>(dl_tensor.data,
                                    [dlm_tensor_ptr = std::move(dlm_tensor_ptr)](void*) {}),
                                    bytes, is_pinned, typed_shape, dali_type.id(), device_id);


  SetLayout(*batch, layout);
}

template <typename TensorType>
void FillTensorFromCudaArray(const py::object &object,
                             TensorType *batch,
                             int device_id,
                             const std::optional<std::string> &layout) {
  auto cu_a_interface_val = getattr(object, "__cuda_array_interface__", py::none());
  if (cu_a_interface_val.is_none()) {
    DALI_FAIL("Provided object doesn't support cuda array interface protocol.")
  }
  py::dict cu_a_interface = py::cast<py::dict>(cu_a_interface_val);

  DALI_ENFORCE(cu_a_interface.contains("typestr") &&
                // see detail::PyUnicode_Check_Permissive implementation
                (PyUnicode_Check(cu_a_interface["typestr"].ptr()) ||
                PYBIND11_BYTES_CHECK(cu_a_interface["typestr"].ptr())) &&
                cu_a_interface.contains("shape") &&
                PyTuple_Check(cu_a_interface["shape"].ptr()) &&
                cu_a_interface.contains("data") &&
                PyTuple_Check(cu_a_interface["data"].ptr()) &&
                cu_a_interface["data"].cast<py::tuple>().size() >= 2 &&
                cu_a_interface.contains("version"),
                "Provided object doesn't have required cuda array interface "
                "protocol fields of necessary type.");
  DALI_ENFORCE(!cu_a_interface.contains("mask") || cu_a_interface["mask"].is_none(),
                "Masked tensors are not supported");

  // Create the Tensor and wrap the data
  TensorShape<> shape = shape_from_py(cu_a_interface["shape"].cast<py::tuple>());

  std::string typestr = cu_a_interface["typestr"].cast<py::str>();
  const TypeInfo &type = TypeFromFormatStr(typestr);
  size_t bytes = volume(shape) * type.size();

  if (cu_a_interface.contains("strides") && !cu_a_interface["strides"].is_none()) {
    TensorShape<> strides = shape_from_py(cu_a_interface["strides"].cast<py::tuple>());
    CheckContiguousTensor(strides, shape, type.size());
  }

  const auto &typed_shape = ConvertShape(shape, batch);
  auto *ptr = PyLong_AsVoidPtr(cu_a_interface["data"].cast<py::tuple>()[0].ptr());

  // it is for __cuda_array_interface__ so device_id < 0 is not a valid value
  if (device_id < 0) {
    CUDA_CALL(cudaGetDevice(&device_id));
  }

  batch->Reset();

  if (cu_a_interface.contains("stream")) {
    const auto &stream_obj = cu_a_interface["stream"];
    if (!stream_obj.is_none()) {
      auto stream_long_value = cu_a_interface["stream"].cast<int64_t>();
      auto stream_value = PyLong_AsVoidPtr(cu_a_interface["stream"].ptr());
      DALI_ENFORCE(stream_value != 0, make_string("Provided stream is not a valid CUDA stream ",
                   "based on CUDA Array Interface v3. `0` value is ambiguous and disallowed"));
      if (stream_long_value == 1) stream_value = 0;
      if (stream_long_value == 2) stream_value = CU_STREAM_PER_THREAD;
      auto order = AccessOrder(cudaStream_t(stream_value));
      batch->set_order(order);
    }
  }

  // Keep a copy of the input object ref in the deleter, so its refcount is increased
  // while this shared_ptr is alive (and the data should be kept alive)
  batch->ShareData(shared_ptr<void>(ptr, [obj_ref = object](void *) mutable {  // NOLINT
    // acquire GIL ...
    py::gil_scoped_acquire aqr;
    {
      // now move out the object stored in the closure to a local variable...
      auto tmp = std::move(obj_ref);
      (void)tmp;
      /// ...and let it go out of scope while GIL is held
    }
  }),
      bytes, false, typed_shape, type.id(), device_id);

  SetLayout(*batch, layout);
}

template <typename Backend>
void ReinterpretTensor(Tensor<Backend> &t, DALIDataType new_type) {
  t.Reinterpret(new_type);
}

template <typename Backend>
void ReinterpretTensorList(TensorList<Backend> &tl, DALIDataType new_type) {
  tl.Reinterpret(new_type);
}

void ExposeTensorLayout(py::module &m) {
  py::class_<TensorLayout> tl(m, "TensorLayout");
  tl.def(py::init([](const std::string &s) {
    return new TensorLayout(s);
  }))
  .def("__str__", &TensorLayout::str)
  .def("__repr__", TensorLayoutRepr)
  .def("__len__", &TensorLayout::ndim);
#define DEFINE_LAYOUT_CMP(name, expr)\
    tl.def("__" #name "__", [](const TensorLayout &self, const TensorLayout *other) {\
      return expr;\
    })\
    .def("__" #name "__", [](const TensorLayout &self, const string *other) {\
      return expr;\
    })
  DEFINE_LAYOUT_CMP(eq, other  && self == *other);
  DEFINE_LAYOUT_CMP(ne, !other || self != *other);  // null is not equal to non-null
  DEFINE_LAYOUT_CMP(lt, !other || self <  *other);  // null precedes non-null
  DEFINE_LAYOUT_CMP(gt, other  && self >  *other);
  DEFINE_LAYOUT_CMP(le, !other || self <= *other);
  DEFINE_LAYOUT_CMP(ge, other  && self >= *other);
#undef DEFINE_LAYOUT_CMP
}

// Placeholder enum for defining __call__ on dtype member of Tensor (to be deprecated).
enum DALIDataTypePlaceholder {};

/**
 * @brief Extracts attribute named `attr_name` from the python object.
 *
 * @param object python object.
 * @param attr_name name of the requested attribute.
 */
auto ExtractPythonAttr(py::object &&object, const char *attr_name) {
  return object.attr(attr_name);
}

/**
 * @brief Extracts nested attribute from the python object.
 *
 * @param object python object.
 * @param attr_name name of the next requested attribute.
 * @param rest rest of the requested attributes names.
 */
template <typename... Args>
auto ExtractPythonAttr(py::object &&object, const char *attr_name, Args... rest) {
  return ExtractPythonAttr(object.attr(attr_name), rest...);
}

/**
 * @brief Extracts nested attribute from imported python module.
 *
 * @param python_module name of the python module.
 * @param attr_name outer most attribute name.
 * @param attr_names rest of the attribute names.
 */
template <typename... Args>
auto FromPythonTrampoline(const char *python_module, const char *attr_name, Args... attr_names) {
  return ExtractPythonAttr(py::module::import(python_module).attr(attr_name), attr_names...);
}

/**
 * @brief Extracts attribute from imported python module.
 *
 * @param python_module name of the python module.
 * @param attr_name name of the attribute.
 */
auto FromPythonTrampoline(const char *python_module, const char *attr_name) {
  return py::module::import(python_module).attr(attr_name);
}

/**
 * @brief Copies the contents of the source DALI batch to an external buffer
 *
 * The function schedules a copy of the contents of src to the target destination buffer.
 * The copy will be scheduled on the provided `cuda_stream` or, if left out, on an internal DALI
 * stream.
 * If a non-blocking copy is requested, the function will synchronize the source buffer's
 * associated access order with the provided stream; otherwise, the function will wait until the
 * copy completes.
 *
 * @tparam SourceObject  a data store on GPUBackend (Tensor, TensorList, TensorList)
 * @param src             Source batch
 * @param dst_ptr         Destination pointer, wrapped in a C void_ptr Python type
 * @param cuda_stream     CUDA stream, wrapped in a C void_ptr type
 * @param non_blocking    whether the function should wait on host for the copy to complete
 * @param use_copy_kernel if true, the copy will be done using a kernel instead of cudaMemcpyAsync
 */
template <typename SourceObject>
void CopyToExternalImplGPU(SourceObject &src,
                           py::object dst_ptr, py::object cuda_stream,
                           bool non_blocking, bool use_copy_kernel) {
  CUDAStreamLease lease;
  AccessOrder copy_order;
  AccessOrder wait_order = non_blocking ? src.order() : AccessOrder::host();
  int device = src.device_id();
  if (!cuda_stream.is_none()) {
    cudaStream_t stream = static_cast<cudaStream_t>(ctypes_void_ptr(cuda_stream));
    copy_order = AccessOrder(stream, device);
  } else {
    lease = CUDAStreamPool::instance().Get(device);
    copy_order = AccessOrder(lease, device);
  }

  void *ptr = ctypes_void_ptr(dst_ptr);
  CopyToExternal<mm::memory_kind::device>(ptr, std::nullopt, src, copy_order, use_copy_kernel);

  wait_order.wait(copy_order);
}

template <typename Backend>
py::object GetTensorProperty(const Tensor<Backend> &tensor, std::string name) {
  if (name == "layout") {
    TensorLayout layout = tensor.GetLayout();
    if (layout.empty())
      return py::none();
    else
      return py::str(layout.c_str());
  } else if (name == "source_info") {
    auto &&srcinfo = tensor.GetSourceInfo();
    if (srcinfo.empty())
      return py::none();
    else
      return py::str(srcinfo);
  } else {
    // TODO(michalz): Make TensorMeta more flexible and have some dictionary
    return py::none();
  }
}

template <typename Backend>
DLDevice GetDLDevice(const Tensor<Backend> &tensor) {
  if constexpr (std::is_same_v<Backend, GPUBackend>)
    return { kDLCUDA, tensor.device_id() };
  else
    return { tensor.is_pinned() ? kDLCUDAHost : kDLCPU };
}

template <typename Backend>
DLMTensorPtr ToDLMTensor(Tensor<Backend> &tensor,
                         std::optional<intptr_t> stream_handle_value,
                         std::optional<std::pair<DLDeviceType, int>> dl_device) {
  DLDevice dev;

  if (dl_device.has_value())
    dev = { dl_device->first, dl_device->second };
  else
    dev = GetDLDevice(tensor);

  if (dev.device_type == kDLCUDA) {
    AccessOrder target_order = cudaStreamLegacy;
    if (stream_handle_value.has_value()) {
      if (*stream_handle_value == -1) {
        target_order = AccessOrder{};
      } else {
        cudaStream_t stream = cudaStream_t(*stream_handle_value);
        if (stream == 0)
          throw std::invalid_argument("Stream 0 is explicitly forbidden by DLPack protocol");
        target_order = AccessOrder(stream, dev.device_id);
      }
    }

    if constexpr (std::is_same_v<Backend, CPUBackend>) {
      throw std::runtime_error(
          "The tensor is in CPU memory and a CUDA DLPack tensor was requested");
    }
    if (dev.device_id != tensor.device_id())
      throw std::runtime_error(make_string("Requested a DLPack tensor for GPU_", dev.device_id,
        "while the tensor resides in GPU_", tensor.device_id(), " memory."));

    if (target_order) {
      if (tensor.ready_event()) {
        target_order.wait(tensor.ready_event());
      } else {
        target_order.wait(tensor.order());
      }
    }
    return GetSharedDLTensor(tensor);
  } else if (dev.device_type == kDLCPU || dev.device_type == kDLCUDAHost) {
    if constexpr (std::is_same_v<Backend, GPUBackend>) {
      throw std::runtime_error(
          "The tensor is in CUDA GPU memory and a CPU DLPack tensor was requested");
    }

    if (dev.device_type == kDLCUDAHost && !tensor.is_pinned())
      throw std::runtime_error(
          "A CUDA host (pinned) DLPack was requested, but the tensor buffer is not pinned.");

    if (tensor.is_pinned() && tensor.ready_event()) {
      // DLPack doesn't support stream-ordered CUDA host tensors
      AccessOrder::host().wait(tensor.ready_event());
    }

    DLMTensorPtr dlm_tensor = GetSharedDLTensor(tensor);
    // Set the device type to the desired one - if the original tensor was pinned, we can
    // downgrade it to regular host memory.
    dlm_tensor->dl_tensor.device = dev;
    return dlm_tensor;
  } else {
    throw std::runtime_error(make_string(
      "An unsupported DLPack device was requested: ", static_cast<int>(dev.device_type)));
  }
}

template <typename Backend>
py::capsule ToDLPack(Tensor<Backend> &tensor,
                     std::optional<intptr_t> stream,
                     std::optional<std::pair<DLDeviceType, int>> dl_device) {
  return DLTensorToCapsule([&]() {
    py::gil_scoped_release interpreter_unlock{};
    return ToDLMTensor(tensor, stream, dl_device);
  }());
}

AccessOrder AccessOrderFromPythonStreamObj(const py::object &cuda_stream) {
  AccessOrder order;
  if (!cuda_stream.is_none()) {
    auto cuda_stream_interface = getattr(cuda_stream, "__cuda_stream__", py::none());
    if (!cuda_stream_interface.is_none()) {
      auto [version, stream_ptr] = cuda_stream_interface().cast<std::tuple<int, uintptr_t>>();
      cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_ptr);
      order = AccessOrder(stream);
    } else if (py::hasattr(cuda_stream, "value")) {
      cudaStream_t stream = static_cast<cudaStream_t>(ctypes_void_ptr(cuda_stream));
    } else if (py::isinstance<py::int_>(cuda_stream)) {
      cudaStream_t stream = reinterpret_cast<cudaStream_t>(py::cast<uintptr_t>(cuda_stream));
      order = AccessOrder(stream);
    }
  } else {
    order = AccessOrder::host();
  }
  return order;
}

/**
 * Pipeline output descriptor.
 */
using OutputDesc = std::tuple<std::string  /* name */,
                              std::string  /* device */,
                              DALIDataType /* dtype */,
                              int          /* ndim */,
                              std::string  /* layout */>;

void ExposeTensor(py::module &m) {
  m.def("CheckDLPackCapsule",
        [](py::object &p) {
          py::list list;
          if (PyCapsule_CheckExact(p.ptr())) {
            py::capsule capsule = py::reinterpret_borrow<py::capsule>(p);
            // do not consume capsule
            auto dlm_tensor_ptr = DLMTensorRawPtrFromCapsule(capsule, false);
            const auto &dl_tensor = dlm_tensor_ptr->dl_tensor;
            list.append(dl_tensor.device.device_type == kDLCUDA ||
                        dl_tensor.device.device_type == kDLCPU);
            list.append(dl_tensor.device.device_type == kDLCUDA);
          } else {
            list.append(false);
            list.append(false);
          }
          return py::cast<py::tuple>(list);
        },
      "ptr"_a,
      R"code(
      Check if provided python object represent a valid DLPack capsule.
      It returns a tuple of two boolean values: one indicating if this is a valid DLPack object, and the other if the data

      p : python object
          Python object to be checked
      )code");

  auto tensor_cpu_binding = py::class_<Tensor<CPUBackend>>(m, "TensorCPU", py::buffer_protocol())
    .def_property_readonly_static("__module__", tensor_module_impl)
    .def(py::init([](py::capsule &capsule, std::optional<std::string> layout = {}) {
          DomainTimeRange range("TensorCPU::init", kCPUTensorColor);
          auto t = std::make_unique<Tensor<CPUBackend>>();
          FillTensorFromDlPack(capsule, t.get(), layout);
          return t.release();
        }),
      "object"_a,
      "layout"_a = py::none(),
      R"code(
      Wrap a DLPack Tensor residing in the CPU memory.

      object : DLPack object
            Python DLPack object
      layout : str
            Layout of the data
      )code")
    .def(
      "__dlpack_device__", [](const Tensor<CPUBackend> &tensor) {
        auto dev = GetDLDevice(tensor);
        return std::make_tuple(dev.device_type, dev.device_id);
      },
      R"code(
      Returns device type and device ID in DLPack format.
      )code")
    .def(
      "__dlpack__", ToDLPack<CPUBackend>,
      "stream"_a = py::none(),
      "dl_device"_a = py::none(),
      R"code(
      Exposes the tensor as a DLPack capsule.

      Note:
        When NOT using the default execution model (i.e., when ``exec_dynamic=False`` or other
        parameters are incompatible with this execution mode), the pipeline outputs may be reused
        and overwritten by DALI after ``release_outputs`` has been called. Make sure that the
        default execution model is enabled if you want to keep the outputs indefinitely.

      stream : int, None
          The CUDA stream the the caller is going to use to access the buffer.
          A synchronization event might be inserted, if necessary, into that stream.
          Special values:

          * ``None`` - any stream; wait on host
          * ``-1``   - do not synchronize at all
          * ``1``    - legacy default stream
          * ``2``    - legacy per-thread stream
          * ``>2``   - a CUDA stream handle converted to an integer
          * ``0``    - forbidden value
      )code")
    .def_buffer([](Tensor<CPUBackend> &t) -> py::buffer_info {
          DALI_ENFORCE(IsValidType(t.type()), "Cannot produce "
            "buffer info for tensor w/ invalid type.");

          std::vector<ssize_t> shape(t.ndim()), stride(t.ndim());
          size_t dim_prod = 1;
          for (int i = 0; i < t.ndim(); ++i) {
            shape[i] = t.shape()[i];

            // We iterate over stride backwards
            stride[(t.ndim()-1) - i] = t.type_info().size()*dim_prod;
            dim_prod *= t.shape()[(t.ndim()-1) - i];
          }

          if (t.is_pinned()) {
            if (auto &event = t.ready_event())
              AccessOrder::host().wait(event);  // more fine-grained synchronization
            else
              AccessOrder::host().wait(t.order());
          }

          return py::buffer_info(
              t.raw_mutable_data(),
              t.type_info().size(),
              FormatStrFromType(t.type()),
              t.ndim(), shape, stride);
        })
    .def(py::init([](py::buffer b, std::optional<std::string> layout = {}, bool is_pinned = false) {
          // We need to verify that the input data is c contiguous
          // and of a type that we can work with in the backend
          __backend_impl_force_tls_align_fun();
          py::buffer_info info = b.request();

          std::vector<Index> i_shape;
          for (auto &dim : info.shape) {
            i_shape.push_back(dim);
          }
          size_t bytes = volume(i_shape) * info.itemsize;

          // Validate the stride
          CheckContiguousTensor(info.strides, info.shape, info.itemsize);

          // TODO(klecki): Extend the constructor with stream and device_id
          // Assume that we cannot use pinned memory in CPU_ONLY mode
          int device_id = CPU_ONLY_DEVICE_ID;
          if (is_pinned) {
            CUDA_CALL(cudaGetDevice(&device_id));
          }

          // Create the Tensor and wrap the data
          auto t = std::make_unique<Tensor<CPUBackend>>();
          const TypeInfo &type = TypeFromFormatStr(info.format);
          // Keep a copy of the input buffer ref in the deleter, so its refcount is increased
          // while this shared_ptr is alive (and the data should be kept alive)
          // Use dynamically allocated memory so we can call deleter inside py::gil_scoped_acquire
          // scope
          py::buffer *buf_tmp = new py::buffer(b);
          t->ShareData(shared_ptr<void>(info.ptr, [buf_ref = buf_tmp](void *) {
             py::gil_scoped_acquire aqr;
             delete buf_ref;
          }),
                       bytes, is_pinned, i_shape, type.id(), device_id);
          SetLayout(*t, layout);
          return t.release();
        }),
      "b"_a,
      "layout"_a = py::none(),
      "is_pinned"_a = false,
      R"code(
      Wrap a Tensor residing in the CPU memory.

      b : object
            the buffer to wrap into the TensorListCPU object
      layout : str
            Layout of the data
      is_pinned : bool
            If provided memory is page-locked (pinned)
      )code")
    .def("shape", &py_shape<CPUBackend>,
         R"code(
         Shape of the tensor.
         )code")
    .def("ndim", &Tensor<CPUBackend>::ndim,
         R"code(
         Number of dimensions of the tensor.
         )code")
    .def("squeeze",
      [](Tensor<CPUBackend> &t, py::object dim_arg) -> bool {
        if (!dim_arg.is_none()) {
          int dim = dim_arg.cast<int>();
          return t.Squeeze(dim);
        }
        return t.Squeeze();
      },
      "dim"_a = py::none(),
      R"code(
      Remove single-dimensional entries from the shape of the Tensor and it returns true
      if the shape changed or false if it remained unchanged.

      dim : int
            If specified, it represents the axis of a single dimension to be squeezed.
      )code")
    .def("layout", [](Tensor<CPUBackend> &t) {
      return t.GetLayout().str();
    })
    .def("set_layout", [](Tensor<CPUBackend> &t, const std::optional<std::string> &layout) {
      SetLayout(t, layout);
    })
    .def("source_info", &Tensor<CPUBackend>::GetSourceInfo,
        R"(Gets a string descrbing the source of the data in the tensor, e.g. a name of the file
        from which the data was loaded.)")
    .def("get_property", GetTensorProperty<CPUBackend>)
    .def("_as_gpu", [](Tensor<CPUBackend> &t) -> Tensor<GPUBackend>* {
          auto ret = std::make_unique<Tensor<GPUBackend>>();
          int dev = -1;
          CUDA_CALL(cudaGetDevice(&dev));
          ret->set_device_id(dev);
          UserStream *us = UserStream::Get();
          cudaStream_t s = us->GetStream(*ret);
          ret->Copy(t, s);
          us->Wait(*ret);
          return ret.release();
        },
      R"code(
      Returns a `TensorGPU` object being a copy of this `TensorCPU`.
      )code",
      py::return_value_policy::take_ownership)
    .def("as_cpu", [](Tensor<CPUBackend> &t) -> Tensor<CPUBackend>& {
          return t;
        },
      R"code(Passthrough, since the object is already an instance of `TensorCPU`.)code",
      py::return_value_policy::reference_internal)
    .def("_set_stream", [](Tensor<CPUBackend> &t, py::object stream) {
      t.set_order(AccessOrderFromPythonStreamObj(stream));
    })
    .def("_make_copy", [](const Tensor<CPUBackend> &t) {
        auto dst = std::make_unique<Tensor<CPUBackend>>();
        dst->set_device_id(t.device_id());
        dst->set_order(t.order());
        dst->set_pinned(t.is_pinned());
        dst->Copy(t);
        return dst;
      },
      py::return_value_policy::take_ownership)
    .def("copy_to_external",
        [](Tensor<CPUBackend> &t, py::object p) {
          CopyToExternal<mm::memory_kind::host>(
              ctypes_void_ptr(p), std::nullopt, t, AccessOrder::host(), false);
        },
      "ptr"_a,
      R"code(
      Copy to external pointer in the CPU memory.

      ptr : ctypes.c_void_p
            Destination of the copy.
      )code")
    .def("data_ptr", [](Tensor<CPUBackend> &t) {
          return py::reinterpret_borrow<py::object>(PyLong_FromVoidPtr(t.raw_mutable_data()));
        },
      R"code(
      Returns the address of the first element of tensor.
      )code")
    .def("reinterpret", ReinterpretTensor<CPUBackend>,
      "new_type"_a,
      R"code(
      Reinterpret the contents of the tensor as a new type. The element size must not change.
      )code")
    .def("__str__", [](Tensor<CPUBackend> &t) {
      return FromPythonTrampoline("nvidia.dali.tensors", "_tensor_to_string")(t);
    })
    .def("__repr__", [](Tensor<CPUBackend> &t) {
      return FromPythonTrampoline("nvidia.dali.tensors", "_tensor_to_string")(t, false);
    })
    .def_property("__array_interface__", &ArrayInterfaceRepr<CPUBackend>, nullptr,
      R"code(
      Returns Array Interface representation of TensorCPU.
      )code")
    .def_property_readonly("dtype", [](Tensor<CPUBackend> &t) {
          return static_cast<DALIDataTypePlaceholder>(t.type());
        },
      R"code(
      Data type of the TensorCPU's elements.

      :type: DALIDataType
      )code");
  tensor_cpu_binding.doc() = R"code(
      Class representing a Tensor residing in host memory. It can be used to access individual
      samples of a :class:`TensorListCPU` or used to wrap CPU memory that is intended
      to be passed as an input to DALI.

      It is compatible with `Python Buffer Protocol <https://docs.python.org/3/c-api/buffer.html>`_,
      `NumPy Array Interface <https://numpy.org/doc/stable/reference/arrays.interface.html>`_
      and `DLPack <https://github.com/dmlc/dlpack>`_.)code";

  py::implicitly_convertible<py::buffer, Tensor<CPUBackend>>();
  py::implicitly_convertible<py::capsule&, Tensor<CPUBackend>>();

  auto tensor_gpu_binding = py::class_<Tensor<GPUBackend>>(m, "TensorGPU")
    .def_property_readonly_static("__module__", tensor_module_impl)
    .def(py::init([](
              py::capsule &capsule,
              std::optional<std::string> layout = {},
              py::object stream = py::none()) {
          DomainTimeRange range("TensorGPU::init from capsule", kGPUTensorColor);
          auto t = std::make_unique<Tensor<GPUBackend>>();
          FillTensorFromDlPack(capsule, t.get(), layout);
          if (!stream.is_none())  // use a separately provided stream - there's none in the capsule
            t->set_order(AccessOrderFromPythonStreamObj(stream));
          return t.release();
        }),
      "object"_a,
      "layout"_a = py::none(),
      "stream"_a = py::none(),
      R"code(
      Wrap a DLPack Tensor residing in the GPU memory.

      object : DLPack object
            Python DLPack object
      layout : str
            Layout of the data
      stream : dali.Stream, int, ctypes_void_ptr, None
            Stream to accociate the tensor with
      )code")
    .def(
      "device_id", &Tensor<GPUBackend>::device_id)
    .def(
      "__dlpack_device__", [](const Tensor<GPUBackend> &tensor) {
        auto dev = GetDLDevice(tensor);
        return std::make_tuple(dev.device_type, dev.device_id);
      },
      R"code(
      Returns device type and device ID in DLPack format.
      )code")
    .def(
      "__dlpack__", ToDLPack<GPUBackend>,
      "stream"_a = py::none(),
      "dl_device"_a = py::none(),
      R"code(
      Exposes the tensor as a DLPack capsule.

      Note:
        When NOT using the default execution model (i.e., when ``exec_dynamic=False`` or other
        parameters are incompatible with this execution mode), the pipeline outputs may be reused
        and overwritten by DALI after ``release_outputs`` has been called. Make sure that the
        default execution model is enabled if you want to keep the outputs indefinitely.

      stream : int, None
          The CUDA stream the the caller is going to use to access the buffer.
          A synchronization event might be inserted, if necessary, into that stream.
          Special values:

          * ``None`` - any stream; wait on host
          * ``-1``   - do not synchronize at all
          * ``1``    - legacy default stream
          * ``2``    - legacy per-thread stream
          * ``>2``   - a CUDA stream handle converted to an integer
          * ``0``    - forbidden value
      )code")
    .def(py::init([](const py::object &object,
                     const std::optional<std::string> &layout = {},
                     int device_id = -1) {
          DomainTimeRange range("TensorGPU::init from CUDA array", kGPUTensorColor);
          auto t = std::make_unique<Tensor<GPUBackend>>();
          FillTensorFromCudaArray(object, t.get(), device_id, layout);
          return t.release();
        }),
      "object"_a,
      "layout"_a = py::none(),
      "device_id"_a = -1,
      R"code(
      Wrap a Tensor residing in the GPU memory that implements CUDA Array Interface.

      object : object
            Python object that implements CUDA Array Interface
      layout : str
            Layout of the data
      device_id: int
            Device of where this tensor resides. If not provided, the current device is used.
      )code")
    .def("shape", &py_shape<GPUBackend>,
         R"code(
         Shape of the tensor.
         )code")
    .def("ndim", &Tensor<GPUBackend>::ndim,
         R"code(
         Number of dimensions of the tensor.
         )code")
    .def("layout", [](Tensor<GPUBackend> &t) {
      return t.GetLayout().str();
    })
    .def("set_layout", [](Tensor<GPUBackend> &t, const std::optional<std::string> &layout) {
      SetLayout(t, layout);
    })
    .def("source_info", &Tensor<GPUBackend>::GetSourceInfo,
        R"(Gets a string descrbing the source of the data in the tensor, e.g. a name of the file
        from which the data was loaded.)")
    .def("get_property", GetTensorProperty<GPUBackend>)
    .def("as_cpu", [](Tensor<GPUBackend> &t) -> Tensor<CPUBackend>* {
          DeviceGuard g(t.device_id());