@@ -68,7 +68,6 @@ BufferedReader::BufferedReader(
68
68
stream_ = platform::NpuStreamResourcePool::Instance ().New (dev_idx);
69
69
}
70
70
#endif
71
- is_same_place_ = false ;
72
71
cpu_buffer_.resize (buffer_size);
73
72
cuda_buffer_.resize (buffer_size);
74
73
npu_buffer_.resize (buffer_size);
@@ -116,7 +115,7 @@ void BufferedReader::ReadAsync(size_t i) {
116
115
std::vector<void *> cuda_pinned_ptrs;
117
116
cuda_pinned_ptrs.reserve (cpu.size ());
118
117
platform::RecordEvent record_event (" BufferedReader:MemoryCopy" );
119
- // NODE(chenwehiang ): When we use CUDAPinned Memory, we need call
118
+ // NODE(chenweihang ): When we use CUDAPinned Memory, we need call
120
119
// cudaHostAlloc, that is a CUDA API, calling CUDA API need load
121
120
// cuda lib into device, it will cost hundreds of MB of GPU memory.
122
121
// If we don't set Device here, which will use CUDAPlace(0) default.
@@ -126,18 +125,21 @@ void BufferedReader::ReadAsync(size_t i) {
126
125
if (platform::is_cpu_place (cpu[i].place ())) {
127
126
cuda[i].Resize (cpu[i].dims ());
128
127
cuda[i].set_layout (cpu[i].layout ());
129
- cuda_pinned_ptrs. emplace_back (
130
- cuda[i].mutable_data (cuda_pinned_place, cpu[i].type ())) ;
128
+ cuda_pinned_ptrs[i] =
129
+ cuda[i].mutable_data (cuda_pinned_place, cpu[i].type ());
131
130
auto size =
132
131
cpu[i].numel () * paddle::framework::SizeOfType (cpu[i].type ());
133
132
134
133
memory::Copy (cuda_pinned_place, cuda_pinned_ptrs[i],
135
134
BOOST_GET_CONST (platform::CPUPlace, cpu[i].place ()),
136
135
cpu[i].data <void >(), size);
136
+
137
137
cuda[i].set_lod (cpu[i].lod ());
138
138
} else {
139
- // we set same place flag & use cpu[i] directly
140
- is_same_place_ = true ;
139
+ // Here the cpu[i]'s place may be CUDAPlace, CUDAPinnedPlace, or
140
+ // others, we don't copy the memory of it to CUDAPinnedPlace, but
141
+ // we should share tensor data to cuda[i]
142
+ cuda[i].ShareDataWith (cpu[i]);
141
143
}
142
144
}
143
145
} else {
@@ -296,9 +298,9 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
296
298
return ;
297
299
}
298
300
299
- if (platform::is_gpu_place (place_) && !is_same_place_ ) {
301
+ if (platform::is_gpu_place (place_)) {
300
302
*out = std::move (cuda_buffer_[i]);
301
- } else if (platform::is_npu_place (place_) && !is_same_place_ ) {
303
+ } else if (platform::is_npu_place (place_)) {
302
304
*out = std::move (npu_buffer_[i]);
303
305
} else {
304
306
*out = std::move (cpu_buffer_[i]);
0 commit comments