Skip to content

Commit ace64bc

Browse files
authored
Merge pull request #151 from droccom/rma-assign
[#144] RMA-based shad::transform Fixes #144 Fixes #151
2 parents 12d64ed + cdbc084 commit ace64bc

File tree

2 files changed

+193
-36
lines changed

2 files changed

+193
-36
lines changed

include/shad/core/impl/modifyng_sequence_ops.h

Lines changed: 161 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#define INCLUDE_SHAD_CORE_IMPL_MODIFYING_SEQUENCE_OPS_H
2727

2828
#include <algorithm>
29+
#include <cstring>
2930
#include <functional>
3031
#include <iterator>
3132
#include <tuple>
@@ -84,12 +85,159 @@ void fill(distributed_sequential_tag&& policy, ForwardIt first, ForwardIt last,
8485
value);
8586
}
8687

88+
namespace transform_impl {
89+
template <typename ForwardIt>
90+
struct gen_args_t {
91+
static constexpr size_t buf_size =
92+
(2 << 10) / sizeof(typename ForwardIt::value_type);
93+
typename ForwardIt::value_type buf[buf_size];
94+
ForwardIt w_first;
95+
size_t size;
96+
};
97+
98+
template <class ForwardIt1, class ForwardIt2, class UnaryOperation>
99+
ForwardIt2 block_contiguous_kernel(rt::Locality l, ForwardIt1 first,
100+
ForwardIt1 last, ForwardIt2 d_first,
101+
UnaryOperation op) {
102+
using itr_traits1 = std::iterator_traits<ForwardIt1>;
103+
using itr_traits2 = distributed_iterator_traits<ForwardIt2>;
104+
using args_t = gen_args_t<ForwardIt2>;
105+
auto size = std::distance(first, last);
106+
auto d_last = d_first;
107+
std::advance(d_last, size);
108+
109+
// local assign
110+
if (rt::thisLocality() == l) {
111+
auto local_d_range = itr_traits2::local_range(d_first, d_last);
112+
auto loc_res = std::transform(first, last, local_d_range.begin(), op);
113+
return itr_traits2::iterator_from_local(d_first, d_last, loc_res - 1) + 1;
114+
}
115+
116+
// remote assign
117+
std::shared_ptr<uint8_t> args_buf(new uint8_t[sizeof(args_t)],
118+
std::default_delete<uint8_t[]>());
119+
auto typed_args_buf = reinterpret_cast<args_t*>(args_buf.get());
120+
auto block_last = first;
121+
rt::Handle h;
122+
while (first != last) {
123+
typed_args_buf->w_first = d_first;
124+
typed_args_buf->size =
125+
std::min(args_t::buf_size, (size_t)std::distance(first, last));
126+
std::advance(block_last, typed_args_buf->size);
127+
std::transform(first, block_last, typed_args_buf->buf, op);
128+
rt::asyncExecuteAt(
129+
h, l,
130+
[](rt::Handle&, const uint8_t* args_buf, const uint32_t) {
131+
const args_t& args = *reinterpret_cast<const args_t*>(args_buf);
132+
using val_t = typename ForwardIt2::value_type;
133+
ForwardIt2 w_last = args.w_first;
134+
std::advance(w_last, args.size);
135+
auto w_range = itr_traits2::local_range(args.w_first, w_last);
136+
std::memcpy(w_range.begin(), args.buf, sizeof(val_t) * args.size);
137+
},
138+
args_buf, sizeof(args_t));
139+
std::advance(first, typed_args_buf->size);
140+
std::advance(d_first, typed_args_buf->size);
141+
}
142+
rt::waitForCompletion(h);
143+
return d_last; // todo double check
144+
}
145+
146+
// distributed-sequential kernel for non-block-contiguous output-iterators
147+
template <class ForwardIt1, class ForwardIt2, class UnaryOperation>
148+
void dseq_kernel(std::false_type, ForwardIt1 first, ForwardIt1 last,
149+
ForwardIt2 d_first, ForwardIt2* res_ptr, UnaryOperation op) {
150+
using itr_traits1 = distributed_iterator_traits<ForwardIt1>;
151+
auto local_range = itr_traits1::local_range(first, last);
152+
auto begin = local_range.begin();
153+
auto end = local_range.end();
154+
*res_ptr = std::transform(begin, end, d_first, op);
155+
}
156+
157+
// distributed-sequential kernel for block-contiguous output-iterators
158+
template <class ForwardIt1, class ForwardIt2, class UnaryOperation>
159+
void dseq_kernel(std::true_type, ForwardIt1 first, ForwardIt1 last,
160+
ForwardIt2 d_first, ForwardIt2* res_ptr, UnaryOperation op) {
161+
using itr_traits1 = distributed_iterator_traits<ForwardIt1>;
162+
using itr_traits2 = distributed_random_access_iterator_trait<ForwardIt2>;
163+
auto loc_range = itr_traits1::local_range(first, last);
164+
auto loc_first = loc_range.begin();
165+
auto d_last = d_first;
166+
std::advance(d_last, std::distance(loc_first, loc_range.end()));
167+
auto dmap = itr_traits2::distribution(d_first, d_last);
168+
auto loc_last = loc_first;
169+
for (auto i : dmap) {
170+
auto l = i.first;
171+
std::advance(loc_last, i.second);
172+
d_last = transform_impl::block_contiguous_kernel(l, loc_first, loc_last,
173+
d_first, op);
174+
std::advance(loc_first, i.second);
175+
std::advance(d_first, i.second);
176+
}
177+
*res_ptr = d_last;
178+
}
179+
180+
// distributed-parallel kernel for non-block-contiguous output-iterators
181+
template <class ForwardIt1, class ForwardIt2, class UnaryOperation>
182+
void dpar_kernel(std::false_type, ForwardIt1 first, ForwardIt1 last,
183+
ForwardIt2 d_first, ForwardIt2* res_ptr, UnaryOperation op) {
184+
using itr_traits1 = distributed_iterator_traits<ForwardIt1>;
185+
auto local_range = itr_traits1::local_range(first, last);
186+
auto begin = local_range.begin();
187+
auto end = local_range.end();
188+
auto it = itr_traits1::iterator_from_local(first, last, begin);
189+
*res_ptr = std::transform(begin, end, d_first, op);
190+
}
191+
192+
// distributed-parallel kernel for block-contiguous output-iterators
193+
template <class ForwardIt1, class ForwardIt2, class UnaryOperation>
194+
void dpar_kernel(std::true_type, ForwardIt1 first, ForwardIt1 last,
195+
ForwardIt2 d_first, ForwardIt2* res_ptr, UnaryOperation op) {
196+
using itr_traits1 = distributed_iterator_traits<ForwardIt1>;
197+
using itr_traits2 = distributed_random_access_iterator_trait<ForwardIt2>;
198+
auto loc_range = itr_traits1::local_range(first, last);
199+
auto loc_first = loc_range.begin();
200+
auto first_ = itr_traits1::iterator_from_local(first, last, loc_first);
201+
std::advance(d_first, std::distance(first, first_));
202+
auto d_last = d_first;
203+
std::advance(d_last, std::distance(loc_first, loc_range.end()));
204+
auto dmap = itr_traits2::distribution(d_first, d_last);
205+
auto loc_last = loc_first;
206+
for (auto i : dmap) {
207+
auto l = i.first;
208+
std::advance(loc_last, i.second);
209+
d_last = transform_impl::block_contiguous_kernel(l, loc_first, loc_last,
210+
d_first, op);
211+
std::advance(loc_first, i.second);
212+
std::advance(d_first, i.second);
213+
}
214+
*res_ptr = d_last;
215+
}
216+
217+
// dispatchers
218+
template <class ForwardIt1, class ForwardIt2, class UnaryOperation>
219+
void dseq_kernel(ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
220+
ForwardIt2* res_ptr, UnaryOperation op) {
221+
dseq_kernel(is_block_contiguous<ForwardIt2>::value, first, last, d_first,
222+
res_ptr, op);
223+
}
224+
225+
template <class ForwardIt1, class ForwardIt2, class UnaryOperation>
226+
void dpar_kernel(ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
227+
ForwardIt2* res_ptr, UnaryOperation op) {
228+
dpar_kernel(is_block_contiguous<ForwardIt2>::value, first, last, d_first,
229+
res_ptr, op);
230+
}
231+
232+
} // namespace transform_impl
233+
87234
template <class ForwardIt1, class ForwardIt2, class UnaryOperation>
88235
ForwardIt2 transform(distributed_parallel_tag&& policy, ForwardIt1 first1,
89236
ForwardIt1 last1, ForwardIt2 d_first,
90237
UnaryOperation unary_op) {
91-
using itr_traits = distributed_iterator_traits<ForwardIt1>;
92-
auto localities = itr_traits::localities(first1, last1);
238+
using itr_traits1 = distributed_iterator_traits<ForwardIt1>;
239+
using itr_traits2 = distributed_random_access_iterator_trait<ForwardIt2>;
240+
auto localities = itr_traits1::localities(first1, last1);
93241
std::vector<ForwardIt2> res(localities.size(), d_first);
94242
auto res_it = res.begin();
95243
rt::Handle h;
@@ -101,16 +249,11 @@ ForwardIt2 transform(distributed_parallel_tag&& policy, ForwardIt1 first1,
101249
const std::tuple<ForwardIt1, ForwardIt1, ForwardIt2, UnaryOperation>&
102250
args,
103251
ForwardIt2* res_ptr) {
104-
auto gbegin = std::get<0>(args);
105-
auto gend = std::get<1>(args);
106-
auto local_range = itr_traits::local_range(gbegin, gend);
107-
auto begin = local_range.begin();
108-
auto end = local_range.end();
109-
auto it = itr_traits::iterator_from_local(gbegin, gend, begin);
110-
auto d_first_ = std::get<2>(args);
111-
advance_output_iterator(d_first_, gbegin, it);
252+
auto first = std::get<0>(args);
253+
auto last = std::get<1>(args);
254+
auto d_first = std::get<2>(args);
112255
auto op = std::get<3>(args);
113-
*res_ptr = std::transform(begin, end, d_first_, op);
256+
transform_impl::dpar_kernel(first, last, d_first, res_ptr, op);
114257
flush_iterator(*res_ptr);
115258
},
116259
std::make_tuple(first1, last1, d_first, unary_op), &(*res_it));
@@ -123,8 +266,9 @@ template <class ForwardIt1, class ForwardIt2, class UnaryOperation>
123266
ForwardIt2 transform(distributed_sequential_tag&& policy, ForwardIt1 first1,
124267
ForwardIt1 last1, ForwardIt2 d_first,
125268
UnaryOperation unary_op) {
126-
using itr_traits = distributed_iterator_traits<ForwardIt1>;
127-
auto localities = itr_traits::localities(first1, last1);
269+
using itr_traits1 = distributed_iterator_traits<ForwardIt1>;
270+
using itr_traits2 = distributed_random_access_iterator_trait<ForwardIt2>;
271+
auto localities = itr_traits1::localities(first1, last1);
128272
ForwardIt2 res = d_first;
129273
for (auto locality = localities.begin(), end = localities.end();
130274
locality != end; ++locality) {
@@ -133,13 +277,11 @@ ForwardIt2 transform(distributed_sequential_tag&& policy, ForwardIt1 first1,
133277
[](const std::tuple<ForwardIt1, ForwardIt1, ForwardIt2, UnaryOperation>&
134278
args,
135279
ForwardIt2* res_ptr) {
136-
auto d_first_ = std::get<2>(args);
280+
auto first = std::get<0>(args);
281+
auto last = std::get<1>(args);
282+
auto d_first = std::get<2>(args);
137283
auto op = std::get<3>(args);
138-
auto local_range =
139-
itr_traits::local_range(std::get<0>(args), std::get<1>(args));
140-
auto begin = local_range.begin();
141-
auto end = local_range.end();
142-
*res_ptr = std::transform(begin, end, d_first_, op);
284+
transform_impl::dseq_kernel(first, last, d_first, res_ptr, op);
143285
flush_iterator(*res_ptr);
144286
},
145287
std::make_tuple(first1, last1, res, unary_op), &res);

include/shad/core/iterator.h

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ namespace shad {
4444
template <typename Container>
4545
class insert_iterator
4646
: public std::iterator<std::output_iterator_tag, void, void, void, void> {
47+
protected:
4748
using Iterator = typename Container::iterator;
4849
using internal_container_t = typename Container::internal_container_t;
4950

@@ -79,7 +80,7 @@ class insert_iterator
7980
insert_iterator& operator++() { return *this; }
8081
insert_iterator& operator++(int) { return *this; }
8182

82-
private:
83+
protected:
8384
typename internal_container_t::ObjectID global_id_;
8485
Iterator iterator_;
8586
internal_container_t* local_container_ptr_ = nullptr;
@@ -97,21 +98,21 @@ class insert_iterator
9798
///
9899
/// @tparam Container The type of the distributed container.
99100
template <typename Container>
100-
class buffered_insert_iterator
101-
: public std::iterator<std::output_iterator_tag, void, void, void, void> {
102-
using Iterator = typename Container::iterator;
103-
using internal_container_t = typename Container::internal_container_t;
101+
class buffered_insert_iterator : public insert_iterator<Container> {
102+
using base_t = insert_iterator<Container>;
103+
using Iterator = typename base_t::Iterator;
104+
using internal_container_t = typename base_t::internal_container_t;
104105

105106
public:
106-
using value_type = typename Container::value_type;
107-
using container_type = Container;
107+
using value_type = typename base_t::value_type;
108+
using container_type = typename base_t::container_type;
108109

109110
/// @brief Constructor.
110111
///
111112
/// @param container The container into which the iterator inserts.
112113
/// @param iterator The position at which the iterator starts to insert.
113114
buffered_insert_iterator(Container& container, Iterator iterator)
114-
: global_id_(container.global_id()) {}
115+
: base_t(container, iterator) {}
115116

116117
/// @brief The assignment operator.
117118
///
@@ -122,21 +123,22 @@ class buffered_insert_iterator
122123
///
123124
/// @return A self reference.
124125
buffered_insert_iterator& operator=(const value_type& value) {
125-
if (!local_container_ptr_ || locality_ != rt::thisLocality()) {
126-
locality_ = rt::thisLocality();
127-
local_container_ptr_ = Container::from_global_id(global_id_);
126+
if (!this->local_container_ptr_ || this->locality_ != rt::thisLocality()) {
127+
this->locality_ = rt::thisLocality();
128+
this->local_container_ptr_ = Container::from_global_id(this->global_id_);
128129
rt::Handle h;
129130
handle_ = h;
130131
}
131-
local_container_ptr_->buffered_async_insert(handle_, value);
132+
this->local_container_ptr_->buffered_async_insert(handle_, value);
132133
return *this;
133134
}
134135

135136
/// @brief Flushes pending insertions to the container.
136137
void flush() {
137-
if (local_container_ptr_ != nullptr && locality_ == rt::thisLocality()) {
138+
if (this->local_container_ptr_ != nullptr &&
139+
this->locality_ == rt::thisLocality()) {
138140
// if(!handle_.IsNull()) FIXME
139-
local_container_ptr_->buffered_async_flush(handle_);
141+
this->local_container_ptr_->buffered_async_flush(handle_);
140142
}
141143
}
142144

@@ -145,12 +147,25 @@ class buffered_insert_iterator
145147
buffered_insert_iterator& operator++(int) { return *this; }
146148

147149
private:
148-
typename internal_container_t::ObjectID global_id_;
149-
internal_container_t* local_container_ptr_ = nullptr;
150-
rt::Locality locality_;
151150
rt::Handle handle_;
152151
};
153152

153+
// compile-time test for block-contiguous property
154+
template <typename It>
155+
struct is_block_contiguous {
156+
static constexpr std::true_type value{};
157+
};
158+
159+
template <typename Container>
160+
struct is_block_contiguous<shad::insert_iterator<Container>> {
161+
static constexpr std::false_type value{};
162+
};
163+
164+
template <typename Container>
165+
struct is_block_contiguous<shad::buffered_insert_iterator<Container>> {
166+
static constexpr std::false_type value{};
167+
};
168+
154169
} // namespace shad
155170

156171
#endif /* INCLUDE_SHAD_CORE_ITERATOR_H_ */

0 commit comments

Comments
 (0)