2626#define INCLUDE_SHAD_CORE_IMPL_MODIFYING_SEQUENCE_OPS_H
2727
2828#include < algorithm>
29+ #include < cstring>
2930#include < functional>
3031#include < iterator>
3132#include < tuple>
@@ -84,12 +85,159 @@ void fill(distributed_sequential_tag&& policy, ForwardIt first, ForwardIt last,
8485 value);
8586}
8687
88+ namespace transform_impl {
89+ template <typename ForwardIt>
90+ struct gen_args_t {
91+ static constexpr size_t buf_size =
92+ (2 << 10 ) / sizeof (typename ForwardIt::value_type);
93+ typename ForwardIt::value_type buf[buf_size];
94+ ForwardIt w_first;
95+ size_t size;
96+ };
97+
98+ template <class ForwardIt1 , class ForwardIt2 , class UnaryOperation >
99+ ForwardIt2 block_contiguous_kernel (rt::Locality l, ForwardIt1 first,
100+ ForwardIt1 last, ForwardIt2 d_first,
101+ UnaryOperation op) {
102+ using itr_traits1 = std::iterator_traits<ForwardIt1>;
103+ using itr_traits2 = distributed_iterator_traits<ForwardIt2>;
104+ using args_t = gen_args_t <ForwardIt2>;
105+ auto size = std::distance (first, last);
106+ auto d_last = d_first;
107+ std::advance (d_last, size);
108+
109+ // local assign
110+ if (rt::thisLocality () == l) {
111+ auto local_d_range = itr_traits2::local_range (d_first, d_last);
112+ auto loc_res = std::transform (first, last, local_d_range.begin (), op);
113+ return itr_traits2::iterator_from_local (d_first, d_last, loc_res - 1 ) + 1 ;
114+ }
115+
116+ // remote assign
117+ std::shared_ptr<uint8_t > args_buf (new uint8_t [sizeof (args_t )],
118+ std::default_delete<uint8_t []>());
119+ auto typed_args_buf = reinterpret_cast <args_t *>(args_buf.get ());
120+ auto block_last = first;
121+ rt::Handle h;
122+ while (first != last) {
123+ typed_args_buf->w_first = d_first;
124+ typed_args_buf->size =
125+ std::min (args_t ::buf_size, (size_t )std::distance (first, last));
126+ std::advance (block_last, typed_args_buf->size );
127+ std::transform (first, block_last, typed_args_buf->buf , op);
128+ rt::asyncExecuteAt (
129+ h, l,
130+ [](rt::Handle&, const uint8_t * args_buf, const uint32_t ) {
131+ const args_t & args = *reinterpret_cast <const args_t *>(args_buf);
132+ using val_t = typename ForwardIt2::value_type;
133+ ForwardIt2 w_last = args.w_first ;
134+ std::advance (w_last, args.size );
135+ auto w_range = itr_traits2::local_range (args.w_first , w_last);
136+ std::memcpy (w_range.begin (), args.buf , sizeof (val_t ) * args.size );
137+ },
138+ args_buf, sizeof (args_t ));
139+ std::advance (first, typed_args_buf->size );
140+ std::advance (d_first, typed_args_buf->size );
141+ }
142+ rt::waitForCompletion (h);
143+ return d_last; // todo double check
144+ }
145+
146+ // distributed-sequential kernel for non-block-contiguous output-iterators
147+ template <class ForwardIt1 , class ForwardIt2 , class UnaryOperation >
148+ void dseq_kernel (std::false_type, ForwardIt1 first, ForwardIt1 last,
149+ ForwardIt2 d_first, ForwardIt2* res_ptr, UnaryOperation op) {
150+ using itr_traits1 = distributed_iterator_traits<ForwardIt1>;
151+ auto local_range = itr_traits1::local_range (first, last);
152+ auto begin = local_range.begin ();
153+ auto end = local_range.end ();
154+ *res_ptr = std::transform (begin, end, d_first, op);
155+ }
156+
157+ // distributed-sequential kernel for block-contiguous output-iterators
158+ template <class ForwardIt1 , class ForwardIt2 , class UnaryOperation >
159+ void dseq_kernel (std::true_type, ForwardIt1 first, ForwardIt1 last,
160+ ForwardIt2 d_first, ForwardIt2* res_ptr, UnaryOperation op) {
161+ using itr_traits1 = distributed_iterator_traits<ForwardIt1>;
162+ using itr_traits2 = distributed_random_access_iterator_trait<ForwardIt2>;
163+ auto loc_range = itr_traits1::local_range (first, last);
164+ auto loc_first = loc_range.begin ();
165+ auto d_last = d_first;
166+ std::advance (d_last, std::distance (loc_first, loc_range.end ()));
167+ auto dmap = itr_traits2::distribution (d_first, d_last);
168+ auto loc_last = loc_first;
169+ for (auto i : dmap) {
170+ auto l = i.first ;
171+ std::advance (loc_last, i.second );
172+ d_last = transform_impl::block_contiguous_kernel (l, loc_first, loc_last,
173+ d_first, op);
174+ std::advance (loc_first, i.second );
175+ std::advance (d_first, i.second );
176+ }
177+ *res_ptr = d_last;
178+ }
179+
180+ // distributed-parallel kernel for non-block-contiguous output-iterators
181+ template <class ForwardIt1 , class ForwardIt2 , class UnaryOperation >
182+ void dpar_kernel (std::false_type, ForwardIt1 first, ForwardIt1 last,
183+ ForwardIt2 d_first, ForwardIt2* res_ptr, UnaryOperation op) {
184+ using itr_traits1 = distributed_iterator_traits<ForwardIt1>;
185+ auto local_range = itr_traits1::local_range (first, last);
186+ auto begin = local_range.begin ();
187+ auto end = local_range.end ();
188+ auto it = itr_traits1::iterator_from_local (first, last, begin);
189+ *res_ptr = std::transform (begin, end, d_first, op);
190+ }
191+
192+ // distributed-parallel kernel for block-contiguous output-iterators
193+ template <class ForwardIt1 , class ForwardIt2 , class UnaryOperation >
194+ void dpar_kernel (std::true_type, ForwardIt1 first, ForwardIt1 last,
195+ ForwardIt2 d_first, ForwardIt2* res_ptr, UnaryOperation op) {
196+ using itr_traits1 = distributed_iterator_traits<ForwardIt1>;
197+ using itr_traits2 = distributed_random_access_iterator_trait<ForwardIt2>;
198+ auto loc_range = itr_traits1::local_range (first, last);
199+ auto loc_first = loc_range.begin ();
200+ auto first_ = itr_traits1::iterator_from_local (first, last, loc_first);
201+ std::advance (d_first, std::distance (first, first_));
202+ auto d_last = d_first;
203+ std::advance (d_last, std::distance (loc_first, loc_range.end ()));
204+ auto dmap = itr_traits2::distribution (d_first, d_last);
205+ auto loc_last = loc_first;
206+ for (auto i : dmap) {
207+ auto l = i.first ;
208+ std::advance (loc_last, i.second );
209+ d_last = transform_impl::block_contiguous_kernel (l, loc_first, loc_last,
210+ d_first, op);
211+ std::advance (loc_first, i.second );
212+ std::advance (d_first, i.second );
213+ }
214+ *res_ptr = d_last;
215+ }
216+
217+ // dispatchers
218+ template <class ForwardIt1 , class ForwardIt2 , class UnaryOperation >
219+ void dseq_kernel (ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
220+ ForwardIt2* res_ptr, UnaryOperation op) {
221+ dseq_kernel (is_block_contiguous<ForwardIt2>::value, first, last, d_first,
222+ res_ptr, op);
223+ }
224+
225+ template <class ForwardIt1 , class ForwardIt2 , class UnaryOperation >
226+ void dpar_kernel (ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
227+ ForwardIt2* res_ptr, UnaryOperation op) {
228+ dpar_kernel (is_block_contiguous<ForwardIt2>::value, first, last, d_first,
229+ res_ptr, op);
230+ }
231+
232+ } // namespace transform_impl
233+
87234template <class ForwardIt1 , class ForwardIt2 , class UnaryOperation >
88235ForwardIt2 transform (distributed_parallel_tag&& policy, ForwardIt1 first1,
89236 ForwardIt1 last1, ForwardIt2 d_first,
90237 UnaryOperation unary_op) {
91- using itr_traits = distributed_iterator_traits<ForwardIt1>;
92- auto localities = itr_traits::localities (first1, last1);
238+ using itr_traits1 = distributed_iterator_traits<ForwardIt1>;
239+ using itr_traits2 = distributed_random_access_iterator_trait<ForwardIt2>;
240+ auto localities = itr_traits1::localities (first1, last1);
93241 std::vector<ForwardIt2> res (localities.size (), d_first);
94242 auto res_it = res.begin ();
95243 rt::Handle h;
@@ -101,16 +249,11 @@ ForwardIt2 transform(distributed_parallel_tag&& policy, ForwardIt1 first1,
101249 const std::tuple<ForwardIt1, ForwardIt1, ForwardIt2, UnaryOperation>&
102250 args,
103251 ForwardIt2* res_ptr) {
104- auto gbegin = std::get<0 >(args);
105- auto gend = std::get<1 >(args);
106- auto local_range = itr_traits::local_range (gbegin, gend);
107- auto begin = local_range.begin ();
108- auto end = local_range.end ();
109- auto it = itr_traits::iterator_from_local (gbegin, gend, begin);
110- auto d_first_ = std::get<2 >(args);
111- advance_output_iterator (d_first_, gbegin, it);
252+ auto first = std::get<0 >(args);
253+ auto last = std::get<1 >(args);
254+ auto d_first = std::get<2 >(args);
112255 auto op = std::get<3 >(args);
113- *res_ptr = std::transform (begin, end, d_first_ , op);
256+ transform_impl::dpar_kernel (first, last, d_first, res_ptr , op);
114257 flush_iterator (*res_ptr);
115258 },
116259 std::make_tuple (first1, last1, d_first, unary_op), &(*res_it));
@@ -123,8 +266,9 @@ template <class ForwardIt1, class ForwardIt2, class UnaryOperation>
123266ForwardIt2 transform (distributed_sequential_tag&& policy, ForwardIt1 first1,
124267 ForwardIt1 last1, ForwardIt2 d_first,
125268 UnaryOperation unary_op) {
126- using itr_traits = distributed_iterator_traits<ForwardIt1>;
127- auto localities = itr_traits::localities (first1, last1);
269+ using itr_traits1 = distributed_iterator_traits<ForwardIt1>;
270+ using itr_traits2 = distributed_random_access_iterator_trait<ForwardIt2>;
271+ auto localities = itr_traits1::localities (first1, last1);
128272 ForwardIt2 res = d_first;
129273 for (auto locality = localities.begin (), end = localities.end ();
130274 locality != end; ++locality) {
@@ -133,13 +277,11 @@ ForwardIt2 transform(distributed_sequential_tag&& policy, ForwardIt1 first1,
133277 [](const std::tuple<ForwardIt1, ForwardIt1, ForwardIt2, UnaryOperation>&
134278 args,
135279 ForwardIt2* res_ptr) {
136- auto d_first_ = std::get<2 >(args);
280+ auto first = std::get<0 >(args);
281+ auto last = std::get<1 >(args);
282+ auto d_first = std::get<2 >(args);
137283 auto op = std::get<3 >(args);
138- auto local_range =
139- itr_traits::local_range (std::get<0 >(args), std::get<1 >(args));
140- auto begin = local_range.begin ();
141- auto end = local_range.end ();
142- *res_ptr = std::transform (begin, end, d_first_, op);
284+ transform_impl::dseq_kernel (first, last, d_first, res_ptr, op);
143285 flush_iterator (*res_ptr);
144286 },
145287 std::make_tuple (first1, last1, res, unary_op), &res);
0 commit comments