@@ -216,6 +216,8 @@ op_dot::apply(const T1& X, const T2& Y)
216216
217217 if ( (A.m .n_rows == 1 ) && (B.m .n_rows == 1 ) )
218218 {
219+ arma_debug_print (" op_dot::apply(): subview_row optimisation" );
220+
219221 arma_conform_check ( (A.n_elem != B.n_elem ), " dot(): objects must have the same number of elements" );
220222
221223 const eT* A_mem = A.m .memptr ();
@@ -225,10 +227,47 @@ op_dot::apply(const T1& X, const T2& Y)
225227 }
226228 }
227229
230+ if (is_subview<T1>::value || is_subview<T2>::value)
231+ {
232+ arma_debug_print (" op_dot::apply(): subview optimisation" );
233+
234+ const sv_keep_unwrap<T1>& UA (X);
235+ const sv_keep_unwrap<T2>& UB (Y);
236+
237+ typedef typename sv_keep_unwrap<T1>::stored_type UA_M_type;
238+ typedef typename sv_keep_unwrap<T2>::stored_type UB_M_type;
239+
240+ const UA_M_type& A = UA.M ;
241+ const UB_M_type& B = UB.M ;
242+
243+ const uword A_n_rows = A.n_rows ;
244+ const uword A_n_cols = A.n_cols ;
245+
246+ if ( (A_n_rows == B.n_rows ) && (A_n_cols == B.n_cols ) )
247+ {
248+ eT acc = eT (0 );
249+
250+ for (uword c=0 ; c < A_n_cols; ++c) { acc += op_dot::direct_dot (A_n_rows, A.colptr (c), B.colptr (c)); }
251+
252+ return acc;
253+ }
254+ else
255+ {
256+ const quasi_unwrap<UA_M_type> UUA (A);
257+ const quasi_unwrap<UB_M_type> UUB (B);
258+
259+ arma_conform_check ( (UUA.M .n_elem != UUB.M .n_elem ), " dot(): objects must have the same number of elements" );
260+
261+ return op_dot::direct_dot (UUA.M .n_elem , UUA.M .memptr (), UUB.M .memptr ());
262+ }
263+ }
264+
228265 // if possible, bypass transposes of non-complex vectors
229266
230267 if ( (is_cx<eT>::no) && (resolves_to_vector<T1>::value) && (resolves_to_vector<T2>::value) && (partial_unwrap<T1>::is_fast) && (partial_unwrap<T2>::is_fast) )
231268 {
269+ arma_debug_print (" op_dot::apply(): vector optimisation" );
270+
232271 const partial_unwrap<T1> UA (X);
233272 const partial_unwrap<T2> UB (Y);
234273
@@ -250,6 +289,8 @@ op_dot::apply(const T1& X, const T2& Y)
250289
251290 if (proxy_is_mat || use_at || have_direct_mem)
252291 {
292+ arma_debug_print (" op_dot::apply(): direct_mem optimisation" );
293+
253294 const quasi_unwrap<T1> A (X);
254295 const quasi_unwrap<T2> B (Y);
255296
@@ -263,15 +304,15 @@ op_dot::apply(const T1& X, const T2& Y)
263304
264305 arma_conform_check ( (PA.get_n_elem () != PB.get_n_elem ()), " dot(): objects must have the same number of elements" );
265306
266- return op_dot::apply_proxy (PA,PB);
307+ return op_dot::apply_proxy_linear (PA,PB);
267308 }
268309
269310
270311
271312template <typename T1, typename T2>
272313inline
273314typename arma_not_cx<typename T1::elem_type>::result
274- op_dot::apply_proxy (const Proxy<T1>& PA, const Proxy<T2>& PB)
315+ op_dot::apply_proxy_linear (const Proxy<T1>& PA, const Proxy<T2>& PB)
275316 {
276317 arma_debug_sigprint ();
277318
@@ -308,7 +349,7 @@ op_dot::apply_proxy(const Proxy<T1>& PA, const Proxy<T2>& PB)
308349template <typename T1, typename T2>
309350inline
310351typename arma_cx_only<typename T1::elem_type>::result
311- op_dot::apply_proxy (const Proxy<T1>& PA, const Proxy<T2>& PB)
352+ op_dot::apply_proxy_linear (const Proxy<T1>& PA, const Proxy<T2>& PB)
312353 {
313354 arma_debug_sigprint ();
314355
0 commit comments