@@ -223,9 +223,9 @@ struct SquareMatrixOps< 2 >
223223 */
224224 template < typename DST_VECTOR, typename SYM_MATRIX_A, typename VECTOR_B >
225225 LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
226- static void symAijBj ( DST_VECTOR && LVARRAY_RESTRICT_REF dstVector,
227- SYM_MATRIX_A const & LVARRAY_RESTRICT_REF symMatrixA,
228- VECTOR_B const & LVARRAY_RESTRICT_REF vectorB )
226+ static void Ri_eq_symAijBj ( DST_VECTOR && LVARRAY_RESTRICT_REF dstVector,
227+ SYM_MATRIX_A const & LVARRAY_RESTRICT_REF symMatrixA,
228+ VECTOR_B const & LVARRAY_RESTRICT_REF vectorB )
229229 {
230230 checkSizes< 2 >( dstVector );
231231 checkSizes< 3 >( symMatrixA );
@@ -247,9 +247,9 @@ struct SquareMatrixOps< 2 >
247247 */
248248 template < typename DST_VECTOR, typename SYM_MATRIX_A, typename VECTOR_B >
249249 LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
250- static void plusSymAijBj ( DST_VECTOR && LVARRAY_RESTRICT_REF dstVector,
251- SYM_MATRIX_A const & LVARRAY_RESTRICT_REF symMatrixA,
252- VECTOR_B const & LVARRAY_RESTRICT_REF vectorB )
250+ static void Ri_add_symAijBj ( DST_VECTOR && LVARRAY_RESTRICT_REF dstVector,
251+ SYM_MATRIX_A const & LVARRAY_RESTRICT_REF symMatrixA,
252+ VECTOR_B const & LVARRAY_RESTRICT_REF vectorB )
253253 {
254254 checkSizes< 2 >( dstVector );
255255 checkSizes< 3 >( symMatrixA );
@@ -272,9 +272,9 @@ struct SquareMatrixOps< 2 >
272272 */
273273 template < typename DST_MATRIX, typename SYM_MATRIX_A, typename MATRIX_B >
274274 LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
275- static void symAikBjk ( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix,
276- SYM_MATRIX_A const & LVARRAY_RESTRICT_REF symMatrixA,
277- MATRIX_B const & LVARRAY_RESTRICT_REF matrixB )
275+ static void Rij_eq_symAikBjk ( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix,
276+ SYM_MATRIX_A const & LVARRAY_RESTRICT_REF symMatrixA,
277+ MATRIX_B const & LVARRAY_RESTRICT_REF matrixB )
278278 {
279279 checkSizes< 2 , 2 >( dstMatrix );
280280 checkSizes< 3 >( symMatrixA );
@@ -302,9 +302,9 @@ struct SquareMatrixOps< 2 >
302302 */
303303 template < typename DST_SYM_MATRIX, typename MATRIX_A, typename SYM_MATRIX_B >
304304 LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
305- static void AikSymBklAjl ( DST_SYM_MATRIX && LVARRAY_RESTRICT_REF dstSymMatrix,
306- MATRIX_A const & LVARRAY_RESTRICT_REF matrixA,
307- SYM_MATRIX_B const & LVARRAY_RESTRICT_REF symMatrixB )
305+ static void Rij_eq_AikSymBklAjl ( DST_SYM_MATRIX && LVARRAY_RESTRICT_REF dstSymMatrix,
306+ MATRIX_A const & LVARRAY_RESTRICT_REF matrixA,
307+ SYM_MATRIX_B const & LVARRAY_RESTRICT_REF symMatrixB )
308308 {
309309 checkSizes< 3 >( dstSymMatrix );
310310 checkSizes< 2 , 2 >( matrixA );
@@ -580,11 +580,30 @@ struct SquareMatrixOps< 3 >
580580 LVARRAY_HOST_DEVICE constexpr inline
581581 static auto invert ( MATRIX && matrix )
582582 {
583- std::remove_reference_t < decltype ( matrix[ 0 ][ 0 ] ) > temp[ 3 ][ 3 ];
584- auto const det = invert ( temp, matrix );
585- copy< 3 , 3 >( matrix, temp );
586-
583+ using realType = std::remove_reference_t < decltype ( matrix[ 0 ][ 0 ] ) >;
584+ #if 0
585+ realType temp[ 3 ][ 3 ];
586+ copy< 3, 3 >( temp, matrix );
587+ return invert( matrix, temp );
588+ #else
589+ // cuda kernels use a couple fewer registers in some cases with this implementation.
590+ realType const temp[3 ][3 ] =
591+ { { matrix[1 ][1 ]*matrix[2 ][2 ] - matrix[1 ][2 ]*matrix[2 ][1 ], matrix[0 ][2 ]*matrix[2 ][1 ] - matrix[0 ][1 ]*matrix[2 ][2 ], matrix[0 ][1 ]*matrix[1 ][2 ] - matrix[0 ][2 ]*matrix[1 ][1 ] },
592+ { matrix[1 ][2 ]*matrix[2 ][0 ] - matrix[1 ][0 ]*matrix[2 ][2 ], matrix[0 ][0 ]*matrix[2 ][2 ] - matrix[0 ][2 ]*matrix[2 ][0 ], matrix[0 ][2 ]*matrix[1 ][0 ] - matrix[0 ][0 ]*matrix[1 ][2 ] },
593+ { matrix[1 ][0 ]*matrix[2 ][1 ] - matrix[1 ][1 ]*matrix[2 ][0 ], matrix[0 ][1 ]*matrix[2 ][0 ] - matrix[0 ][0 ]*matrix[2 ][1 ], matrix[0 ][0 ]*matrix[1 ][1 ] - matrix[0 ][1 ]*matrix[1 ][0 ] } };
594+
595+ realType const det = matrix[0 ][0 ] * temp[0 ][0 ] + matrix[1 ][0 ] * temp[0 ][1 ] + matrix[2 ][0 ] * temp[0 ][2 ];
596+ realType const invDet = 1.0 / det;
597+
598+ for ( int i=0 ; i<3 ; ++i )
599+ {
600+ for ( int j=0 ; j<3 ; ++j )
601+ {
602+ matrix[i][j] = temp[i][j] * invDet;
603+ }
604+ }
587605 return det;
606+ #endif
588607 }
589608
590609 /* *
@@ -674,9 +693,9 @@ struct SquareMatrixOps< 3 >
674693 */
675694 template < typename DST_VECTOR, typename SYM_MATRIX_A, typename VECTOR_B >
676695 LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
677- static void symAijBj ( DST_VECTOR && LVARRAY_RESTRICT_REF dstVector,
678- SYM_MATRIX_A const & LVARRAY_RESTRICT_REF symMatrixA,
679- VECTOR_B const & LVARRAY_RESTRICT_REF vectorB )
696+ static void Ri_eq_symAijBj ( DST_VECTOR && LVARRAY_RESTRICT_REF dstVector,
697+ SYM_MATRIX_A const & LVARRAY_RESTRICT_REF symMatrixA,
698+ VECTOR_B const & LVARRAY_RESTRICT_REF vectorB )
680699 {
681700 checkSizes< 3 >( dstVector );
682701 checkSizes< 6 >( symMatrixA );
@@ -705,9 +724,9 @@ struct SquareMatrixOps< 3 >
705724 */
706725 template < typename DST_VECTOR, typename SYM_MATRIX_A, typename VECTOR_B >
707726 LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
708- static void plusSymAijBj ( DST_VECTOR && LVARRAY_RESTRICT_REF dstVector,
709- SYM_MATRIX_A const & LVARRAY_RESTRICT_REF symMatrixA,
710- VECTOR_B const & LVARRAY_RESTRICT_REF vectorB )
727+ static void Ri_add_symAijBj ( DST_VECTOR && LVARRAY_RESTRICT_REF dstVector,
728+ SYM_MATRIX_A const & LVARRAY_RESTRICT_REF symMatrixA,
729+ VECTOR_B const & LVARRAY_RESTRICT_REF vectorB )
711730 {
712731 checkSizes< 3 >( dstVector );
713732 checkSizes< 6 >( symMatrixA );
@@ -740,9 +759,9 @@ struct SquareMatrixOps< 3 >
740759 */
741760 template < typename DST_MATRIX, typename SYM_MATRIX_A, typename MATRIX_B >
742761 LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
743- static void symAikBjk ( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix,
744- SYM_MATRIX_A const & LVARRAY_RESTRICT_REF symMatrixA,
745- MATRIX_B const & LVARRAY_RESTRICT_REF matrixB )
762+ static void Rij_eq_symAikBjk ( DST_MATRIX && LVARRAY_RESTRICT_REF dstMatrix,
763+ SYM_MATRIX_A const & LVARRAY_RESTRICT_REF symMatrixA,
764+ MATRIX_B const & LVARRAY_RESTRICT_REF matrixB )
746765 {
747766 checkSizes< 3 , 3 >( dstMatrix );
748767 checkSizes< 6 >( symMatrixA );
@@ -794,9 +813,9 @@ struct SquareMatrixOps< 3 >
794813 */
795814 template < typename DST_SYM_MATRIX, typename MATRIX_A, typename SYM_MATRIX_B >
796815 LVARRAY_HOST_DEVICE CONSTEXPR_WITHOUT_BOUNDS_CHECK inline
797- static void AikSymBklAjl ( DST_SYM_MATRIX && LVARRAY_RESTRICT_REF dstSymMatrix,
798- MATRIX_A const & LVARRAY_RESTRICT_REF matrixA,
799- SYM_MATRIX_B const & LVARRAY_RESTRICT_REF symMatrixB )
816+ static void Rij_eq_AikSymBklAjl ( DST_SYM_MATRIX && LVARRAY_RESTRICT_REF dstSymMatrix,
817+ MATRIX_A const & LVARRAY_RESTRICT_REF matrixA,
818+ SYM_MATRIX_B const & LVARRAY_RESTRICT_REF symMatrixB )
800819 {
801820 checkSizes< 6 >( dstSymMatrix );
802821 checkSizes< 3 , 3 >( matrixA );
0 commit comments