Speed up getf2

rjfarmer · rjfarmer · commit a5e78c23bc60 · 2023-02-03T13:01:44.000+01:00
The idea is to be slightly more cache friendly when accessing the a array.
As jj and ii are always &gt; j we can extract access to the a array to occur
of the jj and ii loops. This should minimise the number of cache lines that
need to be constantly re-read in when doing the ii loop.

Testing agaisnt split_burn_big_net didnt show any improvement (or worsening)
but using the MESA's one zone burner in a standalone mode showed 30%
speed up. While also showing no difference in final results.
diff --git a/mtx/public/mtx_solve_routines.inc b/mtx/public/mtx_solve_routines.inc
@@ -3,10 +3,10 @@
       subroutine my_getf2(m, a, lda, ipiv, info)
          integer :: info, lda, m
          integer :: ipiv(:)
-         real(dp) :: a(:,:)
+         real(dp) :: a(:,:),aj(m)
          real(dp), parameter :: one=1, zero=0
          integer :: i, j, jp, ii, jj, n, mm
-         real(dp) :: tmp, da
+         real(dp) :: tmp, da, ajjj
          do j = 1, m
             info = 0
             jp = j - 1 + maxloc(abs(a(j:lda,j)),dim=1)
@@ -31,11 +31,13 @@
                info = j
             end if
             if( j.lt.m ) then
+               aj = a(:,j)
                !call dger( m-j, m-j, -one, a( j+1, j ), 1, a( j, j+1 ), lda, a( j+1, j+1 ), lda )
                do jj = j+1, m
+                  ajjj = a(j,jj)
                   !$omp simd
                   do ii = j+1, m
-                     a(ii,jj) = a(ii,jj) - a(ii,j)*a(j,jj)
+                     a(ii,jj) = a(ii,jj) - aj(ii)*ajjj
                   end do
                end do
             end if