55#include "demagcoef.h"
66
77
8- double Nxxdipole (double x , double y , double z ) {
8+ inline double Nxxdipole (double x , double y , double z ) {
99 double x2 = x * x ;
1010 double y2 = y * y ;
1111 double z2 = z * z ;
@@ -16,7 +16,7 @@ double Nxxdipole(double x, double y, double z) {
1616 return - (2 * x2 - y2 - z2 ) / (R * R * r );
1717}
1818
19- double Nxydipole (double x , double y , double z ) {
19+ inline double Nxydipole (double x , double y , double z ) {
2020 double R = x * x + y * y + z * z ;
2121 if (R == 0 )
2222 return 0.0 ;
@@ -54,9 +54,16 @@ void compute_dipolar_tensors(fft_demag_plan *plan) {
5454 int lenx = plan -> lenx ;
5555 int leny = plan -> leny ;
5656 int lenz = plan -> lenz ;
57- int lenxy = lenx * leny ;
58-
59-
57+ int lenxy = lenx * leny ;
58+ // Parallelising this like this
59+ // means that z should be the largest index
60+ // in order to get better performance from threading.
61+ // The data writing is not very clever here; we're
62+ // going to be invalidating the cache a lot. It would be better
63+ // maybe to split this into six seperate loops for each component
64+ // of the tensor in order that each thread is working on a smaller
65+ // memory address range
66+ #pragma omp parallel for private(j, i, x, y, z, id) schedule(dynamic, 32)
6067 for (k = 0 ; k < lenz ; k ++ ) {
6168 for (j = 0 ; j < leny ; j ++ ) {
6269 for (i = 0 ; i < lenx ; i ++ ) {
@@ -81,7 +88,7 @@ void compute_dipolar_tensors(fft_demag_plan *plan) {
8188void compute_demag_tensors (fft_demag_plan * plan ) {
8289
8390 int i , j , k , id ;
84- double x , y , z ;
91+ double x , y , z , radius_sq ;
8592
8693 int nx = plan -> nx ;
8794 int ny = plan -> ny ;
@@ -97,7 +104,7 @@ void compute_demag_tensors(fft_demag_plan *plan) {
97104
98105 double length = pow (dx * dy * dz , 1 /3.0 );
99106 double asymptotic_radius_sq = pow (26.0 * length ,2.0 );
100-
107+ #pragma omp parallel for private(j, i, id, x, y, z, radius_sq) schedule(dynamic, 32)
101108 for (k = 0 ; k < lenz ; k ++ ) {
102109 for (j = 0 ; j < leny ; j ++ ) {
103110 for (i = 0 ; i < lenx ; i ++ ) {
@@ -107,7 +114,7 @@ void compute_demag_tensors(fft_demag_plan *plan) {
107114 y = (j - ny + 1 ) * dy ;
108115 z = (k - nz + 1 ) * dz ;
109116
110- double radius_sq = x * x + y * y + z * z ;
117+ radius_sq = x * x + y * y + z * z ;
111118
112119 if (radius_sq > asymptotic_radius_sq ){
113120 //printf("%g %g %g %g %g %g\n",x,y,z,dx,dy,dz);
@@ -338,7 +345,7 @@ void compute_fields(fft_demag_plan *restrict plan, double *restrict spin, double
338345 //print_r("hz", plan->hz, plan->total_length);
339346
340347 double scale = -1.0 / plan -> total_length ;
341-
348+ #pragma omp parallel for private(j, i, id1, id2) schedule(dynamic, 32)
342349 for (k = 0 ; k < nz ; k ++ ) {
343350 for (j = 0 ; j < ny ; j ++ ) {
344351 for (i = 0 ; i < nx ; i ++ ) {
0 commit comments