Merge branch 'master' into optimising-foldrescale

Marco Barbone · Marco Barbone · commit f6f98733fec8 · 2024-05-14T10:15:45.000-04:00
diff --git a/.github/workflows/cmake_ci.yml b/.github/workflows/cmake_ci.yml
@@ -38,6 +38,11 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
+      - name: Unlink gcc
+        if: runner.os == 'macOS'
+        run: |
+          brew unlink gcc
+
       - name: Setup Cpp
         uses: aminya/setup-cpp@v1
         with:
diff --git a/.github/workflows/python_wheel.yml b/.github/workflows/python_wheel.yml
@@ -46,7 +46,8 @@ jobs:
 
     - name: Install gcc and fftw
       run: |
-        brew install gcc fftw
+        brew unlink gcc
+        brew install gcc@13 fftw
         cp make.inc.macosx_gcc-12 make.inc
         echo "FC=gfortran-13" >> make.inc
         echo "CC=gcc-13" >> make.inc
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,8 +1,9 @@
 List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
-* CPU plan stage prevents now caps # threads at omp_get_max_threads (being 1
-  for single-thread build); warns if this cap was activated (PR 431)
+* CPU plan stage allows any # threads, warns if > omp_get_max_threads(); or
+  if single-threaded fixes nthr=1 and warns opts.nthreads>1 attempt.
+  Sort now respects spread_opts.sort_threads not nthreads. Supercedes PR 431.
 * new docs troubleshooting accuracy limitations due to condition number of the
   NUFFT problem.
 * new sanity check on nj and nk (<0 or too big); new err code, tester, doc.
diff --git a/docs/opts.rst b/docs/opts.rst
@@ -124,7 +124,9 @@ Diagnostic options
 Algorithm performance options
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-**nthreads**: Number of threads to use. This is capped at the number of available threads (eg, to prevent misuse of a single-threaded code). It then sets the number of threads FINUFFT will use in FFTW, bin-sorting, and spreading/interpolation steps. This number of threads also controls the batch size for vectorized transforms (ie ``ntr>1`` :ref:`here <c>`). Setting ``nthreads=0`` uses all threads available, usually recommended. However, for repeated small problems it can be advantageous to use a small number, even as small as 1.
+**nthreads**: (Ignored in single-threaded library builds.) If positive, sets the number of threads to use throughout (multi-threaded build of) library, or if ``0`` uses the maximum number of threads available according to OpenMP. In the positive case, no cap is placed on this number. This number of threads is passed to bin-sorting (which may choose to use less threads), but is adhered to in FFTW and spreading/interpolation steps. This number of threads (or 1 for single-threaded builds) also controls the batch size for vectorized transforms (ie ``ntr>1`` :ref:`here <c>`).
+For medium-to-large transforms, ``0`` is usually recommended.
+However, for (repeated) small transforms it can be advantageous to use a small number, even as small as ``1``.
 
 **fftw**: FFTW planner flags. This number is simply passed to FFTW's planner;
 the flags are documented `here <http://www.fftw.org/fftw3_doc/Planner-Flags.html#Planner-Flags>`_.
diff --git a/src/finufft.cpp b/src/finufft.cpp
@@ -604,15 +604,22 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
   p->fftSign = (iflag>=0) ? 1 : -1;         // clean up flag input
 
   // choose overall # threads...
-  int maxnthr = MY_OMP_GET_MAX_THREADS();
-  int nthr = maxnthr;                       // use as many as OMP gives us
+#ifdef _OPENMP
+  int ompmaxnthr = MY_OMP_GET_MAX_THREADS();
+  int nthr = ompmaxnthr;                    // default: use as many as OMP gives us
+  // (the above could be set, or suggested set, to 1 for small enough problems...)
   if (p->opts.nthreads>0) {
-    nthr = min(maxnthr,p->opts.nthreads);   // user override up to max avail
-    if (p->opts.nthreads > maxnthr)         // if no OMP, maxnthr=1
-      fprintf(stderr,"%s warning: user requested %d threads, but only %d threads available; enforcing nthreads=%d.\n",__func__,p->opts.nthreads,maxnthr,nthr);
+    nthr = p->opts.nthreads;                // user override, now without limit
+    if (p->opts.showwarn && (nthr > ompmaxnthr))
+      fprintf(stderr,"%s warning: using opts.nthreads=%d, more than the %d OpenMP claims available; note large nthreads can be slower.\n",__func__,nthr,ompmaxnthr);
   }
+#else
+  int nthr = 1;                             // always 1 thread (avoid segfault)
+  if (p->opts.nthreads>1)
+    fprintf(stderr,"%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n",__func__,p->opts.nthreads);
+#endif
   p->opts.nthreads = nthr;                  // store actual # thr planned for
-  // (this sets all downstream spread/interp, 1dkernel, and FFT thread counts)
+  // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...)
   
   // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick)
   if (p->opts.maxbatchsize==0) {            // logic to auto-set best batchsize
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
@@ -196,7 +196,7 @@ int indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M,
                    ordering for the x-coords of NU pts, etc.
     returned value - whether a sort was done (1) or not (0).
 
-   Barnett 2017; split out by Melody Shih, Jun 2018.
+   Barnett 2017; split out by Melody Shih, Jun 2018. Barnett nthr logic 2024.
 */
 {
   CNTime timer;
@@ -211,19 +211,26 @@ int indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M,
 
   timer.start();                 // if needed, sort all the NU pts...
   int did_sort=0;
-  int maxnthr = MY_OMP_GET_MAX_THREADS();
-  if (opts.nthreads>0)           // user override up to max avail
-    maxnthr = min(maxnthr,opts.nthreads);
-  
+  int maxnthr = MY_OMP_GET_MAX_THREADS();  // used if both below opts default
+  if (opts.nthreads>0)
+    maxnthr = opts.nthreads;         // user nthreads overrides, without limit
+  if (opts.sort_threads>0)
+    maxnthr = opts.sort_threads;     // high-priority override, also no limit
+  // At this point: maxnthr = the max threads sorting could use
+  // (we don't print warning here, since: no showwarn in spread_opts, and finufft
+  // already warned about it. spreadinterp-only advanced users will miss a warning)
   if (opts.sort==1 || (opts.sort==2 && better_to_sort)) {
     // store a good permutation ordering of all NU pts (dim=1,2 or 3)
     int sort_debug = (opts.debug>=2);    // show timing output?
-    int sort_nthr = opts.sort_threads;   // choose # threads for sorting
-    if (sort_nthr==0)   // use auto choice: when N>>M, one thread is better!
-      sort_nthr = (10*M>N) ? maxnthr : 1;      // heuristic
+    int sort_nthr = opts.sort_threads;   // 0, or user max # threads for sort
+#ifndef _OPENMP
+    sort_nthr = 1;                       // if single-threaded lib, override user
+#endif
+    if (sort_nthr==0)   // multithreaded auto choice: when N>>M, one thread is better!
+      sort_nthr = (10*M>N) ? maxnthr : 1;     // heuristic
     if (sort_nthr==1)
       bin_sort_singlethread(sort_indices,M,kx,ky,kz,N1,N2,N3,bin_size_x,bin_size_y,bin_size_z,sort_debug);
-    else                                      // sort_nthr>1, sets # threads
+    else                                      // sort_nthr>1, user fixes # threads (>=2)
       bin_sort_multithread(sort_indices,M,kx,ky,kz,N1,N2,N3,bin_size_x,bin_size_y,bin_size_z,sort_debug,sort_nthr);
     if (opts.debug) 
       printf("\tsorted (%d threads):\t%.3g s\n",sort_nthr,timer.elapsedsec());
@@ -268,9 +275,12 @@ int spreadSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3,
   int ndims = ndims_from_Ns(N1,N2,N3);
   BIGINT N=N1*N2*N3;            // output array size
   int ns=opts.nspread;          // abbrev. for w, kernel width
-  int nthr = MY_OMP_GET_MAX_THREADS();  // # threads to use to spread
+  int nthr = MY_OMP_GET_MAX_THREADS();  // guess # threads to use to spread
   if (opts.nthreads>0)
-    nthr = min(nthr,opts.nthreads);     // user override up to max avail
+    nthr = opts.nthreads;       // user override, now without limit
+#ifndef _OPENMP
+  nthr = 1;                   // single-threaded lib must override user
+#endif
   if (opts.debug)
     printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n",ndims,(long long)M,(long long)N1,(long long)N2,(long long)N3,nthr);
   
@@ -390,9 +400,12 @@ int interpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3,
   int ndims = ndims_from_Ns(N1,N2,N3);
   int ns=opts.nspread;          // abbrev. for w, kernel width
   FLT ns2 = (FLT)ns/2;          // half spread width, used as stencil shift
-  int nthr = MY_OMP_GET_MAX_THREADS();   // # threads to use to interp
+  int nthr = MY_OMP_GET_MAX_THREADS();   // guess # threads to use to interp
   if (opts.nthreads>0)
-    nthr = min(nthr,opts.nthreads);      // user override up to max avail
+    nthr = opts.nthreads;       // user override, now without limit
+#ifndef _OPENMP
+  nthr = 1;                   // single-threaded lib must override user
+#endif
   if (opts.debug)
     printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n",ndims,(long long)M,(long long)N1,(long long)N2,(long long)N3,nthr);
 
@@ -1234,7 +1247,7 @@ void bin_sort_multithread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
   nbins2 = isky ? N2/bin_size_y+1 : 1;
   nbins3 = iskz ? N3/bin_size_z+1 : 1;
   BIGINT nbins = nbins1*nbins2*nbins3;
-  if (nthr==0)
+  if (nthr==0)                      // should never happen in spreadinterp use
     fprintf(stderr,"[%s] nthr (%d) must be positive!\n",__func__,nthr);
   int nt = min(M,(BIGINT)nthr);     // handle case of less points than threads
   std::vector<BIGINT> brk(nt+1);    // list of start NU pt indices per thread