Skip to content

Commit f6f9873

Browse files
author
Marco Barbone
committed
Merge branch 'master' into optimising-foldrescale
2 parents 4af801e + 79de084 commit f6f9873

File tree

6 files changed

+53
-24
lines changed

6 files changed

+53
-24
lines changed

.github/workflows/cmake_ci.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ jobs:
3838
steps:
3939
- uses: actions/checkout@v4
4040

41+
- name: Unlink gcc
42+
if: runner.os == 'macOS'
43+
run: |
44+
brew unlink gcc
45+
4146
- name: Setup Cpp
4247
uses: aminya/setup-cpp@v1
4348
with:

.github/workflows/python_wheel.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ jobs:
4646

4747
- name: Install gcc and fftw
4848
run: |
49-
brew install gcc fftw
49+
brew unlink gcc
50+
brew install gcc@13 fftw
5051
cp make.inc.macosx_gcc-12 make.inc
5152
echo "FC=gfortran-13" >> make.inc
5253
echo "CC=gcc-13" >> make.inc

CHANGELOG

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
List of features / changes made / release notes, in reverse chronological order.
22
If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
33

4-
* CPU plan stage prevents now caps # threads at omp_get_max_threads (being 1
5-
for single-thread build); warns if this cap was activated (PR 431)
4+
* CPU plan stage allows any # threads, warns if > omp_get_max_threads(); or
5+
if single-threaded fixes nthr=1 and warns opts.nthreads>1 attempt.
6+
Sort now respects spread_opts.sort_threads not nthreads. Supercedes PR 431.
67
* new docs troubleshooting accuracy limitations due to condition number of the
78
NUFFT problem.
89
* new sanity check on nj and nk (<0 or too big); new err code, tester, doc.

docs/opts.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,9 @@ Diagnostic options
124124
Algorithm performance options
125125
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
126126

127-
**nthreads**: Number of threads to use. This is capped at the number of available threads (eg, to prevent misuse of a single-threaded code). It then sets the number of threads FINUFFT will use in FFTW, bin-sorting, and spreading/interpolation steps. This number of threads also controls the batch size for vectorized transforms (ie ``ntr>1`` :ref:`here <c>`). Setting ``nthreads=0`` uses all threads available, usually recommended. However, for repeated small problems it can be advantageous to use a small number, even as small as 1.
127+
**nthreads**: (Ignored in single-threaded library builds.) If positive, sets the number of threads to use throughout (multi-threaded build of) library, or if ``0`` uses the maximum number of threads available according to OpenMP. In the positive case, no cap is placed on this number. This number of threads is passed to bin-sorting (which may choose to use less threads), but is adhered to in FFTW and spreading/interpolation steps. This number of threads (or 1 for single-threaded builds) also controls the batch size for vectorized transforms (ie ``ntr>1`` :ref:`here <c>`).
128+
For medium-to-large transforms, ``0`` is usually recommended.
129+
However, for (repeated) small transforms it can be advantageous to use a small number, even as small as ``1``.
128130

129131
**fftw**: FFTW planner flags. This number is simply passed to FFTW's planner;
130132
the flags are documented `here <http://www.fftw.org/fftw3_doc/Planner-Flags.html#Planner-Flags>`_.

src/finufft.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -604,15 +604,22 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
604604
p->fftSign = (iflag>=0) ? 1 : -1; // clean up flag input
605605

606606
// choose overall # threads...
607-
int maxnthr = MY_OMP_GET_MAX_THREADS();
608-
int nthr = maxnthr; // use as many as OMP gives us
607+
#ifdef _OPENMP
608+
int ompmaxnthr = MY_OMP_GET_MAX_THREADS();
609+
int nthr = ompmaxnthr; // default: use as many as OMP gives us
610+
// (the above could be set, or suggested set, to 1 for small enough problems...)
609611
if (p->opts.nthreads>0) {
610-
nthr = min(maxnthr,p->opts.nthreads); // user override up to max avail
611-
if (p->opts.nthreads > maxnthr) // if no OMP, maxnthr=1
612-
fprintf(stderr,"%s warning: user requested %d threads, but only %d threads available; enforcing nthreads=%d.\n",__func__,p->opts.nthreads,maxnthr,nthr);
612+
nthr = p->opts.nthreads; // user override, now without limit
613+
if (p->opts.showwarn && (nthr > ompmaxnthr))
614+
fprintf(stderr,"%s warning: using opts.nthreads=%d, more than the %d OpenMP claims available; note large nthreads can be slower.\n",__func__,nthr,ompmaxnthr);
613615
}
616+
#else
617+
int nthr = 1; // always 1 thread (avoid segfault)
618+
if (p->opts.nthreads>1)
619+
fprintf(stderr,"%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n",__func__,p->opts.nthreads);
620+
#endif
614621
p->opts.nthreads = nthr; // store actual # thr planned for
615-
// (this sets all downstream spread/interp, 1dkernel, and FFT thread counts)
622+
// (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...)
616623

617624
// choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick)
618625
if (p->opts.maxbatchsize==0) { // logic to auto-set best batchsize

src/spreadinterp.cpp

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ int indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M,
196196
ordering for the x-coords of NU pts, etc.
197197
returned value - whether a sort was done (1) or not (0).
198198
199-
Barnett 2017; split out by Melody Shih, Jun 2018.
199+
Barnett 2017; split out by Melody Shih, Jun 2018. Barnett nthr logic 2024.
200200
*/
201201
{
202202
CNTime timer;
@@ -211,19 +211,26 @@ int indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M,
211211

212212
timer.start(); // if needed, sort all the NU pts...
213213
int did_sort=0;
214-
int maxnthr = MY_OMP_GET_MAX_THREADS();
215-
if (opts.nthreads>0) // user override up to max avail
216-
maxnthr = min(maxnthr,opts.nthreads);
217-
214+
int maxnthr = MY_OMP_GET_MAX_THREADS(); // used if both below opts default
215+
if (opts.nthreads>0)
216+
maxnthr = opts.nthreads; // user nthreads overrides, without limit
217+
if (opts.sort_threads>0)
218+
maxnthr = opts.sort_threads; // high-priority override, also no limit
219+
// At this point: maxnthr = the max threads sorting could use
220+
// (we don't print warning here, since: no showwarn in spread_opts, and finufft
221+
// already warned about it. spreadinterp-only advanced users will miss a warning)
218222
if (opts.sort==1 || (opts.sort==2 && better_to_sort)) {
219223
// store a good permutation ordering of all NU pts (dim=1,2 or 3)
220224
int sort_debug = (opts.debug>=2); // show timing output?
221-
int sort_nthr = opts.sort_threads; // choose # threads for sorting
222-
if (sort_nthr==0) // use auto choice: when N>>M, one thread is better!
223-
sort_nthr = (10*M>N) ? maxnthr : 1; // heuristic
225+
int sort_nthr = opts.sort_threads; // 0, or user max # threads for sort
226+
#ifndef _OPENMP
227+
sort_nthr = 1; // if single-threaded lib, override user
228+
#endif
229+
if (sort_nthr==0) // multithreaded auto choice: when N>>M, one thread is better!
230+
sort_nthr = (10*M>N) ? maxnthr : 1; // heuristic
224231
if (sort_nthr==1)
225232
bin_sort_singlethread(sort_indices,M,kx,ky,kz,N1,N2,N3,bin_size_x,bin_size_y,bin_size_z,sort_debug);
226-
else // sort_nthr>1, sets # threads
233+
else // sort_nthr>1, user fixes # threads (>=2)
227234
bin_sort_multithread(sort_indices,M,kx,ky,kz,N1,N2,N3,bin_size_x,bin_size_y,bin_size_z,sort_debug,sort_nthr);
228235
if (opts.debug)
229236
printf("\tsorted (%d threads):\t%.3g s\n",sort_nthr,timer.elapsedsec());
@@ -268,9 +275,12 @@ int spreadSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3,
268275
int ndims = ndims_from_Ns(N1,N2,N3);
269276
BIGINT N=N1*N2*N3; // output array size
270277
int ns=opts.nspread; // abbrev. for w, kernel width
271-
int nthr = MY_OMP_GET_MAX_THREADS(); // # threads to use to spread
278+
int nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to spread
272279
if (opts.nthreads>0)
273-
nthr = min(nthr,opts.nthreads); // user override up to max avail
280+
nthr = opts.nthreads; // user override, now without limit
281+
#ifndef _OPENMP
282+
nthr = 1; // single-threaded lib must override user
283+
#endif
274284
if (opts.debug)
275285
printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n",ndims,(long long)M,(long long)N1,(long long)N2,(long long)N3,nthr);
276286

@@ -390,9 +400,12 @@ int interpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3,
390400
int ndims = ndims_from_Ns(N1,N2,N3);
391401
int ns=opts.nspread; // abbrev. for w, kernel width
392402
FLT ns2 = (FLT)ns/2; // half spread width, used as stencil shift
393-
int nthr = MY_OMP_GET_MAX_THREADS(); // # threads to use to interp
403+
int nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to interp
394404
if (opts.nthreads>0)
395-
nthr = min(nthr,opts.nthreads); // user override up to max avail
405+
nthr = opts.nthreads; // user override, now without limit
406+
#ifndef _OPENMP
407+
nthr = 1; // single-threaded lib must override user
408+
#endif
396409
if (opts.debug)
397410
printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n",ndims,(long long)M,(long long)N1,(long long)N2,(long long)N3,nthr);
398411

@@ -1234,7 +1247,7 @@ void bin_sort_multithread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
12341247
nbins2 = isky ? N2/bin_size_y+1 : 1;
12351248
nbins3 = iskz ? N3/bin_size_z+1 : 1;
12361249
BIGINT nbins = nbins1*nbins2*nbins3;
1237-
if (nthr==0)
1250+
if (nthr==0) // should never happen in spreadinterp use
12381251
fprintf(stderr,"[%s] nthr (%d) must be positive!\n",__func__,nthr);
12391252
int nt = min(M,(BIGINT)nthr); // handle case of less points than threads
12401253
std::vector<BIGINT> brk(nt+1); // list of start NU pt indices per thread

0 commit comments

Comments
 (0)