@@ -348,102 +348,127 @@ def kth_smallest(numeric_t[::1] arr, Py_ssize_t k) -> numeric_t:
348
348
349
349
@cython.boundscheck(False )
350
350
@cython.wraparound(False )
351
- @cython.cdivision(True )
352
351
def nancorr(
353
352
const float64_t[:, :] mat ,
354
353
bint cov = False ,
355
354
minp = None ,
356
- bint use_parallel = False ,
355
+ bint use_parallel = False
357
356
):
358
357
cdef:
359
- Py_ssize_t i, xi, yi, N, K
360
- int64_t minpv
358
+ Py_ssize_t i, j, xi, yi, N, K
359
+ bint minpv
361
360
float64_t[:, ::1 ] result
362
- uint8_t[:, :] mask
363
- int64_t nobs
364
- float64_t vx, vy, dx, dy, meanx, meany
365
- float64_t prev_meanx, prev_meany
366
- float64_t ssqdmx, ssqdmy, covxy, divisor, val
361
+ # Initialize to None since we only use in the no missing value case
362
+ float64_t[::1 ] means = None
363
+ float64_t[::1 ] ssqds = None
364
+ ndarray[uint8_t, ndim= 2 ] mask
365
+ bint no_nans
366
+ int64_t nobs = 0
367
+ float64_t mean, ssqd, val
368
+ float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy
367
369
368
370
N, K = (< object > mat).shape
369
- minpv = 1 if minp is None else < int64_t> minp
370
-
371
+ if minp is None :
372
+ minpv = 1
373
+ else :
374
+ minpv = < int > minp
371
375
result = np.empty((K, K), dtype = np.float64)
372
376
mask = np.isfinite(mat).view(np.uint8)
377
+ no_nans = mask.all()
373
378
374
- # Welford's method for the variance-calculation
375
- # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
376
-
379
+ # Computing the online means and variances is expensive - so if possible we can
380
+ # precompute these and avoid repeating the computations each time we handle
381
+ # an (xi, yi) pair
382
+ if no_nans:
383
+ means = np.empty(K, dtype = np.float64)
384
+ ssqds = np.empty(K, dtype = np.float64)
385
+ with nogil:
386
+ for j in range (K):
387
+ ssqd = mean = 0
388
+ for i in range (N):
389
+ val = mat[i, j]
390
+ dx = val - mean
391
+ mean += 1 / (i + 1 ) * dx
392
+ ssqd += (val - mean) * dx
393
+ means[j] = mean
394
+ ssqds[j] = ssqd
395
+
396
+ # ONLY CHANGE: Add parallel option to the main correlation loop
377
397
if use_parallel:
378
398
for xi in prange(K, schedule = " dynamic" , nogil = True ):
379
399
for yi in range (xi + 1 ):
380
- nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
381
- for i in range (N) :
382
- if mask[i, xi] and mask[i, yi] :
400
+ covxy = 0
401
+ if no_nans :
402
+ for i in range (N) :
383
403
vx = mat[i, xi]
384
404
vy = mat[i, yi]
385
- nobs = nobs + 1
386
- dx = vx - meanx
387
- dy = vy - meany
388
- meanx = meanx + dx / nobs
389
- meany = meany + dy / nobs
390
- ssqdmx = ssqdmx + (vx - meanx) * dx
391
- ssqdmy = ssqdmy + (vy - meany) * dy
392
- covxy = covxy + (vx - meanx) * dy
393
-
405
+ covxy += (vx - means[xi]) * (vy - means[yi])
406
+ ssqdmx = ssqds[xi]
407
+ ssqdmy = ssqds[yi]
408
+ nobs = N
409
+ else :
410
+ nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
411
+ for i in range (N):
412
+ # Welford's method for the variance-calculation
413
+ # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
414
+ if mask[i, xi] and mask[i, yi]:
415
+ vx = mat[i, xi]
416
+ vy = mat[i, yi]
417
+ nobs = nobs + 1
418
+ dx = vx - meanx
419
+ dy = vy - meany
420
+ meanx = meanx + (1 / nobs * dx)
421
+ meany = meany + (1 / nobs * dy)
422
+ ssqdmx = ssqdmx + ((vx - meanx) * dx)
423
+ ssqdmy = ssqdmy + ((vy - meany) * dy)
424
+ covxy = covxy + ((vx - meanx) * dy)
394
425
if nobs < minpv:
395
- val = NaN
426
+ result[xi, yi] = result[yi, xi] = NaN
396
427
else :
397
- if cov:
398
- divisor = nobs - 1.0
428
+ divisor = (nobs - 1.0 ) if cov else sqrt(ssqdmx * ssqdmy)
429
+ if divisor != 0 :
430
+ result[xi, yi] = result[yi, xi] = covxy / divisor
399
431
else :
400
- divisor = sqrt(ssqdmx * ssqdmy)
401
- if divisor == 0.0 :
402
- val = NaN
403
- else :
404
- val = covxy / divisor
405
- if not cov:
406
- if val > 1.0 :
407
- val = 1.0
408
- if val < - 1.0 :
409
- val = - 1.0
410
-
411
- result[xi, yi] = result[yi, xi] = val
432
+ result[xi, yi] = result[yi, xi] = NaN
412
433
else :
413
- # Sequential version
414
434
with nogil:
415
435
for xi in range (K):
416
436
for yi in range (xi + 1 ):
417
- nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
418
- for i in range (N) :
419
- if mask[i, xi] and mask[i, yi] :
437
+ covxy = 0
438
+ if no_nans :
439
+ for i in range (N) :
420
440
vx = mat[i, xi]
421
441
vy = mat[i, yi]
422
- nobs += 1
423
- prev_meanx = meanx
424
- prev_meany = meany
425
- meanx = meanx + 1 / nobs * (vx - meanx)
426
- meany = meany + 1 / nobs * (vy - meany)
427
- ssqdmx = ssqdmx + (vx - meanx) * (vx - prev_meanx)
428
- ssqdmy = ssqdmy + (vy - meany) * (vy - prev_meany)
429
- covxy = covxy + (vx - meanx) * (vy - prev_meany)
430
-
442
+ covxy += (vx - means[xi]) * (vy - means[yi])
443
+ ssqdmx = ssqds[xi]
444
+ ssqdmy = ssqds[yi]
445
+ nobs = N
446
+ else :
447
+ nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
448
+ for i in range (N):
449
+ # Welford's method for the variance-calculation
450
+ # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
451
+ if mask[i, xi] and mask[i, yi]:
452
+ vx = mat[i, xi]
453
+ vy = mat[i, yi]
454
+ nobs += 1
455
+ dx = vx - meanx
456
+ dy = vy - meany
457
+ meanx += 1 / nobs * dx
458
+ meany += 1 / nobs * dy
459
+ ssqdmx += (vx - meanx) * dx
460
+ ssqdmy += (vy - meany) * dy
461
+ covxy += (vx - meanx) * dy
431
462
if nobs < minpv:
432
463
result[xi, yi] = result[yi, xi] = NaN
433
464
else :
434
465
divisor = (nobs - 1.0 ) if cov else sqrt(ssqdmx * ssqdmy)
435
466
if divisor != 0 :
436
- val = covxy / divisor
437
- if not cov:
438
- if val > 1.0 :
439
- val = 1.0
440
- if val < - 1.0 :
441
- val = - 1.0
442
- result[xi, yi] = result[yi, xi] = val
467
+ result[xi, yi] = result[yi, xi] = covxy / divisor
443
468
else :
444
469
result[xi, yi] = result[yi, xi] = NaN
445
470
446
- return result
471
+ return result.base
447
472
448
473
# ----------------------------------------------------------------------
449
474
# Pairwise Spearman correlation
0 commit comments