Skip to content

Commit 31b530b

Browse files
update: clean up code
1 parent 66ec017 commit 31b530b

File tree

1 file changed

+87
-62
lines changed

1 file changed

+87
-62
lines changed

pandas/_libs/algos.pyx

Lines changed: 87 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -348,102 +348,127 @@ def kth_smallest(numeric_t[::1] arr, Py_ssize_t k) -> numeric_t:
348348

349349
@cython.boundscheck(False)
350350
@cython.wraparound(False)
351-
@cython.cdivision(True)
352351
def nancorr(
353352
const float64_t[:, :] mat,
354353
bint cov=False,
355354
minp=None,
356-
bint use_parallel=False,
355+
bint use_parallel=False
357356
):
358357
cdef:
359-
Py_ssize_t i, xi, yi, N, K
360-
int64_t minpv
358+
Py_ssize_t i, j, xi, yi, N, K
359+
bint minpv
361360
float64_t[:, ::1] result
362-
uint8_t[:, :] mask
363-
int64_t nobs
364-
float64_t vx, vy, dx, dy, meanx, meany
365-
float64_t prev_meanx, prev_meany
366-
float64_t ssqdmx, ssqdmy, covxy, divisor, val
361+
# Initialize to None since we only use in the no missing value case
362+
float64_t[::1] means = None
363+
float64_t[::1] ssqds = None
364+
ndarray[uint8_t, ndim=2] mask
365+
bint no_nans
366+
int64_t nobs = 0
367+
float64_t mean, ssqd, val
368+
float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy
367369

368370
N, K = (<object>mat).shape
369-
minpv = 1 if minp is None else <int64_t>minp
370-
371+
if minp is None:
372+
minpv = 1
373+
else:
374+
minpv = <int>minp
371375
result = np.empty((K, K), dtype=np.float64)
372376
mask = np.isfinite(mat).view(np.uint8)
377+
no_nans = mask.all()
373378

374-
# Welford's method for the variance-calculation
375-
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
376-
379+
# Computing the online means and variances is expensive - so if possible we can
380+
# precompute these and avoid repeating the computations each time we handle
381+
# an (xi, yi) pair
382+
if no_nans:
383+
means = np.empty(K, dtype=np.float64)
384+
ssqds = np.empty(K, dtype=np.float64)
385+
with nogil:
386+
for j in range(K):
387+
ssqd = mean = 0
388+
for i in range(N):
389+
val = mat[i, j]
390+
dx = val - mean
391+
mean += 1 / (i + 1) * dx
392+
ssqd += (val - mean) * dx
393+
means[j] = mean
394+
ssqds[j] = ssqd
395+
396+
# ONLY CHANGE: Add parallel option to the main correlation loop
377397
if use_parallel:
378398
for xi in prange(K, schedule="dynamic", nogil=True):
379399
for yi in range(xi + 1):
380-
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
381-
for i in range(N):
382-
if mask[i, xi] and mask[i, yi]:
400+
covxy = 0
401+
if no_nans:
402+
for i in range(N):
383403
vx = mat[i, xi]
384404
vy = mat[i, yi]
385-
nobs = nobs + 1
386-
dx = vx - meanx
387-
dy = vy - meany
388-
meanx = meanx + dx / nobs
389-
meany = meany + dy / nobs
390-
ssqdmx = ssqdmx + (vx - meanx) * dx
391-
ssqdmy = ssqdmy + (vy - meany) * dy
392-
covxy = covxy + (vx - meanx) * dy
393-
405+
covxy += (vx - means[xi]) * (vy - means[yi])
406+
ssqdmx = ssqds[xi]
407+
ssqdmy = ssqds[yi]
408+
nobs = N
409+
else:
410+
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
411+
for i in range(N):
412+
# Welford's method for the variance-calculation
413+
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
414+
if mask[i, xi] and mask[i, yi]:
415+
vx = mat[i, xi]
416+
vy = mat[i, yi]
417+
nobs = nobs + 1
418+
dx = vx - meanx
419+
dy = vy - meany
420+
meanx = meanx + (1 / nobs * dx)
421+
meany = meany + (1 / nobs * dy)
422+
ssqdmx = ssqdmx + ((vx - meanx) * dx)
423+
ssqdmy = ssqdmy + ((vy - meany) * dy)
424+
covxy = covxy + ((vx - meanx) * dy)
394425
if nobs < minpv:
395-
val = NaN
426+
result[xi, yi] = result[yi, xi] = NaN
396427
else:
397-
if cov:
398-
divisor = nobs - 1.0
428+
divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
429+
if divisor != 0:
430+
result[xi, yi] = result[yi, xi] = covxy / divisor
399431
else:
400-
divisor = sqrt(ssqdmx * ssqdmy)
401-
if divisor == 0.0:
402-
val = NaN
403-
else:
404-
val = covxy / divisor
405-
if not cov:
406-
if val > 1.0:
407-
val = 1.0
408-
if val < -1.0:
409-
val = -1.0
410-
411-
result[xi, yi] = result[yi, xi] = val
432+
result[xi, yi] = result[yi, xi] = NaN
412433
else:
413-
# Sequential version
414434
with nogil:
415435
for xi in range(K):
416436
for yi in range(xi + 1):
417-
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
418-
for i in range(N):
419-
if mask[i, xi] and mask[i, yi]:
437+
covxy = 0
438+
if no_nans:
439+
for i in range(N):
420440
vx = mat[i, xi]
421441
vy = mat[i, yi]
422-
nobs += 1
423-
prev_meanx = meanx
424-
prev_meany = meany
425-
meanx = meanx + 1 / nobs * (vx - meanx)
426-
meany = meany + 1 / nobs * (vy - meany)
427-
ssqdmx = ssqdmx + (vx - meanx) * (vx - prev_meanx)
428-
ssqdmy = ssqdmy + (vy - meany) * (vy - prev_meany)
429-
covxy = covxy + (vx - meanx) * (vy - prev_meany)
430-
442+
covxy += (vx - means[xi]) * (vy - means[yi])
443+
ssqdmx = ssqds[xi]
444+
ssqdmy = ssqds[yi]
445+
nobs = N
446+
else:
447+
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
448+
for i in range(N):
449+
# Welford's method for the variance-calculation
450+
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
451+
if mask[i, xi] and mask[i, yi]:
452+
vx = mat[i, xi]
453+
vy = mat[i, yi]
454+
nobs += 1
455+
dx = vx - meanx
456+
dy = vy - meany
457+
meanx += 1 / nobs * dx
458+
meany += 1 / nobs * dy
459+
ssqdmx += (vx - meanx) * dx
460+
ssqdmy += (vy - meany) * dy
461+
covxy += (vx - meanx) * dy
431462
if nobs < minpv:
432463
result[xi, yi] = result[yi, xi] = NaN
433464
else:
434465
divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
435466
if divisor != 0:
436-
val = covxy / divisor
437-
if not cov:
438-
if val > 1.0:
439-
val = 1.0
440-
if val < -1.0:
441-
val = -1.0
442-
result[xi, yi] = result[yi, xi] = val
467+
result[xi, yi] = result[yi, xi] = covxy / divisor
443468
else:
444469
result[xi, yi] = result[yi, xi] = NaN
445470

446-
return result
471+
return result.base
447472

448473
# ----------------------------------------------------------------------
449474
# Pairwise Spearman correlation

0 commit comments

Comments
 (0)