Skip to content

Commit 3dc9eed

Browse files
add: parallelization support for pearson correlation
1 parent 592a41a commit 3dc9eed

File tree

4 files changed

+113
-28
lines changed

4 files changed

+113
-28
lines changed

meson.build

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,18 @@ py = import('python').find_installation(pure: false)
1515
tempita = files('generate_pxi.py')
1616
versioneer = files('generate_version.py')
1717

18+
# Find OpenMP dependency
19+
openmp_dep = dependency('openmp', required: false)
20+
if not openmp_dep.found()
21+
# Try to find OpenMP using compiler flags
22+
cc = meson.get_compiler('c')
23+
if cc.has_argument('-fopenmp')
24+
openmp_dep = declare_dependency(
25+
compile_args: ['-fopenmp'],
26+
link_args: ['-fopenmp'],
27+
)
28+
endif
29+
endif
1830

1931
add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language: 'c')
2032
add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language: 'cpp')
@@ -28,6 +40,14 @@ add_project_arguments(
2840
language: 'cpp',
2941
)
3042

43+
# Add OpenMP compile args if available
44+
if openmp_dep.found()
45+
add_project_arguments('-DHAVE_OPENMP', language: 'c')
46+
add_project_arguments('-DHAVE_OPENMP', language: 'cpp')
47+
message('OpenMP support enabled')
48+
else
49+
message('OpenMP not found - parallel code will run sequentially')
50+
endif
3151

3252
if fs.exists('_version_meson.py')
3353
py.install_sources('_version_meson.py', subdir: 'pandas')

pandas/_libs/algos.pyx

Lines changed: 69 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
cimport cython
22
from cython cimport Py_ssize_t
3+
from cython.parallel cimport (
4+
prange,
5+
)
36
from libc.math cimport (
47
fabs,
58
sqrt,
@@ -346,61 +349,101 @@ def kth_smallest(numeric_t[::1] arr, Py_ssize_t k) -> numeric_t:
346349
@cython.boundscheck(False)
347350
@cython.wraparound(False)
348351
@cython.cdivision(True)
349-
def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
352+
def nancorr(
353+
const float64_t[:, :] mat,
354+
bint cov=False,
355+
minp=None,
356+
bint use_parallel=False,
357+
):
350358
cdef:
351359
Py_ssize_t i, xi, yi, N, K
352360
int64_t minpv
353361
float64_t[:, ::1] result
354362
uint8_t[:, :] mask
355-
int64_t nobs = 0
356-
float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy, val
363+
int64_t nobs
364+
float64_t vx, vy, dx, dy, meanx, meany
365+
float64_t prev_meanx, prev_meany
366+
float64_t ssqdmx, ssqdmy, covxy, divisor, val
357367

358368
N, K = (<object>mat).shape
359-
if minp is None:
360-
minpv = 1
361-
else:
362-
minpv = <int64_t>minp
369+
minpv = 1 if minp is None else <int64_t>minp
363370

364371
result = np.empty((K, K), dtype=np.float64)
365372
mask = np.isfinite(mat).view(np.uint8)
366373

367-
with nogil:
368-
for xi in range(K):
374+
# Welford's method for the variance-calculation
375+
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
376+
377+
if use_parallel:
378+
for xi in prange(K, schedule="dynamic", nogil=True):
369379
for yi in range(xi + 1):
370-
# Welford's method for the variance-calculation
371-
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
372380
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
373381
for i in range(N):
374382
if mask[i, xi] and mask[i, yi]:
375383
vx = mat[i, xi]
376384
vy = mat[i, yi]
377-
nobs += 1
385+
nobs = nobs + 1
378386
dx = vx - meanx
379387
dy = vy - meany
380-
meanx += 1. / nobs * dx
381-
meany += 1. / nobs * dy
382-
ssqdmx += (vx - meanx) * dx
383-
ssqdmy += (vy - meany) * dy
384-
covxy += (vx - meanx) * dy
388+
meanx = meanx + dx / nobs
389+
meany = meany + dy / nobs
390+
ssqdmx = ssqdmx + (vx - meanx) * dx
391+
ssqdmy = ssqdmy + (vy - meany) * dy
392+
covxy = covxy + (vx - meanx) * dy
385393

386394
if nobs < minpv:
387-
result[xi, yi] = result[yi, xi] = NaN
395+
val = NaN
388396
else:
389-
divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
390-
391-
# clip `covxy / divisor` to ensure coeff is within bounds
392-
if divisor != 0:
397+
if cov:
398+
divisor = nobs - 1.0
399+
else:
400+
divisor = sqrt(ssqdmx * ssqdmy)
401+
if divisor == 0.0:
402+
val = NaN
403+
else:
393404
val = covxy / divisor
394405
if not cov:
395406
if val > 1.0:
396407
val = 1.0
397-
elif val < -1.0:
408+
if val < -1.0:
398409
val = -1.0
399-
result[xi, yi] = result[yi, xi] = val
400-
else:
410+
411+
result[xi, yi] = result[yi, xi] = val
412+
else:
413+
# Sequential version
414+
with nogil:
415+
for xi in range(K):
416+
for yi in range(xi + 1):
417+
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
418+
for i in range(N):
419+
if mask[i, xi] and mask[i, yi]:
420+
vx = mat[i, xi]
421+
vy = mat[i, yi]
422+
nobs += 1
423+
prev_meanx = meanx
424+
prev_meany = meany
425+
meanx = meanx + 1 / nobs * (vx - meanx)
426+
meany = meany + 1 / nobs * (vy - meany)
427+
ssqdmx = ssqdmx + (vx - meanx) * (vx - prev_meanx)
428+
ssqdmy = ssqdmy + (vy - meany) * (vy - prev_meany)
429+
covxy = covxy + (vx - meanx) * (vy - prev_meany)
430+
431+
if nobs < minpv:
401432
result[xi, yi] = result[yi, xi] = NaN
433+
else:
434+
divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
435+
if divisor != 0:
436+
val = covxy / divisor
437+
if not cov:
438+
if val > 1.0:
439+
val = 1.0
440+
if val < -1.0:
441+
val = -1.0
442+
result[xi, yi] = result[yi, xi] = val
443+
else:
444+
result[xi, yi] = result[yi, xi] = NaN
402445

403-
return result.base
446+
return result
404447

405448
# ----------------------------------------------------------------------
406449
# Pairwise Spearman correlation

pandas/_libs/meson.build

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
# Add OpenMP dependency detection at the top of the file
2+
openmp_dep = dependency('openmp', required: false)
3+
if not openmp_dep.found()
4+
cc = meson.get_compiler('c')
5+
if cc.has_argument('-fopenmp')
6+
openmp_dep = declare_dependency(
7+
compile_args: ['-fopenmp'],
8+
link_args: ['-fopenmp'],
9+
)
10+
endif
11+
endif
12+
113
_algos_take_helper = custom_target(
214
'algos_take_helper_pxi',
315
output: 'algos_take_helper.pxi',
@@ -70,7 +82,7 @@ libs_sources = {
7082
# numpy include dir is implicitly included
7183
'algos': {
7284
'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper],
73-
'deps': _khash_primitive_helper_dep,
85+
'deps': [_khash_primitive_helper_dep, openmp_dep],
7486
},
7587
'arrays': {'sources': ['arrays.pyx']},
7688
'groupby': {'sources': ['groupby.pyx']},

pandas/core/frame.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11264,6 +11264,7 @@ def corr(
1126411264
method: CorrelationMethod = "pearson",
1126511265
min_periods: int = 1,
1126611266
numeric_only: bool = False,
11267+
parallel: bool = False,
1126711268
) -> DataFrame:
1126811269
"""
1126911270
Compute pairwise correlation of columns, excluding NA/null values.
@@ -11292,6 +11293,11 @@ def corr(
1129211293
.. versionchanged:: 2.0.0
1129311294
The default value of ``numeric_only`` is now ``False``.
1129411295
11296+
parallel : bool, default False
11297+
Use parallel computation for Pearson correlation.
11298+
Only effective for large matrices where parallelization overhead
11299+
is justified by compute time savings.
11300+
1129511301
Returns
1129611302
-------
1129711303
DataFrame
@@ -11332,14 +11338,18 @@ def corr(
1133211338
dogs cats
1133311339
dogs 1.0 NaN
1133411340
cats NaN 1.0
11341+
11342+
>>> # Use parallel computation for large DataFrames
11343+
>>> large_df = pd.DataFrame(np.random.randn(10000, 100))
11344+
>>> corr_matrix = large_df.corr(parallel=True)
1133511345
""" # noqa: E501
1133611346
data = self._get_numeric_data() if numeric_only else self
1133711347
cols = data.columns
1133811348
idx = cols.copy()
1133911349
mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
1134011350

1134111351
if method == "pearson":
11342-
correl = libalgos.nancorr(mat, minp=min_periods)
11352+
correl = libalgos.nancorr(mat, minp=min_periods, parallel=parallel)
1134311353
elif method == "spearman":
1134411354
correl = libalgos.nancorr_spearman(mat, minp=min_periods)
1134511355
elif method == "kendall" or callable(method):

0 commit comments

Comments
 (0)