Skip to content

Commit 928635c

Browse files
add: test for parallel correlation
1 parent 6f8c859 commit 928635c

File tree

1 file changed

+124
-0
lines changed

1 file changed

+124
-0
lines changed

pandas/tests/frame/methods/test_cov_corr.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,3 +507,127 @@ def test_cov_with_missing_values(self):
507507
result2 = df.dropna().cov()
508508
tm.assert_frame_equal(result1, expected)
509509
tm.assert_frame_equal(result2, expected)
510+
511+
def test_corr_parallel_functionality(self):
512+
"""Test that parallel correlation gives same results as sequential."""
513+
rng = np.random.default_rng(seed=42)
514+
n_samples = 100
515+
516+
x = rng.normal(0, 1, n_samples)
517+
y = 0.8 * x + 0.6 * rng.normal(0, 1, n_samples)
518+
z = -0.5 * x + 0.866 * rng.normal(0, 1, n_samples)
519+
520+
df = DataFrame({"X": x, "Y": y, "Z": z})
521+
522+
result_sequential = df.corr(use_parallel=False)
523+
result_parallel = df.corr(use_parallel=True)
524+
525+
tm.assert_frame_equal(result_sequential, result_parallel)
526+
527+
for result in [result_sequential, result_parallel]:
528+
assert result.shape == (3, 3)
529+
assert (np.diag(result) == 1.0).all()
530+
531+
tm.assert_frame_equal(result, result.T)
532+
533+
assert (result >= -1.0).all().all()
534+
assert (result <= 1.0).all().all()
535+
536+
assert abs(result.loc["X", "Y"] - 0.8) < 0.2
537+
assert abs(result.loc["X", "Z"] + 0.5) < 0.2
538+
539+
def test_corr_with_missing_data_parallel(self):
540+
"""
541+
Test correlation with missing data works correctly with parallel processing.
542+
"""
543+
df = DataFrame(
544+
{
545+
"A": [1.0, 2.0, np.nan, 4.0, 5.0],
546+
"B": [2.0, np.nan, 3.0, 4.0, 5.0],
547+
"C": [1.0, 2.0, 3.0, np.nan, 5.0],
548+
}
549+
)
550+
551+
result_sequential = df.corr(use_parallel=False)
552+
result_parallel = df.corr(use_parallel=True)
553+
554+
tm.assert_frame_equal(result_sequential, result_parallel)
555+
556+
for result in [result_sequential, result_parallel]:
557+
assert result.shape == (3, 3)
558+
assert (np.diag(result) == 1.0).all()
559+
tm.assert_frame_equal(result, result.T)
560+
561+
assert (result >= -1.0).all().all()
562+
assert (result <= 1.0).all().all()
563+
564+
assert np.isfinite(result.loc["A", "B"])
565+
assert np.isfinite(result.loc["A", "C"])
566+
assert np.isfinite(result.loc["B", "C"])
567+
568+
def test_corr_parallel_vs_sequential_large_data(self):
569+
"""
570+
Test parallel and sequential correlation on large dataset.
571+
"""
572+
rng = np.random.default_rng(seed=42)
573+
n_samples = 1000
574+
n_cols = 100
575+
576+
data = rng.normal(0, 1, (n_samples, n_cols))
577+
df = DataFrame(data, columns=[f"col_{i}" for i in range(n_cols)])
578+
579+
result_sequential = df.corr(use_parallel=False)
580+
result_parallel = df.corr(use_parallel=True)
581+
582+
tm.assert_frame_equal(
583+
result_sequential, result_parallel, rtol=1e-14, atol=1e-14
584+
)
585+
586+
for result in [result_sequential, result_parallel]:
587+
assert (np.diag(result) == 1.0).all()
588+
589+
tm.assert_frame_equal(result, result.T)
590+
591+
assert (result >= -1.0).all().all()
592+
assert (result <= 1.0).all().all()
593+
594+
def test_corr_numerical_stability_edge_cases(self):
595+
"""Test correlation numerical stability with edge cases."""
596+
df_small = DataFrame({"A": [1e-15, 2e-15, 3e-15], "B": [2e-15, 4e-15, 6e-15]})
597+
598+
result_small_seq = df_small.corr(use_parallel=False)
599+
result_small_par = df_small.corr(use_parallel=True)
600+
601+
tm.assert_frame_equal(result_small_seq, result_small_par)
602+
603+
for result in [result_small_seq, result_small_par]:
604+
assert (np.diag(result) == 1.0).all()
605+
assert abs(result.loc["A", "B"] - 1.0) < 1e-10
606+
607+
df_large = DataFrame({"A": [1e15, 2e15, 3e15], "B": [2e15, 4e15, 6e15]})
608+
609+
result_large_seq = df_large.corr(use_parallel=False)
610+
result_large_par = df_large.corr(use_parallel=True)
611+
612+
tm.assert_frame_equal(result_large_seq, result_large_par)
613+
614+
for result in [result_large_seq, result_large_par]:
615+
assert (np.diag(result) == 1.0).all()
616+
assert abs(result.loc["A", "B"] - 1.0) < 1e-10
617+
618+
df_precision = DataFrame(
619+
{
620+
"A": [0.1, 0.2, 0.3],
621+
"B": [0.1000000000000001, 0.2000000000000001, 0.3000000000000001],
622+
}
623+
)
624+
625+
result_precision_seq = df_precision.corr(use_parallel=False)
626+
result_precision_par = df_precision.corr(use_parallel=True)
627+
628+
tm.assert_frame_equal(result_precision_seq, result_precision_par)
629+
630+
for result in [result_precision_seq, result_precision_par]:
631+
assert (np.diag(result) == 1.0).all()
632+
assert result.loc["A", "B"] >= 0.999
633+
assert result.loc["A", "B"] <= 1.0

0 commit comments

Comments
 (0)