@@ -507,3 +507,127 @@ def test_cov_with_missing_values(self):
507
507
result2 = df .dropna ().cov ()
508
508
tm .assert_frame_equal (result1 , expected )
509
509
tm .assert_frame_equal (result2 , expected )
510
+
511
+ def test_corr_parallel_functionality (self ):
512
+ """Test that parallel correlation gives same results as sequential."""
513
+ rng = np .random .default_rng (seed = 42 )
514
+ n_samples = 100
515
+
516
+ x = rng .normal (0 , 1 , n_samples )
517
+ y = 0.8 * x + 0.6 * rng .normal (0 , 1 , n_samples )
518
+ z = - 0.5 * x + 0.866 * rng .normal (0 , 1 , n_samples )
519
+
520
+ df = DataFrame ({"X" : x , "Y" : y , "Z" : z })
521
+
522
+ result_sequential = df .corr (use_parallel = False )
523
+ result_parallel = df .corr (use_parallel = True )
524
+
525
+ tm .assert_frame_equal (result_sequential , result_parallel )
526
+
527
+ for result in [result_sequential , result_parallel ]:
528
+ assert result .shape == (3 , 3 )
529
+ assert (np .diag (result ) == 1.0 ).all ()
530
+
531
+ tm .assert_frame_equal (result , result .T )
532
+
533
+ assert (result >= - 1.0 ).all ().all ()
534
+ assert (result <= 1.0 ).all ().all ()
535
+
536
+ assert abs (result .loc ["X" , "Y" ] - 0.8 ) < 0.2
537
+ assert abs (result .loc ["X" , "Z" ] + 0.5 ) < 0.2
538
+
539
+ def test_corr_with_missing_data_parallel (self ):
540
+ """
541
+ Test correlation with missing data works correctly with parallel processing.
542
+ """
543
+ df = DataFrame (
544
+ {
545
+ "A" : [1.0 , 2.0 , np .nan , 4.0 , 5.0 ],
546
+ "B" : [2.0 , np .nan , 3.0 , 4.0 , 5.0 ],
547
+ "C" : [1.0 , 2.0 , 3.0 , np .nan , 5.0 ],
548
+ }
549
+ )
550
+
551
+ result_sequential = df .corr (use_parallel = False )
552
+ result_parallel = df .corr (use_parallel = True )
553
+
554
+ tm .assert_frame_equal (result_sequential , result_parallel )
555
+
556
+ for result in [result_sequential , result_parallel ]:
557
+ assert result .shape == (3 , 3 )
558
+ assert (np .diag (result ) == 1.0 ).all ()
559
+ tm .assert_frame_equal (result , result .T )
560
+
561
+ assert (result >= - 1.0 ).all ().all ()
562
+ assert (result <= 1.0 ).all ().all ()
563
+
564
+ assert np .isfinite (result .loc ["A" , "B" ])
565
+ assert np .isfinite (result .loc ["A" , "C" ])
566
+ assert np .isfinite (result .loc ["B" , "C" ])
567
+
568
+ def test_corr_parallel_vs_sequential_large_data (self ):
569
+ """
570
+ Test parallel and sequential correlation on large dataset.
571
+ """
572
+ rng = np .random .default_rng (seed = 42 )
573
+ n_samples = 1000
574
+ n_cols = 100
575
+
576
+ data = rng .normal (0 , 1 , (n_samples , n_cols ))
577
+ df = DataFrame (data , columns = [f"col_{ i } " for i in range (n_cols )])
578
+
579
+ result_sequential = df .corr (use_parallel = False )
580
+ result_parallel = df .corr (use_parallel = True )
581
+
582
+ tm .assert_frame_equal (
583
+ result_sequential , result_parallel , rtol = 1e-14 , atol = 1e-14
584
+ )
585
+
586
+ for result in [result_sequential , result_parallel ]:
587
+ assert (np .diag (result ) == 1.0 ).all ()
588
+
589
+ tm .assert_frame_equal (result , result .T )
590
+
591
+ assert (result >= - 1.0 ).all ().all ()
592
+ assert (result <= 1.0 ).all ().all ()
593
+
594
+ def test_corr_numerical_stability_edge_cases (self ):
595
+ """Test correlation numerical stability with edge cases."""
596
+ df_small = DataFrame ({"A" : [1e-15 , 2e-15 , 3e-15 ], "B" : [2e-15 , 4e-15 , 6e-15 ]})
597
+
598
+ result_small_seq = df_small .corr (use_parallel = False )
599
+ result_small_par = df_small .corr (use_parallel = True )
600
+
601
+ tm .assert_frame_equal (result_small_seq , result_small_par )
602
+
603
+ for result in [result_small_seq , result_small_par ]:
604
+ assert (np .diag (result ) == 1.0 ).all ()
605
+ assert abs (result .loc ["A" , "B" ] - 1.0 ) < 1e-10
606
+
607
+ df_large = DataFrame ({"A" : [1e15 , 2e15 , 3e15 ], "B" : [2e15 , 4e15 , 6e15 ]})
608
+
609
+ result_large_seq = df_large .corr (use_parallel = False )
610
+ result_large_par = df_large .corr (use_parallel = True )
611
+
612
+ tm .assert_frame_equal (result_large_seq , result_large_par )
613
+
614
+ for result in [result_large_seq , result_large_par ]:
615
+ assert (np .diag (result ) == 1.0 ).all ()
616
+ assert abs (result .loc ["A" , "B" ] - 1.0 ) < 1e-10
617
+
618
+ df_precision = DataFrame (
619
+ {
620
+ "A" : [0.1 , 0.2 , 0.3 ],
621
+ "B" : [0.1000000000000001 , 0.2000000000000001 , 0.3000000000000001 ],
622
+ }
623
+ )
624
+
625
+ result_precision_seq = df_precision .corr (use_parallel = False )
626
+ result_precision_par = df_precision .corr (use_parallel = True )
627
+
628
+ tm .assert_frame_equal (result_precision_seq , result_precision_par )
629
+
630
+ for result in [result_precision_seq , result_precision_par ]:
631
+ assert (np .diag (result ) == 1.0 ).all ()
632
+ assert result .loc ["A" , "B" ] >= 0.999
633
+ assert result .loc ["A" , "B" ] <= 1.0
0 commit comments