update: docs for parallel correlation

gangula-karthik · gangula-karthik · commit 5af65e6d42fb · 2025-07-06T03:53:34.000Z
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -56,6 +56,7 @@ Other enhancements
 - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`)
 - :func:`set_option` now accepts a dictionary of options, simplifying configuration of multiple settings at once (:issue:`61093`)
 - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
+- :meth:`DataFrame.corr` now accepts ``use_parallel`` parameter for parallel computation of Pearson correlations, potentially improving performance on large datasets (:issue:`TBD`)
 - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
 - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`)
 - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
@@ -641,6 +642,7 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`)
 - Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`)
 - Performance improvement in :meth:`DataFrame.join` with ``how="left"`` or ``how="right"`` and ``sort=True`` (:issue:`56919`)
+- Performance improvement in :meth:`DataFrame.corr` when ``use_parallel=True`` is used for computing Pearson correlations on large datasets (:issue:`TBD`)
 - Performance improvement in :meth:`DataFrame.to_csv` when ``index=False`` (:issue:`59312`)
 - Performance improvement in :meth:`DataFrameGroupBy.ffill`, :meth:`DataFrameGroupBy.bfill`, :meth:`SeriesGroupBy.ffill`, and :meth:`SeriesGroupBy.bfill` (:issue:`56902`)
 - Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`)
diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi
@@ -43,6 +43,7 @@ def nancorr(
     mat: npt.NDArray[np.float64],  # const float64_t[:, :]
     cov: bool = ...,
     minp: int | None = ...,
+    use_parallel: bool = ...,
 ) -> npt.NDArray[np.float64]: ...  # ndarray[float64_t, ndim=2]
 def nancorr_spearman(
     mat: npt.NDArray[np.float64],  # ndarray[float64_t, ndim=2]
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -11269,6 +11269,10 @@ def corr(
         """
         Compute pairwise correlation of columns, excluding NA/null values.
 
+        This function computes the correlation matrix between all pairs of columns
+        in the DataFrame, handling missing values by excluding them from the
+        calculation on a pairwise basis.
+
         Parameters
         ----------
         method : {'pearson', 'kendall', 'spearman'} or callable
@@ -11294,9 +11298,19 @@ def corr(
                 The default value of ``numeric_only`` is now ``False``.
 
         use_parallel : bool, default False
-            Use parallel computation for Pearson correlation.
-            Only effective for large matrices where parallelization overhead
-            is justified by compute time savings.
+            Use parallel computation for Pearson correlation to potentially
+            improve performance on large datasets. This parameter is only
+            effective when ``method='pearson'`` and is ignored for other
+            correlation methods.
+
+            When ``True``, the computation will utilize multiple CPU cores
+            for calculating pairwise correlations. This can provide significant
+            performance improvements for large DataFrames (typically with
+            hundreds of columns or more) but may introduce overhead for
+            smaller datasets. The optimal threshold depends on system
+            specifications and data characteristics.
+
+            .. versionadded:: 3.0.0
 
         Returns
         -------
@@ -11317,6 +11331,17 @@ def corr(
         * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_
         * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
 
+        **Parallel Computation:**
+
+        The ``use_parallel`` parameter can significantly improve performance for large
+        DataFrames by distributing the correlation computation across multiple CPU cores.
+        However, it's important to note:
+
+        - Only affects Pearson correlation (``method='pearson'``)
+        - Performance gains are most noticeable for DataFrames with many columns
+        - Small datasets may see negligible improvement or even slight overhead
+        - The optimal threshold depends on system specifications and data characteristics
+
         Examples
         --------
         >>> def histogram_intersection(a, b):
@@ -11340,8 +11365,8 @@ def corr(
         cats   NaN   1.0
 
         >>> # Use parallel computation for large DataFrames
-        >>> large_df = pd.DataFrame(np.random.randn(10000, 100))
-        >>> corr_matrix = large_df.corr(use_parallel=True)
+        >>> large_df = pd.DataFrame(np.random.randn(1000, 50))
+        >>> corr_matrix = large_df.corr(use_parallel=True)  # doctest: +SKIP
         """  # noqa: E501
         data = self._get_numeric_data() if numeric_only else self
         cols = data.columns