New feature: issue #61691

ishaan1234 · ishaan1234 · commit 63cc559a3116 · 2025-06-25T23:51:29.000+05:30
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
@@ -0,0 +1,11 @@
+{
+  "permissions": {
+    "allow": [
+      "WebFetch(domain:github.com)",
+      "Bash(rg:*)",
+      "Bash(python:*)",
+      "Bash(find:*)"
+    ],
+    "deny": []
+  }
+}
diff --git a/pandas/__init__.py b/pandas/__init__.py
@@ -93,6 +93,8 @@
     to_numeric,
     to_datetime,
     to_timedelta,
+    # diagnostics
+    check,
     # misc
     Flags,
     Grouper,
@@ -281,6 +283,7 @@
     "array",
     "arrays",
     "bdate_range",
+    "check",
     "concat",
     "crosstab",
     "cut",
diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -69,6 +69,7 @@
 from pandas.core.indexes.timedeltas import timedelta_range
 from pandas.core.indexing import IndexSlice
 from pandas.core.series import Series
+from pandas.core.tools.check import check
 from pandas.core.tools.datetimes import to_datetime
 from pandas.core.tools.numeric import to_numeric
 from pandas.core.tools.timedeltas import to_timedelta
@@ -121,6 +122,7 @@
     "UInt64Dtype",
     "array",
     "bdate_range",
+    "check",
     "date_range",
     "factorize",
     "interval_range",
diff --git a/pandas/core/tools/check.py b/pandas/core/tools/check.py
@@ -0,0 +1,70 @@
+"""
+Utility function for quick dataset diagnostics.
+"""
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from pandas.util._decorators import set_module
+
+if TYPE_CHECKING:
+    from pandas import DataFrame
+
+
+@set_module("pandas")
+def check(df: DataFrame, round_digits: int = 2) -> DataFrame:
+    """
+    Provide a column-wise summary of DataFrame structure for quick diagnostics.
+
+    This function combines several common exploratory data analysis operations
+    into a single diagnostic summary, including unique values, non-null counts,
+    missing value counts, and missing percentages.
+
+    Parameters
+    ----------
+    df : DataFrame
+        The DataFrame to analyze.
+    round_digits : int, default 2
+        Number of decimal places to round the missing percentage to.
+
+    Returns
+    -------
+    DataFrame
+        A DataFrame with columns:
+        - unique: Number of unique values per column
+        - non_null: Number of non-null values per column  
+        - missing: Number of missing values per column
+        - missing_pct: Percentage of missing values per column
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> df = pd.DataFrame({
+    ...     'A': [1, 2, None, 4],
+    ...     'B': ['x', 'y', 'x', None],
+    ...     'C': [1.0, 2.0, 3.0, 4.0]
+    ... })
+    >>> pd.check(df)
+       unique  non_null  missing  missing_pct
+    A       3         3        1        25.00
+    B       2         3        1        25.00
+    C       4         4        0         0.00
+    """
+    import pandas as pd
+    
+    # Calculate basic statistics for each column
+    unique_counts = df.nunique()
+    non_null_counts = df.count()
+    missing_counts = df.isnull().sum()
+    total_rows = len(df)
+    missing_pct = (missing_counts / total_rows * 100).round(round_digits)
+    
+    # Create the result DataFrame
+    result = pd.DataFrame({
+        'unique': unique_counts,
+        'non_null': non_null_counts,
+        'missing': missing_counts,
+        'missing_pct': missing_pct
+    })
+    
+    return result
diff --git a/pandas/tests/tools/test_check.py b/pandas/tests/tools/test_check.py
@@ -0,0 +1,224 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    check,
+)
+import pandas._testing as tm
+
+
+class TestCheck:
+    def test_basic_functionality(self):
+        """Test basic functionality of pd.check()."""
+        df = DataFrame({
+            'A': [1, 2, None, 4],
+            'B': ['x', 'y', 'x', None],
+            'C': [1.0, 2.0, 3.0, 4.0]
+        })
+        
+        result = check(df)
+        
+        expected = DataFrame({
+            'unique': [3, 2, 4],
+            'non_null': [3, 3, 4],
+            'missing': [1, 1, 0],
+            'missing_pct': [25.00, 25.00, 0.00]
+        }, index=['A', 'B', 'C'])
+        
+        tm.assert_frame_equal(result, expected)
+
+    def test_empty_dataframe(self):
+        """Test check() with empty DataFrame."""
+        df = DataFrame()
+        
+        result = check(df)
+        
+        expected = DataFrame({
+            'unique': [],
+            'non_null': [],
+            'missing': [],
+            'missing_pct': []
+        }).astype('int64')
+        expected['missing_pct'] = expected['missing_pct'].astype('float64')
+        
+        tm.assert_frame_equal(result, expected)
+
+    def test_all_null_column(self):
+        """Test check() with a column that is all null."""
+        df = DataFrame({
+            'A': [1, 2, 3],
+            'B': [None, None, None],
+            'C': ['x', 'y', 'z']
+        })
+        
+        result = check(df)
+        
+        expected = DataFrame({
+            'unique': [3, 0, 3],
+            'non_null': [3, 0, 3],
+            'missing': [0, 3, 0],
+            'missing_pct': [0.00, 100.00, 0.00]
+        }, index=['A', 'B', 'C'])
+        
+        tm.assert_frame_equal(result, expected)
+
+    def test_no_missing_values(self):
+        """Test check() with DataFrame that has no missing values."""
+        df = DataFrame({
+            'A': [1, 2, 3, 4],
+            'B': ['w', 'x', 'y', 'z'],
+            'C': [1.1, 2.2, 3.3, 4.4]
+        })
+        
+        result = check(df)
+        
+        expected = DataFrame({
+            'unique': [4, 4, 4],
+            'non_null': [4, 4, 4],
+            'missing': [0, 0, 0],
+            'missing_pct': [0.00, 0.00, 0.00]
+        }, index=['A', 'B', 'C'])
+        
+        tm.assert_frame_equal(result, expected)
+
+    def test_round_digits_parameter(self):
+        """Test check() with different round_digits parameter."""
+        df = DataFrame({
+            'A': [1, None, None],  # 2/3 = 66.666... % missing
+            'B': [1, 2, 3]
+        })
+        
+        # Test with default round_digits=2
+        result_default = check(df)
+        expected_default = DataFrame({
+            'unique': [1, 3],
+            'non_null': [1, 3],
+            'missing': [2, 0],
+            'missing_pct': [66.67, 0.00]
+        }, index=['A', 'B'])
+        tm.assert_frame_equal(result_default, expected_default)
+        
+        # Test with round_digits=0
+        result_zero = check(df, round_digits=0)
+        expected_zero = DataFrame({
+            'unique': [1, 3],
+            'non_null': [1, 3],
+            'missing': [2, 0],
+            'missing_pct': [67.0, 0.0]
+        }, index=['A', 'B'])
+        tm.assert_frame_equal(result_zero, expected_zero)
+        
+        # Test with round_digits=4
+        result_four = check(df, round_digits=4)
+        expected_four = DataFrame({
+            'unique': [1, 3],
+            'non_null': [1, 3],
+            'missing': [2, 0],
+            'missing_pct': [66.6667, 0.0000]
+        }, index=['A', 'B'])
+        tm.assert_frame_equal(result_four, expected_four)
+
+    def test_various_dtypes(self):
+        """Test check() with various data types."""
+        df = DataFrame({
+            'int_col': [1, 2, None],
+            'float_col': [1.1, None, 3.3],
+            'str_col': ['a', 'b', None],
+            'bool_col': [True, False, None],
+            'datetime_col': pd.to_datetime(['2020-01-01', '2020-01-02', None])
+        })
+        
+        result = check(df)
+        
+        expected = DataFrame({
+            'unique': [2, 2, 2, 2, 2],
+            'non_null': [2, 2, 2, 2, 2],
+            'missing': [1, 1, 1, 1, 1],
+            'missing_pct': [33.33, 33.33, 33.33, 33.33, 33.33]
+        }, index=['int_col', 'float_col', 'str_col', 'bool_col', 'datetime_col'])
+        
+        tm.assert_frame_equal(result, expected)
+
+    def test_duplicate_values(self):
+        """Test check() with columns containing duplicate values."""
+        df = DataFrame({
+            'A': [1, 1, 2, 2, 2],
+            'B': ['x', 'x', 'x', 'y', 'y'],
+            'C': [1, 1, 1, 1, 1]  # All same value
+        })
+        
+        result = check(df)
+        
+        expected = DataFrame({
+            'unique': [2, 2, 1],
+            'non_null': [5, 5, 5],
+            'missing': [0, 0, 0],
+            'missing_pct': [0.00, 0.00, 0.00]
+        }, index=['A', 'B', 'C'])
+        
+        tm.assert_frame_equal(result, expected)
+
+    def test_single_row_dataframe(self):
+        """Test check() with single row DataFrame."""
+        df = DataFrame({
+            'A': [1],
+            'B': [None],
+            'C': ['test']
+        })
+        
+        result = check(df)
+        
+        expected = DataFrame({
+            'unique': [1, 0, 1],
+            'non_null': [1, 0, 1],
+            'missing': [0, 1, 0],
+            'missing_pct': [0.00, 100.00, 0.00]
+        }, index=['A', 'B', 'C'])
+        
+        tm.assert_frame_equal(result, expected)
+
+    def test_single_column_dataframe(self):
+        """Test check() with single column DataFrame."""
+        df = DataFrame({
+            'A': [1, 2, None, 4]
+        })
+        
+        result = check(df)
+        
+        expected = DataFrame({
+            'unique': [3],
+            'non_null': [3],
+            'missing': [1],
+            'missing_pct': [25.00]
+        }, index=['A'])
+        
+        tm.assert_frame_equal(result, expected)
+
+    def test_non_dataframe_raises_error(self):
+        """Test that check() raises appropriate error for non-DataFrame input."""
+        with pytest.raises(AttributeError):
+            check("not a dataframe")
+        
+        with pytest.raises(AttributeError):
+            check([1, 2, 3])
+
+    def test_return_type(self):
+        """Test that check() returns a DataFrame."""
+        df = DataFrame({'A': [1, 2, 3]})
+        result = check(df)
+        assert isinstance(result, DataFrame)
+
+    def test_column_order_preserved(self):
+        """Test that the order of columns is preserved in the result."""
+        df = DataFrame({
+            'Z': [1, 2, 3],
+            'A': [4, 5, 6],
+            'M': [7, 8, 9]
+        })
+        
+        result = check(df)
+        
+        expected_index = ['Z', 'A', 'M']
+        tm.assert_index_equal(result.index, pd.Index(expected_index))