Skip to content

Commit 63cc559

Browse files
committed
New feature: issue #61691
1 parent 35b0d1d commit 63cc559

File tree

5 files changed

+310
-0
lines changed

5 files changed

+310
-0
lines changed

.claude/settings.local.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"permissions": {
3+
"allow": [
4+
"WebFetch(domain:github.com)",
5+
"Bash(rg:*)",
6+
"Bash(python:*)",
7+
"Bash(find:*)"
8+
],
9+
"deny": []
10+
}
11+
}

pandas/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@
9393
to_numeric,
9494
to_datetime,
9595
to_timedelta,
96+
# diagnostics
97+
check,
9698
# misc
9799
Flags,
98100
Grouper,
@@ -281,6 +283,7 @@
281283
"array",
282284
"arrays",
283285
"bdate_range",
286+
"check",
284287
"concat",
285288
"crosstab",
286289
"cut",

pandas/core/api.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
from pandas.core.indexes.timedeltas import timedelta_range
7070
from pandas.core.indexing import IndexSlice
7171
from pandas.core.series import Series
72+
from pandas.core.tools.check import check
7273
from pandas.core.tools.datetimes import to_datetime
7374
from pandas.core.tools.numeric import to_numeric
7475
from pandas.core.tools.timedeltas import to_timedelta
@@ -121,6 +122,7 @@
121122
"UInt64Dtype",
122123
"array",
123124
"bdate_range",
125+
"check",
124126
"date_range",
125127
"factorize",
126128
"interval_range",

pandas/core/tools/check.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""
2+
Utility function for quick dataset diagnostics.
3+
"""
4+
from __future__ import annotations
5+
6+
from typing import TYPE_CHECKING
7+
8+
from pandas.util._decorators import set_module
9+
10+
if TYPE_CHECKING:
11+
from pandas import DataFrame
12+
13+
14+
@set_module("pandas")
15+
def check(df: DataFrame, round_digits: int = 2) -> DataFrame:
16+
"""
17+
Provide a column-wise summary of DataFrame structure for quick diagnostics.
18+
19+
This function combines several common exploratory data analysis operations
20+
into a single diagnostic summary, including unique values, non-null counts,
21+
missing value counts, and missing percentages.
22+
23+
Parameters
24+
----------
25+
df : DataFrame
26+
The DataFrame to analyze.
27+
round_digits : int, default 2
28+
Number of decimal places to round the missing percentage to.
29+
30+
Returns
31+
-------
32+
DataFrame
33+
A DataFrame with columns:
34+
- unique: Number of unique values per column
35+
- non_null: Number of non-null values per column
36+
- missing: Number of missing values per column
37+
- missing_pct: Percentage of missing values per column
38+
39+
Examples
40+
--------
41+
>>> import pandas as pd
42+
>>> df = pd.DataFrame({
43+
... 'A': [1, 2, None, 4],
44+
... 'B': ['x', 'y', 'x', None],
45+
... 'C': [1.0, 2.0, 3.0, 4.0]
46+
... })
47+
>>> pd.check(df)
48+
unique non_null missing missing_pct
49+
A 3 3 1 25.00
50+
B 2 3 1 25.00
51+
C 4 4 0 0.00
52+
"""
53+
import pandas as pd
54+
55+
# Calculate basic statistics for each column
56+
unique_counts = df.nunique()
57+
non_null_counts = df.count()
58+
missing_counts = df.isnull().sum()
59+
total_rows = len(df)
60+
missing_pct = (missing_counts / total_rows * 100).round(round_digits)
61+
62+
# Create the result DataFrame
63+
result = pd.DataFrame({
64+
'unique': unique_counts,
65+
'non_null': non_null_counts,
66+
'missing': missing_counts,
67+
'missing_pct': missing_pct
68+
})
69+
70+
return result

pandas/tests/tools/test_check.py

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
import numpy as np
2+
import pytest
3+
4+
import pandas as pd
5+
from pandas import (
6+
DataFrame,
7+
check,
8+
)
9+
import pandas._testing as tm
10+
11+
12+
class TestCheck:
13+
def test_basic_functionality(self):
14+
"""Test basic functionality of pd.check()."""
15+
df = DataFrame({
16+
'A': [1, 2, None, 4],
17+
'B': ['x', 'y', 'x', None],
18+
'C': [1.0, 2.0, 3.0, 4.0]
19+
})
20+
21+
result = check(df)
22+
23+
expected = DataFrame({
24+
'unique': [3, 2, 4],
25+
'non_null': [3, 3, 4],
26+
'missing': [1, 1, 0],
27+
'missing_pct': [25.00, 25.00, 0.00]
28+
}, index=['A', 'B', 'C'])
29+
30+
tm.assert_frame_equal(result, expected)
31+
32+
def test_empty_dataframe(self):
33+
"""Test check() with empty DataFrame."""
34+
df = DataFrame()
35+
36+
result = check(df)
37+
38+
expected = DataFrame({
39+
'unique': [],
40+
'non_null': [],
41+
'missing': [],
42+
'missing_pct': []
43+
}).astype('int64')
44+
expected['missing_pct'] = expected['missing_pct'].astype('float64')
45+
46+
tm.assert_frame_equal(result, expected)
47+
48+
def test_all_null_column(self):
49+
"""Test check() with a column that is all null."""
50+
df = DataFrame({
51+
'A': [1, 2, 3],
52+
'B': [None, None, None],
53+
'C': ['x', 'y', 'z']
54+
})
55+
56+
result = check(df)
57+
58+
expected = DataFrame({
59+
'unique': [3, 0, 3],
60+
'non_null': [3, 0, 3],
61+
'missing': [0, 3, 0],
62+
'missing_pct': [0.00, 100.00, 0.00]
63+
}, index=['A', 'B', 'C'])
64+
65+
tm.assert_frame_equal(result, expected)
66+
67+
def test_no_missing_values(self):
68+
"""Test check() with DataFrame that has no missing values."""
69+
df = DataFrame({
70+
'A': [1, 2, 3, 4],
71+
'B': ['w', 'x', 'y', 'z'],
72+
'C': [1.1, 2.2, 3.3, 4.4]
73+
})
74+
75+
result = check(df)
76+
77+
expected = DataFrame({
78+
'unique': [4, 4, 4],
79+
'non_null': [4, 4, 4],
80+
'missing': [0, 0, 0],
81+
'missing_pct': [0.00, 0.00, 0.00]
82+
}, index=['A', 'B', 'C'])
83+
84+
tm.assert_frame_equal(result, expected)
85+
86+
def test_round_digits_parameter(self):
87+
"""Test check() with different round_digits parameter."""
88+
df = DataFrame({
89+
'A': [1, None, None], # 2/3 = 66.666... % missing
90+
'B': [1, 2, 3]
91+
})
92+
93+
# Test with default round_digits=2
94+
result_default = check(df)
95+
expected_default = DataFrame({
96+
'unique': [1, 3],
97+
'non_null': [1, 3],
98+
'missing': [2, 0],
99+
'missing_pct': [66.67, 0.00]
100+
}, index=['A', 'B'])
101+
tm.assert_frame_equal(result_default, expected_default)
102+
103+
# Test with round_digits=0
104+
result_zero = check(df, round_digits=0)
105+
expected_zero = DataFrame({
106+
'unique': [1, 3],
107+
'non_null': [1, 3],
108+
'missing': [2, 0],
109+
'missing_pct': [67.0, 0.0]
110+
}, index=['A', 'B'])
111+
tm.assert_frame_equal(result_zero, expected_zero)
112+
113+
# Test with round_digits=4
114+
result_four = check(df, round_digits=4)
115+
expected_four = DataFrame({
116+
'unique': [1, 3],
117+
'non_null': [1, 3],
118+
'missing': [2, 0],
119+
'missing_pct': [66.6667, 0.0000]
120+
}, index=['A', 'B'])
121+
tm.assert_frame_equal(result_four, expected_four)
122+
123+
def test_various_dtypes(self):
124+
"""Test check() with various data types."""
125+
df = DataFrame({
126+
'int_col': [1, 2, None],
127+
'float_col': [1.1, None, 3.3],
128+
'str_col': ['a', 'b', None],
129+
'bool_col': [True, False, None],
130+
'datetime_col': pd.to_datetime(['2020-01-01', '2020-01-02', None])
131+
})
132+
133+
result = check(df)
134+
135+
expected = DataFrame({
136+
'unique': [2, 2, 2, 2, 2],
137+
'non_null': [2, 2, 2, 2, 2],
138+
'missing': [1, 1, 1, 1, 1],
139+
'missing_pct': [33.33, 33.33, 33.33, 33.33, 33.33]
140+
}, index=['int_col', 'float_col', 'str_col', 'bool_col', 'datetime_col'])
141+
142+
tm.assert_frame_equal(result, expected)
143+
144+
def test_duplicate_values(self):
145+
"""Test check() with columns containing duplicate values."""
146+
df = DataFrame({
147+
'A': [1, 1, 2, 2, 2],
148+
'B': ['x', 'x', 'x', 'y', 'y'],
149+
'C': [1, 1, 1, 1, 1] # All same value
150+
})
151+
152+
result = check(df)
153+
154+
expected = DataFrame({
155+
'unique': [2, 2, 1],
156+
'non_null': [5, 5, 5],
157+
'missing': [0, 0, 0],
158+
'missing_pct': [0.00, 0.00, 0.00]
159+
}, index=['A', 'B', 'C'])
160+
161+
tm.assert_frame_equal(result, expected)
162+
163+
def test_single_row_dataframe(self):
164+
"""Test check() with single row DataFrame."""
165+
df = DataFrame({
166+
'A': [1],
167+
'B': [None],
168+
'C': ['test']
169+
})
170+
171+
result = check(df)
172+
173+
expected = DataFrame({
174+
'unique': [1, 0, 1],
175+
'non_null': [1, 0, 1],
176+
'missing': [0, 1, 0],
177+
'missing_pct': [0.00, 100.00, 0.00]
178+
}, index=['A', 'B', 'C'])
179+
180+
tm.assert_frame_equal(result, expected)
181+
182+
def test_single_column_dataframe(self):
183+
"""Test check() with single column DataFrame."""
184+
df = DataFrame({
185+
'A': [1, 2, None, 4]
186+
})
187+
188+
result = check(df)
189+
190+
expected = DataFrame({
191+
'unique': [3],
192+
'non_null': [3],
193+
'missing': [1],
194+
'missing_pct': [25.00]
195+
}, index=['A'])
196+
197+
tm.assert_frame_equal(result, expected)
198+
199+
def test_non_dataframe_raises_error(self):
200+
"""Test that check() raises appropriate error for non-DataFrame input."""
201+
with pytest.raises(AttributeError):
202+
check("not a dataframe")
203+
204+
with pytest.raises(AttributeError):
205+
check([1, 2, 3])
206+
207+
def test_return_type(self):
208+
"""Test that check() returns a DataFrame."""
209+
df = DataFrame({'A': [1, 2, 3]})
210+
result = check(df)
211+
assert isinstance(result, DataFrame)
212+
213+
def test_column_order_preserved(self):
214+
"""Test that the order of columns is preserved in the result."""
215+
df = DataFrame({
216+
'Z': [1, 2, 3],
217+
'A': [4, 5, 6],
218+
'M': [7, 8, 9]
219+
})
220+
221+
result = check(df)
222+
223+
expected_index = ['Z', 'A', 'M']
224+
tm.assert_index_equal(result.index, pd.Index(expected_index))

0 commit comments

Comments
 (0)