Skip to content

Commit d57f40c

Browse files
SNOW-2002725: Add support for DataFrame.boxplot (#3532)
1 parent d91b13b commit d57f40c

File tree

10 files changed

+213
-18
lines changed

10 files changed

+213
-18
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ local ingestion. By default, local ingestion uses multithreading. Multiprocessin
5858
- Added support for `pd.explain_switch()` to return debugging information on hybrid execution decisions.
5959
- Support `pd.read_snowflake` when the global modin backend is `Pandas`.
6060
- Added support for `pd.to_dynamic_table`, `pd.to_iceberg`, and `pd.to_view`.
61+
- Added support for `DataFrame.boxplot`.
6162

6263
#### Improvements
6364

docs/source/modin/dataframe.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,13 @@ DataFrame
233233
DataFrame.last_valid_index
234234
DataFrame.resample
235235

236+
.. rubric:: Plotting
237+
238+
.. autosummary::
239+
:toctree: pandas_api/
240+
241+
DataFrame.boxplot
242+
236243
.. rubric:: Serialization / IO / conversion
237244

238245
.. autosummary::

docs/source/modin/supported/dataframe_supported.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ Methods
109109
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
110110
| ``bool`` | N | | |
111111
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
112-
| ``boxplot`` | N | | |
112+
| ``boxplot`` | Y | | |
113113
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
114114
| ``clip`` | N | | |
115115
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

src/snowflake/snowpark/modin/plugin/docstrings/base.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2220,15 +2220,14 @@ def pct_change():
22202220
fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad'
22212221
How to handle NAs **before** computing percent changes.
22222222
2223-
.. deprecated:: 2.1
2224-
All options of `fill_method` are deprecated except `fill_method=None`.
2223+
All options of `fill_method` are deprecated except `fill_method=None`.
22252224
22262225
limit : int, default None
22272226
The number of consecutive NAs to fill before stopping.
22282227
22292228
Snowpark pandas does not yet support this parameter.
22302229
2231-
.. deprecated:: 2.1
2230+
Deprecated parameter.
22322231
22332232
freq : DateOffset, timedelta, or str, optional
22342233
Increment to use from time series API (e.g. 'ME' or BDay()).
@@ -2521,7 +2520,7 @@ def resample():
25212520
Which axis to use for up- or down-sampling. For Series this parameter is unused and defaults to 0.
25222521
Snowpark pandas only supports ``axis`` 0 and DatetimeIndex.
25232522
2524-
Deprecated since version 2.0.0: Use frame.T.resample(…) instead.
2523+
Deprecated: Use frame.T.resample(…) instead.
25252524
closed : {'right', 'left'}, default None
25262525
Which side of bin interval is closed. The default is 'left' for all frequency offsets except for
25272526
'ME', 'YE', 'QE', 'BME', 'BA', 'BQE', and 'W' which all have a default of 'right'.
@@ -2536,7 +2535,7 @@ def resample():
25362535
For PeriodIndex only, controls whether to use the start or end of rule.
25372536
Snowpark pandas does not support PeriodIndex.
25382537
2539-
Deprecated since version 2.2.0: Convert PeriodIndex to DatetimeIndex before resampling instead.
2538+
Deprecated: Convert PeriodIndex to DatetimeIndex before resampling instead.
25402539
kind : {'timestamp', 'period'}, optional, default None
25412540
Pass 'timestamp' to convert the resulting index to a DateTimeIndex
25422541
or 'period' to convert it to a PeriodIndex. By default, the input representation is retained.

src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py

Lines changed: 143 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,7 +1181,7 @@ def bfill():
11811181
downcast : dict, default is None
11821182
A dict of item->dtype of what to downcast if possible, or the string ‘infer’ which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible).
11831183
1184-
Deprecated since version 2.2.0.
1184+
Deprecated parameter.
11851185
11861186
Returns
11871187
-------
@@ -1231,7 +1231,144 @@ def bfill():
12311231

12321232
def boxplot():
12331233
"""
1234-
Make a box plot from ``DataFrame`` columns.
1234+
Make a box plot from DataFrame columns.
1235+
1236+
Make a box-and-whisker plot from DataFrame columns, optionally grouped by some other columns. A box plot is a method for graphically depicting groups of numerical data through their quartiles. The box extends from the Q1 to Q3 quartile values of the data, with a line at the median (Q2). The whiskers extend from the edges of box to show the range of the data. By default, they extend no more than 1.5 * IQR (IQR = Q3 - Q1) from the edges of the box, ending at the farthest data point within that interval. Outliers are plotted as separate dots.
1237+
1238+
For further details see Wikipedia’s entry for [boxplot](https://en.wikipedia.org/wiki/Box_plot).
1239+
1240+
Parameters
1241+
----------
1242+
column : str or list of str, optional
1243+
Column name or list of names, or vector. Can be any valid input to pandas.DataFrame.groupby().
1244+
1245+
by : str or array-like, optional
1246+
Column in the DataFrame to pandas.DataFrame.groupby(). One box-plot will be done per value of columns in by.
1247+
1248+
ax : object of class matplotlib.axes.Axes, optional
1249+
The matplotlib axes to be used by boxplot.
1250+
1251+
fontsize : float or str
1252+
Tick label font size in points or as a string (e.g., large).
1253+
1254+
rot : float, default 0
1255+
The rotation angle of labels (in degrees) with respect to the screen coordinate system.
1256+
1257+
grid : bool, default True
1258+
Setting this to True will show the grid.
1259+
1260+
fig : sizeA tuple (width, height) in inches
1261+
The size of the figure to create in matplotlib.
1262+
1263+
layout : tuple (rows, columns), optional
1264+
For example, (3, 5) will display the subplots using 3 rows and 5 columns, starting from the top-left.
1265+
1266+
return_type : {‘axes’, ‘dict’, ‘both’} or None, default ‘axes’
1267+
The kind of object to return. The default is axes.
1268+
1269+
- ‘axes’ returns the matplotlib axes the boxplot is drawn on.
1270+
1271+
- ‘dict’ returns a dictionary whose values are the matplotlib Lines of the boxplot.
1272+
1273+
- ‘both’ returns a namedtuple with the axes and dict.
1274+
1275+
- when grouping with by, a Series mapping columns to return_type is returned.
1276+
1277+
If return_type is None, a NumPy array of axes with the same shape as layout is returned.
1278+
1279+
backend : str, default None
1280+
Backend to use instead of the backend specified in the option plotting.backend. For instance, ‘matplotlib’. Alternatively, to specify the plotting.backend for the whole session, set pd.options.plotting.backend.
1281+
1282+
**kwargs
1283+
All other plotting keyword arguments to be passed to matplotlib.pyplot.boxplot().
1284+
1285+
Returns
1286+
-------
1287+
result
1288+
See Notes.
1289+
1290+
See also
1291+
--------
1292+
Series.plot.hist
1293+
Make a histogram.
1294+
1295+
matplotlib.pyplot.boxplot
1296+
Matplotlib equivalent plot.
1297+
1298+
Notes
1299+
-----
1300+
The return type depends on the return_type parameter:
1301+
1302+
- ‘axes’ : object of class matplotlib.axes.Axes
1303+
1304+
- ‘dict’ : dict of matplotlib.lines.Line2D objects
1305+
1306+
- ‘both’ : a namedtuple with structure (ax, lines)
1307+
1308+
For data grouped with by, return a Series of the above or a numpy array:
1309+
1310+
- Series
1311+
1312+
- array (for return_type = None)
1313+
1314+
Use return_type='dict' when you want to tweak the appearance of the lines after plotting. In this case a dict containing the Lines making up the boxes, caps, fliers, medians, and whiskers is returned.
1315+
1316+
Examples
1317+
--------
1318+
Boxplots can be created for every column in the dataframe by df.boxplot() or indicating the columns to be used:
1319+
1320+
>>> np.random.seed(1234)
1321+
>>> df = pd.DataFrame(np.random.randn(10, 4),
1322+
... columns=['Col1', 'Col2', 'Col3', 'Col4'])
1323+
>>> boxplot = df.boxplot(column=['Col1', 'Col2', 'Col3'])
1324+
../../_images/pandas-DataFrame-boxplot-1.png
1325+
1326+
Boxplots of variables distributions grouped by the values of a third variable can be created using the option by. For instance:
1327+
1328+
>>> df = pd.DataFrame(np.random.randn(10, 2),
1329+
... columns=['Col1', 'Col2'])
1330+
>>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A',
1331+
... 'B', 'B', 'B', 'B', 'B'])
1332+
>>> boxplot = df.boxplot(by='X')
1333+
1334+
A list of strings (i.e. ['X', 'Y']) can be passed to boxplot in order to group the data by combination of the variables in the x-axis:
1335+
1336+
>>> df = pd.DataFrame(np.random.randn(10, 3),
1337+
... columns=['Col1', 'Col2', 'Col3'])
1338+
>>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A',
1339+
... 'B', 'B', 'B', 'B', 'B'])
1340+
>>> df['Y'] = pd.Series(['A', 'B', 'A', 'B', 'A',
1341+
... 'B', 'A', 'B', 'A', 'B'])
1342+
>>> boxplot = df.boxplot(column=['Col1', 'Col2'], by=['X', 'Y'])
1343+
1344+
The layout of boxplot can be adjusted giving a tuple to layout:
1345+
1346+
>>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X',
1347+
... layout=(2, 1))
1348+
1349+
Additional formatting can be done to the boxplot, like suppressing the grid (grid=False), rotating the labels in the x-axis (i.e. rot=45) or changing the fontsize (i.e. fontsize=15):
1350+
1351+
>>> boxplot = df.boxplot(grid=False, rot=45, fontsize=15)
1352+
1353+
The parameter return_type can be used to select the type of element returned by boxplot. When return_type='axes' is selected, the matplotlib axes on which the boxplot is drawn are returned:
1354+
1355+
>>> boxplot = df.boxplot(column=['Col1', 'Col2'], return_type='axes')
1356+
>>> type(boxplot)
1357+
<class 'matplotlib.axes._axes.Axes'>
1358+
1359+
When grouping with by, a Series mapping columns to return_type is returned:
1360+
1361+
>>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X',
1362+
... return_type='axes')
1363+
type(boxplot)
1364+
<class 'pandas.Series'>
1365+
1366+
If return_type is None, a NumPy array of axes with the same shape as layout is returned:
1367+
1368+
>>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X',
1369+
... return_type=None)
1370+
>>> type(boxplot)
1371+
<class 'numpy.ndarray'>
12351372
"""
12361373

12371374
def combine():
@@ -1514,7 +1651,7 @@ def ffill():
15141651
downcast : dict, default is None
15151652
A dict of item->dtype of what to downcast if possible, or the string ‘infer’ which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible).
15161653
1517-
Deprecated since version 2.2.0.
1654+
Deprecated parameter.
15181655
15191656
Returns
15201657
-------
@@ -1569,8 +1706,7 @@ def fillna():
15691706
* ffill: propagate last valid observation forward to next valid.
15701707
* backfill / bfill: use next valid observation to fill gap.
15711708
1572-
.. deprecated:: 2.1.0
1573-
Use ffill or bfill instead.
1709+
Deprecated: Use ffill or bfill instead.
15741710
15751711
axis : {axes_single_arg}
15761712
Axis along which to fill missing values. For `Series`
@@ -1591,7 +1727,7 @@ def fillna():
15911727
or the string 'infer' which will try to downcast to an appropriate
15921728
equal type (e.g. float64 to int64 if possible).
15931729
1594-
.. deprecated:: 2.2.0
1730+
Deprecated parameter.
15951731
15961732
Returns
15971733
-------
@@ -1758,7 +1894,7 @@ def from_records():
17581894
data : structured ndarray, sequence of tuples or dicts, or DataFrame
17591895
Structured input data.
17601896
1761-
Deprecated since version 2.1.0: Passing a DataFrame is deprecated.
1897+
Deprecated: Passing a DataFrame is deprecated.
17621898
17631899
index : str, list of fields, array-like
17641900
Field of array to use as the index, alternately a specific set of input labels to use.

src/snowflake/snowpark/modin/plugin/docstrings/groupby.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -940,7 +940,7 @@ def pct_change():
940940
The number of consecutive NAs to fill before stopping.
941941
Snowpark pandas does not yet support this parameter.
942942
943-
Deprecated.
943+
Deprecated parameter.
944944
945945
freq : DateOffset, timedelta, or str, optional
946946
Increment to use from time series API (e.g. ‘ME’ or BDay()).

src/snowflake/snowpark/modin/plugin/docstrings/index.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1649,7 +1649,7 @@ def fillna():
16491649
or the string 'infer' which will try to downcast to an appropriate
16501650
equal type (e.g. float64 to int64 if possible).
16511651
1652-
.. deprecated:: 2.1.0
1652+
Deprecated parameter.
16531653
16541654
Returns
16551655
-------

src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,6 @@ def _map(self, func: PythonFuncType, na_action: str | None = None, **kwargs):
167167
)
168168

169169

170-
@register_dataframe_not_implemented()
171170
def boxplot(
172171
self,
173172
column=None,
@@ -182,7 +181,22 @@ def boxplot(
182181
backend=None,
183182
**kwargs,
184183
): # noqa: PR01, RT01, D200
185-
pass # pragma: no cover
184+
WarningMessage.single_warning(
185+
"DataFrame.boxplot materializes data to the local machine."
186+
)
187+
return self._to_pandas().boxplot(
188+
column=column,
189+
by=by,
190+
ax=ax,
191+
fontsize=fontsize,
192+
rot=rot,
193+
grid=grid,
194+
figsize=figsize,
195+
layout=layout,
196+
return_type=return_type,
197+
backend=backend,
198+
**kwargs,
199+
)
186200

187201

188202
@register_dataframe_not_implemented()
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#
2+
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
3+
#
4+
5+
import modin.pandas as pd
6+
import matplotlib.pyplot as plt
7+
import pandas as native_pd
8+
import pytest
9+
10+
from matplotlib.testing.compare import compare_images
11+
from tests.integ.utils.sql_counter import sql_count_checker
12+
13+
14+
@sql_count_checker(query_count=1)
15+
@pytest.mark.parametrize("grid", [True, False])
16+
@pytest.mark.parametrize("column", [None, "A", ["A", "B"]])
17+
def test_boxplot(grid, column, tmp_path):
18+
data = {
19+
"A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
20+
"B": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
21+
"C": [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
22+
}
23+
native_df = native_pd.DataFrame(data)
24+
snow_df = pd.DataFrame(native_df)
25+
26+
native_fig = plt.figure()
27+
snow_fig = plt.figure()
28+
29+
native_fig.add_axes(
30+
native_df.boxplot(ax=native_fig.gca(), grid=grid, column=column)
31+
)
32+
native_file_path = f"{tmp_path}/test_boxplot_native.png"
33+
native_fig.savefig(native_file_path)
34+
35+
snow_fig.add_axes(snow_df.boxplot(ax=snow_fig.gca(), grid=grid, column=column))
36+
snow_file_path = f"{tmp_path}/test_boxplot_snow.png"
37+
snow_fig.savefig(snow_file_path)
38+
39+
assert compare_images(native_file_path, snow_file_path, 0) is None

tests/unit/modin/test_unsupported.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ def test_unsupported_general(general_method, kwargs):
7171
["at_time", {"time": ""}],
7272
["between_time", {"start_time": "", "end_time": ""}],
7373
["bool", {}],
74-
["boxplot", {}],
7574
["clip", {}],
7675
["combine", {"other": "", "func": ""}],
7776
["combine_first", {"other": ""}],

0 commit comments

Comments
 (0)