Skip to content

Commit fe3fef3

Browse files
committed
Add binning option to BivariatePlotGAM; other updates
- Changed function name BivariatePlotRawGAM to BivariatePlotGAM - Added more documentation to some function docstrings, in some cases changing existing docstrings. - Fixed fencepost problem in _interpGrid - Commented out generic labelling in pd.cut within _aggValuesToGrid. These generic labels did not serve a purpose before, and now that _aggValuesToGrid also returns the aggregated DataFrame, it is more useful if the aggregated DataFrame's multiindex has the pd.Intervals from pd.cut. - As mentioned, changed _aggValuesToGrid to also return the aggregated DataFrame in addition to the reshaped matrix. - Fixed BivariatePlotGrid passing wind_u and wind_v to _aggValuesToGrid backwards. This removed the need for a transpose within _aggValuesToGrid. - Added .copy() to X and y definition in BivariatePlotGAM, for safety.
1 parent ddea2e2 commit fe3fef3

File tree

6 files changed

+144
-42
lines changed

6 files changed

+144
-42
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import bivapp.plots as bp
2121
import cmcrameri.cm as cm
2222

2323
df = ImportOpenairDataExample()
24-
fig, axs = bp.BivariatePlotRawGAM(
24+
fig, axs = bp.BivariatePlotGAM(
2525
df["so2"],
2626
df["ws"],
2727
df["wd"],
0 Bytes
Binary file not shown.
5.1 KB
Binary file not shown.
3 Bytes
Binary file not shown.

bivapp/plots.py

Lines changed: 140 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,30 @@ def _makeFigurePretty(
136136

137137

138138
def _getWindComponents(wind_speed, wind_dir):
139-
"""Get wind components from wind speed and direction."""
139+
"""Get cartesian wind vector components from wind speed and direction.
140+
141+
This expresses the wind as a vector pointing in the direction the wind is
142+
coming from. For example, wind from the north at 1 m/s gives an output of
143+
(0, 1). An easterly 1 m/s wind returns (1, 0).
144+
145+
This might differ from other sources that might express wind as a vector
146+
point in the direction of air mass flow. If you desire that convention, simply
147+
take the negative of the output of this function.
148+
149+
Parameters
150+
----------
151+
wind_speed : array-like
152+
Wind speed data. Should be the same length as wind_dir.
153+
wind_dir : array-like
154+
Wind direction data. Should be the same length as wind_speed.
155+
Returns
156+
-------
157+
wind_u : numpy array
158+
Wind speed component in the u direction (east-west). Positive values indicate
159+
wind from the east.
160+
wind_v : numpy array
161+
Wind speed component in the v direction (north-south). Positive values indicate
162+
wind from the north."""
140163
wind_u = wind_speed * np.sin(np.deg2rad(wind_dir))
141164
wind_v = wind_speed * np.cos(np.deg2rad(wind_dir))
142165

@@ -153,28 +176,34 @@ def _interpGrid(sparse_grid, xs, ys, interpolation_method):
153176
points,
154177
method=interpolation_method,
155178
)
156-
return interp.reshape(len(xs) - 1, len(ys) - 1)
179+
return interp.reshape(len(xs), len(ys))
157180

158181

159182
def _aggValuesToGrid(wind_u, wind_v, values, wind_bins, resolution, agg_method):
183+
"""Aggregate values into a grid based on wind speed and direction components.
184+
Returns a pandas DataFrame with the aggregated values for each wind speed
185+
and direction bin, and a numpy array of the values reshaped into a grid.
186+
187+
Note that the numpy array contains no information about the bins themselves."""
188+
160189
df = pd.DataFrame([wind_u, wind_v, values]).T
161190
df.columns = ["wind_u", "wind_v", "values"]
162191
wind_u_cut = pd.cut(
163192
df["wind_u"],
164193
bins=wind_bins,
165194
include_lowest=True,
166-
labels=np.arange(0, resolution - 1, 1),
195+
# labels=np.arange(0, resolution - 1, 1),
167196
)
168197
wind_v_cut = pd.cut(
169198
df["wind_v"],
170199
bins=wind_bins,
171200
include_lowest=True,
172-
labels=np.arange(0, resolution - 1, 1),
201+
# labels=np.arange(0, resolution - 1, 1),
173202
)
174203
agged_values = df.groupby([wind_u_cut, wind_v_cut], observed=False).agg(
175204
{"values": agg_method}
176205
)
177-
return agged_values.values.reshape(resolution - 1, resolution - 1)
206+
return agged_values.values.reshape(resolution - 1, resolution - 1).T, agged_values
178207

179208

180209
def _excludeTooFar(true_points, pred_points, dist):
@@ -268,8 +297,8 @@ def BivariatePlotGrid(
268297
wind_bins = np.linspace(-wind_comp_abs_max, wind_comp_abs_max, resolution)
269298

270299
# Aggregate values into bins. For convenience, we use pandas cut and groupby methods.
271-
reshaped = _aggValuesToGrid(
272-
wind_v, wind_u, values, wind_bins, resolution, agg_method
300+
reshaped, _ = _aggValuesToGrid(
301+
wind_u, wind_v, values, wind_bins, resolution, agg_method
273302
)
274303

275304
# Interpolate, if requested
@@ -299,11 +328,13 @@ def _fitGAM(X, y, degfree, lam):
299328
return gam
300329

301330

302-
def BivariatePlotRawGAM(
331+
def BivariatePlotGAM(
303332
values,
304333
wind_speed,
305334
wind_dir,
335+
bin_data_before_gam=False,
306336
pred_res=500,
337+
agg_method="mean",
307338
positive=True,
308339
degfree=30,
309340
lam=0.6,
@@ -315,44 +346,115 @@ def BivariatePlotRawGAM(
315346
colourbar_label=None,
316347
wind_unit="m/s",
317348
):
318-
"""Fits a GAM to the raw data, similar to the R openair package. Specifically fits
319-
values ** 0.5 ~ s(wind_u) + s(wind_v), where s are smoothing splines, and returns
320-
a plot of the fitted values.
321-
322-
Note that openair fits a GAM to binned data, but this function fits a GAM to the raw data.
323-
324-
Masking method chooses how to cut GAM predictions to a reasonable boundary around the raw
325-
data. Options are:
326-
327-
"near": points within a set distance (1 m/s) of the raw data are preserved. This is similar to
328-
how openair limits data. However, the implementation here can be slow for large datasets.
329-
"ch": a convext hull is drawn around the raw data and only predictions falling within this hull
330-
are presented.
331-
"grid": raw data is gridded to a resolution of pred_res and only grid cells with raw data present
332-
are preserved. This is best used with smaller values of pred_res and will produce an image that
333-
looks like a smoothed version of the image produced by BivariatePlotGrid. Note that due to the
334-
way the gridding calculation works, this method drops the final prediction along each axis. This
335-
usually isn't a problem as the edges are often excluded after gridding anyways.
336-
337-
near_dist is only used if masking_method is "near".
349+
"""Fits a GAM the wind and values data then plots the GAM's predictions.
350+
Specifically fits the square root of `values` to a tensor product of the wind
351+
vector components $u$ and $v$.
352+
353+
Parameters
354+
----------
355+
values : array-like
356+
Values to be plotted. Should be the same length as wind_speed and wind_dir.
357+
wind_speed : array-like
358+
Wind speed data. Should be the same length as values and wind_dir.
359+
wind_dir : array-like
360+
Wind direction data. Should be the same length as values and wind_speed.
361+
bin_data_before_gam : bool, default False
362+
If True, the data will be binned/gridded before fitting the GAM. Binning the data can speed up GAM fitting
363+
but smooths resulting plot on top of the smoothing the GAM already performs.
364+
pred_res : int, default 500
365+
The resolution of the prediction grid. This is the number of bins along each wind direction.
366+
agg_method : str, default "mean"
367+
The aggregation method to use when gridding the data. Options are "mean", "median", "sum", etc.
368+
Any method accepted by pandas.DataFrame.groupby().agg can be used. This argument is ignored if
369+
bin_data_before_gam is False.
370+
positive : bool, default True
371+
If True, the predictions will be forced to be non-negative. This is useful for data that should not
372+
have negative values, such as concentrations or counts. This is done by simply setting values below 0 to 0.
373+
Note that this step happens twice: once to the raw values before optionally gridding and fitting the GAM,
374+
and again to the GAM predictions.
375+
degfree : int, default 30
376+
The degrees of freedom for the GAM. This controls the smoothness of the fit. A higher value will result in a
377+
smoother fit, while a lower value will result in a more flexible fit.
378+
lam : float, default 0.6
379+
The regularization parameter for the GAM. This controls the amount of smoothing applied to the fit.
380+
vmin : float, optional
381+
The minimum value for the colour scale. If None, the minimum value of the predictions will be used.
382+
vmax : float, optional
383+
The maximum value for the colour scale. If None, the maximum value of the predictions will be used.
384+
cmap : colormap, default cm.batlow
385+
The colormap to use for the plot. This can be any colormap accepted by matplotlib. Colourmaps from
386+
cmcrameri are recommended, as they are perceptually uniform and colourblind-friendly.
387+
masking_method : str, default "near"
388+
The method to use for masking the predictions. This is used to limit the predictions to a reasonable
389+
boundary around the raw data. This is useful for preventing predictions from extending too far from the
390+
Options are:
391+
- "near": points within a set distance (1 m/s) of the raw data are preserved. This is similar to
392+
how openair limits data. However, the implementation here can be slow for large datasets.
393+
- "ch": a convext hull is drawn around the raw data and only predictions falling within this hull
394+
are presented.
395+
- "grid": raw data is gridded to a resolution of pred_res and only grid cells with raw data present
396+
are preserved. This is best used with smaller values of pred_res and will produce an image that
397+
looks like a smoothed version of the image produced by BivariatePlotGrid. Note that due to the
398+
way the gridding calculation works, this method drops the final prediction along each axis. This
399+
usually isn't a problem as the edges are often excluded after gridding anyways.
400+
near_dist : float, default 1
401+
The distance from the raw data within which predictions are preserved when masking_method is "near".
402+
This is the distance in the units of the wind speed data (e.g., m/s).
403+
This is only used if masking_method is "near".
404+
colourbar_label : str, optional
405+
The label for the colourbar. If None, no label will be added.
406+
wind_unit : str, default "m/s"
407+
The unit of the wind speed data. This is only used to label the colourbar and axes.
408+
409+
Returns
410+
-------
411+
fig, ax : tuple
412+
A tuple containing the matplotlib figure and axes of the plot.
413+
414+
Notes
415+
-----
416+
- openair fits a GAM to binned data, though openair's approach seems to bin the data by wind speed and directions
417+
before converting wind to vector components. This function differs when setting `bin_data_before_gam=True` in that it
418+
splits the wind into vector components prior to binning, so `values` will be binned up based on a grid of $(u, v)$
419+
vector components rather than bins of wind speed and direction. In terms of outcome, both this function and openair's
420+
approach will produce similar results, and the process of binning/gridding the data serves to smooth the resulting
421+
plot and potentially reduce calculation time in fitting the GAM.
338422
"""
339423
# Get wind components
340424
wind_u, wind_v = _getWindComponents(wind_speed, wind_dir)
425+
# Find largest absolute wind speed component and define bins
426+
wind_comp_abs_max = np.max(
427+
[np.abs(np.nanmin([wind_u, wind_v])), np.nanmax([wind_u, wind_v])]
428+
)
429+
wind_bins = np.linspace(-wind_comp_abs_max, wind_comp_abs_max, pred_res)
430+
431+
if positive:
432+
values = np.where(values < 0, 0, values)
433+
434+
# Make DataFrame of wind components and values. This process differs depending
435+
# on whether we are gridding the data prior to GAM-fitting.
436+
if bin_data_before_gam:
437+
_, df = _aggValuesToGrid(
438+
wind_u, wind_v, values, wind_bins, pred_res, agg_method
439+
)
440+
df = df.reset_index()
441+
# During gridding, pd.cut returns pd.Intervals as the indices of the bins.
442+
# We need single numbers for fitting and plotting, so we'll use the midpoints.
443+
df["wind_u"] = df["wind_u"].apply(lambda x: x.mid).copy()
444+
df["wind_v"] = df["wind_v"].apply(lambda x: x.mid).copy()
445+
else:
446+
df = pd.DataFrame([wind_u, wind_v, values]).T
447+
df.columns = ["wind_u", "wind_v", "values"]
341448

342-
df = pd.DataFrame([wind_u, wind_v, values]).T
343-
df.columns = ["wind_u", "wind_v", "values"]
344449
df = df.dropna()
345450
df["values"] = df["values"] ** 0.5
346-
X = df[["wind_v", "wind_u"]]
347-
y = df["values"]
451+
X = df[["wind_v", "wind_u"]].copy()
452+
y = df["values"].copy()
348453

454+
# Fit the GAM
349455
gam = _fitGAM(X, y, degfree, lam)
350456

351-
# Find largest absolute wind speed component and define bins
352-
wind_comp_abs_max = np.max(
353-
[np.abs(np.nanmin([wind_u, wind_v])), np.nanmax([wind_u, wind_v])]
354-
)
355-
wind_bins = np.linspace(-wind_comp_abs_max, wind_comp_abs_max, pred_res)
457+
# Make grid of predictions
356458
points_x, points_y = np.meshgrid(wind_bins, wind_bins, indexing="ij")
357459
points = np.vstack([points_x.ravel(), points_y.ravel()]).T
358460
pred = (
@@ -365,7 +467,7 @@ def BivariatePlotRawGAM(
365467
pred_res, pred_res
366468
)
367469
elif masking_method == "grid":
368-
grid = _aggValuesToGrid(wind_v, wind_u, values, wind_bins, pred_res, "mean")
470+
grid, _ = _aggValuesToGrid(wind_v, wind_u, values, wind_bins, pred_res, "mean")
369471
mask = np.ma.masked_invalid(grid).mask
370472
pred = pred[:-1, :-1]
371473
elif masking_method == "ch":

bivapp/sampledata.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,9 @@ def ImportTorontoMet():
102102
met_df["wind_speed"] = met_df["wind_speed"] + np.random.uniform(
103103
-0.5, 0.5, len(met_df.index)
104104
)
105-
met_df.loc[met_df["wind_speed"] < 0, "wind_speed"] = (
106-
0 # the jittering could introduce negatives
107-
)
105+
met_df.loc[
106+
met_df["wind_speed"] < 0, "wind_speed"
107+
] = 0 # the jittering could introduce negatives
108108
met_df["wind_speed_m_s"] = met_df["wind_speed"] / 3.6
109109
met_df = met_df.drop(columns=["wind_speed"])
110110

0 commit comments

Comments
 (0)