@@ -136,7 +136,30 @@ def _makeFigurePretty(
136136
137137
138138def _getWindComponents (wind_speed , wind_dir ):
139- """Get wind components from wind speed and direction."""
139+ """Get cartesian wind vector components from wind speed and direction.
140+
141+ This expresses the wind as a vector pointing in the direction the wind is
142+ coming from. For example, wind from the north at 1 m/s gives an output of
143+ (0, 1). An easterly 1 m/s wind returns (1, 0).
144+
145+ This might differ from other sources that might express wind as a vector
146+ point in the direction of air mass flow. If you desire that convention, simply
147+ take the negative of the output of this function.
148+
149+ Parameters
150+ ----------
151+ wind_speed : array-like
152+ Wind speed data. Should be the same length as wind_dir.
153+ wind_dir : array-like
154+ Wind direction data. Should be the same length as wind_speed.
155+ Returns
156+ -------
157+ wind_u : numpy array
158+ Wind speed component in the u direction (east-west). Positive values indicate
159+ wind from the east.
160+ wind_v : numpy array
161+ Wind speed component in the v direction (north-south). Positive values indicate
162+ wind from the north."""
140163 wind_u = wind_speed * np .sin (np .deg2rad (wind_dir ))
141164 wind_v = wind_speed * np .cos (np .deg2rad (wind_dir ))
142165
@@ -153,28 +176,34 @@ def _interpGrid(sparse_grid, xs, ys, interpolation_method):
153176 points ,
154177 method = interpolation_method ,
155178 )
156- return interp .reshape (len (xs ) - 1 , len (ys ) - 1 )
179+ return interp .reshape (len (xs ), len (ys ))
157180
158181
159182def _aggValuesToGrid (wind_u , wind_v , values , wind_bins , resolution , agg_method ):
183+ """Aggregate values into a grid based on wind speed and direction components.
184+ Returns a pandas DataFrame with the aggregated values for each wind speed
185+ and direction bin, and a numpy array of the values reshaped into a grid.
186+
187+ Note that the numpy array contains no information about the bins themselves."""
188+
160189 df = pd .DataFrame ([wind_u , wind_v , values ]).T
161190 df .columns = ["wind_u" , "wind_v" , "values" ]
162191 wind_u_cut = pd .cut (
163192 df ["wind_u" ],
164193 bins = wind_bins ,
165194 include_lowest = True ,
166- labels = np .arange (0 , resolution - 1 , 1 ),
195+ # labels=np.arange(0, resolution - 1, 1),
167196 )
168197 wind_v_cut = pd .cut (
169198 df ["wind_v" ],
170199 bins = wind_bins ,
171200 include_lowest = True ,
172- labels = np .arange (0 , resolution - 1 , 1 ),
201+ # labels=np.arange(0, resolution - 1, 1),
173202 )
174203 agged_values = df .groupby ([wind_u_cut , wind_v_cut ], observed = False ).agg (
175204 {"values" : agg_method }
176205 )
177- return agged_values .values .reshape (resolution - 1 , resolution - 1 )
206+ return agged_values .values .reshape (resolution - 1 , resolution - 1 ). T , agged_values
178207
179208
180209def _excludeTooFar (true_points , pred_points , dist ):
@@ -268,8 +297,8 @@ def BivariatePlotGrid(
268297 wind_bins = np .linspace (- wind_comp_abs_max , wind_comp_abs_max , resolution )
269298
270299 # Aggregate values into bins. For convenience, we use pandas cut and groupby methods.
271- reshaped = _aggValuesToGrid (
272- wind_v , wind_u , values , wind_bins , resolution , agg_method
300+ reshaped , _ = _aggValuesToGrid (
301+ wind_u , wind_v , values , wind_bins , resolution , agg_method
273302 )
274303
275304 # Interpolate, if requested
@@ -299,11 +328,13 @@ def _fitGAM(X, y, degfree, lam):
299328 return gam
300329
301330
302- def BivariatePlotRawGAM (
331+ def BivariatePlotGAM (
303332 values ,
304333 wind_speed ,
305334 wind_dir ,
335+ bin_data_before_gam = False ,
306336 pred_res = 500 ,
337+ agg_method = "mean" ,
307338 positive = True ,
308339 degfree = 30 ,
309340 lam = 0.6 ,
@@ -315,44 +346,115 @@ def BivariatePlotRawGAM(
315346 colourbar_label = None ,
316347 wind_unit = "m/s" ,
317348):
318- """Fits a GAM to the raw data, similar to the R openair package. Specifically fits
319- values ** 0.5 ~ s(wind_u) + s(wind_v), where s are smoothing splines, and returns
320- a plot of the fitted values.
321-
322- Note that openair fits a GAM to binned data, but this function fits a GAM to the raw data.
323-
324- Masking method chooses how to cut GAM predictions to a reasonable boundary around the raw
325- data. Options are:
326-
327- "near": points within a set distance (1 m/s) of the raw data are preserved. This is similar to
328- how openair limits data. However, the implementation here can be slow for large datasets.
329- "ch": a convext hull is drawn around the raw data and only predictions falling within this hull
330- are presented.
331- "grid": raw data is gridded to a resolution of pred_res and only grid cells with raw data present
332- are preserved. This is best used with smaller values of pred_res and will produce an image that
333- looks like a smoothed version of the image produced by BivariatePlotGrid. Note that due to the
334- way the gridding calculation works, this method drops the final prediction along each axis. This
335- usually isn't a problem as the edges are often excluded after gridding anyways.
336-
337- near_dist is only used if masking_method is "near".
349+ """Fits a GAM the wind and values data then plots the GAM's predictions.
350+ Specifically fits the square root of `values` to a tensor product of the wind
351+ vector components $u$ and $v$.
352+
353+ Parameters
354+ ----------
355+ values : array-like
356+ Values to be plotted. Should be the same length as wind_speed and wind_dir.
357+ wind_speed : array-like
358+ Wind speed data. Should be the same length as values and wind_dir.
359+ wind_dir : array-like
360+ Wind direction data. Should be the same length as values and wind_speed.
361+ bin_data_before_gam : bool, default False
362+ If True, the data will be binned/gridded before fitting the GAM. Binning the data can speed up GAM fitting
363+ but smooths resulting plot on top of the smoothing the GAM already performs.
364+ pred_res : int, default 500
365+ The resolution of the prediction grid. This is the number of bins along each wind direction.
366+ agg_method : str, default "mean"
367+ The aggregation method to use when gridding the data. Options are "mean", "median", "sum", etc.
368+ Any method accepted by pandas.DataFrame.groupby().agg can be used. This argument is ignored if
369+ bin_data_before_gam is False.
370+ positive : bool, default True
371+ If True, the predictions will be forced to be non-negative. This is useful for data that should not
372+ have negative values, such as concentrations or counts. This is done by simply setting values below 0 to 0.
373+ Note that this step happens twice: once to the raw values before optionally gridding and fitting the GAM,
374+ and again to the GAM predictions.
375+ degfree : int, default 30
376+ The degrees of freedom for the GAM. This controls the smoothness of the fit. A higher value will result in a
377+ smoother fit, while a lower value will result in a more flexible fit.
378+ lam : float, default 0.6
379+ The regularization parameter for the GAM. This controls the amount of smoothing applied to the fit.
380+ vmin : float, optional
381+ The minimum value for the colour scale. If None, the minimum value of the predictions will be used.
382+ vmax : float, optional
383+ The maximum value for the colour scale. If None, the maximum value of the predictions will be used.
384+ cmap : colormap, default cm.batlow
385+ The colormap to use for the plot. This can be any colormap accepted by matplotlib. Colourmaps from
386+ cmcrameri are recommended, as they are perceptually uniform and colourblind-friendly.
387+ masking_method : str, default "near"
388+ The method to use for masking the predictions. This is used to limit the predictions to a reasonable
389+ boundary around the raw data. This is useful for preventing predictions from extending too far from the
390+ Options are:
391+ - "near": points within a set distance (1 m/s) of the raw data are preserved. This is similar to
392+ how openair limits data. However, the implementation here can be slow for large datasets.
393+ - "ch": a convext hull is drawn around the raw data and only predictions falling within this hull
394+ are presented.
395+ - "grid": raw data is gridded to a resolution of pred_res and only grid cells with raw data present
396+ are preserved. This is best used with smaller values of pred_res and will produce an image that
397+ looks like a smoothed version of the image produced by BivariatePlotGrid. Note that due to the
398+ way the gridding calculation works, this method drops the final prediction along each axis. This
399+ usually isn't a problem as the edges are often excluded after gridding anyways.
400+ near_dist : float, default 1
401+ The distance from the raw data within which predictions are preserved when masking_method is "near".
402+ This is the distance in the units of the wind speed data (e.g., m/s).
403+ This is only used if masking_method is "near".
404+ colourbar_label : str, optional
405+ The label for the colourbar. If None, no label will be added.
406+ wind_unit : str, default "m/s"
407+ The unit of the wind speed data. This is only used to label the colourbar and axes.
408+
409+ Returns
410+ -------
411+ fig, ax : tuple
412+ A tuple containing the matplotlib figure and axes of the plot.
413+
414+ Notes
415+ -----
416+ - openair fits a GAM to binned data, though openair's approach seems to bin the data by wind speed and directions
417+ before converting wind to vector components. This function differs when setting `bin_data_before_gam=True` in that it
418+ splits the wind into vector components prior to binning, so `values` will be binned up based on a grid of $(u, v)$
419+ vector components rather than bins of wind speed and direction. In terms of outcome, both this function and openair's
420+ approach will produce similar results, and the process of binning/gridding the data serves to smooth the resulting
421+ plot and potentially reduce calculation time in fitting the GAM.
338422 """
339423 # Get wind components
340424 wind_u , wind_v = _getWindComponents (wind_speed , wind_dir )
425+ # Find largest absolute wind speed component and define bins
426+ wind_comp_abs_max = np .max (
427+ [np .abs (np .nanmin ([wind_u , wind_v ])), np .nanmax ([wind_u , wind_v ])]
428+ )
429+ wind_bins = np .linspace (- wind_comp_abs_max , wind_comp_abs_max , pred_res )
430+
431+ if positive :
432+ values = np .where (values < 0 , 0 , values )
433+
434+ # Make DataFrame of wind components and values. This process differs depending
435+ # on whether we are gridding the data prior to GAM-fitting.
436+ if bin_data_before_gam :
437+ _ , df = _aggValuesToGrid (
438+ wind_u , wind_v , values , wind_bins , pred_res , agg_method
439+ )
440+ df = df .reset_index ()
441+ # During gridding, pd.cut returns pd.Intervals as the indices of the bins.
442+ # We need single numbers for fitting and plotting, so we'll use the midpoints.
443+ df ["wind_u" ] = df ["wind_u" ].apply (lambda x : x .mid ).copy ()
444+ df ["wind_v" ] = df ["wind_v" ].apply (lambda x : x .mid ).copy ()
445+ else :
446+ df = pd .DataFrame ([wind_u , wind_v , values ]).T
447+ df .columns = ["wind_u" , "wind_v" , "values" ]
341448
342- df = pd .DataFrame ([wind_u , wind_v , values ]).T
343- df .columns = ["wind_u" , "wind_v" , "values" ]
344449 df = df .dropna ()
345450 df ["values" ] = df ["values" ] ** 0.5
346- X = df [["wind_v" , "wind_u" ]]
347- y = df ["values" ]
451+ X = df [["wind_v" , "wind_u" ]]. copy ()
452+ y = df ["values" ]. copy ()
348453
454+ # Fit the GAM
349455 gam = _fitGAM (X , y , degfree , lam )
350456
351- # Find largest absolute wind speed component and define bins
352- wind_comp_abs_max = np .max (
353- [np .abs (np .nanmin ([wind_u , wind_v ])), np .nanmax ([wind_u , wind_v ])]
354- )
355- wind_bins = np .linspace (- wind_comp_abs_max , wind_comp_abs_max , pred_res )
457+ # Make grid of predictions
356458 points_x , points_y = np .meshgrid (wind_bins , wind_bins , indexing = "ij" )
357459 points = np .vstack ([points_x .ravel (), points_y .ravel ()]).T
358460 pred = (
@@ -365,7 +467,7 @@ def BivariatePlotRawGAM(
365467 pred_res , pred_res
366468 )
367469 elif masking_method == "grid" :
368- grid = _aggValuesToGrid (wind_v , wind_u , values , wind_bins , pred_res , "mean" )
470+ grid , _ = _aggValuesToGrid (wind_v , wind_u , values , wind_bins , pred_res , "mean" )
369471 mask = np .ma .masked_invalid (grid ).mask
370472 pred = pred [:- 1 , :- 1 ]
371473 elif masking_method == "ch" :
0 commit comments