1+ import calendar
12import re
2- from collections import namedtuple
3+ import time
4+ from typing import NamedTuple , List , Union , Callable
5+ import datetime
6+ from itertools import count
37
48import numpy as np
59import scipy .sparse as sp
1418from . import _discretize
1519
1620__all__ = ["EqualFreq" , "EqualWidth" , "EntropyMDL" , "DomainDiscretizer" ,
17- "decimal_binnings" , "get_bins " ]
21+ "decimal_binnings" , "time_binnings" , "short_time_units " ]
1822
1923
2024class Discretizer (Transformation ):
@@ -174,14 +178,39 @@ def _split_eq_width(self, min, max):
174178 return [min + (i + 1 ) * dif for i in range (self .n - 1 )]
175179
176180
177- BinDefinition = namedtuple ("BinDefinition" , ("start" , "nbins" , "width" ))
181+ class BinDefinition (NamedTuple ):
182+ thresholds : np .ndarray # thresholds, including the top
183+ labels : List [str ] # friendly-formatted thresholds
184+ width : Union [float , None ] # widths, if uniform; otherwise None
185+ width_label : str # friendly-formatted width (e.g. '50' or '2 weeks')
186+
187+ # NamedTupleMeta doesn't allow to define __new__ so we need a subclass
188+ # Name of the class has to be the same to match the namedtuple name
189+ # pylint: function-redefined
190+ class BinDefinition (BinDefinition ):
191+ def __new__ (cls , thresholds , labels = "%g" , width = None , width_label = "" ):
192+ if isinstance (labels , str ):
193+ labels = [labels % x for x in thresholds ]
194+ elif isinstance (labels , Callable ):
195+ labels = [labels (x ) for x in thresholds ]
196+ if not width_label and width is not None :
197+ width_label = f"{ width :g} "
198+ return super ().__new__ (cls , thresholds , labels , width , width_label )
199+
200+ @property
201+ def start (self ) -> float :
202+ return self .thresholds [0 ]
203+
204+ @property
205+ def nbins (self ) -> int :
206+ return len (self .thresholds ) - 1
178207
179208
180209def decimal_binnings (
181210 data , * , min_width = 0 , min_bins = 2 , max_bins = 50 ,
182- min_unique = 5 , add_unique = None ,
183- factors = (20 , 10 , 5 , 2 , 1 , 0.5 , 0.25 , 0.2 , 0. 1 , 0.05 , 0.025 , 0.02 , 0.01 ),
184- return_defs = False ):
211+ min_unique = 5 , add_unique = 0 ,
212+ factors = (0.01 , 0.02 , 0.025 , 0.05 , 0. 1 , 0.2 , 0.25 , 0.5 , 1 , 2 , 5 , 10 , 20 ),
213+ label_fmt = "%g" ):
185214 """
186215 Find a set of nice splits of data into bins
187216
@@ -218,14 +247,16 @@ def decimal_binnings(
218247 the function returns a single binning that matches that values in
219248 the data
220249 add_unique (int):
221- similar to `min_unique` except that such bins are added to the list
250+ similar to `min_unique` except that such bins are added to the list;
251+ set to 0 to disable
222252 factors (list of float):
223253 The factors with which the scaling is multiplied. Default is
224- `(20, 10, 5, 2, 1, 0.5 , 0.25, 0.2, 0. 1, 0.05, 0.025, 0.02, 0.01 )`,
254+ `(0.01, 0.02, 0.025, 0.05, 0. 1, 0.2 , 0.25, 0.5, 1, 2, 5, 10, 20 )`,
225255 so if scaling is 1000, considered bin widths are 20000, 10000,
226256 5000, 2000, 1000, 500, 250, 200, 100, 50, 25, 20 and 10.
227- return_defs (bool): If set to `True`, the function returns a list of
228- instances of `BinDefinition`, otherwise a list of bin boundaries.
257+ label_fmt (str or Callable):
258+ A format string (default: "%g") used for threshold labels,
259+ or a function for formatting thresholds (e.g. var.str_val)
229260
230261 Returns:
231262 bin_boundaries (list of np.ndarray): a list of bin boundaries,
@@ -245,29 +276,16 @@ def decimal_binnings(
245276
246277 This is returned if `return_defs` is `False`.
247278 """
248- def unique_bins ():
249- if len (unique ) >= 2 :
250- # make the last bin the same width as the one before
251- last_boundary = 2 * unique [- 1 ] - unique [- 2 ]
252- else :
253- last_boundary = unique [0 ] + 1
254- return BinDefinition (
255- unique [0 ], len (unique ), np .hstack ((unique , [last_boundary ])))
279+ bins = []
256280
257- unique = np .unique (data )
258- unique = unique [np .isfinite (unique )]
259- if not unique .size :
260- raise ValueError ("no valid (non-nan) data" )
261- mn , mx = unique [0 ], unique [- 1 ]
262- if mn == mx or len (unique ) <= min_unique :
263- bins = unique_bins ()
264- if not return_defs :
265- bins = get_bins (bins )
266- return [bins ]
281+ mn , mx , unique = _min_max_unique (data )
282+ if len (unique ) <= max (min_unique , add_unique ):
283+ bins .append (BinDefinition (_unique_thresholds (unique ), label_fmt ))
284+ if len (unique ) <= min_unique :
285+ return bins
267286
268287 diff = mx - mn
269288 f10 = 10 ** - np .floor (np .log10 (diff ))
270- bins = []
271289 max_bins = min (max_bins , len (unique ))
272290 for f in factors :
273291 width = f / f10
@@ -278,40 +296,174 @@ def unique_bins():
278296 nbins = np .round ((mx_ - mn_ ) / width )
279297 if min_bins <= nbins <= max_bins \
280298 and (not bins or bins [- 1 ].nbins != nbins ):
281- bins .append (BinDefinition (mn_ , nbins , width ))
299+ bin_def = BinDefinition (mn_ + width * np .arange (nbins + 1 ),
300+ label_fmt , width )
301+ bins .append (bin_def )
302+ return bins
282303
283- if add_unique is not None and len (unique ) <= add_unique :
284- if bins and bins [- 1 ].nbins == len (unique ):
285- del bins [- 1 ]
286- bins .append (unique_bins ())
287- if len (unique ) < min_unique :
288- del bins [:- 1 ]
289304
290- if not return_defs :
291- bins = [get_bins (bin ) for bin in bins ]
305+ def time_binnings (data , * , min_bins = 2 , max_bins = 50 , min_unique = 5 , add_unique = 0 ):
306+ """
307+ Find a set of nice splits of time variable data into bins
292308
293- return bins
309+ The function considers bin widths of
294310
311+ - 1, 5, 10, 15, 30 seconds.
312+ - 1, 5, 10, 15, 30 minutes,
313+ - 1, 2, 3, 6, 12 hours,
314+ - 1 day,
315+ - 1, 2 weeks,
316+ - 1, 2, 3, 6 months,
317+ - 1, 2, 5, 10, 25, 50, 100 years,
318+
319+ and returns those that yield between `min_bins` and `max_bins` intervals.
295320
296- def get_bins (bin_def : BinDefinition ):
297- """
298- Return a `np.ndarray` corresponding to interval
299321 Args:
300- bin_def (BinDefinition):
301- definition of bins. a named tuple containing the beginning of the
302- first bin (`start`), number of bins (`nbins`) and their widths
303- (`width`). The last value can also be a `nd.array` with `nbins + 1`
304- elements, in which case the function returns this as a result.
322+ data (np.ndarray):
323+ vector of data points; values may repeat, and nans and infs are
324+ filtered out.
325+ min_bins (int): minimal number of bins
326+ max_bins (int):
327+ maximal number of bins; the number of bins will never exceed the
328+ number of unique values
305329
306330 Returns:
307- bin boundaries (np.ndarray):
308- bin boundaries including the top boundary of the last interval, hence
309- the list size equals `bin_def.nbins + 1`. This array matches the
310- `bin` argument of `numpy.histogram`.
331+ bin_boundaries (list): a list of possible binning.
332+ Each element of `bin_boundaries` is a tuple consisting of a label
333+ describing the bin size (e.g. `2 weeks`) and a list of thresholds.
334+ Thresholds are given as pairs
335+ (number_of_seconds_since_epoch, label).
311336 """
312- if isinstance (bin_def .width , np .ndarray ):
313- return bin_def .width
314- return bin_def .start + bin_def .width * np .arange (bin_def .nbins + 1 )
337+ mn , mx , unique = _min_max_unique (data )
338+ mn , mx = time .gmtime (mn ), time .gmtime (mx )
339+ bins = []
340+ if len (unique ) <= max (min_unique , add_unique ):
341+ bins .append (_unique_time_bins (unique ))
342+ if len (unique ) > min_unique :
343+ bins += _time_binnings (mn , mx , min_bins + 1 , max_bins + 1 )
344+ return bins
345+
346+
347+ def _time_binnings (mn , mx , min_pts , max_pts ):
348+ yfmt = "%y " if mn .tm_year >= 1950 else "%Y "
349+ bins = []
350+ for place , step , fmt , unit in (
351+ [(5 , x , "%H:%M:%S" , "second" ) for x in (1 , 5 , 10 , 15 , 30 )] +
352+ [(4 , x , "%b %d %H:%M" , "minute" ) for x in (1 , 5 , 10 , 15 , 30 )] +
353+ [(3 , x , yfmt + "%b %d %H:%M" , "hour" ) for x in (1 , 2 , 3 , 6 , 12 )] +
354+ [(2 , 1 , yfmt + "%b %d" , "day" )] +
355+ [(2 , x , yfmt + "%b %d" , "week" ) for x in (7 , 14 )] +
356+ [(1 , x , yfmt + "%b" , "month" ) for x in (1 , 2 , 3 , 6 )] +
357+ [(0 , x , yfmt .strip (), "year" ) for x in (1 , 2 , 5 , 10 , 25 , 50 , 100 )]):
358+ times = _time_range (mn , mx , place , step , min_pts , max_pts )
359+ if not times :
360+ continue
361+ times = [time .struct_time (t + (0 , 0 , 0 )) for t in times ]
362+ thresholds = [calendar .timegm (t ) for t in times ]
363+ labels = _simplified_labels ([time .strftime (fmt , t ) for t in times ])
364+ if place == 2 and step >= 7 :
365+ unit_label = f"{ step // 7 } week{ 's' * (step > 7 )} "
366+ else :
367+ unit_label = f"{ step } { unit } { 's' * (step > 1 )} "
368+ new_bins = BinDefinition (thresholds , labels , None , unit_label )
369+ if not bins or new_bins .nbins != bins [- 1 ].nbins :
370+ bins .append (new_bins )
371+ return bins
372+
373+
374+ # datetime + deltatime is not very useful here because deltatime is
375+ # given a number of days, not years or months, so it doesn't allow
376+ # for specifying a step of 1 month
377+ def _time_range (start , end , place , step , min_pts , max_pts ,
378+ _zeros = (0 , 1 , 1 , 0 , 0 , 0 )):
379+ if place == 2 and step % 7 == 0 :
380+ startd = datetime .date (* start [:3 ])
381+ startd -= datetime .timedelta (days = - startd .weekday ())
382+ start = [startd .year , startd .month , startd .day , 0 , 0 , 0 ]
383+ else :
384+ start = list (
385+ start [:place ]
386+ + ((start [place ] - _zeros [place ]) // step * step + _zeros [place ], )
387+ + _zeros [place + 1 :])
388+ end = list (end [:place + 1 ] + _zeros [place + 1 :])
389+ s = [tuple (start )]
390+ for _ in range (max_pts ):
391+ start [place ] += step
392+ if place >= 3 : # hours, minutes, seconds
393+ for pos , maxval in enumerate ((60 , 60 , 24 ), start = 1 ):
394+ if start [- pos ] >= maxval :
395+ start [- pos - 1 ] += 1
396+ start [- pos ] %= maxval
397+ if place >= 2 :
398+ md = _month_days (* start [:2 ])
399+ if start [2 ] > md :
400+ start [1 ] += 1
401+ start [2 ] %= md
402+ if start [1 ] > 12 :
403+ start [0 ] += 1
404+ start [1 ] %= 12
405+ s .append (tuple (start ))
406+ if start > end :
407+ return s if len (s ) >= min_pts else None
408+ return None
409+
410+
411+ def _month_days (year , month ,
412+ _md = (None , 31 , 28 , 31 , 30 , 31 , 30 , 31 , 31 , 30 , 31 , 30 , 31 )):
413+ return _md [month ] + (
414+ month == 2 and (year % 400 == 0 or year % 4 == 0 and year % 100 != 0 ))
415+
416+
417+ def _simplified_labels (labels ):
418+ to_remove = "42"
419+ while True :
420+ firsts = {f for f , * _ in (lab .split () for lab in labels )}
421+ if len (firsts ) > 1 :
422+ break
423+ to_remove = firsts .pop ()
424+ flen = len (to_remove ) + 1
425+ labels = [lab [flen :] for lab in labels ]
426+ for i in range (len (labels ) - 1 , 0 , - 1 ):
427+ for k , c , d in zip (count (), labels [i ].split (), labels [i - 1 ].split ()):
428+ if c != d :
429+ labels [i ] = " " .join (labels [i ].split ()[k :])
430+ break
431+ # If the last thing removed were month names and the labels continues with
432+ # hours, keep month name in the first label; "08 12:29" looks awkward.
433+ if not to_remove [0 ].isdigit () and ":" in labels [0 ]:
434+ labels [0 ] = f"{ to_remove } { labels [0 ]} "
435+ return labels
436+
437+
438+ def _unique_time_bins (unique ):
439+ times = [time .gmtime (x )[2 :] for x in unique ]
440+ fmt = f'{ "%y " if times [0 ].tm_year >= 1950 else "%Y " } %b %d'
441+ fmt += " %H:%M" * (len (set (times )) > 1 )
442+ fmt += ":%S" * np .all (unique % 60 == 0 )
443+ return BinDefinition (_unique_thresholds (unique ),
444+ [time .strftime (fmt , x ) for x in unique ])
445+
446+
447+ def _unique_thresholds (unique ):
448+ if len (unique ) >= 2 :
449+ # make the last bin the same width as the one before
450+ last_boundary = 2 * unique [- 1 ] - unique [- 2 ]
451+ else :
452+ last_boundary = unique [0 ] + 1
453+ return np .hstack ((unique , [last_boundary ]))
454+
455+
456+ def _min_max_unique (data ):
457+ unique = np .unique (data )
458+ unique = unique [np .isfinite (unique )]
459+ if not unique .size :
460+ raise ValueError ("no valid (non-nan) data" )
461+ return unique [0 ], unique [- 1 ], unique
462+
463+
464+ short_time_units = dict (seconds = "sec" , minutes = "min" , hours = "hrs" ,
465+ weeks = "wks" , months = "mon" , years = "yrs" ,
466+ second = "sec" , minute = "min" , month = "mon" )
315467
316468
317469# noinspection PyPep8Naming
0 commit comments