1+ import calendar
12import re
2- from collections import namedtuple
3+ import time
4+ from typing import NamedTuple , List , Union , Callable
5+ import datetime
6+ from itertools import count
37
48import numpy as np
59import scipy .sparse as sp
1418from . import _discretize
1519
1620__all__ = ["EqualFreq" , "EqualWidth" , "EntropyMDL" , "DomainDiscretizer" ,
17- "decimal_binnings" , "get_bins " ]
21+ "decimal_binnings" , "time_binnings" , "short_time_units " ]
1822
1923
2024class Discretizer (Transformation ):
@@ -174,14 +178,40 @@ def _split_eq_width(self, min, max):
174178 return [min + (i + 1 ) * dif for i in range (self .n - 1 )]
175179
176180
177- BinDefinition = namedtuple ("BinDefinition" , ("start" , "nbins" , "width" ))
181+ class BinDefinition (NamedTuple ):
182+ thresholds : np .ndarray # thresholds, including the top
183+ labels : List [str ] # friendly-formatted thresholds
184+ width : Union [float , None ] # widths, if uniform; otherwise None
185+ width_label : str # friendly-formatted width (e.g. '50' or '2 weeks')
186+
187+
188+ # NamedTupleMeta doesn't allow to define __new__ so we need a subclass
189+ # Name of the class has to be the same to match the namedtuple name
190+ # pylint: disable=function-redefined
191+ class BinDefinition (BinDefinition ):
192+ def __new__ (cls , thresholds , labels = "%g" , width = None , width_label = "" ):
193+ if isinstance (labels , str ):
194+ labels = [labels % x for x in thresholds ]
195+ elif isinstance (labels , Callable ):
196+ labels = [labels (x ) for x in thresholds ]
197+ if not width_label and width is not None :
198+ width_label = f"{ width :g} "
199+ return super ().__new__ (cls , thresholds , labels , width , width_label )
200+
201+ @property
202+ def start (self ) -> float :
203+ return self .thresholds [0 ]
204+
205+ @property
206+ def nbins (self ) -> int :
207+ return len (self .thresholds ) - 1
178208
179209
180210def decimal_binnings (
181211 data , * , min_width = 0 , min_bins = 2 , max_bins = 50 ,
182- min_unique = 5 , add_unique = None ,
183- factors = (20 , 10 , 5 , 2 , 1 , 0.5 , 0.25 , 0.2 , 0. 1 , 0.05 , 0.025 , 0.02 , 0.01 ),
184- return_defs = False ):
212+ min_unique = 5 , add_unique = 0 ,
213+ factors = (0.01 , 0.02 , 0.025 , 0.05 , 0. 1 , 0.2 , 0.25 , 0.5 , 1 , 2 , 5 , 10 , 20 ),
214+ label_fmt = "%g" ):
185215 """
186216 Find a set of nice splits of data into bins
187217
@@ -218,14 +248,16 @@ def decimal_binnings(
218248 the function returns a single binning that matches that values in
219249 the data
220250 add_unique (int):
221- similar to `min_unique` except that such bins are added to the list
251+ similar to `min_unique` except that such bins are added to the list;
252+ set to 0 to disable
222253 factors (list of float):
223254 The factors with which the scaling is multiplied. Default is
224- `(20, 10, 5, 2, 1, 0.5 , 0.25, 0.2, 0. 1, 0.05, 0.025, 0.02, 0.01 )`,
255+ `(0.01, 0.02, 0.025, 0.05, 0. 1, 0.2 , 0.25, 0.5, 1, 2, 5, 10, 20 )`,
225256 so if scaling is 1000, considered bin widths are 20000, 10000,
226257 5000, 2000, 1000, 500, 250, 200, 100, 50, 25, 20 and 10.
227- return_defs (bool): If set to `True`, the function returns a list of
228- instances of `BinDefinition`, otherwise a list of bin boundaries.
258+ label_fmt (str or Callable):
259+ A format string (default: "%g") used for threshold labels,
260+ or a function for formatting thresholds (e.g. var.str_val)
229261
230262 Returns:
231263 bin_boundaries (list of np.ndarray): a list of bin boundaries,
@@ -245,29 +277,16 @@ def decimal_binnings(
245277
246278 This is returned if `return_defs` is `False`.
247279 """
248- def unique_bins ():
249- if len (unique ) >= 2 :
250- # make the last bin the same width as the one before
251- last_boundary = 2 * unique [- 1 ] - unique [- 2 ]
252- else :
253- last_boundary = unique [0 ] + 1
254- return BinDefinition (
255- unique [0 ], len (unique ), np .hstack ((unique , [last_boundary ])))
280+ bins = []
256281
257- unique = np .unique (data )
258- unique = unique [np .isfinite (unique )]
259- if not unique .size :
260- raise ValueError ("no valid (non-nan) data" )
261- mn , mx = unique [0 ], unique [- 1 ]
262- if mn == mx or len (unique ) <= min_unique :
263- bins = unique_bins ()
264- if not return_defs :
265- bins = get_bins (bins )
266- return [bins ]
282+ mn , mx , unique = _min_max_unique (data )
283+ if len (unique ) <= max (min_unique , add_unique ):
284+ bins .append (BinDefinition (_unique_thresholds (unique ), label_fmt ))
285+ if len (unique ) <= min_unique :
286+ return bins
267287
268288 diff = mx - mn
269289 f10 = 10 ** - np .floor (np .log10 (diff ))
270- bins = []
271290 max_bins = min (max_bins , len (unique ))
272291 for f in factors :
273292 width = f / f10
@@ -278,40 +297,174 @@ def unique_bins():
278297 nbins = np .round ((mx_ - mn_ ) / width )
279298 if min_bins <= nbins <= max_bins \
280299 and (not bins or bins [- 1 ].nbins != nbins ):
281- bins .append (BinDefinition (mn_ , nbins , width ))
300+ bin_def = BinDefinition (mn_ + width * np .arange (nbins + 1 ),
301+ label_fmt , width )
302+ bins .append (bin_def )
303+ return bins
282304
283- if add_unique is not None and len (unique ) <= add_unique :
284- if bins and bins [- 1 ].nbins == len (unique ):
285- del bins [- 1 ]
286- bins .append (unique_bins ())
287- if len (unique ) < min_unique :
288- del bins [:- 1 ]
289305
290- if not return_defs :
291- bins = [get_bins (bin ) for bin in bins ]
306+ def time_binnings (data , * , min_bins = 2 , max_bins = 50 , min_unique = 5 , add_unique = 0 ):
307+ """
308+ Find a set of nice splits of time variable data into bins
292309
293- return bins
310+ The function considers bin widths of
294311
312+ - 1, 5, 10, 15, 30 seconds.
313+ - 1, 5, 10, 15, 30 minutes,
314+ - 1, 2, 3, 6, 12 hours,
315+ - 1 day,
316+ - 1, 2 weeks,
317+ - 1, 2, 3, 6 months,
318+ - 1, 2, 5, 10, 25, 50, 100 years,
319+
320+ and returns those that yield between `min_bins` and `max_bins` intervals.
295321
296- def get_bins (bin_def : BinDefinition ):
297- """
298- Return a `np.ndarray` corresponding to interval
299322 Args:
300- bin_def (BinDefinition):
301- definition of bins. a named tuple containing the beginning of the
302- first bin (`start`), number of bins (`nbins`) and their widths
303- (`width`). The last value can also be a `nd.array` with `nbins + 1`
304- elements, in which case the function returns this as a result.
323+ data (np.ndarray):
324+ vector of data points; values may repeat, and nans and infs are
325+ filtered out.
326+ min_bins (int): minimal number of bins
327+ max_bins (int):
328+ maximal number of bins; the number of bins will never exceed the
329+ number of unique values
305330
306331 Returns:
307- bin boundaries (np.ndarray):
308- bin boundaries including the top boundary of the last interval, hence
309- the list size equals `bin_def.nbins + 1`. This array matches the
310- `bin` argument of `numpy.histogram`.
332+ bin_boundaries (list): a list of possible binning.
333+ Each element of `bin_boundaries` is a tuple consisting of a label
334+ describing the bin size (e.g. `2 weeks`) and a list of thresholds.
335+ Thresholds are given as pairs
336+ (number_of_seconds_since_epoch, label).
311337 """
312- if isinstance (bin_def .width , np .ndarray ):
313- return bin_def .width
314- return bin_def .start + bin_def .width * np .arange (bin_def .nbins + 1 )
338+ mn , mx , unique = _min_max_unique (data )
339+ mn , mx = time .gmtime (mn ), time .gmtime (mx )
340+ bins = []
341+ if len (unique ) <= max (min_unique , add_unique ):
342+ bins .append (_unique_time_bins (unique ))
343+ if len (unique ) > min_unique :
344+ bins += _time_binnings (mn , mx , min_bins + 1 , max_bins + 1 )
345+ return bins
346+
347+
348+ def _time_binnings (mn , mx , min_pts , max_pts ):
349+ yfmt = "%y " if mn .tm_year >= 1950 else "%Y "
350+ bins = []
351+ for place , step , fmt , unit in (
352+ [(5 , x , "%H:%M:%S" , "second" ) for x in (1 , 5 , 10 , 15 , 30 )] +
353+ [(4 , x , "%b %d %H:%M" , "minute" ) for x in (1 , 5 , 10 , 15 , 30 )] +
354+ [(3 , x , yfmt + "%b %d %H:%M" , "hour" ) for x in (1 , 2 , 3 , 6 , 12 )] +
355+ [(2 , 1 , yfmt + "%b %d" , "day" )] +
356+ [(2 , x , yfmt + "%b %d" , "week" ) for x in (7 , 14 )] +
357+ [(1 , x , yfmt + "%b" , "month" ) for x in (1 , 2 , 3 , 6 )] +
358+ [(0 , x , yfmt .strip (), "year" ) for x in (1 , 2 , 5 , 10 , 25 , 50 , 100 )]):
359+ times = _time_range (mn , mx , place , step , min_pts , max_pts )
360+ if not times :
361+ continue
362+ times = [time .struct_time (t + (0 , 0 , 0 )) for t in times ]
363+ thresholds = [calendar .timegm (t ) for t in times ]
364+ labels = _simplified_labels ([time .strftime (fmt , t ) for t in times ])
365+ if place == 2 and step >= 7 :
366+ unit_label = f"{ step // 7 } week{ 's' * (step > 7 )} "
367+ else :
368+ unit_label = f"{ step } { unit } { 's' * (step > 1 )} "
369+ new_bins = BinDefinition (thresholds , labels , None , unit_label )
370+ if not bins or new_bins .nbins != bins [- 1 ].nbins :
371+ bins .append (new_bins )
372+ return bins
373+
374+
375+ # datetime + deltatime is not very useful here because deltatime is
376+ # given a number of days, not years or months, so it doesn't allow
377+ # for specifying a step of 1 month
378+ def _time_range (start , end , place , step , min_pts , max_pts ,
379+ _zeros = (0 , 1 , 1 , 0 , 0 , 0 )):
380+ if place == 2 and step % 7 == 0 :
381+ startd = datetime .date (* start [:3 ])
382+ startd -= datetime .timedelta (days = - startd .weekday ())
383+ start = [startd .year , startd .month , startd .day , 0 , 0 , 0 ]
384+ else :
385+ start = list (
386+ start [:place ]
387+ + ((start [place ] - _zeros [place ]) // step * step + _zeros [place ], )
388+ + _zeros [place + 1 :])
389+ end = list (end [:place + 1 ] + _zeros [place + 1 :])
390+ s = [tuple (start )]
391+ for _ in range (max_pts - 1 ):
392+ start [place ] += step
393+ if place >= 3 : # hours, minutes, seconds
394+ for pos , maxval in enumerate ((60 , 60 , 24 ), start = 1 ):
395+ if start [- pos ] >= maxval :
396+ start [- pos - 1 ] += 1
397+ start [- pos ] %= maxval
398+ if place >= 2 :
399+ md = _month_days (* start [:2 ])
400+ if start [2 ] > md :
401+ start [1 ] += 1
402+ start [2 ] %= md
403+ if start [1 ] > 12 :
404+ start [0 ] += 1
405+ start [1 ] %= 12
406+ s .append (tuple (start ))
407+ if start > end :
408+ return s if len (s ) >= min_pts else None
409+ return None
410+
411+
412+ def _month_days (year , month ,
413+ _md = (None , 31 , 28 , 31 , 30 , 31 , 30 , 31 , 31 , 30 , 31 , 30 , 31 )):
414+ return _md [month ] + (
415+ month == 2 and (year % 400 == 0 or year % 4 == 0 and year % 100 != 0 ))
416+
417+
418+ def _simplified_labels (labels ):
419+ to_remove = "42"
420+ while True :
421+ firsts = {f for f , * _ in (lab .split () for lab in labels )}
422+ if len (firsts ) > 1 :
423+ break
424+ to_remove = firsts .pop ()
425+ flen = len (to_remove ) + 1
426+ labels = [lab [flen :] for lab in labels ]
427+ for i in range (len (labels ) - 1 , 0 , - 1 ):
428+ for k , c , d in zip (count (), labels [i ].split (), labels [i - 1 ].split ()):
429+ if c != d :
430+ labels [i ] = " " .join (labels [i ].split ()[k :])
431+ break
432+ # If the last thing removed were month names and the labels continues with
433+ # hours, keep month name in the first label; "08 12:29" looks awkward.
434+ if not to_remove [0 ].isdigit () and ":" in labels [0 ]:
435+ labels [0 ] = f"{ to_remove } { labels [0 ]} "
436+ return labels
437+
438+
439+ def _unique_time_bins (unique ):
440+ times = [time .gmtime (x ) for x in unique ]
441+ fmt = f'{ "%y " if times [0 ][0 ] >= 1950 else "%Y " } %b %d'
442+ fmt += " %H:%M" * (len ({t [2 :] for t in times }) > 1 )
443+ fmt += ":%S" * bool (np .all (unique % 60 == 0 ))
444+ return BinDefinition (_unique_thresholds (unique ),
445+ [time .strftime (fmt , x ) for x in times ])
446+
447+
448+ def _unique_thresholds (unique ):
449+ if len (unique ) >= 2 :
450+ # make the last bin the same width as the one before
451+ last_boundary = 2 * unique [- 1 ] - unique [- 2 ]
452+ else :
453+ last_boundary = unique [0 ] + 1
454+ return np .hstack ((unique , [last_boundary ]))
455+
456+
457+ def _min_max_unique (data ):
458+ unique = np .unique (data )
459+ unique = unique [np .isfinite (unique )]
460+ if not unique .size :
461+ raise ValueError ("no valid (non-nan) data" )
462+ return unique [0 ], unique [- 1 ], unique
463+
464+
465+ short_time_units = dict (seconds = "sec" , minutes = "min" , hours = "hrs" ,
466+ weeks = "wks" , months = "mon" , years = "yrs" ,
467+ second = "sec" , minute = "min" , month = "mon" )
315468
316469
317470# noinspection PyPep8Naming
0 commit comments