1+ import calendar
12import re
2- from collections import namedtuple
3+ import time
4+ from typing import NamedTuple , List , Union , Callable
5+ import datetime
6+ from itertools import count
37
48import numpy as np
59import scipy .sparse as sp
1418from . import _discretize
1519
1620__all__ = ["EqualFreq" , "EqualWidth" , "EntropyMDL" , "DomainDiscretizer" ,
17- "decimal_binnings" , "get_bins" ]
21+ "decimal_binnings" , "time_binnings" , "short_time_units" ,
22+ "BinDefinition" ]
1823
1924
2025class Discretizer (Transformation ):
@@ -174,14 +179,52 @@ def _split_eq_width(self, min, max):
174179 return [min + (i + 1 ) * dif for i in range (self .n - 1 )]
175180
176181
177- BinDefinition = namedtuple ("BinDefinition" , ("start" , "nbins" , "width" ))
182+ class BinDefinition (NamedTuple ):
183+ thresholds : np .ndarray # thresholds, including the top
184+ labels : List [str ] # friendly-formatted thresholds
185+ short_labels : List [str ] # shorter labels (e.g. simplified dates)
186+ width : Union [float , None ] # widths, if uniform; otherwise None
187+ width_label : str # friendly-formatted width (e.g. '50' or '2 weeks')
188+
189+
190+ # NamedTupleMeta doesn't allow to define __new__ so we need a subclass
191+ # Name of the class has to be the same to match the namedtuple name
192+ # pylint: disable=function-redefined
193+ class BinDefinition (BinDefinition ):
194+ def __new__ (cls , thresholds , labels = "%g" ,
195+ short_labels = None , width = None , width_label = "" ):
196+
197+ def get_labels (fmt , default = None ):
198+ if fmt is None :
199+ return default
200+ if isinstance (fmt , str ):
201+ return [fmt % x for x in thresholds ]
202+ elif isinstance (fmt , Callable ):
203+ return [fmt (x ) for x in thresholds ]
204+ else :
205+ return fmt
206+
207+ labels = get_labels (labels )
208+ short_labels = get_labels (short_labels , labels )
209+ if not width_label and width is not None :
210+ width_label = f"{ width :g} "
211+ return super ().__new__ (
212+ cls , thresholds , labels , short_labels , width , width_label )
213+
214+ @property
215+ def start (self ) -> float :
216+ return self .thresholds [0 ]
217+
218+ @property
219+ def nbins (self ) -> int :
220+ return len (self .thresholds ) - 1
178221
179222
180223def decimal_binnings (
181224 data , * , min_width = 0 , min_bins = 2 , max_bins = 50 ,
182- min_unique = 5 , add_unique = None ,
183- factors = (20 , 10 , 5 , 2 , 1 , 0.5 , 0.25 , 0.2 , 0. 1 , 0.05 , 0.025 , 0.02 , 0.01 ),
184- return_defs = False ):
225+ min_unique = 5 , add_unique = 0 ,
226+ factors = (0.01 , 0.02 , 0.025 , 0.05 , 0. 1 , 0.2 , 0.25 , 0.5 , 1 , 2 , 5 , 10 , 20 ),
227+ label_fmt = "%g" ):
185228 """
186229 Find a set of nice splits of data into bins
187230
@@ -218,14 +261,16 @@ def decimal_binnings(
218261 the function returns a single binning that matches that values in
219262 the data
220263 add_unique (int):
221- similar to `min_unique` except that such bins are added to the list
264+ similar to `min_unique` except that such bins are added to the list;
265+ set to 0 to disable
222266 factors (list of float):
223267 The factors with which the scaling is multiplied. Default is
224- `(20, 10, 5, 2, 1, 0.5 , 0.25, 0.2, 0. 1, 0.05, 0.025, 0.02, 0.01 )`,
268+ `(0.01, 0.02, 0.025, 0.05, 0. 1, 0.2 , 0.25, 0.5, 1, 2, 5, 10, 20 )`,
225269 so if scaling is 1000, considered bin widths are 20000, 10000,
226270 5000, 2000, 1000, 500, 250, 200, 100, 50, 25, 20 and 10.
227- return_defs (bool): If set to `True`, the function returns a list of
228- instances of `BinDefinition`, otherwise a list of bin boundaries.
271+ label_fmt (str or Callable):
272+ A format string (default: "%g") used for threshold labels,
273+ or a function for formatting thresholds (e.g. var.str_val)
229274
230275 Returns:
231276 bin_boundaries (list of np.ndarray): a list of bin boundaries,
@@ -245,29 +290,16 @@ def decimal_binnings(
245290
246291 This is returned if `return_defs` is `False`.
247292 """
248- def unique_bins ():
249- if len (unique ) >= 2 :
250- # make the last bin the same width as the one before
251- last_boundary = 2 * unique [- 1 ] - unique [- 2 ]
252- else :
253- last_boundary = unique [0 ] + 1
254- return BinDefinition (
255- unique [0 ], len (unique ), np .hstack ((unique , [last_boundary ])))
293+ bins = []
256294
257- unique = np .unique (data )
258- unique = unique [np .isfinite (unique )]
259- if not unique .size :
260- raise ValueError ("no valid (non-nan) data" )
261- mn , mx = unique [0 ], unique [- 1 ]
262- if mn == mx or len (unique ) <= min_unique :
263- bins = unique_bins ()
264- if not return_defs :
265- bins = get_bins (bins )
266- return [bins ]
295+ mn , mx , unique = _min_max_unique (data )
296+ if len (unique ) <= max (min_unique , add_unique ):
297+ bins .append (BinDefinition (_unique_thresholds (unique ), label_fmt ))
298+ if len (unique ) <= min_unique :
299+ return bins
267300
268301 diff = mx - mn
269302 f10 = 10 ** - np .floor (np .log10 (diff ))
270- bins = []
271303 max_bins = min (max_bins , len (unique ))
272304 for f in factors :
273305 width = f / f10
@@ -278,40 +310,180 @@ def unique_bins():
278310 nbins = np .round ((mx_ - mn_ ) / width )
279311 if min_bins <= nbins <= max_bins \
280312 and (not bins or bins [- 1 ].nbins != nbins ):
281- bins .append (BinDefinition (mn_ , nbins , width ))
313+ bin_def = BinDefinition (mn_ + width * np .arange (nbins + 1 ),
314+ label_fmt , None , width )
315+ bins .append (bin_def )
316+ return bins
282317
283- if add_unique is not None and len (unique ) <= add_unique :
284- if bins and bins [- 1 ].nbins == len (unique ):
285- del bins [- 1 ]
286- bins .append (unique_bins ())
287- if len (unique ) < min_unique :
288- del bins [:- 1 ]
289318
290- if not return_defs :
291- bins = [get_bins (bin ) for bin in bins ]
319+ def time_binnings (data , * , min_bins = 2 , max_bins = 50 , min_unique = 5 , add_unique = 0 ):
320+ """
321+ Find a set of nice splits of time variable data into bins
292322
293- return bins
323+ The function considers bin widths of
294324
325+ - 1, 5, 10, 15, 30 seconds.
326+ - 1, 5, 10, 15, 30 minutes,
327+ - 1, 2, 3, 6, 12 hours,
328+ - 1 day,
329+ - 1, 2 weeks,
330+ - 1, 2, 3, 6 months,
331+ - 1, 2, 5, 10, 25, 50, 100 years,
332+
333+ and returns those that yield between `min_bins` and `max_bins` intervals.
295334
296- def get_bins (bin_def : BinDefinition ):
297- """
298- Return a `np.ndarray` corresponding to interval
299335 Args:
300- bin_def (BinDefinition):
301- definition of bins. a named tuple containing the beginning of the
302- first bin (`start`), number of bins (`nbins`) and their widths
303- (`width`). The last value can also be a `nd.array` with `nbins + 1`
304- elements, in which case the function returns this as a result.
336+ data (np.ndarray):
337+ vector of data points; values may repeat, and nans and infs are
338+ filtered out.
339+ min_bins (int): minimal number of bins
340+ max_bins (int):
341+ maximal number of bins; the number of bins will never exceed the
342+ number of unique values
305343
306344 Returns:
307- bin boundaries (np.ndarray):
308- bin boundaries including the top boundary of the last interval, hence
309- the list size equals `bin_def.nbins + 1`. This array matches the
310- `bin` argument of `numpy.histogram`.
345+ bin_boundaries (list): a list of possible binning.
346+ Each element of `bin_boundaries` is a tuple consisting of a label
347+ describing the bin size (e.g. `2 weeks`) and a list of thresholds.
348+ Thresholds are given as pairs
349+ (number_of_seconds_since_epoch, label).
311350 """
312- if isinstance (bin_def .width , np .ndarray ):
313- return bin_def .width
314- return bin_def .start + bin_def .width * np .arange (bin_def .nbins + 1 )
351+ mn , mx , unique = _min_max_unique (data )
352+ mn , mx = time .gmtime (mn ), time .gmtime (mx )
353+ bins = []
354+ if len (unique ) <= max (min_unique , add_unique ):
355+ bins .append (_unique_time_bins (unique ))
356+ if len (unique ) > min_unique :
357+ bins += _time_binnings (mn , mx , min_bins + 1 , max_bins + 1 )
358+ return bins
359+
360+
361+ def _time_binnings (mn , mx , min_pts , max_pts ):
362+ yfmt = "%y " if mn .tm_year >= 1950 else "%Y "
363+ bins = []
364+ for place , step , fmt , unit in (
365+ [(5 , x , "%H:%M:%S" , "second" ) for x in (1 , 5 , 10 , 15 , 30 )] +
366+ [(4 , x , "%b %d %H:%M" , "minute" ) for x in (1 , 5 , 10 , 15 , 30 )] +
367+ [(3 , x , yfmt + "%b %d %H:%M" , "hour" ) for x in (1 , 2 , 3 , 6 , 12 )] +
368+ [(2 , 1 , yfmt + "%b %d" , "day" )] +
369+ [(2 , x , yfmt + "%b %d" , "week" ) for x in (7 , 14 )] +
370+ [(1 , x , yfmt + "%b" , "month" ) for x in (1 , 2 , 3 , 6 )] +
371+ [(0 , x , yfmt .strip (), "year" ) for x in (1 , 2 , 5 , 10 , 25 , 50 , 100 )]):
372+ times = _time_range (mn , mx , place , step , min_pts , max_pts )
373+ if not times :
374+ continue
375+ times = [time .struct_time (t + (0 , 0 , 0 )) for t in times ]
376+ thresholds = [calendar .timegm (t ) for t in times ]
377+ labels = [time .strftime (fmt , t ) for t in times ]
378+ short_labels = _simplified_labels (labels )
379+ if place == 2 and step >= 7 :
380+ unit_label = f"{ step // 7 } week{ 's' * (step > 7 )} "
381+ else :
382+ unit_label = f"{ step } { unit } { 's' * (step > 1 )} "
383+ new_bins = BinDefinition (
384+ thresholds , labels , short_labels , None , unit_label )
385+ if not bins or new_bins .nbins != bins [- 1 ].nbins :
386+ bins .append (new_bins )
387+ return bins
388+
389+
390+ # datetime + deltatime is not very useful here because deltatime is
391+ # given a number of days, not years or months, so it doesn't allow
392+ # for specifying a step of 1 month
393+ def _time_range (start , end , place , step , min_pts , max_pts ,
394+ _zeros = (0 , 1 , 1 , 0 , 0 , 0 )):
395+ if place == 2 and step % 7 == 0 :
396+ startd = datetime .date (* start [:3 ])
397+ startd -= datetime .timedelta (days = - startd .weekday ())
398+ start = [startd .year , startd .month , startd .day , 0 , 0 , 0 ]
399+ else :
400+ start = list (
401+ start [:place ]
402+ + ((start [place ] - _zeros [place ]) // step * step + _zeros [place ], )
403+ + _zeros [place + 1 :])
404+ end = list (end [:place + 1 ] + _zeros [place + 1 :])
405+ s = [tuple (start )]
406+ for _ in range (max_pts - 1 ):
407+ start [place ] += step
408+ if place >= 3 : # hours, minutes, seconds
409+ for pos , maxval in enumerate ((60 , 60 , 24 ), start = 1 ):
410+ if start [- pos ] >= maxval :
411+ start [- pos - 1 ] += 1
412+ start [- pos ] %= maxval
413+ if place >= 2 :
414+ md = _month_days (* start [:2 ])
415+ if start [2 ] > md :
416+ start [1 ] += 1
417+ start [2 ] %= md
418+ if start [1 ] > 12 :
419+ start [0 ] += 1
420+ start [1 ] %= 12
421+ s .append (tuple (start ))
422+ if start > end :
423+ return s if len (s ) >= min_pts else None
424+ return None
425+
426+
427+ def _month_days (year , month ,
428+ _md = (None , 31 , 28 , 31 , 30 , 31 , 30 , 31 , 31 , 30 , 31 , 30 , 31 )):
429+ return _md [month ] + (
430+ month == 2 and (year % 400 == 0 or year % 4 == 0 and year % 100 != 0 ))
431+
432+
433+ def _simplified_labels (labels ):
434+ labels = labels [:]
435+ to_remove = "42"
436+ while True :
437+ firsts = {f for f , * _ in (lab .split () for lab in labels )}
438+ if len (firsts ) > 1 :
439+ break
440+ to_remove = firsts .pop ()
441+ flen = len (to_remove ) + 1
442+ if any (len (lab ) == flen for lab in labels ):
443+ break
444+ labels = [lab [flen :] for lab in labels ]
445+ for i in range (len (labels ) - 1 , 0 , - 1 ):
446+ for k , c , d in zip (count (), labels [i ].split (), labels [i - 1 ].split ()):
447+ if c != d :
448+ labels [i ] = " " .join (labels [i ].split ()[k :])
449+ break
450+ # If the last thing removed were month names and the labels continues with
451+ # hours, keep month name in the first label; "08 12:29" looks awkward.
452+ if not to_remove [0 ].isdigit () and ":" in labels [0 ]:
453+ labels [0 ] = f"{ to_remove } { labels [0 ]} "
454+ return labels
455+
456+
457+ def _unique_time_bins (unique ):
458+ times = [time .gmtime (x ) for x in unique ]
459+ fmt = f'{ "%y " if times [0 ][0 ] >= 1950 else "%Y " } %b %d'
460+ fmt += " %H:%M" * (len ({t [2 :] for t in times }) > 1 )
461+ fmt += ":%S" * bool (np .all (unique % 60 == 0 ))
462+ labels = [time .strftime (fmt , x ) for x in times ]
463+ short_labels = _simplified_labels (labels )
464+ return BinDefinition (_unique_thresholds (unique ), labels , short_labels )
465+
466+
467+ def _unique_thresholds (unique ):
468+ if len (unique ) >= 2 :
469+ # make the last bin the same width as the one before
470+ last_boundary = 2 * unique [- 1 ] - unique [- 2 ]
471+ else :
472+ last_boundary = unique [0 ] + 1
473+ return np .hstack ((unique , [last_boundary ]))
474+
475+
476+ def _min_max_unique (data ):
477+ unique = np .unique (data )
478+ unique = unique [np .isfinite (unique )]
479+ if not unique .size :
480+ raise ValueError ("no valid (non-nan) data" )
481+ return unique [0 ], unique [- 1 ], unique
482+
483+
484+ short_time_units = dict (seconds = "sec" , minutes = "min" , hours = "hrs" ,
485+ weeks = "wks" , months = "mon" , years = "yrs" ,
486+ second = "sec" , minute = "min" , month = "mon" )
315487
316488
317489# noinspection PyPep8Naming
0 commit comments