Skip to content

Commit 4b3f94c

Browse files
authored
Merge pull request #4123 from janezd/discretize-datetime-labels
[ENH] Nice binning of time variables (Distributions, SOM)
2 parents 4df8324 + 29f8b43 commit 4b3f94c

File tree

6 files changed

+1007
-88
lines changed

6 files changed

+1007
-88
lines changed

Orange/preprocess/discretize.py

Lines changed: 226 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
import calendar
12
import re
2-
from collections import namedtuple
3+
import time
4+
from typing import NamedTuple, List, Union, Callable
5+
import datetime
6+
from itertools import count
37

48
import numpy as np
59
import scipy.sparse as sp
@@ -14,7 +18,8 @@
1418
from . import _discretize
1519

1620
__all__ = ["EqualFreq", "EqualWidth", "EntropyMDL", "DomainDiscretizer",
17-
"decimal_binnings", "get_bins"]
21+
"decimal_binnings", "time_binnings", "short_time_units",
22+
"BinDefinition"]
1823

1924

2025
class Discretizer(Transformation):
@@ -174,14 +179,52 @@ def _split_eq_width(self, min, max):
174179
return [min + (i + 1) * dif for i in range(self.n - 1)]
175180

176181

177-
BinDefinition = namedtuple("BinDefinition", ("start", "nbins", "width"))
182+
class BinDefinition(NamedTuple):
183+
thresholds: np.ndarray # thresholds, including the top
184+
labels: List[str] # friendly-formatted thresholds
185+
short_labels: List[str] # shorter labels (e.g. simplified dates)
186+
width: Union[float, None] # widths, if uniform; otherwise None
187+
width_label: str # friendly-formatted width (e.g. '50' or '2 weeks')
188+
189+
190+
# NamedTupleMeta doesn't allow to define __new__ so we need a subclass
191+
# Name of the class has to be the same to match the namedtuple name
192+
# pylint: disable=function-redefined
193+
class BinDefinition(BinDefinition):
194+
def __new__(cls, thresholds, labels="%g",
195+
short_labels=None, width=None, width_label=""):
196+
197+
def get_labels(fmt, default=None):
198+
if fmt is None:
199+
return default
200+
if isinstance(fmt, str):
201+
return [fmt % x for x in thresholds]
202+
elif isinstance(fmt, Callable):
203+
return [fmt(x) for x in thresholds]
204+
else:
205+
return fmt
206+
207+
labels = get_labels(labels)
208+
short_labels = get_labels(short_labels, labels)
209+
if not width_label and width is not None:
210+
width_label = f"{width:g}"
211+
return super().__new__(
212+
cls, thresholds, labels, short_labels, width, width_label)
213+
214+
@property
215+
def start(self) -> float:
216+
return self.thresholds[0]
217+
218+
@property
219+
def nbins(self) -> int:
220+
return len(self.thresholds) - 1
178221

179222

180223
def decimal_binnings(
181224
data, *, min_width=0, min_bins=2, max_bins=50,
182-
min_unique=5, add_unique=None,
183-
factors=(20, 10, 5, 2, 1, 0.5, 0.25, 0.2, 0.1, 0.05, 0.025, 0.02, 0.01),
184-
return_defs=False):
225+
min_unique=5, add_unique=0,
226+
factors=(0.01, 0.02, 0.025, 0.05, 0.1, 0.2, 0.25, 0.5, 1, 2, 5, 10, 20),
227+
label_fmt="%g"):
185228
"""
186229
Find a set of nice splits of data into bins
187230
@@ -218,14 +261,16 @@ def decimal_binnings(
218261
the function returns a single binning that matches that values in
219262
the data
220263
add_unique (int):
221-
similar to `min_unique` except that such bins are added to the list
264+
similar to `min_unique` except that such bins are added to the list;
265+
set to 0 to disable
222266
factors (list of float):
223267
The factors with which the scaling is multiplied. Default is
224-
`(20, 10, 5, 2, 1, 0.5, 0.25, 0.2, 0.1, 0.05, 0.025, 0.02, 0.01)`,
268+
`(0.01, 0.02, 0.025, 0.05, 0.1, 0.2, 0.25, 0.5, 1, 2, 5, 10, 20)`,
225269
so if scaling is 1000, considered bin widths are 20000, 10000,
226270
5000, 2000, 1000, 500, 250, 200, 100, 50, 25, 20 and 10.
227-
return_defs (bool): If set to `True`, the function returns a list of
228-
instances of `BinDefinition`, otherwise a list of bin boundaries.
271+
label_fmt (str or Callable):
272+
A format string (default: "%g") used for threshold labels,
273+
or a function for formatting thresholds (e.g. var.str_val)
229274
230275
Returns:
231276
bin_boundaries (list of np.ndarray): a list of bin boundaries,
@@ -245,29 +290,16 @@ def decimal_binnings(
245290
246291
This is returned if `return_defs` is `False`.
247292
"""
248-
def unique_bins():
249-
if len(unique) >= 2:
250-
# make the last bin the same width as the one before
251-
last_boundary = 2 * unique[-1] - unique[-2]
252-
else:
253-
last_boundary = unique[0] + 1
254-
return BinDefinition(
255-
unique[0], len(unique), np.hstack((unique, [last_boundary])))
293+
bins = []
256294

257-
unique = np.unique(data)
258-
unique = unique[np.isfinite(unique)]
259-
if not unique.size:
260-
raise ValueError("no valid (non-nan) data")
261-
mn, mx = unique[0], unique[-1]
262-
if mn == mx or len(unique) <= min_unique:
263-
bins = unique_bins()
264-
if not return_defs:
265-
bins = get_bins(bins)
266-
return [bins]
295+
mn, mx, unique = _min_max_unique(data)
296+
if len(unique) <= max(min_unique, add_unique):
297+
bins.append(BinDefinition(_unique_thresholds(unique), label_fmt))
298+
if len(unique) <= min_unique:
299+
return bins
267300

268301
diff = mx - mn
269302
f10 = 10 ** -np.floor(np.log10(diff))
270-
bins = []
271303
max_bins = min(max_bins, len(unique))
272304
for f in factors:
273305
width = f / f10
@@ -278,40 +310,180 @@ def unique_bins():
278310
nbins = np.round((mx_ - mn_) / width)
279311
if min_bins <= nbins <= max_bins \
280312
and (not bins or bins[-1].nbins != nbins):
281-
bins.append(BinDefinition(mn_, nbins, width))
313+
bin_def = BinDefinition(mn_ + width * np.arange(nbins + 1),
314+
label_fmt, None, width)
315+
bins.append(bin_def)
316+
return bins
282317

283-
if add_unique is not None and len(unique) <= add_unique:
284-
if bins and bins[-1].nbins == len(unique):
285-
del bins[-1]
286-
bins.append(unique_bins())
287-
if len(unique) < min_unique:
288-
del bins[:-1]
289318

290-
if not return_defs:
291-
bins = [get_bins(bin) for bin in bins]
319+
def time_binnings(data, *, min_bins=2, max_bins=50, min_unique=5, add_unique=0):
320+
"""
321+
Find a set of nice splits of time variable data into bins
292322
293-
return bins
323+
The function considers bin widths of
294324
325+
- 1, 5, 10, 15, 30 seconds.
326+
- 1, 5, 10, 15, 30 minutes,
327+
- 1, 2, 3, 6, 12 hours,
328+
- 1 day,
329+
- 1, 2 weeks,
330+
- 1, 2, 3, 6 months,
331+
- 1, 2, 5, 10, 25, 50, 100 years,
332+
333+
and returns those that yield between `min_bins` and `max_bins` intervals.
295334
296-
def get_bins(bin_def: BinDefinition):
297-
"""
298-
Return a `np.ndarray` corresponding to interval
299335
Args:
300-
bin_def (BinDefinition):
301-
definition of bins. a named tuple containing the beginning of the
302-
first bin (`start`), number of bins (`nbins`) and their widths
303-
(`width`). The last value can also be a `nd.array` with `nbins + 1`
304-
elements, in which case the function returns this as a result.
336+
data (np.ndarray):
337+
vector of data points; values may repeat, and nans and infs are
338+
filtered out.
339+
min_bins (int): minimal number of bins
340+
max_bins (int):
341+
maximal number of bins; the number of bins will never exceed the
342+
number of unique values
305343
306344
Returns:
307-
bin boundaries (np.ndarray):
308-
bin boundaries including the top boundary of the last interval, hence
309-
the list size equals `bin_def.nbins + 1`. This array matches the
310-
`bin` argument of `numpy.histogram`.
345+
bin_boundaries (list): a list of possible binning.
346+
Each element of `bin_boundaries` is a tuple consisting of a label
347+
describing the bin size (e.g. `2 weeks`) and a list of thresholds.
348+
Thresholds are given as pairs
349+
(number_of_seconds_since_epoch, label).
311350
"""
312-
if isinstance(bin_def.width, np.ndarray):
313-
return bin_def.width
314-
return bin_def.start + bin_def.width * np.arange(bin_def.nbins + 1)
351+
mn, mx, unique = _min_max_unique(data)
352+
mn, mx = time.gmtime(mn), time.gmtime(mx)
353+
bins = []
354+
if len(unique) <= max(min_unique, add_unique):
355+
bins.append(_unique_time_bins(unique))
356+
if len(unique) > min_unique:
357+
bins += _time_binnings(mn, mx, min_bins + 1, max_bins + 1)
358+
return bins
359+
360+
361+
def _time_binnings(mn, mx, min_pts, max_pts):
362+
yfmt = "%y " if mn.tm_year >= 1950 else "%Y "
363+
bins = []
364+
for place, step, fmt, unit in (
365+
[(5, x, "%H:%M:%S", "second") for x in (1, 5, 10, 15, 30)] +
366+
[(4, x, "%b %d %H:%M", "minute") for x in (1, 5, 10, 15, 30)] +
367+
[(3, x, yfmt + "%b %d %H:%M", "hour") for x in (1, 2, 3, 6, 12)] +
368+
[(2, 1, yfmt + "%b %d", "day")] +
369+
[(2, x, yfmt + "%b %d", "week") for x in (7, 14)] +
370+
[(1, x, yfmt + "%b", "month") for x in (1, 2, 3, 6)] +
371+
[(0, x, yfmt.strip(), "year") for x in (1, 2, 5, 10, 25, 50, 100)]):
372+
times = _time_range(mn, mx, place, step, min_pts, max_pts)
373+
if not times:
374+
continue
375+
times = [time.struct_time(t + (0, 0, 0)) for t in times]
376+
thresholds = [calendar.timegm(t) for t in times]
377+
labels = [time.strftime(fmt, t) for t in times]
378+
short_labels = _simplified_labels(labels)
379+
if place == 2 and step >= 7:
380+
unit_label = f"{step // 7} week{'s' * (step > 7)}"
381+
else:
382+
unit_label = f"{step} {unit}{'s' * (step > 1)}"
383+
new_bins = BinDefinition(
384+
thresholds, labels, short_labels, None, unit_label)
385+
if not bins or new_bins.nbins != bins[-1].nbins:
386+
bins.append(new_bins)
387+
return bins
388+
389+
390+
# datetime + deltatime is not very useful here because deltatime is
391+
# given a number of days, not years or months, so it doesn't allow
392+
# for specifying a step of 1 month
393+
def _time_range(start, end, place, step, min_pts, max_pts,
394+
_zeros=(0, 1, 1, 0, 0, 0)):
395+
if place == 2 and step % 7 == 0:
396+
startd = datetime.date(*start[:3])
397+
startd -= datetime.timedelta(days=-startd.weekday())
398+
start = [startd.year, startd.month, startd.day, 0, 0, 0]
399+
else:
400+
start = list(
401+
start[:place]
402+
+ ((start[place] - _zeros[place]) // step * step + _zeros[place], )
403+
+ _zeros[place + 1:])
404+
end = list(end[:place + 1] + _zeros[place + 1:])
405+
s = [tuple(start)]
406+
for _ in range(max_pts - 1):
407+
start[place] += step
408+
if place >= 3: # hours, minutes, seconds
409+
for pos, maxval in enumerate((60, 60, 24), start=1):
410+
if start[-pos] >= maxval:
411+
start[-pos - 1] += 1
412+
start[-pos] %= maxval
413+
if place >= 2:
414+
md = _month_days(*start[:2])
415+
if start[2] > md:
416+
start[1] += 1
417+
start[2] %= md
418+
if start[1] > 12:
419+
start[0] += 1
420+
start[1] %= 12
421+
s.append(tuple(start))
422+
if start > end:
423+
return s if len(s) >= min_pts else None
424+
return None
425+
426+
427+
def _month_days(year, month,
428+
_md=(None, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)):
429+
return _md[month] + (
430+
month == 2 and (year % 400 == 0 or year % 4 == 0 and year % 100 != 0))
431+
432+
433+
def _simplified_labels(labels):
434+
labels = labels[:]
435+
to_remove = "42"
436+
while True:
437+
firsts = {f for f, *_ in (lab.split() for lab in labels)}
438+
if len(firsts) > 1:
439+
break
440+
to_remove = firsts.pop()
441+
flen = len(to_remove) + 1
442+
if any(len(lab) == flen for lab in labels):
443+
break
444+
labels = [lab[flen:] for lab in labels]
445+
for i in range(len(labels) - 1, 0, -1):
446+
for k, c, d in zip(count(), labels[i].split(), labels[i - 1].split()):
447+
if c != d:
448+
labels[i] = " ".join(labels[i].split()[k:])
449+
break
450+
# If the last thing removed were month names and the labels continues with
451+
# hours, keep month name in the first label; "08 12:29" looks awkward.
452+
if not to_remove[0].isdigit() and ":" in labels[0]:
453+
labels[0] = f"{to_remove} {labels[0]}"
454+
return labels
455+
456+
457+
def _unique_time_bins(unique):
458+
times = [time.gmtime(x) for x in unique]
459+
fmt = f'{"%y " if times[0][0] >= 1950 else "%Y "} %b %d'
460+
fmt += " %H:%M" * (len({t[2:] for t in times}) > 1)
461+
fmt += ":%S" * bool(np.all(unique % 60 == 0))
462+
labels = [time.strftime(fmt, x) for x in times]
463+
short_labels = _simplified_labels(labels)
464+
return BinDefinition(_unique_thresholds(unique), labels, short_labels)
465+
466+
467+
def _unique_thresholds(unique):
468+
if len(unique) >= 2:
469+
# make the last bin the same width as the one before
470+
last_boundary = 2 * unique[-1] - unique[-2]
471+
else:
472+
last_boundary = unique[0] + 1
473+
return np.hstack((unique, [last_boundary]))
474+
475+
476+
def _min_max_unique(data):
477+
unique = np.unique(data)
478+
unique = unique[np.isfinite(unique)]
479+
if not unique.size:
480+
raise ValueError("no valid (non-nan) data")
481+
return unique[0], unique[-1], unique
482+
483+
484+
short_time_units = dict(seconds="sec", minutes="min", hours="hrs",
485+
weeks="wks", months="mon", years="yrs",
486+
second="sec", minute="min", month="mon")
315487

316488

317489
# noinspection PyPep8Naming

Orange/preprocess/tests/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)