Skip to content

Commit cc749d9

Browse files
committed
Add function for datetime binning
1 parent 0f9c0d7 commit cc749d9

File tree

4 files changed

+908
-85
lines changed

4 files changed

+908
-85
lines changed

Orange/preprocess/discretize.py

Lines changed: 204 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
import calendar
12
import re
2-
from collections import namedtuple
3+
import time
4+
from typing import NamedTuple, List, Union, Callable
5+
import datetime
6+
from itertools import count
37

48
import numpy as np
59
import scipy.sparse as sp
@@ -14,7 +18,7 @@
1418
from . import _discretize
1519

1620
__all__ = ["EqualFreq", "EqualWidth", "EntropyMDL", "DomainDiscretizer",
17-
"decimal_binnings", "get_bins"]
21+
"decimal_binnings", "time_binnings", "short_time_units"]
1822

1923

2024
class Discretizer(Transformation):
@@ -174,14 +178,37 @@ def _split_eq_width(self, min, max):
174178
return [min + (i + 1) * dif for i in range(self.n - 1)]
175179

176180

177-
BinDefinition = namedtuple("BinDefinition", ("start", "nbins", "width"))
181+
class BinDefinition(NamedTuple):
182+
thresholds: np.ndarray # thresholds, including the top
183+
labels: List[str] # friendly-formatted thresholds
184+
width: Union[float, None] # widths, if uniform; otherwise None
185+
width_label: str # friendly-formatted width (e.g. '50' or '2 weeks')
186+
187+
188+
class BinDefinition(BinDefinition):
189+
def __new__(cls, thresholds, labels="%g", width=None, width_label=""):
190+
if isinstance(labels, str):
191+
labels = [labels % x for x in thresholds]
192+
elif isinstance(labels, Callable):
193+
labels = [labels(x) for x in thresholds]
194+
if not width_label and width is not None:
195+
width_label = f"{width:g}"
196+
return super().__new__(cls, thresholds, labels, width, width_label)
197+
198+
@property
199+
def start(self) -> float:
200+
return self.thresholds[0]
201+
202+
@property
203+
def nbins(self) -> int:
204+
return len(self.thresholds) - 1
178205

179206

180207
def decimal_binnings(
181208
data, *, min_width=0, min_bins=2, max_bins=50,
182-
min_unique=5, add_unique=None,
183-
factors=(20, 10, 5, 2, 1, 0.5, 0.25, 0.2, 0.1, 0.05, 0.025, 0.02, 0.01),
184-
return_defs=False):
209+
min_unique=5, add_unique=0,
210+
factors=(0.01, 0.02, 0.025, 0.05, 0.1, 0.2, 0.25, 0.5, 1, 2, 5, 10, 20),
211+
label_fmt="%g"):
185212
"""
186213
Find a set of nice splits of data into bins
187214
@@ -218,14 +245,16 @@ def decimal_binnings(
218245
the function returns a single binning that matches that values in
219246
the data
220247
add_unique (int):
221-
similar to `min_unique` except that such bins are added to the list
248+
similar to `min_unique` except that such bins are added to the list;
249+
set to 0 to disable
222250
factors (list of float):
223251
The factors with which the scaling is multiplied. Default is
224-
`(20, 10, 5, 2, 1, 0.5, 0.25, 0.2, 0.1, 0.05, 0.025, 0.02, 0.01)`,
252+
`(0.01, 0.02, 0.025, 0.05, 0.1, 0.2, 0.25, 0.5, 1, 2, 5, 10, 20)`,
225253
so if scaling is 1000, considered bin widths are 20000, 10000,
226254
5000, 2000, 1000, 500, 250, 200, 100, 50, 25, 20 and 10.
227-
return_defs (bool): If set to `True`, the function returns a list of
228-
instances of `BinDefinition`, otherwise a list of bin boundaries.
255+
label_fmt (str or Callable):
256+
A format string (default: "%g") used for threshold labels,
257+
or a function for formatting thresholds (e.g. var.str_val)
229258
230259
Returns:
231260
bin_boundaries (list of np.ndarray): a list of bin boundaries,
@@ -245,29 +274,16 @@ def decimal_binnings(
245274
246275
This is returned if `return_defs` is `False`.
247276
"""
248-
def unique_bins():
249-
if len(unique) >= 2:
250-
# make the last bin the same width as the one before
251-
last_boundary = 2 * unique[-1] - unique[-2]
252-
else:
253-
last_boundary = unique[0] + 1
254-
return BinDefinition(
255-
unique[0], len(unique), np.hstack((unique, [last_boundary])))
277+
bins = []
256278

257-
unique = np.unique(data)
258-
unique = unique[np.isfinite(unique)]
259-
if not unique.size:
260-
raise ValueError("no valid (non-nan) data")
261-
mn, mx = unique[0], unique[-1]
262-
if mn == mx or len(unique) <= min_unique:
263-
bins = unique_bins()
264-
if not return_defs:
265-
bins = get_bins(bins)
266-
return [bins]
279+
mn, mx, unique = _min_max_unique(data)
280+
if len(unique) <= max(min_unique, add_unique):
281+
bins.append(BinDefinition(_unique_thresholds(unique), label_fmt))
282+
if len(unique) <= min_unique:
283+
return bins
267284

268285
diff = mx - mn
269286
f10 = 10 ** -np.floor(np.log10(diff))
270-
bins = []
271287
max_bins = min(max_bins, len(unique))
272288
for f in factors:
273289
width = f / f10
@@ -278,40 +294,174 @@ def unique_bins():
278294
nbins = np.round((mx_ - mn_) / width)
279295
if min_bins <= nbins <= max_bins \
280296
and (not bins or bins[-1].nbins != nbins):
281-
bins.append(BinDefinition(mn_, nbins, width))
297+
bin_def = BinDefinition(mn_ + width * np.arange(nbins + 1),
298+
label_fmt, width)
299+
bins.append(bin_def)
300+
return bins
282301

283-
if add_unique is not None and len(unique) <= add_unique:
284-
if bins and bins[-1].nbins == len(unique):
285-
del bins[-1]
286-
bins.append(unique_bins())
287-
if len(unique) < min_unique:
288-
del bins[:-1]
289302

290-
if not return_defs:
291-
bins = [get_bins(bin) for bin in bins]
303+
def time_binnings(data, *, min_bins=2, max_bins=50, min_unique=5, add_unique=0):
304+
"""
305+
Find a set of nice splits of time variable data into bins
292306
293-
return bins
307+
The function considers bin widths of
294308
309+
- 1, 5, 10, 15, 30 seconds.
310+
- 1, 5, 10, 15, 30 minutes,
311+
- 1, 2, 3, 6, 12 hours,
312+
- 1 day,
313+
- 1, 2 weeks,
314+
- 1, 2, 3, 6 months,
315+
- 1, 2, 5, 10, 25, 50, 100 years,
316+
317+
and returns those that yield between `min_bins` and `max_bins` intervals.
295318
296-
def get_bins(bin_def: BinDefinition):
297-
"""
298-
Return a `np.ndarray` corresponding to interval
299319
Args:
300-
bin_def (BinDefinition):
301-
definition of bins. a named tuple containing the beginning of the
302-
first bin (`start`), number of bins (`nbins`) and their widths
303-
(`width`). The last value can also be a `nd.array` with `nbins + 1`
304-
elements, in which case the function returns this as a result.
320+
data (np.ndarray):
321+
vector of data points; values may repeat, and nans and infs are
322+
filtered out.
323+
min_bins (int): minimal number of bins
324+
max_bins (int):
325+
maximal number of bins; the number of bins will never exceed the
326+
number of unique values
305327
306328
Returns:
307-
bin boundaries (np.ndarray):
308-
bin boundaries including the top boundary of the last interval, hence
309-
the list size equals `bin_def.nbins + 1`. This array matches the
310-
`bin` argument of `numpy.histogram`.
329+
bin_boundaries (list): a list of possible binning.
330+
Each element of `bin_boundaries` is a tuple consisting of a label
331+
describing the bin size (e.g. `2 weeks`) and a list of thresholds.
332+
Thresholds are given as pairs
333+
(number_of_seconds_since_epoch, label).
311334
"""
312-
if isinstance(bin_def.width, np.ndarray):
313-
return bin_def.width
314-
return bin_def.start + bin_def.width * np.arange(bin_def.nbins + 1)
335+
mn, mx, unique = _min_max_unique(data)
336+
mn, mx = time.gmtime(mn), time.gmtime(mx)
337+
bins = []
338+
if len(unique) <= max(min_unique, add_unique):
339+
bins.append(_unique_time_bins(unique))
340+
if len(unique) > min_unique:
341+
bins += _time_binnings(mn, mx, min_bins + 1, max_bins + 1)
342+
return bins
343+
344+
345+
def _time_binnings(mn, mx, min_pts, max_pts):
346+
yfmt = "%y " if mn.tm_year >= 1950 else "%Y "
347+
bins = []
348+
for place, step, fmt, unit in (
349+
[(5, x, "%H:%M:%S", "second") for x in (1, 5, 10, 15, 30)] +
350+
[(4, x, "%b %d %H:%M", "minute") for x in (1, 5, 10, 15, 30)] +
351+
[(3, x, yfmt + "%b %d %H:%M", "hour") for x in (1, 2, 3, 6, 12)] +
352+
[(2, 1, yfmt + "%b %d", "day")] +
353+
[(2, x, yfmt + "%b %d", "week") for x in (7, 14)] +
354+
[(1, x, yfmt + "%b", "month") for x in (1, 2, 3, 6)] +
355+
[(0, x, yfmt.strip(), "year") for x in (1, 2, 5, 10, 25, 50, 100)]):
356+
times = _time_range(mn, mx, place, step, min_pts, max_pts)
357+
if not times:
358+
continue
359+
times = [time.struct_time(t + (0, 0, 0)) for t in times]
360+
thresholds = [calendar.timegm(t) for t in times]
361+
labels = _simplified_labels([time.strftime(fmt, t) for t in times])
362+
if place == 2 and step >= 7:
363+
unit_label = f"{step // 7} week{'s' * (step > 7)}"
364+
else:
365+
unit_label = f"{step} {unit}{'s' * (step > 1)}"
366+
new_bins = BinDefinition(thresholds, labels, None, unit_label)
367+
if not bins or new_bins.nbins != bins[-1].nbins:
368+
bins.append(new_bins)
369+
return bins
370+
371+
372+
# datetime + deltatime is not very useful here because deltatime is
373+
# given a number of days, not years or months, so it doesn't allow
374+
# for specifying a step of 1 month
375+
def _time_range(start, end, place, step, min_pts, max_pts,
376+
_zeros=(0, 1, 1, 0, 0, 0)):
377+
if place == 2 and step % 7 == 0:
378+
startd = datetime.date(*start[:3])
379+
startd -= datetime.timedelta(days=-startd.weekday())
380+
start = [startd.year, startd.month, startd.day, 0, 0, 0]
381+
else:
382+
start = list(
383+
start[:place]
384+
+ ((start[place] - _zeros[place]) // step * step + _zeros[place], )
385+
+ _zeros[place + 1:])
386+
end = list(end[:place + 1] + _zeros[place + 1:])
387+
s = [tuple(start)]
388+
for _ in range(max_pts):
389+
start[place] += step
390+
if place >= 3: # hours, minutes, seconds
391+
for pos, maxval in enumerate((60, 60, 24), start=1):
392+
if start[-pos] >= maxval:
393+
start[-pos - 1] += 1
394+
start[-pos] %= maxval
395+
if place >= 2:
396+
md = _month_days(*start[:2])
397+
if start[2] > md:
398+
start[1] += 1
399+
start[2] %= md
400+
if start[1] > 12:
401+
start[0] += 1
402+
start[1] %= 12
403+
s.append(tuple(start))
404+
if start > end:
405+
return s if len(s) >= min_pts else None
406+
return None
407+
408+
409+
def _month_days(year, month,
410+
_md=(None, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)):
411+
return _md[month] + (
412+
month == 2 and (year % 400 == 0 or year % 4 == 0 and year % 100 != 0))
413+
414+
415+
def _simplified_labels(labels):
416+
to_remove = "42"
417+
while True:
418+
firsts = {f for f, *_ in (lab.split() for lab in labels)}
419+
if len(firsts) > 1:
420+
break
421+
to_remove = firsts.pop()
422+
flen = len(to_remove) + 1
423+
labels = [lab[flen:] for lab in labels]
424+
for i in range(len(labels) - 1, 0, -1):
425+
for k, c, d in zip(count(), labels[i].split(), labels[i - 1].split()):
426+
if c != d:
427+
labels[i] = " ".join(labels[i].split()[k:])
428+
break
429+
# If the last thing removed were month names and the labels continues with
430+
# hours, keep month name in the first label; "08 12:29" looks awkward.
431+
if not to_remove[0].isdigit() and ":" in labels[0]:
432+
labels[0] = f"{to_remove} {labels[0]}"
433+
return labels
434+
435+
436+
def _unique_time_bins(unique):
437+
times = [time.gmtime(x)[2:] for x in unique]
438+
fmt = f'{"%y " if times[0].tm_year >= 1950 else "%Y "} %b %d'
439+
fmt += " %H:%M" * (len(set(times)) > 1)
440+
fmt += ":%S" * np.all(unique % 60 == 0)
441+
return BinDefinition(_unique_thresholds(unique),
442+
[time.strftime(fmt, x) for x in unique])
443+
444+
445+
def _unique_thresholds(unique):
446+
if len(unique) >= 2:
447+
# make the last bin the same width as the one before
448+
last_boundary = 2 * unique[-1] - unique[-2]
449+
else:
450+
last_boundary = unique[0] + 1
451+
return np.hstack((unique, [last_boundary]))
452+
453+
454+
def _min_max_unique(data):
455+
unique = np.unique(data)
456+
unique = unique[np.isfinite(unique)]
457+
if not unique.size:
458+
raise ValueError("no valid (non-nan) data")
459+
return unique[0], unique[-1], unique
460+
461+
462+
short_time_units = dict(seconds="sec", minutes="min", hours="hrs",
463+
weeks="wks", months="mon", years="yrs",
464+
second="sec", minute="min", month="mon")
315465

316466

317467
# noinspection PyPep8Naming

0 commit comments

Comments
 (0)