Skip to content

Commit 97ff40b

Browse files
committed
Add function for datetime binning
1 parent 0f9c0d7 commit 97ff40b

File tree

6 files changed

+988
-87
lines changed

6 files changed

+988
-87
lines changed

Orange/preprocess/discretize.py

Lines changed: 207 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
import calendar
12
import re
2-
from collections import namedtuple
3+
import time
4+
from typing import NamedTuple, List, Union, Callable
5+
import datetime
6+
from itertools import count
37

48
import numpy as np
59
import scipy.sparse as sp
@@ -14,7 +18,7 @@
1418
from . import _discretize
1519

1620
__all__ = ["EqualFreq", "EqualWidth", "EntropyMDL", "DomainDiscretizer",
17-
"decimal_binnings", "get_bins"]
21+
"decimal_binnings", "time_binnings", "short_time_units"]
1822

1923

2024
class Discretizer(Transformation):
@@ -174,14 +178,40 @@ def _split_eq_width(self, min, max):
174178
return [min + (i + 1) * dif for i in range(self.n - 1)]
175179

176180

177-
BinDefinition = namedtuple("BinDefinition", ("start", "nbins", "width"))
181+
class BinDefinition(NamedTuple):
182+
thresholds: np.ndarray # thresholds, including the top
183+
labels: List[str] # friendly-formatted thresholds
184+
width: Union[float, None] # widths, if uniform; otherwise None
185+
width_label: str # friendly-formatted width (e.g. '50' or '2 weeks')
186+
187+
188+
# NamedTupleMeta doesn't allow to define __new__ so we need a subclass
189+
# Name of the class has to be the same to match the namedtuple name
190+
# pylint: disable=function-redefined
191+
class BinDefinition(BinDefinition):
192+
def __new__(cls, thresholds, labels="%g", width=None, width_label=""):
193+
if isinstance(labels, str):
194+
labels = [labels % x for x in thresholds]
195+
elif isinstance(labels, Callable):
196+
labels = [labels(x) for x in thresholds]
197+
if not width_label and width is not None:
198+
width_label = f"{width:g}"
199+
return super().__new__(cls, thresholds, labels, width, width_label)
200+
201+
@property
202+
def start(self) -> float:
203+
return self.thresholds[0]
204+
205+
@property
206+
def nbins(self) -> int:
207+
return len(self.thresholds) - 1
178208

179209

180210
def decimal_binnings(
181211
data, *, min_width=0, min_bins=2, max_bins=50,
182-
min_unique=5, add_unique=None,
183-
factors=(20, 10, 5, 2, 1, 0.5, 0.25, 0.2, 0.1, 0.05, 0.025, 0.02, 0.01),
184-
return_defs=False):
212+
min_unique=5, add_unique=0,
213+
factors=(0.01, 0.02, 0.025, 0.05, 0.1, 0.2, 0.25, 0.5, 1, 2, 5, 10, 20),
214+
label_fmt="%g"):
185215
"""
186216
Find a set of nice splits of data into bins
187217
@@ -218,14 +248,16 @@ def decimal_binnings(
218248
the function returns a single binning that matches that values in
219249
the data
220250
add_unique (int):
221-
similar to `min_unique` except that such bins are added to the list
251+
similar to `min_unique` except that such bins are added to the list;
252+
set to 0 to disable
222253
factors (list of float):
223254
The factors with which the scaling is multiplied. Default is
224-
`(20, 10, 5, 2, 1, 0.5, 0.25, 0.2, 0.1, 0.05, 0.025, 0.02, 0.01)`,
255+
`(0.01, 0.02, 0.025, 0.05, 0.1, 0.2, 0.25, 0.5, 1, 2, 5, 10, 20)`,
225256
so if scaling is 1000, considered bin widths are 20000, 10000,
226257
5000, 2000, 1000, 500, 250, 200, 100, 50, 25, 20 and 10.
227-
return_defs (bool): If set to `True`, the function returns a list of
228-
instances of `BinDefinition`, otherwise a list of bin boundaries.
258+
label_fmt (str or Callable):
259+
A format string (default: "%g") used for threshold labels,
260+
or a function for formatting thresholds (e.g. var.str_val)
229261
230262
Returns:
231263
bin_boundaries (list of np.ndarray): a list of bin boundaries,
@@ -245,29 +277,16 @@ def decimal_binnings(
245277
246278
This is returned if `return_defs` is `False`.
247279
"""
248-
def unique_bins():
249-
if len(unique) >= 2:
250-
# make the last bin the same width as the one before
251-
last_boundary = 2 * unique[-1] - unique[-2]
252-
else:
253-
last_boundary = unique[0] + 1
254-
return BinDefinition(
255-
unique[0], len(unique), np.hstack((unique, [last_boundary])))
280+
bins = []
256281

257-
unique = np.unique(data)
258-
unique = unique[np.isfinite(unique)]
259-
if not unique.size:
260-
raise ValueError("no valid (non-nan) data")
261-
mn, mx = unique[0], unique[-1]
262-
if mn == mx or len(unique) <= min_unique:
263-
bins = unique_bins()
264-
if not return_defs:
265-
bins = get_bins(bins)
266-
return [bins]
282+
mn, mx, unique = _min_max_unique(data)
283+
if len(unique) <= max(min_unique, add_unique):
284+
bins.append(BinDefinition(_unique_thresholds(unique), label_fmt))
285+
if len(unique) <= min_unique:
286+
return bins
267287

268288
diff = mx - mn
269289
f10 = 10 ** -np.floor(np.log10(diff))
270-
bins = []
271290
max_bins = min(max_bins, len(unique))
272291
for f in factors:
273292
width = f / f10
@@ -278,40 +297,174 @@ def unique_bins():
278297
nbins = np.round((mx_ - mn_) / width)
279298
if min_bins <= nbins <= max_bins \
280299
and (not bins or bins[-1].nbins != nbins):
281-
bins.append(BinDefinition(mn_, nbins, width))
300+
bin_def = BinDefinition(mn_ + width * np.arange(nbins + 1),
301+
label_fmt, width)
302+
bins.append(bin_def)
303+
return bins
282304

283-
if add_unique is not None and len(unique) <= add_unique:
284-
if bins and bins[-1].nbins == len(unique):
285-
del bins[-1]
286-
bins.append(unique_bins())
287-
if len(unique) < min_unique:
288-
del bins[:-1]
289305

290-
if not return_defs:
291-
bins = [get_bins(bin) for bin in bins]
306+
def time_binnings(data, *, min_bins=2, max_bins=50, min_unique=5, add_unique=0):
307+
"""
308+
Find a set of nice splits of time variable data into bins
292309
293-
return bins
310+
The function considers bin widths of
294311
312+
- 1, 5, 10, 15, 30 seconds.
313+
- 1, 5, 10, 15, 30 minutes,
314+
- 1, 2, 3, 6, 12 hours,
315+
- 1 day,
316+
- 1, 2 weeks,
317+
- 1, 2, 3, 6 months,
318+
- 1, 2, 5, 10, 25, 50, 100 years,
319+
320+
and returns those that yield between `min_bins` and `max_bins` intervals.
295321
296-
def get_bins(bin_def: BinDefinition):
297-
"""
298-
Return a `np.ndarray` corresponding to interval
299322
Args:
300-
bin_def (BinDefinition):
301-
definition of bins. a named tuple containing the beginning of the
302-
first bin (`start`), number of bins (`nbins`) and their widths
303-
(`width`). The last value can also be a `nd.array` with `nbins + 1`
304-
elements, in which case the function returns this as a result.
323+
data (np.ndarray):
324+
vector of data points; values may repeat, and nans and infs are
325+
filtered out.
326+
min_bins (int): minimal number of bins
327+
max_bins (int):
328+
maximal number of bins; the number of bins will never exceed the
329+
number of unique values
305330
306331
Returns:
307-
bin boundaries (np.ndarray):
308-
bin boundaries including the top boundary of the last interval, hence
309-
the list size equals `bin_def.nbins + 1`. This array matches the
310-
`bin` argument of `numpy.histogram`.
332+
bin_boundaries (list): a list of possible binning.
333+
Each element of `bin_boundaries` is a tuple consisting of a label
334+
describing the bin size (e.g. `2 weeks`) and a list of thresholds.
335+
Thresholds are given as pairs
336+
(number_of_seconds_since_epoch, label).
311337
"""
312-
if isinstance(bin_def.width, np.ndarray):
313-
return bin_def.width
314-
return bin_def.start + bin_def.width * np.arange(bin_def.nbins + 1)
338+
mn, mx, unique = _min_max_unique(data)
339+
mn, mx = time.gmtime(mn), time.gmtime(mx)
340+
bins = []
341+
if len(unique) <= max(min_unique, add_unique):
342+
bins.append(_unique_time_bins(unique))
343+
if len(unique) > min_unique:
344+
bins += _time_binnings(mn, mx, min_bins + 1, max_bins + 1)
345+
return bins
346+
347+
348+
def _time_binnings(mn, mx, min_pts, max_pts):
349+
yfmt = "%y " if mn.tm_year >= 1950 else "%Y "
350+
bins = []
351+
for place, step, fmt, unit in (
352+
[(5, x, "%H:%M:%S", "second") for x in (1, 5, 10, 15, 30)] +
353+
[(4, x, "%b %d %H:%M", "minute") for x in (1, 5, 10, 15, 30)] +
354+
[(3, x, yfmt + "%b %d %H:%M", "hour") for x in (1, 2, 3, 6, 12)] +
355+
[(2, 1, yfmt + "%b %d", "day")] +
356+
[(2, x, yfmt + "%b %d", "week") for x in (7, 14)] +
357+
[(1, x, yfmt + "%b", "month") for x in (1, 2, 3, 6)] +
358+
[(0, x, yfmt.strip(), "year") for x in (1, 2, 5, 10, 25, 50, 100)]):
359+
times = _time_range(mn, mx, place, step, min_pts, max_pts)
360+
if not times:
361+
continue
362+
times = [time.struct_time(t + (0, 0, 0)) for t in times]
363+
thresholds = [calendar.timegm(t) for t in times]
364+
labels = _simplified_labels([time.strftime(fmt, t) for t in times])
365+
if place == 2 and step >= 7:
366+
unit_label = f"{step // 7} week{'s' * (step > 7)}"
367+
else:
368+
unit_label = f"{step} {unit}{'s' * (step > 1)}"
369+
new_bins = BinDefinition(thresholds, labels, None, unit_label)
370+
if not bins or new_bins.nbins != bins[-1].nbins:
371+
bins.append(new_bins)
372+
return bins
373+
374+
375+
# datetime + deltatime is not very useful here because deltatime is
376+
# given a number of days, not years or months, so it doesn't allow
377+
# for specifying a step of 1 month
378+
def _time_range(start, end, place, step, min_pts, max_pts,
379+
_zeros=(0, 1, 1, 0, 0, 0)):
380+
if place == 2 and step % 7 == 0:
381+
startd = datetime.date(*start[:3])
382+
startd -= datetime.timedelta(days=-startd.weekday())
383+
start = [startd.year, startd.month, startd.day, 0, 0, 0]
384+
else:
385+
start = list(
386+
start[:place]
387+
+ ((start[place] - _zeros[place]) // step * step + _zeros[place], )
388+
+ _zeros[place + 1:])
389+
end = list(end[:place + 1] + _zeros[place + 1:])
390+
s = [tuple(start)]
391+
for _ in range(max_pts - 1):
392+
start[place] += step
393+
if place >= 3: # hours, minutes, seconds
394+
for pos, maxval in enumerate((60, 60, 24), start=1):
395+
if start[-pos] >= maxval:
396+
start[-pos - 1] += 1
397+
start[-pos] %= maxval
398+
if place >= 2:
399+
md = _month_days(*start[:2])
400+
if start[2] > md:
401+
start[1] += 1
402+
start[2] %= md
403+
if start[1] > 12:
404+
start[0] += 1
405+
start[1] %= 12
406+
s.append(tuple(start))
407+
if start > end:
408+
return s if len(s) >= min_pts else None
409+
return None
410+
411+
412+
def _month_days(year, month,
413+
_md=(None, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)):
414+
return _md[month] + (
415+
month == 2 and (year % 400 == 0 or year % 4 == 0 and year % 100 != 0))
416+
417+
418+
def _simplified_labels(labels):
419+
to_remove = "42"
420+
while True:
421+
firsts = {f for f, *_ in (lab.split() for lab in labels)}
422+
if len(firsts) > 1:
423+
break
424+
to_remove = firsts.pop()
425+
flen = len(to_remove) + 1
426+
labels = [lab[flen:] for lab in labels]
427+
for i in range(len(labels) - 1, 0, -1):
428+
for k, c, d in zip(count(), labels[i].split(), labels[i - 1].split()):
429+
if c != d:
430+
labels[i] = " ".join(labels[i].split()[k:])
431+
break
432+
# If the last thing removed were month names and the labels continues with
433+
# hours, keep month name in the first label; "08 12:29" looks awkward.
434+
if not to_remove[0].isdigit() and ":" in labels[0]:
435+
labels[0] = f"{to_remove} {labels[0]}"
436+
return labels
437+
438+
439+
def _unique_time_bins(unique):
440+
times = [time.gmtime(x) for x in unique]
441+
fmt = f'{"%y " if times[0][0] >= 1950 else "%Y "} %b %d'
442+
fmt += " %H:%M" * (len({t[2:] for t in times}) > 1)
443+
fmt += ":%S" * bool(np.all(unique % 60 == 0))
444+
return BinDefinition(_unique_thresholds(unique),
445+
[time.strftime(fmt, x) for x in times])
446+
447+
448+
def _unique_thresholds(unique):
449+
if len(unique) >= 2:
450+
# make the last bin the same width as the one before
451+
last_boundary = 2 * unique[-1] - unique[-2]
452+
else:
453+
last_boundary = unique[0] + 1
454+
return np.hstack((unique, [last_boundary]))
455+
456+
457+
def _min_max_unique(data):
458+
unique = np.unique(data)
459+
unique = unique[np.isfinite(unique)]
460+
if not unique.size:
461+
raise ValueError("no valid (non-nan) data")
462+
return unique[0], unique[-1], unique
463+
464+
465+
short_time_units = dict(seconds="sec", minutes="min", hours="hrs",
466+
weeks="wks", months="mon", years="yrs",
467+
second="sec", minute="min", month="mon")
315468

316469

317470
# noinspection PyPep8Naming

Orange/preprocess/tests/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)