Skip to content

Commit 0b3c15d

Browse files
committed
Add function for datetime binning
1 parent 0f9c0d7 commit 0b3c15d

File tree

6 files changed

+914
-87
lines changed

6 files changed

+914
-87
lines changed

Orange/preprocess/discretize.py

Lines changed: 206 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
import calendar
12
import re
2-
from collections import namedtuple
3+
import time
4+
from typing import NamedTuple, List, Union, Callable
5+
import datetime
6+
from itertools import count
37

48
import numpy as np
59
import scipy.sparse as sp
@@ -14,7 +18,7 @@
1418
from . import _discretize
1519

1620
__all__ = ["EqualFreq", "EqualWidth", "EntropyMDL", "DomainDiscretizer",
17-
"decimal_binnings", "get_bins"]
21+
"decimal_binnings", "time_binnings", "short_time_units"]
1822

1923

2024
class Discretizer(Transformation):
@@ -174,14 +178,39 @@ def _split_eq_width(self, min, max):
174178
return [min + (i + 1) * dif for i in range(self.n - 1)]
175179

176180

177-
BinDefinition = namedtuple("BinDefinition", ("start", "nbins", "width"))
181+
class BinDefinition(NamedTuple):
182+
thresholds: np.ndarray # thresholds, including the top
183+
labels: List[str] # friendly-formatted thresholds
184+
width: Union[float, None] # widths, if uniform; otherwise None
185+
width_label: str # friendly-formatted width (e.g. '50' or '2 weeks')
186+
187+
# NamedTupleMeta doesn't allow to define __new__ so we need a subclass
188+
# Name of the class has to be the same to match the namedtuple name
189+
# pylint: function-redefined
190+
class BinDefinition(BinDefinition):
191+
def __new__(cls, thresholds, labels="%g", width=None, width_label=""):
192+
if isinstance(labels, str):
193+
labels = [labels % x for x in thresholds]
194+
elif isinstance(labels, Callable):
195+
labels = [labels(x) for x in thresholds]
196+
if not width_label and width is not None:
197+
width_label = f"{width:g}"
198+
return super().__new__(cls, thresholds, labels, width, width_label)
199+
200+
@property
201+
def start(self) -> float:
202+
return self.thresholds[0]
203+
204+
@property
205+
def nbins(self) -> int:
206+
return len(self.thresholds) - 1
178207

179208

180209
def decimal_binnings(
181210
data, *, min_width=0, min_bins=2, max_bins=50,
182-
min_unique=5, add_unique=None,
183-
factors=(20, 10, 5, 2, 1, 0.5, 0.25, 0.2, 0.1, 0.05, 0.025, 0.02, 0.01),
184-
return_defs=False):
211+
min_unique=5, add_unique=0,
212+
factors=(0.01, 0.02, 0.025, 0.05, 0.1, 0.2, 0.25, 0.5, 1, 2, 5, 10, 20),
213+
label_fmt="%g"):
185214
"""
186215
Find a set of nice splits of data into bins
187216
@@ -218,14 +247,16 @@ def decimal_binnings(
218247
the function returns a single binning that matches that values in
219248
the data
220249
add_unique (int):
221-
similar to `min_unique` except that such bins are added to the list
250+
similar to `min_unique` except that such bins are added to the list;
251+
set to 0 to disable
222252
factors (list of float):
223253
The factors with which the scaling is multiplied. Default is
224-
`(20, 10, 5, 2, 1, 0.5, 0.25, 0.2, 0.1, 0.05, 0.025, 0.02, 0.01)`,
254+
`(0.01, 0.02, 0.025, 0.05, 0.1, 0.2, 0.25, 0.5, 1, 2, 5, 10, 20)`,
225255
so if scaling is 1000, considered bin widths are 20000, 10000,
226256
5000, 2000, 1000, 500, 250, 200, 100, 50, 25, 20 and 10.
227-
return_defs (bool): If set to `True`, the function returns a list of
228-
instances of `BinDefinition`, otherwise a list of bin boundaries.
257+
label_fmt (str or Callable):
258+
A format string (default: "%g") used for threshold labels,
259+
or a function for formatting thresholds (e.g. var.str_val)
229260
230261
Returns:
231262
bin_boundaries (list of np.ndarray): a list of bin boundaries,
@@ -245,29 +276,16 @@ def decimal_binnings(
245276
246277
This is returned if `return_defs` is `False`.
247278
"""
248-
def unique_bins():
249-
if len(unique) >= 2:
250-
# make the last bin the same width as the one before
251-
last_boundary = 2 * unique[-1] - unique[-2]
252-
else:
253-
last_boundary = unique[0] + 1
254-
return BinDefinition(
255-
unique[0], len(unique), np.hstack((unique, [last_boundary])))
279+
bins = []
256280

257-
unique = np.unique(data)
258-
unique = unique[np.isfinite(unique)]
259-
if not unique.size:
260-
raise ValueError("no valid (non-nan) data")
261-
mn, mx = unique[0], unique[-1]
262-
if mn == mx or len(unique) <= min_unique:
263-
bins = unique_bins()
264-
if not return_defs:
265-
bins = get_bins(bins)
266-
return [bins]
281+
mn, mx, unique = _min_max_unique(data)
282+
if len(unique) <= max(min_unique, add_unique):
283+
bins.append(BinDefinition(_unique_thresholds(unique), label_fmt))
284+
if len(unique) <= min_unique:
285+
return bins
267286

268287
diff = mx - mn
269288
f10 = 10 ** -np.floor(np.log10(diff))
270-
bins = []
271289
max_bins = min(max_bins, len(unique))
272290
for f in factors:
273291
width = f / f10
@@ -278,40 +296,174 @@ def unique_bins():
278296
nbins = np.round((mx_ - mn_) / width)
279297
if min_bins <= nbins <= max_bins \
280298
and (not bins or bins[-1].nbins != nbins):
281-
bins.append(BinDefinition(mn_, nbins, width))
299+
bin_def = BinDefinition(mn_ + width * np.arange(nbins + 1),
300+
label_fmt, width)
301+
bins.append(bin_def)
302+
return bins
282303

283-
if add_unique is not None and len(unique) <= add_unique:
284-
if bins and bins[-1].nbins == len(unique):
285-
del bins[-1]
286-
bins.append(unique_bins())
287-
if len(unique) < min_unique:
288-
del bins[:-1]
289304

290-
if not return_defs:
291-
bins = [get_bins(bin) for bin in bins]
305+
def time_binnings(data, *, min_bins=2, max_bins=50, min_unique=5, add_unique=0):
306+
"""
307+
Find a set of nice splits of time variable data into bins
292308
293-
return bins
309+
The function considers bin widths of
294310
311+
- 1, 5, 10, 15, 30 seconds.
312+
- 1, 5, 10, 15, 30 minutes,
313+
- 1, 2, 3, 6, 12 hours,
314+
- 1 day,
315+
- 1, 2 weeks,
316+
- 1, 2, 3, 6 months,
317+
- 1, 2, 5, 10, 25, 50, 100 years,
318+
319+
and returns those that yield between `min_bins` and `max_bins` intervals.
295320
296-
def get_bins(bin_def: BinDefinition):
297-
"""
298-
Return a `np.ndarray` corresponding to interval
299321
Args:
300-
bin_def (BinDefinition):
301-
definition of bins. a named tuple containing the beginning of the
302-
first bin (`start`), number of bins (`nbins`) and their widths
303-
(`width`). The last value can also be a `nd.array` with `nbins + 1`
304-
elements, in which case the function returns this as a result.
322+
data (np.ndarray):
323+
vector of data points; values may repeat, and nans and infs are
324+
filtered out.
325+
min_bins (int): minimal number of bins
326+
max_bins (int):
327+
maximal number of bins; the number of bins will never exceed the
328+
number of unique values
305329
306330
Returns:
307-
bin boundaries (np.ndarray):
308-
bin boundaries including the top boundary of the last interval, hence
309-
the list size equals `bin_def.nbins + 1`. This array matches the
310-
`bin` argument of `numpy.histogram`.
331+
bin_boundaries (list): a list of possible binning.
332+
Each element of `bin_boundaries` is a tuple consisting of a label
333+
describing the bin size (e.g. `2 weeks`) and a list of thresholds.
334+
Thresholds are given as pairs
335+
(number_of_seconds_since_epoch, label).
311336
"""
312-
if isinstance(bin_def.width, np.ndarray):
313-
return bin_def.width
314-
return bin_def.start + bin_def.width * np.arange(bin_def.nbins + 1)
337+
mn, mx, unique = _min_max_unique(data)
338+
mn, mx = time.gmtime(mn), time.gmtime(mx)
339+
bins = []
340+
if len(unique) <= max(min_unique, add_unique):
341+
bins.append(_unique_time_bins(unique))
342+
if len(unique) > min_unique:
343+
bins += _time_binnings(mn, mx, min_bins + 1, max_bins + 1)
344+
return bins
345+
346+
347+
def _time_binnings(mn, mx, min_pts, max_pts):
348+
yfmt = "%y " if mn.tm_year >= 1950 else "%Y "
349+
bins = []
350+
for place, step, fmt, unit in (
351+
[(5, x, "%H:%M:%S", "second") for x in (1, 5, 10, 15, 30)] +
352+
[(4, x, "%b %d %H:%M", "minute") for x in (1, 5, 10, 15, 30)] +
353+
[(3, x, yfmt + "%b %d %H:%M", "hour") for x in (1, 2, 3, 6, 12)] +
354+
[(2, 1, yfmt + "%b %d", "day")] +
355+
[(2, x, yfmt + "%b %d", "week") for x in (7, 14)] +
356+
[(1, x, yfmt + "%b", "month") for x in (1, 2, 3, 6)] +
357+
[(0, x, yfmt.strip(), "year") for x in (1, 2, 5, 10, 25, 50, 100)]):
358+
times = _time_range(mn, mx, place, step, min_pts, max_pts)
359+
if not times:
360+
continue
361+
times = [time.struct_time(t + (0, 0, 0)) for t in times]
362+
thresholds = [calendar.timegm(t) for t in times]
363+
labels = _simplified_labels([time.strftime(fmt, t) for t in times])
364+
if place == 2 and step >= 7:
365+
unit_label = f"{step // 7} week{'s' * (step > 7)}"
366+
else:
367+
unit_label = f"{step} {unit}{'s' * (step > 1)}"
368+
new_bins = BinDefinition(thresholds, labels, None, unit_label)
369+
if not bins or new_bins.nbins != bins[-1].nbins:
370+
bins.append(new_bins)
371+
return bins
372+
373+
374+
# datetime + deltatime is not very useful here because deltatime is
375+
# given a number of days, not years or months, so it doesn't allow
376+
# for specifying a step of 1 month
377+
def _time_range(start, end, place, step, min_pts, max_pts,
378+
_zeros=(0, 1, 1, 0, 0, 0)):
379+
if place == 2 and step % 7 == 0:
380+
startd = datetime.date(*start[:3])
381+
startd -= datetime.timedelta(days=-startd.weekday())
382+
start = [startd.year, startd.month, startd.day, 0, 0, 0]
383+
else:
384+
start = list(
385+
start[:place]
386+
+ ((start[place] - _zeros[place]) // step * step + _zeros[place], )
387+
+ _zeros[place + 1:])
388+
end = list(end[:place + 1] + _zeros[place + 1:])
389+
s = [tuple(start)]
390+
for _ in range(max_pts):
391+
start[place] += step
392+
if place >= 3: # hours, minutes, seconds
393+
for pos, maxval in enumerate((60, 60, 24), start=1):
394+
if start[-pos] >= maxval:
395+
start[-pos - 1] += 1
396+
start[-pos] %= maxval
397+
if place >= 2:
398+
md = _month_days(*start[:2])
399+
if start[2] > md:
400+
start[1] += 1
401+
start[2] %= md
402+
if start[1] > 12:
403+
start[0] += 1
404+
start[1] %= 12
405+
s.append(tuple(start))
406+
if start > end:
407+
return s if len(s) >= min_pts else None
408+
return None
409+
410+
411+
def _month_days(year, month,
412+
_md=(None, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)):
413+
return _md[month] + (
414+
month == 2 and (year % 400 == 0 or year % 4 == 0 and year % 100 != 0))
415+
416+
417+
def _simplified_labels(labels):
418+
to_remove = "42"
419+
while True:
420+
firsts = {f for f, *_ in (lab.split() for lab in labels)}
421+
if len(firsts) > 1:
422+
break
423+
to_remove = firsts.pop()
424+
flen = len(to_remove) + 1
425+
labels = [lab[flen:] for lab in labels]
426+
for i in range(len(labels) - 1, 0, -1):
427+
for k, c, d in zip(count(), labels[i].split(), labels[i - 1].split()):
428+
if c != d:
429+
labels[i] = " ".join(labels[i].split()[k:])
430+
break
431+
# If the last thing removed were month names and the labels continues with
432+
# hours, keep month name in the first label; "08 12:29" looks awkward.
433+
if not to_remove[0].isdigit() and ":" in labels[0]:
434+
labels[0] = f"{to_remove} {labels[0]}"
435+
return labels
436+
437+
438+
def _unique_time_bins(unique):
439+
times = [time.gmtime(x)[2:] for x in unique]
440+
fmt = f'{"%y " if times[0].tm_year >= 1950 else "%Y "} %b %d'
441+
fmt += " %H:%M" * (len(set(times)) > 1)
442+
fmt += ":%S" * np.all(unique % 60 == 0)
443+
return BinDefinition(_unique_thresholds(unique),
444+
[time.strftime(fmt, x) for x in unique])
445+
446+
447+
def _unique_thresholds(unique):
448+
if len(unique) >= 2:
449+
# make the last bin the same width as the one before
450+
last_boundary = 2 * unique[-1] - unique[-2]
451+
else:
452+
last_boundary = unique[0] + 1
453+
return np.hstack((unique, [last_boundary]))
454+
455+
456+
def _min_max_unique(data):
457+
unique = np.unique(data)
458+
unique = unique[np.isfinite(unique)]
459+
if not unique.size:
460+
raise ValueError("no valid (non-nan) data")
461+
return unique[0], unique[-1], unique
462+
463+
464+
short_time_units = dict(seconds="sec", minutes="min", hours="hrs",
465+
weeks="wks", months="mon", years="yrs",
466+
second="sec", minute="min", month="mon")
315467

316468

317469
# noinspection PyPep8Naming

Orange/preprocess/tests/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)