Skip to content

Commit b08fc7e

Browse files
authored
Merge pull request #4226 from PrimozGodec/fix-time
[FIX] Fix guessing strategy for date and time variables
2 parents 6795cb9 + 24e98ef commit b08fc7e

File tree

3 files changed

+66
-17
lines changed

3 files changed

+66
-17
lines changed

Orange/data/io.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -145,30 +145,39 @@ def guess_data_type(orig_values, namask=None):
145145
"""
146146
valuemap, values = None, orig_values
147147
is_discrete = is_discrete_values(orig_values)
148+
orig_values = np.asarray(orig_values, dtype=str)
149+
if namask is None:
150+
namask = isnastr(orig_values)
148151
if is_discrete:
149152
valuemap = sorted(is_discrete)
150153
coltype = DiscreteVariable
151154
else:
152155
# try to parse as float
153-
orig_values = np.asarray(orig_values)
154-
if namask is None:
155-
namask = isnastr(orig_values)
156156
values = np.empty_like(orig_values, dtype=float)
157157
values[namask] = np.nan
158158
try:
159159
np.copyto(values, orig_values, where=~namask, casting="unsafe")
160160
except ValueError:
161-
tvar = TimeVariable('_')
162-
try:
163-
values[~namask] = [tvar.parse(i) for i in orig_values[~namask]]
164-
except ValueError:
165-
coltype = StringVariable
166-
# return original_values
167-
values = orig_values
168-
else:
169-
coltype = TimeVariable
161+
values = orig_values
162+
coltype = StringVariable
170163
else:
171164
coltype = ContinuousVariable
165+
166+
if coltype is not ContinuousVariable:
167+
# when not continuous variable it can still be time variable even it
168+
# was before recognized as a discrete
169+
tvar = TimeVariable('_')
170+
# introducing new variable prevent overwriting orig_values and values
171+
temp_values = np.empty_like(orig_values, dtype=float)
172+
try:
173+
temp_values[~namask] = [
174+
tvar.parse_exact_iso(i) for i in orig_values[~namask]]
175+
except ValueError:
176+
pass
177+
else:
178+
valuemap = None
179+
coltype = TimeVariable
180+
values = temp_values
172181
return valuemap, values, coltype
173182

174183

Orange/data/tests/test_io.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import unittest
22
import numpy as np
33

4-
from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable
4+
from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \
5+
TimeVariable
56
from Orange.data.io import guess_data_type
67

78

@@ -68,3 +69,27 @@ def test_guess_data_type_string(self):
6869
self.assertEqual(StringVariable, coltype)
6970
self.assertIsNone(valuemap)
7071
np.testing.assert_array_equal(in_values, values)
72+
73+
def test_guess_data_type_time(self):
74+
in_values = ["2019-10-10", "2019-10-10", "2019-10-10", "2019-10-01"]
75+
valuemap, _, coltype = guess_data_type(in_values)
76+
self.assertEqual(TimeVariable, coltype)
77+
self.assertIsNone(valuemap)
78+
79+
in_values = ["2019-10-10T12:08:51", "2019-10-10T12:08:51",
80+
"2019-10-10T12:08:51", "2019-10-01T12:08:51"]
81+
valuemap, _, coltype = guess_data_type(in_values)
82+
self.assertEqual(TimeVariable, coltype)
83+
self.assertIsNone(valuemap)
84+
85+
in_values = ["2019-10-10 12:08:51", "2019-10-10 12:08:51",
86+
"2019-10-10 12:08:51", "2019-10-01 12:08:51"]
87+
valuemap, _, coltype = guess_data_type(in_values)
88+
self.assertEqual(TimeVariable, coltype)
89+
self.assertIsNone(valuemap)
90+
91+
in_values = ["2019-10-10 12:08", "2019-10-10 12:08",
92+
"2019-10-10 12:08", "2019-10-01 12:08"]
93+
valuemap, _, coltype = guess_data_type(in_values)
94+
self.assertEqual(TimeVariable, coltype)
95+
self.assertIsNone(valuemap)

Orange/data/variable.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -889,6 +889,13 @@ class TimeVariable(ContinuousVariable):
889889
r'\d{2}\d{2}\d{2}\.\d+|'
890890
r'\d{1,4}(-?\d{2,3})?'
891891
r')$')
892+
893+
class InvalidDateTimeFormatError(ValueError):
894+
def __init__(self, date_string):
895+
super().__init__(
896+
"Invalid datetime format '{}'. "
897+
"Only ISO 8601 supported.".format(date_string))
898+
892899
_matches_iso_format = re.compile(REGEX).match
893900

894901
# UTC offset and associated timezone. If parsed datetime values provide an
@@ -954,16 +961,14 @@ def parse(self, datestr):
954961
return Unknown
955962
datestr = datestr.strip().rstrip('Z')
956963

957-
ERROR = ValueError("Invalid datetime format '{}'. "
958-
"Only ISO 8601 supported.".format(datestr))
959964
if not self._matches_iso_format(datestr):
960965
try:
961966
# If it is a number, assume it is a unix timestamp
962967
value = float(datestr)
963968
self.have_date = self.have_time = 1
964969
return value
965970
except ValueError:
966-
raise ERROR
971+
raise self.InvalidDateTimeFormatError(datestr)
967972

968973
for i, (have_date, have_time, fmt) in enumerate(self._ISO_FORMATS):
969974
try:
@@ -984,7 +989,7 @@ def parse(self, datestr):
984989
self.UNIX_EPOCH.day)
985990
break
986991
else:
987-
raise ERROR
992+
raise self.InvalidDateTimeFormatError(datestr)
988993

989994
# Remember UTC offset. If not all parsed values share the same offset,
990995
# remember none of it.
@@ -1010,6 +1015,16 @@ def parse(self, datestr):
10101015
except OverflowError:
10111016
return -(self.UNIX_EPOCH - dt).total_seconds()
10121017

1018+
def parse_exact_iso(self, datestr):
1019+
"""
1020+
This function is a meta function to `parse` function. It checks
1021+
whether the date is of the iso format - it does not accept float-like
1022+
date.
1023+
"""
1024+
if not self._matches_iso_format(datestr):
1025+
raise self.InvalidDateTimeFormatError(datestr)
1026+
return self.parse(datestr)
1027+
10131028
def to_val(self, s):
10141029
"""
10151030
Convert a value, given as an instance of an arbitrary type, to a float.

0 commit comments

Comments
 (0)