Skip to content

Commit 8ecbe68

Browse files
authored
Merge pull request #4838 from PrimozGodec/csvimport-autovariable
[ENH] CSV Import: guess data types
2 parents c838736 + 9c19f2f commit 8ecbe68

File tree

3 files changed

+127
-1
lines changed

3 files changed

+127
-1
lines changed

Orange/widgets/data/owcsvimport.py

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1259,6 +1259,8 @@ def expand(ranges):
12591259
float_precision="round_trip",
12601260
**numbers_format_kwds
12611261
)
1262+
df = guess_types(df, dtypes, columns_ignored)
1263+
12621264
if columns_ignored:
12631265
# TODO: use 'usecols' parameter in `read_csv` call to
12641266
# avoid loading/parsing the columns
@@ -1270,6 +1272,86 @@ def expand(ranges):
12701272
return df
12711273

12721274

1275+
def guess_types(
1276+
df: pd.DataFrame, dtypes: Dict[int, str], columns_ignored: List[int]
1277+
) -> pd.DataFrame:
1278+
"""
1279+
Guess data type for variables according to values.
1280+
1281+
Parameters
1282+
----------
1283+
df
1284+
Data frame
1285+
dtypes
1286+
The dictionary with data types set by user. We will guess values only
1287+
for columns that does not have data type defined.
1288+
columns_ignored
1289+
List with indices of ignored columns. Ignored columns are skipped.
1290+
1291+
Returns
1292+
-------
1293+
A data frame with changed dtypes according to the strategy.
1294+
"""
1295+
for i, col in enumerate(df):
1296+
# only when automatic is set in widget dialog
1297+
if dtypes.get(i, None) is None and i not in columns_ignored:
1298+
df[col] = guess_data_type(df[col])
1299+
return df
1300+
1301+
1302+
def guess_data_type(col: pd.Series) -> pd.Series:
1303+
"""
1304+
Guess column types. Logic is same than in guess_data_type from io_utils
1305+
module. This function only change the dtype of the column such that later
1306+
correct Orange.data.variable is used.
1307+
Logic:
1308+
- if can converted to date-time (ISO) -> TimeVariable
1309+
- if numeric (only numbers)
1310+
- only values {0, 1} or {1, 2} -> DiscreteVariable
1311+
- else -> ContinuousVariable
1312+
- if not numbers:
1313+
- num_unique_values < len(data) ** 0.7 and < 100 -> DiscreteVariable
1314+
- else -> StringVariable
1315+
1316+
Parameters
1317+
----------
1318+
col
1319+
Data column
1320+
1321+
Returns
1322+
-------
1323+
Data column with correct dtype
1324+
"""
1325+
def parse_dates(s):
1326+
"""
1327+
This is an extremely fast approach to datetime parsing.
1328+
For large data, the same dates are often repeated. Rather than
1329+
re-parse these, we store all unique dates, parse them, and
1330+
use a lookup to convert all dates.
1331+
"""
1332+
try:
1333+
dates = {date: pd.to_datetime(date) for date in s.unique()}
1334+
except ValueError:
1335+
return None
1336+
return s.map(dates)
1337+
1338+
if pdtypes.is_numeric_dtype(col):
1339+
unique_values = col.unique()
1340+
if len(unique_values) <= 2 and (
1341+
len(np.setdiff1d(unique_values, [0, 1])) == 0
1342+
or len(np.setdiff1d(unique_values, [1, 2])) == 0):
1343+
return col.astype("category")
1344+
else: # object
1345+
# try parse as date - if None not a date
1346+
parsed_col = parse_dates(col)
1347+
if parsed_col is not None:
1348+
return parsed_col
1349+
unique_values = col.unique()
1350+
if len(unique_values) < 100 and len(unique_values) < len(col)**0.7:
1351+
return col.astype("category")
1352+
return col
1353+
1354+
12731355
def clear_stack_on_cancel(f):
12741356
"""
12751357
A decorator that catches the TaskState.UserCancelException exception
@@ -1465,7 +1547,8 @@ def pandas_to_table(df):
14651547
)
14661548
# Remap the coldata into the var.values order/set
14671549
coldata = pd.Categorical(
1468-
coldata, categories=var.values, ordered=coldata.ordered
1550+
coldata.astype("str"), categories=var.values,
1551+
ordered=coldata.ordered,
14691552
)
14701553
codes = coldata.codes
14711554
assert np.issubdtype(codes.dtype, np.integer)
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
time numeric1 discrete1 numeric2 discrete2 string
2+
2020-05-05 1 0 a a
3+
2020-05-06 2 1 a b
4+
2020-05-07 3 0 a c
5+
2020-05-08 4 1 b d
6+
2020-05-09 5 1 b e

Orange/widgets/data/tests/test_owcsvimport.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212

1313
from AnyQt.QtCore import QSettings
1414

15+
from Orange.data import DiscreteVariable, TimeVariable, ContinuousVariable, \
16+
StringVariable
1517
from Orange.tests import named_file
1618
from Orange.widgets.tests.base import WidgetTest, GuiTest
1719
from Orange.widgets.data import owcsvimport
@@ -127,6 +129,37 @@ def test_summary(self):
127129
output_sum.assert_called_with(len(output),
128130
format_summary_details(output))
129131

132+
data_csv_types_options = owcsvimport.Options(
133+
encoding="ascii", dialect=csv.excel_tab(),
134+
columntypes=[
135+
(range(0, 5), ColumnType.Auto),
136+
]
137+
)
138+
139+
def test_type_guessing(self):
140+
""" Check if correct column type is guessed when column type auto """
141+
dirname = os.path.dirname(__file__)
142+
path = os.path.join(dirname, "data-csv-types.tab")
143+
widget = self.create_widget(
144+
owcsvimport.OWCSVFileImport,
145+
stored_settings={
146+
"_session_items": [
147+
(path, self.data_csv_types_options.as_dict())
148+
]
149+
}
150+
)
151+
widget.commit()
152+
self.wait_until_finished(widget)
153+
output = self.get_output("Data", widget)
154+
domain = output.domain
155+
156+
self.assertIsInstance(domain["time"], TimeVariable)
157+
self.assertIsInstance(domain["discrete1"], DiscreteVariable)
158+
self.assertIsInstance(domain["discrete2"], DiscreteVariable)
159+
self.assertIsInstance(domain["numeric1"], ContinuousVariable)
160+
self.assertIsInstance(domain["numeric2"], ContinuousVariable)
161+
self.assertIsInstance(domain["string"], StringVariable)
162+
130163

131164
class TestImportDialog(GuiTest):
132165
def test_dialog(self):
@@ -253,3 +286,7 @@ class dialect(csv.excel):
253286
assert_array_equal(tb.X[:, 0], [np.nan, 0, np.nan])
254287
assert_array_equal(tb.X[:, 1], [0, np.nan, np.nan])
255288
assert_array_equal(tb.X[:, 2], [np.nan, 1, np.nan])
289+
290+
291+
if __name__ == "__main__":
292+
unittest.main()

0 commit comments

Comments
 (0)