CSV Import: guess data types

PrimozGodec · PrimozGodec · commit e97053574dc7 · 2020-06-03T13:02:13.000+02:00
diff --git a/Orange/widgets/data/owcsvimport.py b/Orange/widgets/data/owcsvimport.py
@@ -907,6 +907,7 @@ def __set_error_state(self, err):
         if isinstance(err, UnicodeDecodeError):
             self.Error.encoding_error(exc_info=err)
         else:
+            raise err
             self.Error.error(exc_info=err)
 
         path = self.current_item().path()
@@ -1259,6 +1260,8 @@ def expand(ranges):
             float_precision="round_trip",
             **numbers_format_kwds
         )
+        df = guess_types(df, dtypes, columns_ignored)
+
         if columns_ignored:
             # TODO: use 'usecols' parameter in `read_csv` call to
             # avoid loading/parsing the columns
@@ -1270,6 +1273,86 @@ def expand(ranges):
         return df
 
 
+def guess_types(
+        df: pd.DataFrame, dtypes: Dict[int, str], columns_ignored: List[int]
+) -> pd.DataFrame:
+    """
+    Guess data type for variables according to values.
+
+    Parameters
+    ----------
+    df
+        Data frame
+    dtypes
+        The dictionary with data types set by user. We will guess values only
+        for columns that does not have data type defined.
+    columns_ignored
+        List with indices of ignored columns. Ignored columns are skipped.
+
+    Returns
+    -------
+    A data frame with changed dtypes according to the strategy.
+    """
+    for i, col in enumerate(df):
+        # only when automatic is set in widget dialog
+        if dtypes.get(i, None) is None and i not in columns_ignored:
+            df[col] = guess_data_type(df[col])
+    return df
+
+
+def guess_data_type(col: pd.Series) -> pd.Series:
+    """
+    Guess column types. Logic is same than in guess_data_type from io_utils
+    module. This function only change the dtype of the column such that later
+    correct Orange.data.variable is used.
+    Logic:
+    - if can converted to date-time (ISO) -> TimeVariable
+    - if numeric (only numbers)
+        - only values {0, 1} or {1, 2} -> DiscreteVariable
+        - else -> ContinuousVariable
+    - if not numbers:
+        - num_unique_values < len(data) ** 0.7 and < 100 -> DiscreteVariable
+        - else -> StringVariable
+
+    Parameters
+    ----------
+    col
+        Data column
+
+    Returns
+    -------
+    Data column with correct dtype
+    """
+    def parse_dates(s):
+        """
+        This is an extremely fast approach to datetime parsing.
+        For large data, the same dates are often repeated. Rather than
+        re-parse these, we store all unique dates, parse them, and
+        use a lookup to convert all dates.
+        """
+        try:
+            dates = {date: pd.to_datetime(date) for date in s.unique()}
+        except ValueError:
+            return None
+        return s.map(dates)
+
+    if pdtypes.is_numeric_dtype(col):
+        unique_values = col.unique()
+        if len(unique_values) <= 2 and (
+                len(np.setdiff1d(unique_values, [0, 1])) == 0
+                or len(np.setdiff1d(unique_values, [1, 2])) == 0):
+            return col.astype("category")
+    else:  # object
+        # try parse as date - if None not a date
+        parsed_col = parse_dates(col)
+        if parsed_col is not None:
+            return parsed_col
+        unique_values = col.unique()
+        if len(unique_values) < 100 and len(unique_values) < len(col)**0.7:
+            return col.astype("category")
+    return col
+
+
 def clear_stack_on_cancel(f):
     """
     A decorator that catches the TaskState.UserCancelException exception
@@ -1465,7 +1548,8 @@ def pandas_to_table(df):
             )
             # Remap the coldata into the var.values order/set
             coldata = pd.Categorical(
-                coldata, categories=var.values, ordered=coldata.ordered
+                coldata.astype("str"), categories=var.values,
+                ordered=coldata.ordered,
             )
             codes = coldata.codes
             assert np.issubdtype(codes.dtype, np.integer)
diff --git a/Orange/widgets/data/tests/data-csv-types.tab b/Orange/widgets/data/tests/data-csv-types.tab
@@ -0,0 +1,6 @@
+time	numeric1	discrete1	numeric2	discrete2	string
+2020-05-05	1	0		a	a
+2020-05-06	2	1		a	b
+2020-05-07	3	0		a	c
+2020-05-08	4	1		b	d
+2020-05-09	5	1		b	e
diff --git a/Orange/widgets/data/tests/test_owcsvimport.py b/Orange/widgets/data/tests/test_owcsvimport.py
@@ -12,6 +12,8 @@
 
 from AnyQt.QtCore import QSettings
 
+from Orange.data import DiscreteVariable, TimeVariable, ContinuousVariable, \
+    StringVariable
 from Orange.tests import named_file
 from Orange.widgets.tests.base import WidgetTest, GuiTest
 from Orange.widgets.data import owcsvimport
@@ -127,6 +129,37 @@ def test_summary(self):
         output_sum.assert_called_with(len(output),
                                       format_summary_details(output))
 
+    data_csv_types_options = owcsvimport.Options(
+        encoding="ascii", dialect=csv.excel_tab(),
+        columntypes=[
+            (range(0, 5), ColumnType.Auto),
+        ]
+    )
+
+    def test_type_guessing(self):
+        """ Check if correct column type is guessed when column type auto """
+        dirname = os.path.dirname(__file__)
+        path = os.path.join(dirname, "data-csv-types.tab")
+        widget = self.create_widget(
+            owcsvimport.OWCSVFileImport,
+            stored_settings={
+                "_session_items": [
+                    (path, self.data_csv_types_options.as_dict())
+                ]
+            }
+        )
+        widget.commit()
+        self.wait_until_finished(widget)
+        output = self.get_output("Data", widget)
+        domain = output.domain
+
+        self.assertIsInstance(domain["time"], TimeVariable)
+        self.assertIsInstance(domain["discrete1"], DiscreteVariable)
+        self.assertIsInstance(domain["discrete2"], DiscreteVariable)
+        self.assertIsInstance(domain["numeric1"], ContinuousVariable)
+        self.assertIsInstance(domain["numeric2"], ContinuousVariable)
+        self.assertIsInstance(domain["string"], StringVariable)
+
 
 class TestImportDialog(GuiTest):
     def test_dialog(self):
@@ -253,3 +286,7 @@ class dialect(csv.excel):
         assert_array_equal(tb.X[:, 0], [np.nan, 0, np.nan])
         assert_array_equal(tb.X[:, 1], [0, np.nan, np.nan])
         assert_array_equal(tb.X[:, 2], [np.nan, 1, np.nan])
+
+
+if __name__ == "__main__":
+    unittest.main()