@@ -1259,6 +1259,8 @@ def expand(ranges):
12591259 float_precision = "round_trip" ,
12601260 ** numbers_format_kwds
12611261 )
1262+ df = guess_types (df , dtypes , columns_ignored )
1263+
12621264 if columns_ignored :
12631265 # TODO: use 'usecols' parameter in `read_csv` call to
12641266 # avoid loading/parsing the columns
@@ -1270,6 +1272,86 @@ def expand(ranges):
12701272 return df
12711273
12721274
1275+ def guess_types (
1276+ df : pd .DataFrame , dtypes : Dict [int , str ], columns_ignored : List [int ]
1277+ ) -> pd .DataFrame :
1278+ """
1279+ Guess data type for variables according to values.
1280+
1281+ Parameters
1282+ ----------
1283+ df
1284+ Data frame
1285+ dtypes
1286+ The dictionary with data types set by user. We will guess values only
1287+ for columns that does not have data type defined.
1288+ columns_ignored
1289+ List with indices of ignored columns. Ignored columns are skipped.
1290+
1291+ Returns
1292+ -------
1293+ A data frame with changed dtypes according to the strategy.
1294+ """
1295+ for i , col in enumerate (df ):
1296+ # only when automatic is set in widget dialog
1297+ if dtypes .get (i , None ) is None and i not in columns_ignored :
1298+ df [col ] = guess_data_type (df [col ])
1299+ return df
1300+
1301+
1302+ def guess_data_type (col : pd .Series ) -> pd .Series :
1303+ """
1304+ Guess column types. Logic is same than in guess_data_type from io_utils
1305+ module. This function only change the dtype of the column such that later
1306+ correct Orange.data.variable is used.
1307+ Logic:
1308+ - if can converted to date-time (ISO) -> TimeVariable
1309+ - if numeric (only numbers)
1310+ - only values {0, 1} or {1, 2} -> DiscreteVariable
1311+ - else -> ContinuousVariable
1312+ - if not numbers:
1313+ - num_unique_values < len(data) ** 0.7 and < 100 -> DiscreteVariable
1314+ - else -> StringVariable
1315+
1316+ Parameters
1317+ ----------
1318+ col
1319+ Data column
1320+
1321+ Returns
1322+ -------
1323+ Data column with correct dtype
1324+ """
1325+ def parse_dates (s ):
1326+ """
1327+ This is an extremely fast approach to datetime parsing.
1328+ For large data, the same dates are often repeated. Rather than
1329+ re-parse these, we store all unique dates, parse them, and
1330+ use a lookup to convert all dates.
1331+ """
1332+ try :
1333+ dates = {date : pd .to_datetime (date ) for date in s .unique ()}
1334+ except ValueError :
1335+ return None
1336+ return s .map (dates )
1337+
1338+ if pdtypes .is_numeric_dtype (col ):
1339+ unique_values = col .unique ()
1340+ if len (unique_values ) <= 2 and (
1341+ len (np .setdiff1d (unique_values , [0 , 1 ])) == 0
1342+ or len (np .setdiff1d (unique_values , [1 , 2 ])) == 0 ):
1343+ return col .astype ("category" )
1344+ else : # object
1345+ # try parse as date - if None not a date
1346+ parsed_col = parse_dates (col )
1347+ if parsed_col is not None :
1348+ return parsed_col
1349+ unique_values = col .unique ()
1350+ if len (unique_values ) < 100 and len (unique_values ) < len (col )** 0.7 :
1351+ return col .astype ("category" )
1352+ return col
1353+
1354+
12731355def clear_stack_on_cancel (f ):
12741356 """
12751357 A decorator that catches the TaskState.UserCancelException exception
@@ -1465,7 +1547,8 @@ def pandas_to_table(df):
14651547 )
14661548 # Remap the coldata into the var.values order/set
14671549 coldata = pd .Categorical (
1468- coldata , categories = var .values , ordered = coldata .ordered
1550+ coldata .astype ("str" ), categories = var .values ,
1551+ ordered = coldata .ordered ,
14691552 )
14701553 codes = coldata .codes
14711554 assert np .issubdtype (codes .dtype , np .integer )
0 commit comments