22import logging
33
44import pandas as pd
5+ import numpy as np
56from scipy import interpolate
67
78from dku_timeseries .dataframe_helpers import has_duplicates , nothing_to_do , filter_empty_columns , generic_check_compute_arguments
@@ -69,6 +70,7 @@ def check(self):
6970
7071
7172class Resampler :
73+ RESAMPLEABLE_TYPES = [int , float , np .float32 , np .int32 ]
7274
7375 def __init__ (self , params = None ):
7476
@@ -77,7 +79,7 @@ def __init__(self, params=None):
7779 self .params = params
7880 self .params .check ()
7981
80- def transform (self , df , datetime_column , groupby_columns = None ):
82+ def transform (self , df , datetime_column , groupby_columns = None ):
8183 if groupby_columns is None :
8284 groupby_columns = []
8385
@@ -94,8 +96,8 @@ def transform(self, df, datetime_column, groupby_columns=None):
9496 # when having multiple timeseries, their time range is not necessarily the same
9597 # we thus compute a unified time index for all partitions
9698 reference_time_index = self ._compute_full_time_index (df_copy , datetime_column )
97- columns_to_resample = [col for col in df_copy .select_dtypes ([ int , float ] ).columns .tolist () if col != datetime_column and col not in groupby_columns ]
98- category_columns = [col for col in df .select_dtypes ([ object , bool ] ).columns .tolist () if col != datetime_column and col not in columns_to_resample and
99+ columns_to_resample = [col for col in df_copy .select_dtypes (Resampler . RESAMPLEABLE_TYPES ).columns .tolist () if col != datetime_column and col not in groupby_columns ]
100+ category_columns = [col for col in df .select_dtypes (exclude = Resampler . RESAMPLEABLE_TYPES ).columns .tolist () if col != datetime_column and col not in columns_to_resample and
99101 col not in groupby_columns ]
100102 if groupby_columns :
101103 grouped = df_copy .groupby (groupby_columns )
@@ -232,6 +234,13 @@ def _fill_in_category_values(self, df, category_columns):
232234 elif self .params .category_imputation_method == "clip" :
233235 category_filled_df .loc [:, category_columns ] = category_filled_df .loc [:, category_columns ].ffill ().bfill ()
234236 elif self .params .category_imputation_method == "mode" :
237+ # .mode() loses the timezone info for any datetimetz column
235238 most_frequent_categoricals = category_filled_df .loc [:, category_columns ].mode ().iloc [0 ]
239+
240+ for col in category_columns :
241+ # only perform conversion if the column has a timezone
242+ if pd .api .types .is_datetime64_any_dtype (category_filled_df [col ]) and category_filled_df [col ].dt .tz is not None :
243+ most_frequent_categoricals [col ] = most_frequent_categoricals [col ].tz_localize ("UTC" )
244+
236245 category_filled_df .loc [:, category_columns ] = category_filled_df .loc [:, category_columns ].fillna (most_frequent_categoricals )
237246 return category_filled_df
0 commit comments