@@ -1027,7 +1027,6 @@ def get_model_outliers_withgap(df, window=3):
10271027 # todo - clean up, return
10281028
10291029
1030-
10311030# TODO: Add option - estimate_outl_size
10321031# TODO: Add option - sigmoid steps
10331032# TODO: ADD option - gaussian spikes
@@ -1092,206 +1091,46 @@ def get_model_outliers(df, window=3):
10921091 dfo .mean_group_diff = dfo .mean_group_diff .fillna (0 )
10931092 dfo .mean_group_diff_abs = dfo .mean_group_diff_abs .fillna (0 )
10941093
1095- dfo ['is_step' ] = (dfo .mean_group_diff < thr_low ) | (dfo .mean_group_diff > thr_hi )
1096- dfo ['is_spike ' ] = ( dfo . mean_group_diff_abs - dfo . mean_group_diff ) > ( thr_hi - thr_low ) / 2
1097- dfo [ 'ischange_cumsum' ] = dfo .ischange . cumsum ( )
1094+ # dfo['is_step'] = (dfo.mean_group_diff < thr_low) | (dfo.mean_group_diff > thr_hi)
1095+ dfo ['is_step ' ] = dfo [ 'ischange_group' ] & (
1096+ (( dfo .mean_group_diff < thr_low ) | ( dfo . mean_group_diff > thr_hi )) )
10981097
1099- # logger_info('DF_OUTL: ',dfo)
1098+ dfo ['is_spike' ] = (dfo .mean_group_diff_abs - dfo .mean_group_diff .abs ()) > (thr_hi - thr_low ) / 2
1099+ dfo ['ischange_cumsum' ] = dfo .ischange .cumsum ()
11001100
11011101 df_outl = (
1102- dfo .loc [dfo .ischange .astype (bool )].groupby ('change_group' )
1103- . apply ( lambda x :pd .Series ({'outl_start' :x . head ( 1 ) [x_col ].iloc [0 ],'outl_end' : x . tail ( 1 )[ x_col ]. iloc [ 0 ]}))
1104- .reset_index ()
1102+ dfo .loc [dfo .ischange .astype (bool )].groupby ('change_group' ). apply (
1103+ lambda x : pd .Series ({'outl_start' : x [x_col ].iloc [0 ],
1104+ 'outl_end' : x [ x_col ]. iloc [ - 1 ]})) .reset_index ()
11051105 )
11061106
11071107 if df_outl .empty : # No outliers - nothing to do
1108- return None , None
1108+ return np .full (dfo .index .size , False ), \
1109+ np .full (dfo .index .size , False )
11091110
11101111 df_outl = df_outl .merge (dfo [['change_group' , 'is_spike' , 'is_step' ]].drop_duplicates ())
11111112
11121113 dfo = dfo .merge (df_outl , how = 'left' )
1113- dfo ['outl_start' ] = dfo .outl_start .fillna (0 ).astype (int )
1114- dfo ['outl_end' ] = dfo .outl_end .fillna (0 ).astype (int )
1115-
1116- dfo = dfo # .reset_index()
1117-
1118- df_spikes = df_outl .loc [df_outl .is_spike ]
1119- df_steps = df_outl .loc [df_outl .is_step ]
1120-
1121- l_model_outl = []
1122- l_mask_step = []
1123- l_mask_spike = []
1124-
1125- for g in df_spikes .change_group :
1126- s_spike = df_spikes .loc [df_spikes .change_group == g ].iloc [0 ]
1127- if with_dates :
1128- mask_spike_tmp = ~ ((dfo .date >= pd .to_datetime (s_spike .outl_start )) &
1129- (dfo .date < pd .to_datetime (s_spike .outl_end )))
1130- else :
1131- mask_spike_tmp = ~ ((dfo .x .values >= s_spike .outl_start ) &
1132- (dfo .x .values < s_spike .outl_end ))
1133- l_mask_spike += [mask_spike_tmp .astype (float )]
1134-
1135- for g in df_steps .change_group :
1136- s_step = df_steps .loc [df_steps .change_group == g ].iloc [0 ]
1137- if with_dates :
1138- model_step_tmp = get_model_step_date (pd .to_datetime (s_step .outl_start ))
1139- else :
1140- model_step_tmp = (
1141- fix_params_fmodel (
1142- model_step , [s_step .outl_start , np .NaN ])
1143- )
1144- l_model_outl += [model_step_tmp ]
1145- if with_dates :
1146- mask_step_tmp = (dfo .date == pd .to_datetime (s_step .outl_start ))
1147- else :
1148- mask_step_tmp = (dfo .x .values == s_step .outl_start )
1149- l_mask_step += [mask_step_tmp .astype (float )]
1150-
1151- if len (l_model_outl ) == 0 :
1152- model_outliers = None
1153- else :
1154- model_outliers = np .prod (l_model_outl ) if is_mult else np .sum (l_model_outl )
1155- if len (l_mask_spike ) == 0 :
1156- mask_spike = None
1157- else :
1158- mask_spike = np .prod (l_mask_spike , axis = 0 )
1159- if len (l_mask_step )== 0 :
1160- mask_step = None
1114+ if with_dates :
1115+ dfo ['outl_start' ] = pd .to_datetime (dfo .outl_start )
1116+ dfo ['outl_end' ] = pd .to_datetime (dfo .outl_end )
1117+ dfo ['mask_spike' ] = (dfo ['is_spike' ] &
1118+ (dfo .date >= pd .to_datetime (dfo .outl_start )) &
1119+ (dfo .date < pd .to_datetime (dfo .outl_end )))
1120+ dfo ['mask_step' ] = (dfo ['is_step' ] &
1121+ (dfo .date >= pd .to_datetime (dfo .outl_start )) &
1122+ (dfo .date <= pd .to_datetime (dfo .outl_end )))
11611123 else :
1162- mask_step = np .sum (l_mask_step , axis = 0 )
1163- return mask_step , mask_spike
1164- #return model_outliers, mask_spike
1165- # return model_outliers, l_mask_spike
1166-
1167-
1168- # TODO: Remove - replaced by get_model_outliers()
1169- def find_steps_and_spikes (a_x , a_y , a_date , window = 3 , max_changes = None ):
1170- """
1171- Automatically locate steps and spikes for a_y time series.
1172- A rule of thumb for the window is two weeks.
1173-
1174- :param a_x: The values of the x axis
1175- :param a_y: The values of the y axis
1176- :param a_date: The values of the x axis, if they are dates
1177- :param window: The x-axis window to aggregate multiple steps/spikes
1178- :type window: int
1179- :param max_changes: Manual input of the number of changes that the
1180- input time series is expected to have.
1181- If max_changes = 0 or None, the function returns
1182- all changes (steps / spikes) found.
1183- :type max_changes: int
1184- :return: changes_list
1185- :rtype: list of dictionaries
1186- Each dictionary includes information on a single change,
1187- and is of the format:
1188- {'change_type': change_type, # ('spike' or 'step')
1189- 'duration': duration, # int - the x-axis duration of the change
1190- 'diff': diff, # the difference within the change
1191- 'x': x, # the x-axis at the middle of the change
1192- 'date': date} # the x-axis date at the middle of the change
1193- """
1194- # find NaNs in a_y
1195- df = pd .DataFrame ({'y' : a_y }).interpolate ('slinear' )
1196-
1197- # Find peak changes in diff
1198- df ['diff' ] = df .y .diff ()
1199-
1200- # Outliers are < Q1 - 3 * IQR, > Q3 + 3 * IQR
1201- # Q1 and Q3 are the 25th (1st) and 75th (3rd) quartiles, and
1202- # IQR is the inter-quartile range
1203- q1 = df ['diff' ].quantile (0.25 )
1204- q3 = df ['diff' ].quantile (0.75 )
1205- iqr = q3 - q1
1206- low_thresh = q1 - 1.5 * iqr
1207- high_thresh = q3 + 1.5 * iqr
1208-
1209- # df['is_change'] = 0
1210- step_filt = (df ['diff' ] < low_thresh ) | (df ['diff' ] > high_thresh )
1211- df ['is_change' ] = step_filt .astype (int )
1212-
1213- if not any (step_filt ):
1214- return [], []
1215-
1216- # df.loc[step_filt, 'is_change'] = 1
1217-
1218- # Now that we have found the outliers in differences,
1219- # group consecutive steps together
1220-
1221- # get only the diffs that correspond to changes
1222- df ['diff' ] = df ['diff' ] * df ['is_change' ]
1223- df [['diff_sum' , 'change_sum' ]] = df [['diff' , 'is_change' ]].rolling (
1224- window , win_type = None , center = True ).sum ()
1225-
1226- # we have steps, we may need to aggregate
1227- # We split the array with zeros.
1228- # This means that we treat all nearby changes as one,
1229- # within `window` values
1230- split = np .split (df ['change_sum' ],
1231- np .where (df ['change_sum' ] == 0. )[0 ])
1232- # get rid of zero only series
1233- split = [i for i in split if i .any ()]
1234-
1235- # Now we have a list of series with the changes
1236- changes_list = []
1237- for s in split :
1238- change_s = df .iloc [s .index ]
1239- change_max_occur = change_s [change_s .is_change == 1 ].index .max ()
1240- change_min_occur = change_s [change_s .is_change == 1 ].index .min ()
1241- diff = change_s ['diff' ].sum ()
1242- duration = change_max_occur - change_min_occur
1243-
1244- if low_thresh <= diff <= high_thresh : # Change is a spike
1245- change_type = 'spike'
1246- # we keep the starting point as x
1247- x = change_min_occur
1248- # get the average change for the values of the change that
1249- # are in the changing threshold
1250- diff = change_s .loc [(change_s .is_change == 1 ) &
1251- ((change_s ['diff' ] < low_thresh ) |
1252- (change_s ['diff' ] > high_thresh )), 'diff' ].abs ().mean ()
1253-
1254- else : # Change is a step
1255- # here we have a different starting point
1256- x = (change_max_occur + change_min_occur - 1 ) / 2.0
1257- change_type = 'step'
1258-
1259- d = {'change_type' : change_type ,
1260- 'duration' : duration ,
1261- 'diff' : diff ,
1262- 'x' : x }
1263- changes_list += [d ]
1264-
1265- # Sort by absolute difference, in descending order
1266- sorted_changes_list = sorted (changes_list , key = lambda ch : abs (ch ['diff' ]),
1267- reverse = True )
1268-
1269- # Rule of thumb: the maximum number of changes
1270- # is the square root of the time series length
1271- max_max_changes = int (np .floor (np .sqrt (len (a_y ))))
1272- # If we have a max_changes input value, select the ones with higher diff
1273- if (not max_changes ) or (max_changes > max_max_changes ):
1274- max_changes = max_max_changes
1275-
1276- changes_list = sorted_changes_list [:max_changes ]
1277-
1278- steps = []
1279- spikes = []
1280- for c in changes_list :
1281- # get models
1282- if c ['change_type' ] == 'spike' :
1283- # spike = create_fixed_spike(c['diff'], x=c['x'],
1284- # duration=c['duration'])
1285- spike = create_fixed_spike_ignored (x = c ['x' ],
1286- duration = c ['duration' ])
1287- spikes += [spike ]
1288- elif c ['change_type' ] == 'step' :
1289- step = create_fixed_step (diff = c ['diff' ], x = c ['x' ])
1290- steps += [step ]
1291- else :
1292- raise ValueError ('Invalid change type: ' + c ['change_type' ])
1293-
1294- return steps , spikes
1124+ dfo ['outl_start' ] = dfo .outl_start .fillna (0 ).astype (int )
1125+ dfo ['outl_end' ] = dfo .outl_end .fillna (0 ).astype (int )
1126+ dfo ['mask_spike' ] = (dfo ['is_spike' ] &
1127+ (dfo .x >= dfo .outl_start ) &
1128+ (dfo .x < dfo .outl_end ))
1129+ dfo ['mask_step' ] = (dfo ['is_step' ] &
1130+ (dfo .x >= dfo .outl_start ) &
1131+ (dfo .x <= dfo .outl_end ))
1132+
1133+ return dfo .mask_step .values , dfo .mask_spike .values
12951134
12961135
12971136def create_fixed_step (diff , x ):
0 commit comments