Skip to content
This repository was archived by the owner on Jul 8, 2025. It is now read-only.

Commit 07be98e

Browse files
authored
Remove redundant functions for outlier detection (#35)
close #6
1 parent 5599d27 commit 07be98e

File tree

5 files changed

+111
-529
lines changed

5 files changed

+111
-529
lines changed

anticipy/forecast.py

Lines changed: 9 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,6 @@ def run_forecast(df_y, l_model_trend=None, l_model_season=None,
563563
extrapolate_years=0, season_add_mult='add',
564564
include_all_fits=False,
565565
simplify_output=True,
566-
do_find_steps_and_spikes=False,
567566
find_outliers=False,
568567
l_season_yearly=None,
569568
l_season_weekly=None,
@@ -603,9 +602,6 @@ def run_forecast(df_y, l_model_trend=None, l_model_season=None,
603602
:param season_add_mult: 'add', 'mult', or 'both'. Whether forecast seasonality will be additive, multiplicative,
604603
or the best fit of the two.
605604
:type season_add_mult: str
606-
:param do_find_steps_and_spikes: if True, find steps and spikes, create fixed models and add them
607-
to the list of models
608-
:type do_find_steps_and_spikes: bool
609605
:param find_outliers: If True, find outliers in input data, ignore outlier samples in forecast
610606
:type find_outliers: bool
611607
:param include_all_fits: If True, also include non-optimal models in output
@@ -649,7 +645,6 @@ def run_forecast(df_y, l_model_trend=None, l_model_season=None,
649645
season_add_mult,
650646
include_all_fits,
651647
simplify_output,
652-
do_find_steps_and_spikes,
653648
find_outliers,
654649
l_season_yearly,
655650
l_season_weekly,
@@ -669,7 +664,6 @@ def run_forecast(df_y, l_model_trend=None, l_model_season=None,
669664
season_add_mult,
670665
include_all_fits,
671666
False, # Simplify output
672-
do_find_steps_and_spikes,
673667
find_outliers,
674668
l_season_yearly,
675669
l_season_weekly,
@@ -723,7 +717,6 @@ def run_forecast_single(df_y,
723717
season_add_mult='add',
724718
include_all_fits=False,
725719
simplify_output=True,
726-
do_find_steps_and_spikes=False,
727720
find_outliers=False,
728721
l_season_yearly=None,
729722
l_season_weekly=None,
@@ -753,9 +746,6 @@ def run_forecast_single(df_y,
753746
:param season_add_mult: 'add', 'mult', or 'both'. Whether forecast seasonality will be additive, multiplicative,
754747
or the best fit of the two.
755748
:type season_add_mult: str
756-
:param do_find_steps_and_spikes: if True, find steps and spikes, create fixed models and add them
757-
to the list of models
758-
:type do_find_steps_and_spikes: bool
759749
:param include_all_fits: If True, also include non-optimal models in output
760750
:type include_all_fits: bool
761751
:param simplify_output: If False, return dict with forecast and metadata. Otherwise, return only forecast.
@@ -802,15 +792,14 @@ def run_forecast_single(df_y,
802792

803793
# If we find outliers, we add a model with dummy variables for the outliers
804794
if find_outliers:
805-
model_outliers, outlier_mask = forecast_models.get_model_outliers(df_y)
806-
if outlier_mask is not None:
807-
if 'weight' in df_y.columns:
808-
df_y['weight'] = df_y['weight'] * outlier_mask
809-
else:
810-
df_y['weight'] = outlier_mask
811-
assert np.issubdtype(df_y.weight.astype(float), np.float64)
812-
else:
813-
model_outliers = None
795+
mask_step, mask_spike = forecast_models.get_model_outliers(df_y)
796+
# Make weight = 0 to ignore spike outliers
797+
if 'weight' in df_y.columns:
798+
df_y['weight'] = df_y['weight'] * (~mask_spike).astype(float)
799+
else:
800+
df_y['weight'] = (~mask_spike).astype(float)
801+
assert np.issubdtype(df_y.weight.astype(float), np.float64)
802+
# TODO add models for steps
814803

815804
# Add actuals to output
816805
# Get weight for metadata
@@ -872,28 +861,6 @@ def run_forecast_single(df_y,
872861
if l_model_naive is not None:
873862
l_model = l_model_naive+l_model
874863

875-
# if model_outliers is not None:
876-
# l_model_outlier = [forecast_models.model_null, model_outliers]
877-
# l_model = get_list_model(l_model, l_model_outlier, 'add')
878-
879-
if do_find_steps_and_spikes:
880-
a_y = df_y.y.values
881-
a_x = df_y.y
882-
883-
a_date = df_y.date if 'date' in df_y.columns else None
884-
885-
steps, spikes = forecast_models.find_steps_and_spikes(a_x, a_y, a_date)
886-
if steps:
887-
steps_summed = reduce(lambda x, y: x + y, steps)
888-
steps_summed.name = '{}_fixed_steps'.format(len(steps))
889-
l_model = [model + steps_summed for model in l_model]
890-
if spikes:
891-
spikes_mult = reduce(lambda x, y: x * y, spikes)
892-
spikes_mult.name = '{}_fixed_spikes'.format(len(spikes))
893-
# filter values during the spike
894-
a_y_filt = spikes_mult(a_x, a_date, [])
895-
df_y[a_y_filt == 0] = np.nan
896-
897864
# exclude samples with weight = 0
898865
df_y = df_y.loc[df_y.weight > 0]
899866
date_start_actuals = df_y.date.min() if 'date' in df_y.columns else df_y.x.min()
@@ -980,7 +947,6 @@ def run_l_forecast(l_fcast_input,
980947
col_name_source='source',
981948
extrapolate_years=0, season_add_mult='add',
982949
include_all_fits=False,
983-
do_find_steps_and_spikes=False,
984950
find_outliers=False):
985951
"""
986952
Generate forecasts for a list of SolverConfig objects, each including a time series, model functions, and other
@@ -1028,7 +994,6 @@ def run_l_forecast(l_fcast_input,
1028994
col_name_source,
1029995
extrapolate_years, season_add_mult,
1030996
include_all_fits, simplify_output=False,
1031-
do_find_steps_and_spikes=do_find_steps_and_spikes,
1032997
find_outliers=find_outliers)
1033998
l_dict_result += [dict_result]
1034999

@@ -1192,4 +1157,4 @@ def _get_pi_single_source(df_forecast, n=100):
11921157
df_actuals_unfiltered = df_actuals_unfiltered[l_cols + ['is_actuals', 'model', 'y']]
11931158
df_pi_result = pd.concat([df_actuals_unfiltered, df_forecast_past, df_forecast_pi, ], sort=False, ignore_index=True)
11941159

1195-
return df_pi_result
1160+
return df_pi_result

anticipy/forecast_models.py

Lines changed: 29 additions & 190 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,7 +1027,6 @@ def get_model_outliers_withgap(df, window=3):
10271027
# todo - clean up, return
10281028

10291029

1030-
10311030
# TODO: Add option - estimate_outl_size
10321031
# TODO: Add option - sigmoid steps
10331032
# TODO: ADD option - gaussian spikes
@@ -1092,206 +1091,46 @@ def get_model_outliers(df, window=3):
10921091
dfo.mean_group_diff = dfo.mean_group_diff.fillna(0)
10931092
dfo.mean_group_diff_abs = dfo.mean_group_diff_abs.fillna(0)
10941093

1095-
dfo['is_step'] = (dfo.mean_group_diff < thr_low) | (dfo.mean_group_diff > thr_hi)
1096-
dfo['is_spike'] = (dfo.mean_group_diff_abs - dfo.mean_group_diff) > (thr_hi - thr_low) / 2
1097-
dfo['ischange_cumsum'] = dfo.ischange.cumsum()
1094+
# dfo['is_step'] = (dfo.mean_group_diff < thr_low) | (dfo.mean_group_diff > thr_hi)
1095+
dfo['is_step'] = dfo['ischange_group'] & (
1096+
((dfo.mean_group_diff < thr_low) | (dfo.mean_group_diff > thr_hi)))
10981097

1099-
# logger_info('DF_OUTL: ',dfo)
1098+
dfo['is_spike'] = (dfo.mean_group_diff_abs - dfo.mean_group_diff.abs()) > (thr_hi - thr_low) / 2
1099+
dfo['ischange_cumsum'] = dfo.ischange.cumsum()
11001100

11011101
df_outl = (
1102-
dfo.loc[dfo.ischange.astype(bool)].groupby('change_group')
1103-
.apply(lambda x:pd.Series({'outl_start':x.head(1)[x_col].iloc[0],'outl_end':x.tail(1)[x_col].iloc[0]}))
1104-
.reset_index()
1102+
dfo.loc[dfo.ischange.astype(bool)].groupby('change_group').apply(
1103+
lambda x: pd.Series({'outl_start': x[x_col].iloc[0],
1104+
'outl_end': x[x_col].iloc[-1]})).reset_index()
11051105
)
11061106

11071107
if df_outl.empty: # No outliers - nothing to do
1108-
return None, None
1108+
return np.full(dfo.index.size, False), \
1109+
np.full(dfo.index.size, False)
11091110

11101111
df_outl = df_outl.merge(dfo[['change_group', 'is_spike', 'is_step']].drop_duplicates())
11111112

11121113
dfo = dfo.merge(df_outl, how='left')
1113-
dfo['outl_start'] = dfo.outl_start.fillna(0).astype(int)
1114-
dfo['outl_end'] = dfo.outl_end.fillna(0).astype(int)
1115-
1116-
dfo = dfo # .reset_index()
1117-
1118-
df_spikes = df_outl.loc[df_outl.is_spike]
1119-
df_steps = df_outl.loc[df_outl.is_step]
1120-
1121-
l_model_outl = []
1122-
l_mask_step = []
1123-
l_mask_spike = []
1124-
1125-
for g in df_spikes.change_group:
1126-
s_spike = df_spikes.loc[df_spikes.change_group == g].iloc[0]
1127-
if with_dates:
1128-
mask_spike_tmp = ~((dfo.date>=pd.to_datetime(s_spike.outl_start)) &
1129-
(dfo.date<pd.to_datetime(s_spike.outl_end)))
1130-
else:
1131-
mask_spike_tmp = ~((dfo.x.values>=s_spike.outl_start) &
1132-
(dfo.x.values<s_spike.outl_end))
1133-
l_mask_spike += [mask_spike_tmp.astype(float)]
1134-
1135-
for g in df_steps.change_group:
1136-
s_step = df_steps.loc[df_steps.change_group == g].iloc[0]
1137-
if with_dates:
1138-
model_step_tmp = get_model_step_date(pd.to_datetime(s_step.outl_start))
1139-
else:
1140-
model_step_tmp = (
1141-
fix_params_fmodel(
1142-
model_step, [s_step.outl_start, np.NaN])
1143-
)
1144-
l_model_outl += [model_step_tmp]
1145-
if with_dates:
1146-
mask_step_tmp = (dfo.date==pd.to_datetime(s_step.outl_start))
1147-
else:
1148-
mask_step_tmp = (dfo.x.values==s_step.outl_start)
1149-
l_mask_step += [mask_step_tmp.astype(float)]
1150-
1151-
if len(l_model_outl) == 0:
1152-
model_outliers = None
1153-
else:
1154-
model_outliers = np.prod(l_model_outl) if is_mult else np.sum(l_model_outl)
1155-
if len(l_mask_spike) == 0:
1156-
mask_spike = None
1157-
else:
1158-
mask_spike = np.prod(l_mask_spike, axis=0)
1159-
if len(l_mask_step)==0:
1160-
mask_step = None
1114+
if with_dates:
1115+
dfo['outl_start'] = pd.to_datetime(dfo.outl_start)
1116+
dfo['outl_end'] = pd.to_datetime(dfo.outl_end)
1117+
dfo['mask_spike'] = (dfo['is_spike'] &
1118+
(dfo.date >= pd.to_datetime(dfo.outl_start)) &
1119+
(dfo.date < pd.to_datetime(dfo.outl_end)))
1120+
dfo['mask_step'] = (dfo['is_step'] &
1121+
(dfo.date >= pd.to_datetime(dfo.outl_start)) &
1122+
(dfo.date <= pd.to_datetime(dfo.outl_end)))
11611123
else:
1162-
mask_step = np.sum(l_mask_step, axis=0)
1163-
return mask_step, mask_spike
1164-
#return model_outliers, mask_spike
1165-
# return model_outliers, l_mask_spike
1166-
1167-
1168-
# TODO: Remove - replaced by get_model_outliers()
1169-
def find_steps_and_spikes(a_x, a_y, a_date, window=3, max_changes=None):
1170-
"""
1171-
Automatically locate steps and spikes for a_y time series.
1172-
A rule of thumb for the window is two weeks.
1173-
1174-
:param a_x: The values of the x axis
1175-
:param a_y: The values of the y axis
1176-
:param a_date: The values of the x axis, if they are dates
1177-
:param window: The x-axis window to aggregate multiple steps/spikes
1178-
:type window: int
1179-
:param max_changes: Manual input of the number of changes that the
1180-
input time series is expected to have.
1181-
If max_changes = 0 or None, the function returns
1182-
all changes (steps / spikes) found.
1183-
:type max_changes: int
1184-
:return: changes_list
1185-
:rtype: list of dictionaries
1186-
Each dictionary includes information on a single change,
1187-
and is of the format:
1188-
{'change_type': change_type, # ('spike' or 'step')
1189-
'duration': duration, # int - the x-axis duration of the change
1190-
'diff': diff, # the difference within the change
1191-
'x': x, # the x-axis at the middle of the change
1192-
'date': date} # the x-axis date at the middle of the change
1193-
"""
1194-
# find NaNs in a_y
1195-
df = pd.DataFrame({'y': a_y}).interpolate('slinear')
1196-
1197-
# Find peak changes in diff
1198-
df['diff'] = df.y.diff()
1199-
1200-
# Outliers are < Q1 - 3 * IQR, > Q3 + 3 * IQR
1201-
# Q1 and Q3 are the 25th (1st) and 75th (3rd) quartiles, and
1202-
# IQR is the inter-quartile range
1203-
q1 = df['diff'].quantile(0.25)
1204-
q3 = df['diff'].quantile(0.75)
1205-
iqr = q3 - q1
1206-
low_thresh = q1 - 1.5 * iqr
1207-
high_thresh = q3 + 1.5 * iqr
1208-
1209-
# df['is_change'] = 0
1210-
step_filt = (df['diff'] < low_thresh) | (df['diff'] > high_thresh)
1211-
df['is_change'] = step_filt.astype(int)
1212-
1213-
if not any(step_filt):
1214-
return [], []
1215-
1216-
# df.loc[step_filt, 'is_change'] = 1
1217-
1218-
# Now that we have found the outliers in differences,
1219-
# group consecutive steps together
1220-
1221-
# get only the diffs that correspond to changes
1222-
df['diff'] = df['diff'] * df['is_change']
1223-
df[['diff_sum', 'change_sum']] = df[['diff', 'is_change']].rolling(
1224-
window, win_type=None, center=True).sum()
1225-
1226-
# we have steps, we may need to aggregate
1227-
# We split the array with zeros.
1228-
# This means that we treat all nearby changes as one,
1229-
# within `window` values
1230-
split = np.split(df['change_sum'],
1231-
np.where(df['change_sum'] == 0.)[0])
1232-
# get rid of zero only series
1233-
split = [i for i in split if i.any()]
1234-
1235-
# Now we have a list of series with the changes
1236-
changes_list = []
1237-
for s in split:
1238-
change_s = df.iloc[s.index]
1239-
change_max_occur = change_s[change_s.is_change == 1].index.max()
1240-
change_min_occur = change_s[change_s.is_change == 1].index.min()
1241-
diff = change_s['diff'].sum()
1242-
duration = change_max_occur - change_min_occur
1243-
1244-
if low_thresh <= diff <= high_thresh: # Change is a spike
1245-
change_type = 'spike'
1246-
# we keep the starting point as x
1247-
x = change_min_occur
1248-
# get the average change for the values of the change that
1249-
# are in the changing threshold
1250-
diff = change_s.loc[(change_s.is_change == 1) &
1251-
((change_s['diff'] < low_thresh) |
1252-
(change_s['diff'] > high_thresh)), 'diff'].abs().mean()
1253-
1254-
else: # Change is a step
1255-
# here we have a different starting point
1256-
x = (change_max_occur + change_min_occur - 1) / 2.0
1257-
change_type = 'step'
1258-
1259-
d = {'change_type': change_type,
1260-
'duration': duration,
1261-
'diff': diff,
1262-
'x': x}
1263-
changes_list += [d]
1264-
1265-
# Sort by absolute difference, in descending order
1266-
sorted_changes_list = sorted(changes_list, key=lambda ch: abs(ch['diff']),
1267-
reverse=True)
1268-
1269-
# Rule of thumb: the maximum number of changes
1270-
# is the square root of the time series length
1271-
max_max_changes = int(np.floor(np.sqrt(len(a_y))))
1272-
# If we have a max_changes input value, select the ones with higher diff
1273-
if (not max_changes) or (max_changes > max_max_changes):
1274-
max_changes = max_max_changes
1275-
1276-
changes_list = sorted_changes_list[:max_changes]
1277-
1278-
steps = []
1279-
spikes = []
1280-
for c in changes_list:
1281-
# get models
1282-
if c['change_type'] == 'spike':
1283-
# spike = create_fixed_spike(c['diff'], x=c['x'],
1284-
# duration=c['duration'])
1285-
spike = create_fixed_spike_ignored(x=c['x'],
1286-
duration=c['duration'])
1287-
spikes += [spike]
1288-
elif c['change_type'] == 'step':
1289-
step = create_fixed_step(diff=c['diff'], x=c['x'])
1290-
steps += [step]
1291-
else:
1292-
raise ValueError('Invalid change type: ' + c['change_type'])
1293-
1294-
return steps, spikes
1124+
dfo['outl_start'] = dfo.outl_start.fillna(0).astype(int)
1125+
dfo['outl_end'] = dfo.outl_end.fillna(0).astype(int)
1126+
dfo['mask_spike'] = (dfo['is_spike'] &
1127+
(dfo.x >= dfo.outl_start) &
1128+
(dfo.x < dfo.outl_end))
1129+
dfo['mask_step'] = (dfo['is_step'] &
1130+
(dfo.x >= dfo.outl_start) &
1131+
(dfo.x <= dfo.outl_end))
1132+
1133+
return dfo.mask_step.values, dfo.mask_spike.values
12951134

12961135

12971136
def create_fixed_step(diff, x):

0 commit comments

Comments
 (0)