Skip to content

Commit e67d2b5

Browse files
Jean Bredecheluca-s
authored andcommitted
PERF: Speed up compute_forward_returns and get_clean_factor. (#327)
1 parent 23ba51e commit e67d2b5

File tree

1 file changed

+56
-30
lines changed

1 file changed

+56
-30
lines changed

alphalens/utils.py

Lines changed: 56 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -269,48 +269,70 @@ def compute_forward_returns(factor,
269269
"they have the same convention in terms of datetimes "
270270
"and symbol-names")
271271

272-
forward_returns = pd.DataFrame(index=pd.MultiIndex.from_product(
273-
[factor_dateindex, prices.columns], names=['date', 'asset']))
272+
# chop prices down to only the assets we care about (= unique assets in
273+
# `factor`). we could modify `prices` in place, but that might confuse
274+
# the caller.
275+
prices = prices.filter(items=factor.index.levels[1])
274276

275-
forward_returns.index.levels[0].freq = freq
277+
raw_values_dict = {}
278+
column_list = []
276279

277280
for period in sorted(periods):
278-
#
279-
# build forward returns
280-
#
281-
fwdret = prices.shift(-period) / prices - 1
282-
fwdret = fwdret.reindex(factor_dateindex)
281+
forward_returns = \
282+
prices.pct_change(period).shift(-period).reindex(factor_dateindex)
283283

284284
if filter_zscore is not None:
285-
mask = abs(fwdret - fwdret.mean()) > (filter_zscore * fwdret.std())
286-
fwdret[mask] = np.nan
285+
mask = abs(
286+
forward_returns - forward_returns.mean()
287+
) > (filter_zscore * forward_returns.std())
288+
forward_returns[mask] = np.nan
287289

288290
#
289291
# Find the period length, which will be the column name. We'll test
290292
# several entries in order to find out the correct period length as
291293
# there could be non-trading days which would make the computation
292294
# wrong if made only one test
293295
#
294-
entries_to_test = min(30, len(fwdret.index),
295-
len(prices.index) - period)
296+
entries_to_test = min(
297+
30,
298+
len(forward_returns.index),
299+
len(prices.index) - period
300+
)
301+
296302
days_diffs = []
297303
for i in range(entries_to_test):
298-
p_idx = prices.index.get_loc(fwdret.index[i])
304+
p_idx = prices.index.get_loc(forward_returns.index[i])
299305
start = prices.index[p_idx]
300306
end = prices.index[p_idx + period]
301307
period_len = diff_custom_calendar_timedeltas(start, end, freq)
302308
days_diffs.append(period_len.components.days)
303309

304310
delta_days = period_len.components.days - mode(days_diffs).mode[0]
305311
period_len -= pd.Timedelta(days=delta_days)
312+
label = timedelta_to_string(period_len)
313+
314+
column_list.append(label)
315+
316+
raw_values_dict[label] = np.concatenate(forward_returns.values)
317+
318+
df = pd.DataFrame.from_dict(raw_values_dict)
319+
df.set_index(
320+
pd.MultiIndex.from_product(
321+
[factor_dateindex, prices.columns],
322+
names=['date', 'asset']
323+
),
324+
inplace=True
325+
)
326+
df = df.reindex(factor.index)
306327

307-
# Finally use period_len as column name
308-
column_name = timedelta_to_string(period_len)
309-
forward_returns[column_name] = fwdret.stack()
328+
# now set the columns correctly
329+
df = df[column_list]
310330

311-
forward_returns.index = forward_returns.index.rename(['date', 'asset'])
331+
df.index.levels[0].freq = freq
332+
df.index.levels[0].name = "date"
333+
df.index.levels[1].name = "asset"
312334

313-
return forward_returns
335+
return df
314336

315337

316338
def demean_forward_returns(factor_data, grouper=None):
@@ -527,24 +549,24 @@ def get_clean_factor(factor,
527549

528550
initial_amount = float(len(factor.index))
529551

530-
factor = factor.copy()
531-
factor.index = factor.index.rename(['date', 'asset'])
552+
factor_copy = factor.copy()
553+
factor_copy.index = factor_copy.index.rename(['date', 'asset'])
532554

533555
merged_data = forward_returns.copy()
534-
merged_data['factor'] = factor
556+
merged_data['factor'] = factor_copy
535557

536558
if groupby is not None:
537559
if isinstance(groupby, dict):
538-
diff = set(factor.index.get_level_values(
560+
diff = set(factor_copy.index.get_level_values(
539561
'asset')) - set(groupby.keys())
540562
if len(diff) > 0:
541563
raise KeyError(
542564
"Assets {} not in group mapping".format(
543565
list(diff)))
544566

545567
ss = pd.Series(groupby)
546-
groupby = pd.Series(index=factor.index,
547-
data=ss[factor.index.get_level_values(
568+
groupby = pd.Series(index=factor_copy.index,
569+
data=ss[factor_copy.index.get_level_values(
548570
'asset')].values)
549571

550572
if groupby_labels is not None:
@@ -565,12 +587,16 @@ def get_clean_factor(factor,
565587
fwdret_amount = float(len(merged_data.index))
566588

567589
no_raise = False if max_loss == 0 else True
568-
merged_data['factor_quantile'] = quantize_factor(merged_data,
569-
quantiles,
570-
bins,
571-
binning_by_group,
572-
no_raise,
573-
zero_aware)
590+
quantile_data = quantize_factor(
591+
merged_data,
592+
quantiles,
593+
bins,
594+
binning_by_group,
595+
no_raise,
596+
zero_aware
597+
)
598+
599+
merged_data['factor_quantile'] = quantile_data
574600

575601
merged_data = merged_data.dropna()
576602

0 commit comments

Comments
 (0)