@@ -269,48 +269,70 @@ def compute_forward_returns(factor,
269269 "they have the same convention in terms of datetimes "
270270 "and symbol-names" )
271271
272- forward_returns = pd .DataFrame (index = pd .MultiIndex .from_product (
273- [factor_dateindex , prices .columns ], names = ['date' , 'asset' ]))
272+ # chop prices down to only the assets we care about (= unique assets in
273+ # `factor`). we could modify `prices` in place, but that might confuse
274+ # the caller.
275+ prices = prices .filter (items = factor .index .levels [1 ])
274276
275- forward_returns .index .levels [0 ].freq = freq
277+ raw_values_dict = {}
278+ column_list = []
276279
277280 for period in sorted (periods ):
278- #
279- # build forward returns
280- #
281- fwdret = prices .shift (- period ) / prices - 1
282- fwdret = fwdret .reindex (factor_dateindex )
281+ forward_returns = \
282+ prices .pct_change (period ).shift (- period ).reindex (factor_dateindex )
283283
284284 if filter_zscore is not None :
285- mask = abs (fwdret - fwdret .mean ()) > (filter_zscore * fwdret .std ())
286- fwdret [mask ] = np .nan
285+ mask = abs (
286+ forward_returns - forward_returns .mean ()
287+ ) > (filter_zscore * forward_returns .std ())
288+ forward_returns [mask ] = np .nan
287289
288290 #
289291 # Find the period length, which will be the column name. We'll test
290292 # several entries in order to find out the correct period length as
291293 # there could be non-trading days which would make the computation
292294 # wrong if made only one test
293295 #
294- entries_to_test = min (30 , len (fwdret .index ),
295- len (prices .index ) - period )
296+ entries_to_test = min (
297+ 30 ,
298+ len (forward_returns .index ),
299+ len (prices .index ) - period
300+ )
301+
296302 days_diffs = []
297303 for i in range (entries_to_test ):
298- p_idx = prices .index .get_loc (fwdret .index [i ])
304+ p_idx = prices .index .get_loc (forward_returns .index [i ])
299305 start = prices .index [p_idx ]
300306 end = prices .index [p_idx + period ]
301307 period_len = diff_custom_calendar_timedeltas (start , end , freq )
302308 days_diffs .append (period_len .components .days )
303309
304310 delta_days = period_len .components .days - mode (days_diffs ).mode [0 ]
305311 period_len -= pd .Timedelta (days = delta_days )
312+ label = timedelta_to_string (period_len )
313+
314+ column_list .append (label )
315+
316+ raw_values_dict [label ] = np .concatenate (forward_returns .values )
317+
318+ df = pd .DataFrame .from_dict (raw_values_dict )
319+ df .set_index (
320+ pd .MultiIndex .from_product (
321+ [factor_dateindex , prices .columns ],
322+ names = ['date' , 'asset' ]
323+ ),
324+ inplace = True
325+ )
326+ df = df .reindex (factor .index )
306327
307- # Finally use period_len as column name
308- column_name = timedelta_to_string (period_len )
309- forward_returns [column_name ] = fwdret .stack ()
328+ # now set the columns correctly
329+ df = df [column_list ]
310330
311- forward_returns .index = forward_returns .index .rename (['date' , 'asset' ])
331+ df .index .levels [0 ].freq = freq
332+ df .index .levels [0 ].name = "date"
333+ df .index .levels [1 ].name = "asset"
312334
313- return forward_returns
335+ return df
314336
315337
316338def demean_forward_returns (factor_data , grouper = None ):
@@ -527,24 +549,24 @@ def get_clean_factor(factor,
527549
528550 initial_amount = float (len (factor .index ))
529551
530- factor = factor .copy ()
531- factor .index = factor .index .rename (['date' , 'asset' ])
552+ factor_copy = factor .copy ()
553+ factor_copy .index = factor_copy .index .rename (['date' , 'asset' ])
532554
533555 merged_data = forward_returns .copy ()
534- merged_data ['factor' ] = factor
556+ merged_data ['factor' ] = factor_copy
535557
536558 if groupby is not None :
537559 if isinstance (groupby , dict ):
538- diff = set (factor .index .get_level_values (
560+ diff = set (factor_copy .index .get_level_values (
539561 'asset' )) - set (groupby .keys ())
540562 if len (diff ) > 0 :
541563 raise KeyError (
542564 "Assets {} not in group mapping" .format (
543565 list (diff )))
544566
545567 ss = pd .Series (groupby )
546- groupby = pd .Series (index = factor .index ,
547- data = ss [factor .index .get_level_values (
568+ groupby = pd .Series (index = factor_copy .index ,
569+ data = ss [factor_copy .index .get_level_values (
548570 'asset' )].values )
549571
550572 if groupby_labels is not None :
@@ -565,12 +587,16 @@ def get_clean_factor(factor,
565587 fwdret_amount = float (len (merged_data .index ))
566588
567589 no_raise = False if max_loss == 0 else True
568- merged_data ['factor_quantile' ] = quantize_factor (merged_data ,
569- quantiles ,
570- bins ,
571- binning_by_group ,
572- no_raise ,
573- zero_aware )
590+ quantile_data = quantize_factor (
591+ merged_data ,
592+ quantiles ,
593+ bins ,
594+ binning_by_group ,
595+ no_raise ,
596+ zero_aware
597+ )
598+
599+ merged_data ['factor_quantile' ] = quantile_data
574600
575601 merged_data = merged_data .dropna ()
576602
0 commit comments