feat: optimize idiosyncratic volatility factor using vectorized covariance operations

adityacosmos24 · adityacosmos24 · commit 9f99632032c1 · 2025-10-27T00:03:07.000+05:30
diff --git a/src/quant_research_starter/examples/benchmark/benchmark_factors.py b/src/quant_research_starter/examples/benchmark/benchmark_factors.py
@@ -0,0 +1,61 @@
+"""
+Benchmark script to compare performance of factor computations.
+
+Usage:
+    python examples/benchmarks/benchmark_factors.py
+"""
+
+import time
+import numpy as np
+import pandas as pd
+
+from quant_research_starter.factors import (
+    MomentumFactor,
+    ValueFactor,
+    SizeFactor,
+    VolatilityFactor,
+    IdiosyncraticVolatility,
+    BollingerBandsFactor,
+)
+
+def generate_synthetic_prices(n_assets: int = 500, n_days: int = 252 * 3) -> pd.DataFrame:
+    """Generate synthetic random walk price data for testing."""
+    np.random.seed(42)
+    returns = np.random.normal(0, 0.01, size=(n_days, n_assets))
+    prices = 100 * np.exp(np.cumsum(returns, axis=0))
+    dates = pd.date_range(end=pd.Timestamp.today(), periods=n_days, freq="B")
+    tickers = [f"Stock_{i:03d}" for i in range(n_assets)]
+    return pd.DataFrame(prices, index=dates, columns=tickers)
+
+
+def benchmark_factor(factor, prices: pd.DataFrame):
+    """Benchmark runtime of a given factor."""
+    start = time.time()
+    _ = factor.compute(prices)
+    end = time.time()
+    elapsed = end - start
+    print(f"{factor.name:<25} | Lookback: {factor.lookback:<5} | Time: {elapsed:.3f} sec")
+
+
+def main():
+    print("Generating synthetic data...")
+    prices = generate_synthetic_prices(n_assets=500, n_days=252 * 3)
+    print(f"Data shape: {prices.shape}")
+
+    print("\nRunning factor benchmarks...\n")
+
+    factors = [
+        MomentumFactor(lookback=63),
+        ValueFactor(),
+        SizeFactor(),
+        VolatilityFactor(lookback=21),
+        IdiosyncraticVolatility(lookback=63),
+        BollingerBandsFactor(lookback=20),
+    ]
+
+    for factor in factors:
+        benchmark_factor(factor, prices)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/quant_research_starter/factors/volatility.py b/src/quant_research_starter/factors/volatility.py
@@ -1,4 +1,4 @@
-"""Volatility factor implementations."""
+"""Volatility factor implementations (vectorized)."""
 
 import numpy as np
 import pandas as pd
@@ -7,14 +7,7 @@
 
 
 class VolatilityFactor(Factor):
-    """
-    Volatility factors measuring different aspects of risk.
-
-    Common volatility measures:
-    - Historical volatility (realized vol)
-    - Idiosyncratic volatility
-    - Volatility of volatility
-    """
+    """Computes historical volatility (annualized)."""
 
     def __init__(self, lookback: int = 21, name: str = "volatility"):
         super().__init__(name=name, lookback=lookback)
@@ -23,109 +16,60 @@ def compute(self, prices: pd.DataFrame) -> pd.DataFrame:
         """Compute historical volatility over lookback period."""
         self._validate_data(prices)
 
-        if len(prices) < self.lookback:
-            raise ValueError(f"Need at least {self.lookback} periods of data")
-
-        # Calculate returns
         returns = prices.pct_change()
 
-        # Compute rolling volatility (annualized); set min_periods to require full window
-        volatility = returns.rolling(
-            window=self.lookback, min_periods=self.lookback
-        ).std() * np.sqrt(252)
-
-        # Remove initial NaN values
-        volatility = volatility.iloc[self.lookback - 1 :]
+        # Vectorized rolling std (annualized)
+        vol = returns.rolling(window=self.lookback, min_periods=self.lookback).std() * np.sqrt(252)
+        vol = vol.iloc[self.lookback - 1:]
 
-        # Low volatility stocks tend to outperform (volatility anomaly)
-        # Use scaled negative volatility to ensure clear negative signal in tests
-        vol_scores = -volatility * 10.0
+        # Low-volatility anomaly (invert sign)
+        scores = -vol * 10.0
 
-        # Cross-sectional z-score when multiple columns; otherwise return scores
-        if vol_scores.shape[1] > 1:
-            vol_z = vol_scores.sub(vol_scores.mean(axis=1), axis=0)
-            denom = vol_scores.std(axis=1).replace(0, np.nan)
-            vol_z = vol_z.div(denom, axis=0)
-            result = vol_z
+        # Cross-sectional z-score
+        if scores.shape[1] > 1:
+            z = (scores - scores.mean(axis=1).values[:, None]) / scores.std(axis=1).values[:, None]
+            result = pd.DataFrame(z, index=scores.index, columns=scores.columns)
         else:
-            # Single asset: use negative realized vol directly
-            result = vol_scores
+            result = scores
 
         self._values = result
         return result
 
 
 class IdiosyncraticVolatility(VolatilityFactor):
-    """
-    Idiosyncratic volatility relative to market model.
-
-    Measures stock-specific risk after accounting for market exposure.
-    """
+    """Vectorized idiosyncratic volatility relative to market model."""
 
     def compute(self, prices: pd.DataFrame) -> pd.DataFrame:
-        """Compute idiosyncratic volatility from market model residuals."""
+        """Compute idiosyncratic volatility using vectorized regression."""
         self._validate_data(prices)
 
-        if len(prices) < self.lookback:
-            raise ValueError(f"Need at least {self.lookback} periods of data")
-
         returns = prices.pct_change().dropna()
-
-        # Use equal-weighted portfolio as market proxy
-        market_returns = returns.mean(axis=1)
-
-        idiosyncratic_vol = pd.DataFrame(index=returns.index, columns=returns.columns)
-
-        # Compute rolling idiosyncratic volatility
-        for symbol in returns.columns:
-            stock_returns = returns[symbol]
-
-            def calc_idio_vol(window_returns):
-                if len(window_returns) < 10:  # Minimum observations for regression
-                    return np.nan
-
-                # Simple market model regression
-                X = market_returns.loc[window_returns.index].values.reshape(-1, 1)
-                y = window_returns.values
-
-                # Remove NaN values
-                mask = ~(np.isnan(X) | np.isnan(y))
-                X_clean = X[mask[:, 0]]
-                y_clean = y[mask[:, 0]]
-
-                if len(X_clean) < 10:
-                    return np.nan
-
-                try:
-                    # Calculate residuals via simple OLS beta
-                    x = X_clean.flatten()
-                    x_var = np.var(x)
-                    if x_var == 0:
-                        return np.nan
-                    beta = np.cov(y_clean, x)[0, 1] / x_var
-                    residuals = y_clean - beta * x
-                    return np.std(residuals) * np.sqrt(252)
-                except Exception:
-                    return np.nan
-
-            idiosyncratic_vol[symbol] = stock_returns.rolling(
-                window=self.lookback
-            ).apply(calc_idio_vol, raw=False)
-
-        # Remove initial NaN values
-        idiosyncratic_vol = idiosyncratic_vol.iloc[self.lookback - 1 :]
-
-        # Negative relationship with returns (idiosyncratic vol anomaly)
-        idio_scores = -idiosyncratic_vol
-
-        # Z-score normalize when multiple assets; otherwise return scores
-        if idio_scores.shape[1] > 1:
-            idio_z = idio_scores.sub(idio_scores.mean(axis=1), axis=0)
-            denom = idio_scores.std(axis=1).replace(0, np.nan)
-            idio_z = idio_z.div(denom, axis=0)
-            result = idio_z
+        market = returns.mean(axis=1)
+
+        # Compute beta for each asset using vectorized covariance/variance
+        cov_with_mkt = returns.mul(market, axis=0).rolling(window=self.lookback).mean() - (
+            returns.rolling(window=self.lookback).mean().mul(market.rolling(window=self.lookback).mean(), axis=0)
+        )
+        market_var = market.rolling(window=self.lookback).var()
+        beta = cov_with_mkt.div(market_var, axis=0)
+
+        # Predicted returns via market model
+        predicted = beta.mul(market, axis=0)
+        residuals = returns - predicted
+
+        # Rolling residual std (annualized)
+        idio_vol = residuals.rolling(window=self.lookback, min_periods=self.lookback).std() * np.sqrt(252)
+        idio_vol = idio_vol.iloc[self.lookback - 1:]
+
+        # Invert sign (low-idio-vol performs better)
+        scores = -idio_vol
+
+        # Cross-sectional z-score normalization
+        if scores.shape[1] > 1:
+            z = (scores - scores.mean(axis=1).values[:, None]) / scores.std(axis=1).values[:, None]
+            result = pd.DataFrame(z, index=scores.index, columns=scores.columns)
         else:
-            result = idio_scores
+            result = scores
 
         self._values = result
         return result