diff --git a/arch/bootstrap/base.py b/arch/bootstrap/base.py
index 92f3cde1b2..23a788ef72 100644
--- a/arch/bootstrap/base.py
+++ b/arch/bootstrap/base.py
@@ -20,6 +20,7 @@
     ArrayLike2D,
     BootstrapIndexT,
     Float64Array,
+    Float64Array2D,
     Int64Array,
     Int64Array1D,
     Literal,
@@ -201,7 +202,7 @@ def optimal_block_length(x: ArrayLike1D | ArrayLike2D) -> pd.DataFrame:
     return pd.DataFrame(opt, index=idx, columns=["stationary", "circular"])
 
 
-def _get_acceleration(jk_params: Float64Array) -> float:
+def _get_acceleration(jk_params: Float64Array) -> Float64Array2D:
     """
     Estimates the BCa acceleration parameter using jackknife estimates
     of theta.
@@ -772,7 +773,7 @@ def conf_int(
                         )
                     a = self._bca_acceleration(func, extra_kwargs)
                 else:
-                    a = 0.0
+                    a = np.zeros((1, 1))
                 percentiles = stats.norm.cdf(
                     b + (b + norm_quantiles) / (1.0 - a * (b + norm_quantiles))
                 )
@@ -834,7 +835,7 @@ def _bca_bias(self) -> Float64Array:
 
     def _bca_acceleration(
         self, func: Callable[..., Float64Array], extra_kwags: dict[str, Any] | None
-    ) -> float:
+    ) -> Float64Array2D:
         nobs = self._num_items
         jk_params = _loo_jackknife(func, nobs, self._args, self._kwargs, extra_kwags)
         return _get_acceleration(jk_params)
diff --git a/arch/tests/univariate/test_volatility.py b/arch/tests/univariate/test_volatility.py
index 2ddc61e77a..52b6d1f360 100644
--- a/arch/tests/univariate/test_volatility.py
+++ b/arch/tests/univariate/test_volatility.py
@@ -12,7 +12,7 @@
 )
 import pytest
 from scipy.special import gamma, gammaln
-
+from arch import arch_model
 from arch.univariate import ZeroMean
 from arch.univariate.distribution import Normal, SkewStudent, StudentsT
 import arch.univariate.recursions_python as recpy
@@ -28,6 +28,7 @@
     FixedVariance,
     MIDASHyperbolic,
     RiskMetrics2006,
+    MSGARCH,
 )
 from arch.utility.exceptions import InitialValueWarning
 
@@ -1640,7 +1641,7 @@ def test_aparch(setup, initial_value):
     assert_equal(aparch._delta, np.nan)
     assert aparch.p == aparch.o == aparch.q == 1
 
-    parameters[1] = 0.9
+    parameters[1] = 0.20001
     with pytest.warns(InitialValueWarning):
         aparch.simulate(parameters, setup.t, rng.simulate([]))
 
@@ -1804,3 +1805,28 @@ def test_figarch_weights():
     lam_cy = rec.figarch_weights(params, 1, 1, 1000)
     assert_allclose(lam_py, lam_nb)
     assert_allclose(lam_py, lam_cy)
+
+
+
+def test_msgarch():
+    data = np.random.randn(100) # arbitary data series
+    model = arch_model(data, vol="MSGARCH",p=1,q=1,mean="zero")  # 2 regimes
+    result = model.fit(disp="off")
+    forecast = result.forecast(horizon=1)
+    cond_var = result.conditional_volatility
+    probs = result.model.volatility.filtered_probs
+    ll = result.loglikelihood
+
+    assert hasattr(result, "params")
+    assert result is not None
+    assert result.params.shape[0] == 8 #  2*3 GARCH params + 2 transition matrix probabilities (this is valid whilst k=2)
+    assert np.all(np.isfinite(result.params))
+    assert result.model.volatility.k == 2
+    assert np.all(forecast.variance.values > 0)
+    assert forecast.variance.shape[1] == 1
+    assert np.allclose(probs[:,1:].sum(axis=0), 1.0, atol=1e-6) # excludes t=0 prob's as these have not been filtered
+    assert np.all((probs >= 0) & (probs <= 1))
+    assert cond_var.shape[0] == len(data)
+    assert np.all(cond_var > 0)
+    assert np.isfinite(ll)
+
diff --git a/arch/univariate/__init__.py b/arch/univariate/__init__.py
index b4bb335b34..c25a28973b 100644
--- a/arch/univariate/__init__.py
+++ b/arch/univariate/__init__.py
@@ -29,6 +29,7 @@
     FixedVariance,
     MIDASHyperbolic,
     RiskMetrics2006,
+    MSGARCH,
 )
 
 recursions: types.ModuleType
@@ -55,6 +56,7 @@
     "HARX",
     "LS",
     "MIDASHyperbolic",
+    "MSGARCH",
     "Normal",
     "RiskMetrics2006",
     "SkewStudent",
diff --git a/arch/univariate/base.py b/arch/univariate/base.py
index 18e543539e..a61faecaac 100644
--- a/arch/univariate/base.py
+++ b/arch/univariate/base.py
@@ -34,7 +34,7 @@
     Literal,
 )
 from arch.univariate.distribution import Distribution, Normal
-from arch.univariate.volatility import ConstantVariance, VolatilityProcess
+from arch.univariate.volatility import ConstantVariance, VolatilityProcess, MSGARCH
 from arch.utility.array import ensure1d, to_array_1d
 from arch.utility.exceptions import (
     ConvergenceWarning,
@@ -714,15 +714,21 @@ def fit(
         if total_params == 0:
             return self._fit_parameterless_model(cov_type=cov_type, backcast=backcast)
 
-        sigma2 = np.zeros(resids.shape[0], dtype=float)
+        n_regimes = v.k if isinstance(v, MSGARCH) else 1
+        sigma2 = np.zeros((resids.shape[0], n_regimes)) if n_regimes > 1 else np.zeros(resids.shape[0])
         self._backcast = backcast
-        sv_volatility = v.starting_values(resids)
+        sv_volatility = v.starting_values(resids) # initial guess for GARCH recursion
         self._var_bounds = var_bounds = v.variance_bounds(resids)
-        v.compute_variance(sv_volatility, resids, sigma2, backcast, var_bounds)
-        std_resids = resids / np.sqrt(sigma2)
+        v.compute_variance(sv_volatility, resids, sigma2, backcast, var_bounds) # fills sigma 2 recursively using chosen vol model
+        if n_regimes == 1:
+            std_resids = resids / np.sqrt(sigma2) 
+        else:
+            pi = self.volatility.pi # shape (k,)
+            pi_weighted_sigma2 = sigma2 @ pi # (t,k) @ (k,) = (t,)
+            std_resids = resids / np.sqrt(pi_weighted_sigma2) ## Using stationary distribution to weight regime variances. This is only an approximation (true weights should come from filtered probabilties), but we don't have these available at this stage
 
         # 2. Construct constraint matrices from all models and distribution
-        constraints = (
+        constraints = (                                                    
             self.constraints(),
             self.volatility.constraints(),
             self.distribution.constraints(),
@@ -790,12 +796,19 @@ def fit(
             _callback_info["display"] = update_freq
         disp_flag = True if disp == "final" else False
 
-        func = self._loglikelihood
-        args = (sigma2, backcast, var_bounds)
-        ineq_constraints = constraint(a, b)
+        if isinstance(self.volatility, MSGARCH):
+            func = self.volatility.msgarch_loglikelihood # MS GARCH
+            args = (resids, sigma2, backcast, var_bounds)
 
-        from scipy.optimize import minimize
+        else:
+            func = self._loglikelihood  # standard GARCH
+            args = (sigma2, backcast, var_bounds)
+
+        # func = self.volatility.compute_loglikelihood()
+        # args = (sigma2, backcast, var_bounds)
 
+        ineq_constraints = constraint(a, b)
+        from scipy.optimize import minimize
         options = {} if options is None else options
         options.setdefault("disp", disp_flag)
         with warnings.catch_warnings():
@@ -835,7 +848,7 @@ def fit(
         mp, vp, dp = self._parse_parameters(params)
 
         resids = self.resids(mp)
-        vol = np.zeros(resids.shape[0], dtype=float)
+        vol = self.volatility._initialise_vol(resids, n_regimes)
         self.volatility.compute_variance(vp, resids, vol, backcast, var_bounds)
         vol = cast(Float64Array1D, np.sqrt(vol))
 
@@ -849,8 +862,17 @@ def fit(
         first_obs, last_obs = self._fit_indices
         resids_final = np.full(self._y.shape, np.nan, dtype=float)
         resids_final[first_obs:last_obs] = resids
-        vol_final = np.full(self._y.shape, np.nan, dtype=float)
-        vol_final[first_obs:last_obs] = vol
+        filtered_probs = self.volatility.compute_filtered_probs(params, resids, sigma2, backcast, var_bounds)
+
+        
+        if isinstance(self.volatility, MSGARCH):
+            vol_final = np.full(self._y.shape, np.nan, dtype=float)
+            weighted_sigma2 = (vol**2 * filtered_probs.T).sum(axis=1) 
+            vol_final[first_obs:last_obs] = np.sqrt(weighted_sigma2)
+            
+        else:  
+            vol_final = np.full(self._y.shape, np.nan, dtype=float)
+            vol_final[first_obs:last_obs] = vol
 
         fit_start, fit_stop = self._fit_indices
         model_copy = deepcopy(self)
@@ -870,6 +892,7 @@ def fit(
             fit_start,
             fit_stop,
             model_copy,
+            filtered_probs,
         )
 
     @abstractmethod
@@ -1803,6 +1826,7 @@ def __init__(
         fit_start: int,
         fit_stop: int,
         model: ARCHModel,
+        filtered_probs: Float64Array | None = None,
     ) -> None:
         super().__init__(
             params, resid, volatility, dep_var, names, loglikelihood, is_pandas, model
@@ -1813,6 +1837,7 @@ def __init__(
         self._r2 = r2
         self.cov_type: str = cov_type
         self._optim_output = optim_output
+        self._filtered_probs = filtered_probs
 
     @cached_property
     def scale(self) -> float:
@@ -2063,6 +2088,10 @@ def optimization_result(self) -> OptimizeResult:
             Result from numerical optimization of the log-likelihood.
         """
         return self._optim_output
+    
+    @property
+    def filtered_probs(self):
+        return self._filtered_probs 
 
 
 def _align_forecast(
diff --git a/arch/univariate/mean.py b/arch/univariate/mean.py
index 48fdab8a3f..4afdedf01a 100644
--- a/arch/univariate/mean.py
+++ b/arch/univariate/mean.py
@@ -67,6 +67,7 @@
     HARCH,
     ConstantVariance,
     VolatilityProcess,
+    MSGARCH
 )
 from arch.utility.array import (
     AbstractDocStringInheritor,
@@ -188,7 +189,7 @@ class HARX(ARCHModel, metaclass=AbstractDocStringInheritor):
         Flag indicating whether to automatically rescale data if the scale of the
         data is likely to produce convergence issues when estimating model parameters.
         If False, the model is estimated on the data without transformation.  If True,
-        than y is rescaled and the new scale is reported in the estimation results.
+        then y is rescaled and the new scale is reported in the estimation results.
 
     Examples
     --------
@@ -1107,7 +1108,7 @@ class ConstantMean(HARX):
         Flag indicating whether to automatically rescale data if the scale of the
         data is likely to produce convergence issues when estimating model parameters.
         If False, the model is estimated on the data without transformation.  If True,
-        than y is rescaled and the new scale is reported in the estimation results.
+        then y is rescaled and the new scale is reported in the estimation results.
 
     Examples
     --------
@@ -1252,7 +1253,7 @@ class ZeroMean(HARX):
         Flag indicating whether to automatically rescale data if the scale of the
         data is likely to produce convergence issues when estimating model parameters.
         If False, the model is estimated on the data without transformation.  If True,
-        than y is rescaled and the new scale is reported in the estimation results.
+        then y is rescaled and the new scale is reported in the estimation results.
 
     Examples
     --------
@@ -1414,7 +1415,7 @@ class ARX(HARX):
         Flag indicating whether to automatically rescale data if the scale of the
         data is likely to produce convergence issues when estimating model parameters.
         If False, the model is estimated on the data without transformation.  If True,
-        than y is rescaled and the new scale is reported in the estimation results.
+        then y is rescaled and the new scale is reported in the estimation results.
 
     Examples
     --------
@@ -1540,7 +1541,7 @@ class LS(HARX):
         Flag indicating whether to automatically rescale data if the scale of the
         data is likely to produce convergence issues when estimating model parameters.
         If False, the model is estimated on the data without transformation.  If True,
-        than y is rescaled and the new scale is reported in the estimation results.
+        then y is rescaled and the new scale is reported in the estimation results.
 
     Examples
     --------
@@ -1615,7 +1616,7 @@ class ARCHInMean(ARX):
         Flag indicating whether to automatically rescale data if the scale of the
         data is likely to produce convergence issues when estimating model parameters.
         If False, the model is estimated on the data without transformation.  If True,
-        than y is rescaled and the new scale is reported in the estimation results.
+        then y is rescaled and the new scale is reported in the estimation results.
     form : {"log", "vol", "var", int, float}
         The form of the conditional variance that appears in the mean equation. The
         string names use the log of the conditional variance ("log"), the square-root
@@ -1891,7 +1892,7 @@ def arch_model(
     ] = "Constant",
     lags: int | list[int] | Int32Array | Int64Array | None = 0,
     vol: Literal[
-        "GARCH", "ARCH", "EGARCH", "FIGARCH", "APARCH", "HARCH", "FIGARCH"
+        "GARCH", "ARCH", "EGARCH", "FIGARCH", "APARCH", "HARCH", "FIGARCH", "MSGARCH"
     ] = "GARCH",
     p: int | list[int] = 1,
     o: int = 0,
@@ -1953,7 +1954,7 @@ def arch_model(
         Flag indicating whether to automatically rescale data if the scale
         of the data is likely to produce convergence issues when estimating
         model parameters. If False, the model is estimated on the data without
-        transformation.  If True, than y is rescaled and the new scale is
+        transformation.  If True, then y is rescaled and the new scale is
         reported in the estimation results.
 
     Returns
@@ -2000,6 +2001,7 @@ def arch_model(
         "harch",
         "constant",
         "egarch",
+        "msgarch",
     )
     known_dist = (
         "normal",
@@ -2060,8 +2062,10 @@ def arch_model(
     elif vol_model == "aparch":
         assert isinstance(p, int)
         v = APARCH(p=p, o=o, q=q)
-    else:  # vol == 'harch'
+    elif vol_model == 'harch':
         v = HARCH(lags=p)
+    else: # vol_model == "msgarch":
+        v = MSGARCH()
 
     if dist_name in ("skewstudent", "skewt"):
         d: Distribution = SkewStudent()
diff --git a/arch/univariate/volatility.py b/arch/univariate/volatility.py
index 42378a345e..57a6a86e7f 100644
--- a/arch/univariate/volatility.py
+++ b/arch/univariate/volatility.py
@@ -12,7 +12,7 @@
 
 import numpy as np
 from numpy.random import RandomState
-from scipy.special import gammaln
+from scipy.special import gammaln, logsumexp
 
 from arch.typing import (
     ArrayLike1D,
@@ -208,7 +208,7 @@ class VolatilityProcess(metaclass=ABCMeta):
 
     _updatable: bool = True
 
-    def __init__(self) -> None:
+    def __init__(self, model=None) -> None:
         self._num_params = 0
         self._name = ""
         self.closed_form: bool = False
@@ -217,6 +217,8 @@ def __init__(self) -> None:
         self._start = 0
         self._stop = -1
         self._volatility_updater: rec.VolatilityUpdater | None = None
+        self._loglikelihood = None
+        self.model = model
 
     def __str__(self) -> str:
         return self.name
@@ -860,6 +862,24 @@ def parameter_names(self) -> list[str]:
         """
 
 
+    def compute_filtered_probs(self, params, resids, sigma2, backcast, var_bounds):
+        """
+        Default behavior for non MS models. Returns 1's for each time period (we are always in the single regime).
+        """
+        return np.ones((1, len(resids)))
+
+    def _initialise_vol(self, resids, n_regimes):
+        """
+        Construct empty volatility array.
+        """
+        return np.zeros(resids.shape[0], dtype=float)
+
+    def compute_loglikelihood(self, parameters, resids, sigma2, backcast, var_bounds):
+        """
+        Just calls the default ARCHModel _loglikelihood
+        """
+        return self.model._loglikelihood(parameters, sigma2, backcast, var_bounds)
+
 class ConstantVariance(VolatilityProcess, metaclass=AbstractDocStringInheritor):
     r"""
     Constant volatility process
@@ -3808,3 +3828,353 @@ def __str__(self) -> str:
         descr = descr[:-2] + ")"
 
         return descr
+
+class MSGARCH(VolatilityProcess, metaclass=AbstractDocStringInheritor):
+    """
+    Markov Switching GARCH volatility process.
+
+    This model combines multiple GARCH processes (one per regime) with a 
+    Markov chain dictating regime transitions. At each time step, the
+    conditional variance is computed as a weighted average of the
+    variances in each regime, with weights given by the filtered
+    probabilities of being in each regime.
+
+    """
+
+    def __init__(self, garch_type=GARCH) -> None:
+        super().__init__()
+        self.p = 1 # fixed for now
+        self.q = 1 # fixed for now
+        self.k = 2 # fixed for now
+        self.garch_type = garch_type # fixed for now
+        self.o = 0 # fixed for now
+        self.power = 2.0 # fixed for now
+
+        self.num_params_single = 1 + self.p + self.q # parameters in a single regime
+        self._num_params = self.num_params_single * self.k + self.k # regime specifc + transition matrix
+
+        # Instantiate one GARCH type object per regime
+        self.regimes = [GARCH(p=self.p, o=self.o, q=self.q, power=self.power) for _ in range(self.k)]
+
+        # Initialise constant transition matrix
+        self.transition_matrix = np.full((self.k, self.k), 1.0 / self.k)
+
+        self._name = "MS GARCH"
+        self.filtered_probs = None
+        self.pi = None
+        self.base_garch = GARCH(self.p, self.o, self.q, self.power) # used later for forecasting
+
+
+
+    def compute_variance(self, parameters, resids, sigma2, backcast, var_bounds):
+        power = self.power
+        fresids = np.abs(resids) ** power
+        sresids = np.sign(resids)
+        p, o, q = self.p, self.o, self.q
+        t = resids.shape[0]
+        sigma2 = np.ascontiguousarray(sigma2)
+
+        # regime 1
+        col1 = np.ascontiguousarray(sigma2[:, 0])
+        rec.garch_recursion(parameters[0:3], fresids, sresids, col1, p, o, q, t, backcast, var_bounds)
+        sigma2[:, 0] = col1
+
+        # regime 2
+        col2 = np.ascontiguousarray(sigma2[:, 1])
+        rec.garch_recursion(parameters[3:6], fresids, sresids, col2, p, o, q, t, backcast, var_bounds)
+        sigma2[:, 1] = col2
+
+        inv_power = 2.0 / power
+        sigma2 **= inv_power
+        return sigma2
+        
+
+    def bounds(self, resids):
+        v = float(np.mean(np.abs(resids) ** self.power))
+        bounds = []
+        for _ in range(self.k):
+            bounds.append((1e-4 * v, 10.0 * v)) # omega
+            bounds.extend([(1e-4, 0.999)] * self.p) # alpha
+            bounds.extend([(1e-4, 0.999)] * self.q) # beta
+        bounds.extend([(1e-6, 0.999)] * self.k) # transition probabilities: (k-1) free per row
+        return bounds
+
+
+    def parameter_names(self):
+        names = []
+        for r in range(1,self.k+1):
+            names.extend([f"omega_r{r}", f"alpha_r{r}", f"beta_r{r}"])
+        names.extend(['p11', 'p21'])
+        return names
+
+
+    def simulate(self, *args, **kwargs):
+        """Placeholder for future MS-GARCH simulate method"""
+        pass
+
+
+    def _check_forecasting_method(self, method: ForecastingMethod, horizon: int) -> None:
+        return
+
+
+    def _simulation_forecast(self, *args, **kwargs):
+        """Placeholder for future MS-GARCH simulation forecast"""
+        pass
+
+    def constraints(self) -> tuple[np.ndarray, np.ndarray]:
+        k = self.k       # number of regimes
+        p, q = self.p, self.q
+        k_arch = 1 + p + q  # omega + alphas + betas per regime
+
+        a_list = []
+        b_list = []
+
+        # 1. Regime specific volatility constraints
+        for r in range(k):
+            a = np.zeros((k_arch, k_arch * k + k))  # k_arch rows, total free params columns
+            b = np.zeros(k_arch)
+
+            # Positivity constraints: omega, alpha, beta >= 0
+            for i in range(k_arch):
+                a[i, r*k_arch + i] = -1.0
+                b[i] = 0
+        
+            # stationarity: alpha + beta < 1
+            a_stationarity = np.zeros(k_arch * k + k)
+            a_stationarity[r*k_arch + 1] = 1.0  # alpha
+            a_stationarity[r*k_arch + 2] = 1.0  # beta
+            b_stationarity = 0.9999
+            a_list.append(a)
+            b_list.append(b)
+
+            # append stationarity constraint
+            a_list.append(a_stationarity.reshape(1, -1))
+            b_list.append(np.array([b_stationarity]))
+
+        # 2 Transition probability constraints (rows sum <= 1)
+        # Free params: P11, P21
+        a_trans = np.zeros((k, k_arch*k + k))
+        b_trans = np.zeros(k)
+
+        # P11 >= 0
+        a_trans[0, k_arch*k + 0] = -1.0
+        b_trans[0] = 0.0
+
+        # P21 >= 0
+        a_trans[1, k_arch*k + 1] = -1.0
+        b_trans[1] = 0.0
+
+        # combine all
+        a_total = np.vstack(a_list + [a_trans])
+        b_total = np.concatenate(b_list + [b_trans])
+
+        return a_total, b_total
+
+
+    def variance_bounds(self, resids: ArrayLike1D, power: float = 2.0) -> Float64Array2D:
+        return super().variance_bounds(resids, self.power)
+
+
+    def starting_values(self, resids):
+        """
+        MSGARCH starting values via GM + Viterbi + single GARCH
+        Returns: [ω₁, α₁, β₁, ω₂, α₂, β₂, p11, p21]
+        """
+        from sklearn.mixture import GaussianMixture
+        from arch import arch_model
+
+        # GM groups returns by vol level
+        gm = GaussianMixture(n_components=self.k, random_state=1).fit(np.abs(resids).reshape(-1, 1))
+        
+        # Viterbi decoding for hard regime classification
+        viterbi_regimes = np.argmax(gm.predict_proba(np.abs(resids).reshape(-1, 1)), axis=1)
+        
+        # fit garch(1,1) to each regime's returns
+        garch_params = []
+        for x in range(self.k):
+            regime_mask = (viterbi_regimes == x)
+            if np.sum(regime_mask) < 10:  # use all data if regime is tiny
+                regime_mask = np.ones_like(resids, dtype=bool)
+            
+            # simple GARCH(1,1) for each regime
+            am = arch_model(resids[regime_mask], vol='Garch', p=1, q=1, dist='normal')
+            res = am.fit(disp="off")
+            garch_params.extend([res.params['omega'], res.params['alpha[1]'], res.params['beta[1]']])
+            
+        #Estimate transition matrix from viterbi path
+        P = np.zeros((self.k, self.k)) # K is the number of regimes
+        for t in range(1, len(viterbi_regimes)):
+            i, j = viterbi_regimes[t-1], viterbi_regimes[t]
+            P[i, j] += 1 # counts regime transitions
+        P = P / np.sum(P, axis=1, keepdims=True)  # normalise so rows sum to 1
+        
+        # stationary distribution for initial probabilities
+        eigvals, eigvecs = np.linalg.eig(P.T)
+        pi = eigvecs[:, np.isclose(eigvals, 1)].flatten()
+        self.pi = pi / pi.sum()
+
+        # include free transition parameters
+        free_P = [P[0, 0], P[1, 0]]
+
+        return np.array(garch_params + free_P)
+    
+
+
+    def msgarch_loglikelihood(self, parameters, resids, sigma2, backcast, var_bounds):
+        from arch.univariate.base import _callback_info
+        _callback_info["count"] += 1
+
+        # 1. Parse parameters into regime specific components
+        #mean params, volatility params per regime, distribution params
+        mp, vp, dp = self._parse_parameters(parameters)
+
+        # extract the free parameters for the transition matrix
+        start = self.k * self.num_params_single
+        end = start + self.k  # only one free param per row
+        free_P = vp[start:end]  # shape: (k,)
+
+        # reconstruct the full 2x2 transition matrix
+        transition_matrix = np.zeros((self.k, self.k))
+        transition_matrix[0, 0] = free_P[0]
+        transition_matrix[0, 1] = 1.0 - free_P[0]
+        transition_matrix[1, 0] = free_P[1]
+        transition_matrix[1, 1] = 1.0 - free_P[1]
+
+        # Initial probabilities
+        eigvals, eigvecs = np.linalg.eig(transition_matrix.T)
+        pi = np.real(eigvecs[:, np.isclose(eigvals, 1)].flatten())
+        self.pi /= pi.sum()
+
+        # Initialise regime probabilities array
+        p = np.zeros((self.k, len(resids)))
+        p[:, 0] = self.pi
+
+        # Initialise regime specific variance recursions using backcast
+        sigma2 = np.zeros((len(resids), self.k))
+        sigma2 = self.compute_variance(vp, resids, sigma2, backcast, var_bounds)
+        
+        loglikelihood = 0.0  # accumulator for total loglikelihood
+        var_floor = 1e-4
+        sigma2 = np.maximum(sigma2, var_floor)
+        log_p = np.log(np.maximum(p, 1e-12))
+        
+        # Loop over each time point
+        for t in range(1, len(resids)):
+            # sigma2 for current timestep
+            sigma2_t = sigma2[t, :]
+
+            # loglikelihood for each regime
+            log_likes = -0.5 * (np.log(2 * np.pi * sigma2_t) + (resids[t] ** 2) / sigma2_t)
+
+            # log of predicted probs: log(P' * p_t-1) in log space
+            # log(P.T) +  log_p[:, t-1] reshaped for broadcasting
+            log_pred_probs = logsumexp(np.log(np.maximum(transition_matrix.T, 1e-12)) + log_p[:, t-1][:, None], axis=0)
+
+            # log of mixture likelihood
+            log_mixture = logsumexp(log_pred_probs + log_likes)
+
+            # update filtered probabilities
+            log_p[:, t] = log_pred_probs + log_likes - log_mixture
+
+            # accumulate log-likelihood
+            loglikelihood += log_mixture
+     
+        p[:, :] = np.exp(log_p)
+        self.filtered_probs = p.copy()
+        return -loglikelihood
+
+
+    def _analytic_forecast(self,parameters,resids,backcast,var_bounds,start,horizon,filtered_probs=None):
+        t = resids.shape[0] # # of observations
+        k = self.k
+        vol_per_regime = np.zeros((t, k))  # volatility per regime
+        for r in range(k):
+            # unpack regime r params from flat parameters
+            omega = parameters[r*(3)+0]  # assuming p=q=1, no asymmetry for simplicity
+            alpha = parameters[r*(3)+1]
+            beta = parameters[r*(3)+2]
+
+            vol_r = np.zeros(t)
+            self.base_garch.compute_variance(np.array([omega,alpha,beta]), resids, vol_r, backcast, var_bounds)
+            vol_per_regime[:, r] = np.sqrt(vol_r)
+
+        # Get filtered probabilities
+        if filtered_probs is None:
+            filtered_probs = self.filtered_probs  # shape n_obs x k
+
+        # Weighted average volatility
+        vol_final = np.sum(vol_per_regime * filtered_probs.T, axis=1)  # n_obs
+        vol_final = vol_final[start:].reshape(-1, horizon)
+
+        self.vol_per_regime = vol_per_regime
+        self.vol_final = vol_final
+
+        return VarianceForecast(vol_final)
+
+
+
+    def _parse_parameters(self, x: np.ndarray):
+        x = np.asarray(x, dtype=float)
+        km = int(0)   # current implementation has 0 mean model params
+        kv = int(self.num_params)              # total vol params length for MSGARCH
+                                            
+        mp = x[:km]
+        vp = x[km:km+kv]
+        dp = x[km+kv:]
+
+        return mp, vp, dp
+    
+
+    def compute_filtered_probs(self, parameters, resids, sigma2, backcast, var_bounds):
+        # parse parameters
+        mp, vp, dp = self._parse_parameters(parameters)
+
+        # transition matrix
+        start = self.k * self.num_params_single
+        end = start + self.k
+        free_P = vp[start:end]
+        transition_matrix = np.zeros((self.k, self.k))
+        transition_matrix[0, 0] = free_P[0]
+        transition_matrix[0, 1] = 1.0 - free_P[0]
+        transition_matrix[1, 0] = free_P[1]
+        transition_matrix[1, 1] = 1.0 - free_P[1]
+
+        #  Initial probabilties
+        eigvals, eigvecs = np.linalg.eig(transition_matrix.T)
+        pi = np.real(eigvecs[:, np.isclose(eigvals, 1)].flatten())
+        pi /= pi.sum()
+
+        
+        p = np.zeros((self.k, len(resids)))
+        p[:, 0] = pi
+        sigma2 = self.compute_variance(vp, resids, np.zeros((len(resids), self.k)), backcast, var_bounds)
+
+        # Hamilton filter
+        var_floor = 1e-4
+        sigma2 = np.maximum(sigma2, var_floor)
+        log_p = np.log(np.maximum(p, 1e-12))
+
+        for t in range(1, len(resids)):
+            sigma2_t = sigma2[t, :]
+            log_likes = -0.5 * (np.log(2 * np.pi * sigma2_t) + (resids[t] ** 2) / sigma2_t)
+            log_pred_probs = logsumexp(np.log(np.maximum(transition_matrix.T, 1e-12)) + log_p[:, t-1][:, None], axis=0)
+            log_mixture = logsumexp(log_pred_probs + log_likes)
+            log_p[:, t] = log_pred_probs + log_likes - log_mixture
+
+        # return filtered probs
+        return np.exp(log_p)
+    
+
+    def _initialise_vol(self, resids, n_regimes):
+        return np.zeros((resids.shape[0], n_regimes), dtype=float)
+    
+
+    def compute_loglikelihood(self, parameters=None, resids=None, sigma2=None, backcast=None, var_bounds=None):
+        return self.msgarch_loglikelihood(parameters, resids, sigma2, backcast, var_bounds)
+
+
+    
+
+
+
+
diff --git a/ci/azure/install-posix.sh b/ci/azure/install-posix.sh
index fa287898da..e2bf4651fa 100644
--- a/ci/azure/install-posix.sh
+++ b/ci/azure/install-posix.sh
@@ -16,7 +16,7 @@ else
 fi
 
 python -m pip install --upgrade pip "setuptools>=61" wheel
-python -m pip install cython "pytest>=7,<8" pytest-xdist coverage pytest-cov ipython jupyter notebook nbconvert "property_cached>=1.6.3" black isort flake8 nbconvert setuptools_scm colorama
+python -m pip install cython "pytest>=8.4.1,<9" pytest-xdist coverage pytest-cov ipython jupyter notebook nbconvert "property_cached>=1.6.3" black isort flake8 nbconvert setuptools_scm colorama
 
 if [[ -n ${NUMPY} ]]; then CMD="$CMD~=${NUMPY}"; fi;
 CMD="$CMD scipy"
diff --git a/pyproject.toml b/pyproject.toml
index ba1188f791..52949194c4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,7 +55,7 @@ changelog = "https://bashtage.github.io/arch/changes.html"
 requires = [
   "setuptools>=61",
   "wheel",
-  "setuptools_scm[toml]>=8.0.3,<9",
+  "setuptools_scm>=9.0.3,<10",
   "cython>=3.0.10",
   "numpy>=2.0.0rc1,<3"
 ]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 5c4c27cd2a..34a417205e 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -11,7 +11,7 @@ matplotlib>=3
 seaborn
 
 # Tests
-pytest>=7.3,<8
+pytest>=8.4.1,<9
 pytest-xdist
 pytest-cov
 
diff --git a/setup.cfg b/setup.cfg
index 1bfa92d7bb..b7054d90f3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -31,8 +31,6 @@ filterwarnings =
     error:random_state is deprecated:FutureWarning
     error:seed is deprecated:FutureWarning
     error:get_state is deprecated:FutureWarning
-    error:Passing None has been deprecated:pytest.PytestRemovedIn8Warning:
-    # error:y is poorly scaled:arch.utility.exceptions.DataScaleWarning:
     error:Conversion of an array with ndim:DeprecationWarning:arch
 markers =
     slow: mark a test as slow