|
| 1 | +import math |
| 2 | +import stats |
| 3 | +import strutils |
| 4 | +import sequtils |
| 5 | +import strformat |
| 6 | +import sugar |
| 7 | +import core |
| 8 | +import distributions |
| 9 | +import ols |
| 10 | + |
| 11 | +########################################################### |
| 12 | +# OLS |
| 13 | +########################################################### |
| 14 | +type OLSModel2 = ref object |
| 15 | + residuals*: vector |
| 16 | + sum_squared_errors*: float |
| 17 | + degrees_of_freedom*: float |
| 18 | + variance_matrix_coefficients*: matrix |
| 19 | + # |
| 20 | + include_intercept: bool #Because it could be informative. Maybe. |
| 21 | + # |
| 22 | + loglikelihood*: float |
| 23 | + # |
| 24 | + R2*: float#Coefficient of determination |
| 25 | + adjustedR2*: float#Coefficient of determination |
| 26 | + beta_hat*: vector |
| 27 | + # |
| 28 | + coefficients*: seq[SimpleEstimator[StudentT]] |
| 29 | + noise_variance*: SimpleEstimator[InvChiSquare] |
| 30 | + #https://stats.stackexchange.com/questions/256726/linear-regression-what-does-the-f-statistic-r-squared-and-residual-standard-err |
| 31 | + model_significance*: HypothesisScore[CentralF] |
| 32 | + # |
| 33 | + feature_names*: seq[string] |
| 34 | + |
| 35 | + |
| 36 | +proc simple_ls_model_cov_independient*(y: vector, names: seq[string] = @[]): OLSModel = |
| 37 | + new result |
| 38 | + let |
| 39 | + X = constant_matrix(y.len, 1, 1.0) |
| 40 | + if names.len > 0 and names.len != X.cols: |
| 41 | + raise newException(ValueError, "incorrect number of feature names") |
| 42 | + let |
| 43 | + Y = y.as_column_vector |
| 44 | + XpX = (X.T * X).inverse |
| 45 | + beta_hat = (XpX * X.T) * Y |
| 46 | + Ypred = X * beta_hat |
| 47 | + residuals = (Y - Ypred).T[0] |
| 48 | + sse = norm(residuals) ^ 2 |
| 49 | + #var total_sample_variation = norm(Y - mean(y)) ^ 2 #SST |
| 50 | + variance_normalization_factor = ((X.rows - X.cols - 1).toFloat / (X.rows - X.cols).toFloat) |
| 51 | + s2 = sse / (X.rows - X.cols - 1).toFloat * variance_normalization_factor |
| 52 | + var_beta_hat = s2 * XpX |
| 53 | + estimate_std = var_beta_hat.diag .^ 0.5 |
| 54 | + coefficients = beta_hat.T[0] |
| 55 | + include_intercept = X.wise_standard_deviation(axis=0).any_val(true_val=0.0) |
| 56 | + |
| 57 | + var total_model_variation = norm(Ypred.ravel - Ypred.ravel.mean) ^ 2 #SSE, ESS |
| 58 | + total_model_variation = norm(y - residuals) ^ 2 |
| 59 | + let f_score = (total_model_variation / (X.cols + (if include_intercept: -1 else: 0)).toFloat) / (sse / (X.rows - X.cols).toFloat) |
| 60 | + |
| 61 | + result.include_intercept = include_intercept |
| 62 | + result.residuals = residuals |
| 63 | + result.sum_squared_errors = sse |
| 64 | + result.variance_matrix_coefficients = var_beta_hat |
| 65 | + result.R2 = total_model_variation/(total_model_variation + sse) |
| 66 | + result.adjustedR2 = 1.0 - (X.rows + (if include_intercept: -1 else: 0)).toFloat / (X.rows - X.cols).toFloat * (1.0 - result.R2) |
| 67 | + result.beta_hat = beta_hat.ravel |
| 68 | + # |
| 69 | + result.loglikelihood = normal(mean=0.0, std=(sse * (result.degrees_of_freedom + 1e-10 - 2.0) / (X.rows - X.cols).toFloat).abs.sqrt).loglikelihood(residuals) |
| 70 | + #Fixed according https://stats.stackexchange.com/questions/277009/why-are-the-degrees-of-freedom-for-multiple-regression-n-k-1-for-linear-reg |
| 71 | + #let dof = (X.rows - X.cols - 1).toFloat #DOF without intercept |
| 72 | + result.degrees_of_freedom = (X.rows - X.cols).toFloat#Because if includes intercept, it is in the design matrix |
| 73 | + |
| 74 | + result.noise_variance = shifted_estimator( |
| 75 | + distribution=invchisquare(dof=result.degrees_of_freedom + 1e-10), |
| 76 | + location=0.0,#result.degrees_of_freedom, |
| 77 | + scale=sse * (result.degrees_of_freedom + 1e-10 - 2.0) / (X.rows - X.cols).toFloat |
| 78 | + ) |
| 79 | + result.coefficients = (0..estimate_std.high).mapIt( |
| 80 | + shifted_estimator( |
| 81 | + distribution=studentt(dof=result.degrees_of_freedom), |
| 82 | + location=coefficients[it], scale=estimate_std[it] |
| 83 | + ) |
| 84 | + ) |
| 85 | + #Note s2 and noise_variance.estimate are the same, but s2 avoids to calculate it many times |
| 86 | + |
| 87 | + let ms_model_dof = (X.cols + (if include_intercept: -1 else: 0)).toFloat |
| 88 | + let ms_residual_dof = (X.rows - X.cols).toFloat |
| 89 | + result.model_significance = central_f(df1=ms_model_dof, df2=ms_residual_dof).htest_score( |
| 90 | + score=(total_model_variation/ms_model_dof)/(sse/ms_residual_dof), |
| 91 | + test_type=oneTailed |
| 92 | + ) |
| 93 | + if names.len > 0: |
| 94 | + result.feature_names = names |
| 95 | + else: |
| 96 | + result.feature_names = (1..X.cols).mapIt(fmt"x{it}") |
| 97 | + |
| 98 | + |
| 99 | +proc simple_ls_model_cov_exchangeable*(y: vector, names: seq[string] = @[]): OLSModel = |
| 100 | + new result |
| 101 | + let |
| 102 | + X = constant_matrix(y.len, 1, 1.0) |
| 103 | + if names.len > 0 and names.len != X.cols: |
| 104 | + raise newException(ValueError, "incorrect number of feature names") |
| 105 | + let |
| 106 | + Y = y.as_column_vector |
| 107 | + XpX = (X.T * X).inverse |
| 108 | + beta_hat = (XpX * X.T) * Y |
| 109 | + Ypred = X * beta_hat |
| 110 | + residuals = (Y - Ypred).T[0] |
| 111 | + sse = norm(residuals) ^ 2 |
| 112 | + #var total_sample_variation = norm(Y - mean(y)) ^ 2 #SST |
| 113 | + variance_normalization_factor = ((X.rows - X.cols - 1).toFloat / (X.rows - X.cols).toFloat) |
| 114 | + s2 = sse / (X.rows - X.cols - 1).toFloat * variance_normalization_factor |
| 115 | + var_beta_hat = s2 * XpX |
| 116 | + estimate_std = var_beta_hat.diag .^ 0.5 |
| 117 | + coefficients = beta_hat.T[0] |
| 118 | + include_intercept = X.wise_standard_deviation(axis=0).any_val(true_val=0.0) |
| 119 | + |
| 120 | + var total_model_variation = norm(Ypred.ravel - Ypred.ravel.mean) ^ 2 #SSE, ESS |
| 121 | + total_model_variation = norm(y - residuals) ^ 2 |
| 122 | + let f_score = (total_model_variation / (X.cols + (if include_intercept: -1 else: 0)).toFloat) / (sse / (X.rows - X.cols).toFloat) |
| 123 | + |
| 124 | + result.include_intercept = include_intercept |
| 125 | + result.residuals = residuals |
| 126 | + result.sum_squared_errors = sse |
| 127 | + result.variance_matrix_coefficients = var_beta_hat |
| 128 | + result.R2 = total_model_variation/(total_model_variation + sse) |
| 129 | + result.adjustedR2 = 1.0 - (X.rows + (if include_intercept: -1 else: 0)).toFloat / (X.rows - X.cols).toFloat * (1.0 - result.R2) |
| 130 | + result.beta_hat = beta_hat.ravel |
| 131 | + # |
| 132 | + result.loglikelihood = normal(mean=0.0, std=(sse * (result.degrees_of_freedom + 1e-10 - 2.0) / (X.rows - X.cols).toFloat).abs.sqrt).loglikelihood(residuals) |
| 133 | + #Fixed according https://stats.stackexchange.com/questions/277009/why-are-the-degrees-of-freedom-for-multiple-regression-n-k-1-for-linear-reg |
| 134 | + #let dof = (X.rows - X.cols - 1).toFloat #DOF without intercept |
| 135 | + result.degrees_of_freedom = (X.rows - X.cols).toFloat#Because if includes intercept, it is in the design matrix |
| 136 | + |
| 137 | + result.noise_variance = shifted_estimator( |
| 138 | + distribution=invchisquare(dof=result.degrees_of_freedom + 1e-10), |
| 139 | + location=0.0,#result.degrees_of_freedom, |
| 140 | + scale=sse * (result.degrees_of_freedom + 1e-10 - 2.0) / (X.rows - X.cols).toFloat |
| 141 | + ) |
| 142 | + result.coefficients = (0..estimate_std.high).mapIt( |
| 143 | + shifted_estimator( |
| 144 | + distribution=studentt(dof=result.degrees_of_freedom), |
| 145 | + location=coefficients[it], scale=estimate_std[it] |
| 146 | + ) |
| 147 | + ) |
| 148 | + #Note s2 and noise_variance.estimate are the same, but s2 avoids to calculate it many times |
| 149 | + |
| 150 | + let ms_model_dof = (X.cols + (if include_intercept: -1 else: 0)).toFloat |
| 151 | + let ms_residual_dof = (X.rows - X.cols).toFloat |
| 152 | + result.model_significance = central_f(df1=ms_model_dof, df2=ms_residual_dof).htest_score( |
| 153 | + score=(total_model_variation/ms_model_dof)/(sse/ms_residual_dof), |
| 154 | + test_type=oneTailed |
| 155 | + ) |
| 156 | + if names.len > 0: |
| 157 | + result.feature_names = names |
| 158 | + else: |
| 159 | + result.feature_names = (1..X.cols).mapIt(fmt"x{it}") |
| 160 | + |
| 161 | + |
| 162 | + |
| 163 | +proc `$`*(model: OLSModel2): string = |
| 164 | + result = "[Ordinary Least Squares Model]\n" |
| 165 | + result &= fmt"* Design matrix: {model.coefficients.len}x{model.residuals.len}" & (if model.include_intercept: " (include intercept)\n" else: "\n") |
| 166 | + result &= "* Coefficients:\n" |
| 167 | + #result &= model.coefficients_as_string & "\n" |
| 168 | + result &= "* Noise:\n" |
| 169 | + result &= $model.noise_variance & "\n" |
| 170 | + result &= &"* Residual standard error: {(model.noise_variance.estimate ^ 0.5):.3f} on {model.degrees_of_freedom.toInt} degrees of freedom\n" |
| 171 | + result &= &"* Multiple R-squared: {model.R2:.5f}, Adjusted R-squared: {model.adjustedR2:.5f}\n" |
| 172 | + #result &= &"* F-statistic: {model.model_significance.test_score:.5f} on {model.model_significance.distribution.df1.toInt} and {model.model_significance.distribution.df2.toInt} DF, p-value: {model.model_significance.p_value:.5e} \n" |
| 173 | + # |
| 174 | + result &= &"* Analysis of variance:\n" |
| 175 | + let anova_model_dof = model.model_significance.distribution.df1 |
| 176 | + let anova_residual_dof = model.model_significance.distribution.df2 |
| 177 | + let anova_residual_mse = model.sum_squared_errors / anova_residual_dof |
| 178 | + let anova_model_mse = model.model_significance.test_score * anova_residual_mse |
| 179 | + result &= " | D.f.| Sum Sq.| Mean Sq.| F value | p-value\n" |
| 180 | + result &= &" Predictors| {anova_model_dof:>4}| {anova_model_mse*anova_model_dof:>9.5e}| {anova_model_mse:>9.5e}| {model.model_significance.test_score:>9.5e}| {model.model_significance.p_value:.5e}\n" |
| 181 | + result &= &" Residuals| {anova_residual_dof:>4}| {anova_residual_mse*anova_residual_dof:>9.5e}| {anova_residual_mse:>9.5e}|" |
| 182 | + |
0 commit comments