-
Notifications
You must be signed in to change notification settings - Fork 2k
GH-16583: Access GLM Variance-Covariance Matrix with vcov
#16586
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 5 commits
3231144
467b294
55b9ccd
72556df
fa1cbc6
33d3aee
1471a91
20e1ba6
de3e960
348de6a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,97 @@ | ||
| import sys | ||
|
|
||
| sys.path.insert(1, "../../../") | ||
| import h2o | ||
| from tests import pyunit_utils | ||
| from h2o.estimators.gbm import H2OGradientBoostingEstimator | ||
| from h2o.estimators.glm import H2OGeneralizedLinearEstimator | ||
|
|
||
| # make sure error is thrown when non glm model try to call vcov | ||
| def test_glm_vcov(): | ||
| cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) | ||
| y = "economy_20mpg" | ||
| predictors = ["displacement", "power", "weight", "acceleration", "year"] | ||
| cars[y] = cars[y].asfactor() | ||
|
|
||
| gbm_model = H2OGradientBoostingEstimator(distribution="bernoulli", seed=1234) | ||
| gbm_model.train(x=predictors, y=y, training_frame=cars) | ||
|
|
||
| try: | ||
| gbm_model.vcov() | ||
| except ValueError as e: | ||
| assert "The variance-covariance matrix is only found in GLM." in e.args[0], "Wrong error messages received." | ||
|
|
||
| # test to make sure we have error when compute_p_value=False and vcov is called | ||
| def test_wrong_model_vcov(): | ||
| cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) | ||
| y = "economy_20mpg" | ||
| predictors = ["displacement", "power", "weight", "acceleration", "year"] | ||
| cars[y] = cars[y].asfactor() | ||
|
|
||
| h2oglm_vcov = H2OGeneralizedLinearEstimator(family="binomial", seed=1234) | ||
| h2oglm_vcov.train(x=predictors, y=y, training_frame=cars) | ||
|
|
||
| try: | ||
| h2oglm_vcov.vcov() | ||
| except ValueError as e: | ||
| assert "The variance-covariance matrix is only calculated when compute_p_values=True" in e.args[0], "Wrong error messages received." | ||
|
|
||
| # make sure correct covariances are returned when vcov is called | ||
| def test_glm_vcov_values(): | ||
| cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) | ||
| y = "economy_20mpg" | ||
| predictors = ["displacement","power","weight","acceleration","year"] | ||
| cars[y] = cars[y].asfactor() | ||
|
|
||
| h2oglm_compute_vcov = H2OGeneralizedLinearEstimator(family = "binomial", lambda_=0.0, compute_p_values=True, | ||
| seed = 1234) | ||
|
|
||
| h2oglm_compute_vcov.train(x = predictors, y = y, training_frame = cars) | ||
| vcov = h2oglm_compute_vcov.vcov() | ||
| vcov_intercept = vcov['intercept'] | ||
| vcov_displacement = vcov['displacement'] | ||
| vcov_power = vcov['power'] | ||
| vcov_weight = vcov['weight'] | ||
| vcov_acceleration = vcov['acceleration'] | ||
| vcov_year = vcov['year'] | ||
|
|
||
| print("variance-covariance table: {0}".format(vcov)) | ||
|
|
||
| # manually obtain covariances and compare with ones using functions | ||
| names = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["names"] | ||
| manual_intercept = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["intercept"] | ||
| manual_displacement = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["displacement"] | ||
| manual_power = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["power"] | ||
| manual_weight = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["weight"] | ||
| manual_acceleration = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["acceleration"] | ||
| manual_year = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["year"] | ||
|
|
||
| # check both sets of values are equal | ||
| for i, el in enumerate(names): | ||
| assert abs(vcov_intercept[i]-manual_intercept[i]) < 1e-12, "Expected covariance between intercept and {el} : {0} " \ | ||
| ", actual covariance: {1}. They are different".format( | ||
| vcov_intercept[count], manual_intercept[count]) | ||
| assert abs(vcov_displacement[i]-manual_displacement[i]) < 1e-12, "Expected covariance between displacement and {el} : {0} " \ | ||
| ", actual covariance: {1}. They are different".format( | ||
| vcov_displacement[count], manual_displacement[count]) | ||
| assert abs(vcov_power[i]-manual_power[i]) < 1e-12, "Expected covariance between power and {el} : {0} " \ | ||
| ", actual covariance: {1}. They are different".format( | ||
| vcov_power[count], manual_power[count]) | ||
| assert abs(vcov_weight[i]-manual_weight[i]) < 1e-12, "Expected covariance between weight and {el} : {0} " \ | ||
| ", actual covariance: {1}. They are different".format( | ||
| vcov_weight[count], manual_weight[count]) | ||
| assert abs(vcov_acceleration[i]-manual_acceleration[i]) < 1e-12, "Expected covariance between acceleration and {el} : {0} " \ | ||
| ", actual covariance: {1}. They are different".format( | ||
| vcov_acceleration[count], manual_acceleration[count]) | ||
| assert abs(vcov_year[i]-manual_year[i]) < 1e-12, "Expected covariance between year and {el} : {0} " \ | ||
| ", actual covariance: {1}. They are different".format( | ||
| vcov_year[count], manual_year[count]) | ||
|
|
||
| if __name__ == "__main__": | ||
| pyunit_utils.standalone_test(test_glm_vcov) | ||
| pyunit_utils.standalone_test(test_wrong_model_vcov) | ||
| pyunit_utils.standalone_test(test_glm_vcov_values) | ||
| else: | ||
| test_glm_vcov() | ||
| test_wrong_model_vcov() | ||
| test_glm_vcov_values() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,66 @@ | ||
| setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) | ||
| source("../../../scripts/h2o-r-test-setup.R") | ||
|
|
||
| # call glm functions by a gbm model. Should generate an error | ||
| testGBMvcov <- function() { | ||
| bhexFV <- h2o.importFile(locate("smalldata/logreg/benign.csv")) | ||
| maxX <- 11 | ||
| Y <- 4 | ||
| X <- 3:maxX | ||
| X <- X[ X != Y ] | ||
| mFV <- h2o.gbm(y=Y, x=colnames(bhexFV)[X], training_frame=bhexFV) | ||
|
|
||
| assertError(vcovValues <- h2o.vcov(mFV)) | ||
| } | ||
|
|
||
| # should throw an error when compute_p_value=FALSE | ||
| testGLMvcovcomputePValueFALSE <- function() { | ||
| bhexFV <- h2o.importFile(locate("smalldata/logreg/benign.csv")) | ||
| maxX <- 11 | ||
| Y <- 4 | ||
| X <- 3:maxX | ||
| X <- X[ X != Y ] | ||
| mFV <- h2o.glm(y=Y, x=colnames(bhexFV)[X], training_frame=bhexFV, lambda=1.0) | ||
|
|
||
| assertError(vcovValues <- h2o.vcov(mFV)) | ||
| } | ||
|
|
||
| testGLMPValZValStdError <- function() { | ||
| bhexFV <- h2o.importFile(locate("smalldata/logreg/benign.csv")) | ||
| maxX <- 11 | ||
| Y <- 4 | ||
| X <- 3:maxX | ||
| X <- X[ X != Y ] | ||
| bhexFV$FNDX <- h2o.asfactor(bhexFV$FNDX) | ||
| mFV <- h2o.glm(y=Y, x=colnames(bhexFV)[X], training_frame=bhexFV, family="binomial", lambda=0.0, compute_p_values=TRUE) | ||
|
|
||
| vcovValues <- h2o.coef_with_p_values(mFV) | ||
| print("variance-covariance table") | ||
| print(vcovValues) | ||
| vcovIntercept <- vcovValues$intercept | ||
| vcovDisplacement <- vcovValues$displacement | ||
| vcovPower <- vcovValues$power | ||
| vcovWeight <- vcovValues$weight | ||
| vcovAcceleration <- vcovValues$acceleration | ||
| vcovYear <- vcovValues$year | ||
|
|
||
| manualIntercept <- mFV@model$coefficients_table$intercept | ||
| manualDisplacement <- mFV@model$coefficients_table$displacement | ||
| manualPower <- mFV@model$coefficients_table$power | ||
| manualWeight <- mFV@model$coefficients_table$weight | ||
| manualAcceleration <- mFV@model$coefficients_table$acceleration | ||
| manualYear <- mFV@model$coefficients_table$year | ||
|
|
||
| # compare values from model and obtained manually | ||
| for (ind in c(1:length(manuelPValues))) | ||
|
||
| expect_equal(manualIntercept[ind], vcovIntercept[ind]) | ||
| expect_equal(manualDisplacement[ind], vcovDisplacement[ind]) | ||
| expect_equal(manualPower[ind], vcovPower[ind]) | ||
| expect_equal(manualWeight[ind], vcovWeight[ind]) | ||
| expect_equal(manualAcceleration[ind], vcovAcceleration[ind]) | ||
| expect_equal(manualYear[ind], vcovYear[ind]) | ||
| } | ||
|
|
||
| doTest("GLM: make sure error is generated when a gbm model calls glm functions", testGBMvcov) | ||
| doTest("GLM: make sure error is generated when compute_p_values=FALSE", testGLMvcovcomputePValueFALSE) | ||
| doTest("GLM: test variance-covariance values", testGLMPValZValStdError) | ||
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is no test that would test if the implementation is working. It just tests if it throws an error if used when unsupported.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The last R test was supposed to, but it looks like I must have pushed the wrong thing. It should now check if the covariance matrix returned by the function matches the one returned by going and grabbing the h2o frame by it's reference name.
I thought a little bit about adding in a test that the actual values themselves were calculated correctly, but since the vcov matrix was already in the code, and the p-values all rely on it already, I assume that's already tested somewhere. I'm just making it accessible. I did do testing to make sure they matched what I got with the statsmodels in python though. It looks like they're calculated based on the Observed Information Matrix instead of the Expected Information Matrix, so it was a little tricky.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No worries!
Thank you for being so vigilant! Unfortunately, the person who implemented this is no longer working in H2O so I can't easily ask (and I'm not familiar with this piece of code) but I hope we have the test somewhere.