-
Notifications
You must be signed in to change notification settings - Fork 2k
GH-16583: Access GLM Variance-Covariance Matrix with vcov
#16586
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
3231144
467b294
55b9ccd
72556df
fa1cbc6
33d3aee
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,97 @@ | ||
| import sys | ||
|
|
||
| sys.path.insert(1, "../../../") | ||
| import h2o | ||
| from tests import pyunit_utils | ||
| from h2o.estimators.gbm import H2OGradientBoostingEstimator | ||
| from h2o.estimators.glm import H2OGeneralizedLinearEstimator | ||
|
|
||
| # make sure error is thrown when non glm model try to call vcov | ||
| def test_glm_vcov(): | ||
| cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) | ||
| y = "economy_20mpg" | ||
| predictors = ["displacement", "power", "weight", "acceleration", "year"] | ||
| cars[y] = cars[y].asfactor() | ||
|
|
||
| gbm_model = H2OGradientBoostingEstimator(distribution="bernoulli", seed=1234) | ||
| gbm_model.train(x=predictors, y=y, training_frame=cars) | ||
|
|
||
| try: | ||
| gbm_model.vcov() | ||
| except ValueError as e: | ||
| assert "The variance-covariance matrix is only found in GLM." in e.args[0], "Wrong error messages received." | ||
|
|
||
| # test to make sure we have error when compute_p_value=False and vcov is called | ||
| def test_wrong_model_vcov(): | ||
| cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) | ||
| y = "economy_20mpg" | ||
| predictors = ["displacement", "power", "weight", "acceleration", "year"] | ||
| cars[y] = cars[y].asfactor() | ||
|
|
||
| h2oglm_vcov = H2OGeneralizedLinearEstimator(family="binomial", seed=1234) | ||
| h2oglm_vcov.train(x=predictors, y=y, training_frame=cars) | ||
|
|
||
| try: | ||
| h2oglm_vcov.vcov() | ||
| except ValueError as e: | ||
| assert "The variance-covariance matrix is only calculated when compute_p_values=True" in e.args[0], "Wrong error messages received." | ||
|
|
||
| # make sure correct covariances are returned when vcov is called | ||
| def test_glm_vcov_values(): | ||
| cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) | ||
| y = "economy_20mpg" | ||
| predictors = ["displacement","power","weight","acceleration","year"] | ||
| cars[y] = cars[y].asfactor() | ||
|
|
||
| h2oglm_compute_vcov = H2OGeneralizedLinearEstimator(family = "binomial", lambda_=0.0, compute_p_values=True, | ||
| seed = 1234) | ||
|
|
||
| h2oglm_compute_vcov.train(x = predictors, y = y, training_frame = cars) | ||
| vcov = h2oglm_compute_vcov.vcov() | ||
| vcov_intercept = vcov['intercept'] | ||
| vcov_displacement = vcov['displacement'] | ||
| vcov_power = vcov['power'] | ||
| vcov_weight = vcov['weight'] | ||
| vcov_acceleration = vcov['acceleration'] | ||
| vcov_year = vcov['year'] | ||
|
|
||
| print("variance-covariance table: {0}".format(vcov)) | ||
|
|
||
| # manually obtain covariances and compare with ones using functions | ||
| names = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["names"] | ||
| manual_intercept = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["intercept"] | ||
| manual_displacement = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["displacement"] | ||
| manual_power = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["power"] | ||
| manual_weight = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["weight"] | ||
| manual_acceleration = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["acceleration"] | ||
| manual_year = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["year"] | ||
|
|
||
| # check both sets of values are equal | ||
| for i, el in enumerate(names): | ||
| assert abs(vcov_intercept[i]-manual_intercept[i]) < 1e-12, "Expected covariance between intercept and {el} : {0} " \ | ||
| ", actual covariance: {1}. They are different".format( | ||
| vcov_intercept[count], manual_intercept[count]) | ||
| assert abs(vcov_displacement[i]-manual_displacement[i]) < 1e-12, "Expected covariance between displacement and {el} : {0} " \ | ||
| ", actual covariance: {1}. They are different".format( | ||
| vcov_displacement[count], manual_displacement[count]) | ||
| assert abs(vcov_power[i]-manual_power[i]) < 1e-12, "Expected covariance between power and {el} : {0} " \ | ||
| ", actual covariance: {1}. They are different".format( | ||
| vcov_power[count], manual_power[count]) | ||
| assert abs(vcov_weight[i]-manual_weight[i]) < 1e-12, "Expected covariance between weight and {el} : {0} " \ | ||
| ", actual covariance: {1}. They are different".format( | ||
| vcov_weight[count], manual_weight[count]) | ||
| assert abs(vcov_acceleration[i]-manual_acceleration[i]) < 1e-12, "Expected covariance between acceleration and {el} : {0} " \ | ||
| ", actual covariance: {1}. They are different".format( | ||
| vcov_acceleration[count], manual_acceleration[count]) | ||
| assert abs(vcov_year[i]-manual_year[i]) < 1e-12, "Expected covariance between year and {el} : {0} " \ | ||
| ", actual covariance: {1}. They are different".format( | ||
| vcov_year[count], manual_year[count]) | ||
|
|
||
| if __name__ == "__main__": | ||
| pyunit_utils.standalone_test(test_glm_vcov) | ||
| pyunit_utils.standalone_test(test_wrong_model_vcov) | ||
| pyunit_utils.standalone_test(test_glm_vcov_values) | ||
| else: | ||
| test_glm_vcov() | ||
| test_wrong_model_vcov() | ||
| test_glm_vcov_values() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,66 @@ | ||
| setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) | ||
| source("../../../scripts/h2o-r-test-setup.R") | ||
|
|
||
| # call glm functions by a gbm model. Should generate an error | ||
| testGBMvcov <- function() { | ||
| bhexFV <- h2o.importFile(locate("smalldata/logreg/benign.csv")) | ||
| maxX <- 11 | ||
| Y <- 4 | ||
| X <- 3:maxX | ||
| X <- X[ X != Y ] | ||
| mFV <- h2o.gbm(y=Y, x=colnames(bhexFV)[X], training_frame=bhexFV) | ||
|
|
||
| assertError(vcovValues <- h2o.vcov(mFV)) | ||
| } | ||
|
|
||
| # should throw an error when compute_p_value=FALSE | ||
| testGLMvcovcomputePValueFALSE <- function() { | ||
| bhexFV <- h2o.importFile(locate("smalldata/logreg/benign.csv")) | ||
| maxX <- 11 | ||
| Y <- 4 | ||
| X <- 3:maxX | ||
| X <- X[ X != Y ] | ||
| mFV <- h2o.glm(y=Y, x=colnames(bhexFV)[X], training_frame=bhexFV, lambda=1.0) | ||
|
|
||
| assertError(vcovValues <- h2o.vcov(mFV)) | ||
| } | ||
|
|
||
| testGLMPValZValStdError <- function() { | ||
| bhexFV <- h2o.importFile(locate("smalldata/logreg/benign.csv")) | ||
| maxX <- 11 | ||
| Y <- 4 | ||
| X <- 3:maxX | ||
| X <- X[ X != Y ] | ||
| bhexFV$FNDX <- h2o.asfactor(bhexFV$FNDX) | ||
| mFV <- h2o.glm(y=Y, x=colnames(bhexFV)[X], training_frame=bhexFV, family="binomial", lambda=0.0, compute_p_values=TRUE) | ||
|
|
||
| vcovValues <- h2o.coef_with_p_values(mFV) | ||
| print("variance-covariance table") | ||
| print(vcovValues) | ||
| vcovIntercept <- vcovValues$intercept | ||
| vcovDisplacement <- vcovValues$displacement | ||
| vcovPower <- vcovValues$power | ||
| vcovWeight <- vcovValues$weight | ||
| vcovAcceleration <- vcovValues$acceleration | ||
| vcovYear <- vcovValues$year | ||
|
|
||
| manualIntercept <- mFV@model$coefficients_table$intercept | ||
| manualDisplacement <- mFV@model$coefficients_table$displacement | ||
| manualPower <- mFV@model$coefficients_table$power | ||
| manualWeight <- mFV@model$coefficients_table$weight | ||
| manualAcceleration <- mFV@model$coefficients_table$acceleration | ||
| manualYear <- mFV@model$coefficients_table$year | ||
|
|
||
| # compare values from model and obtained manually | ||
| for (ind in c(1:length(manuelPValues))) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| expect_equal(manualIntercept[ind], vcovIntercept[ind]) | ||
| expect_equal(manualDisplacement[ind], vcovDisplacement[ind]) | ||
| expect_equal(manualPower[ind], vcovPower[ind]) | ||
| expect_equal(manualWeight[ind], vcovWeight[ind]) | ||
| expect_equal(manualAcceleration[ind], vcovAcceleration[ind]) | ||
| expect_equal(manualYear[ind], vcovYear[ind]) | ||
| } | ||
|
|
||
| doTest("GLM: make sure error is generated when a gbm model calls glm functions", testGBMvcov) | ||
| doTest("GLM: make sure error is generated when compute_p_values=FALSE", testGLMvcovcomputePValueFALSE) | ||
| doTest("GLM: test variance-covariance values", testGLMPValZValStdError) | ||
|
Comment on lines
+64
to
+66
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer something like: doSuite("GLM: VCOV support", makeSuite(testGBMvcov, testGLMvcovcomputePValueFALSE, testGLMPValZValStdError)) |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is no test that would test if the implementation is working. It just tests if it throws an error if used when unsupported.