diff --git a/h2o-algos/src/main/java/hex/api/MakeGLMModelHandler.java b/h2o-algos/src/main/java/hex/api/MakeGLMModelHandler.java
index 495c3e6c6552..f78e9b2b1afb 100644
--- a/h2o-algos/src/main/java/hex/api/MakeGLMModelHandler.java
+++ b/h2o-algos/src/main/java/hex/api/MakeGLMModelHandler.java
@@ -56,10 +56,10 @@ public GLMModelV3 make_model(int version, MakeGLMModelV3 args){
public GLMModelV3 make_unrestricted_model(int version, MakeUnrestrictedGLMModelV3 args){
GLMModel model = DKV.getGet(args.model.key());
- if(model == null)
+ if (model == null)
throw new IllegalArgumentException("Missing source model " + args.model);
- if(model._parms._control_variables == null){
- throw new IllegalArgumentException("Source model is not trained with control variables.");
+ if (model._parms._control_variables == null && !model._parms._remove_offset_effects){
+ throw new IllegalArgumentException("Source model is not trained with control variables or remove offset effects.");
}
Key generatedKey = Key.make(model._key.toString()+"_unrestricted_model");
Key key = args.dest != null ? Key.make(args.dest) : generatedKey;
@@ -73,7 +73,9 @@ public GLMModelV3 make_unrestricted_model(int version, MakeUnrestrictedGLMModelV
Double.NaN, Double.NaN, -1);
m.setInputParms(inputParms);
m._input_parms._control_variables = null;
+ m._input_parms._remove_offset_effects = false;
m._parms._control_variables = null;
+ m._parms._remove_offset_effects = false;
DataInfo dinfo = model.dinfo();
dinfo.setPredictorTransform(TransformType.NONE);
m._output = new GLMOutput(model.dinfo(), model._output._names, model._output._column_types, model._output._domains,
diff --git a/h2o-algos/src/main/java/hex/glm/GLM.java b/h2o-algos/src/main/java/hex/glm/GLM.java
index 06158a7601aa..980485289cd8 100644
--- a/h2o-algos/src/main/java/hex/glm/GLM.java
+++ b/h2o-algos/src/main/java/hex/glm/GLM.java
@@ -1430,7 +1430,7 @@ private void restoreScoringHistoryFromCheckpoint() {
else {
_scoringHistory.restoreFromCheckpoint(scoringHistory, colHeadersIndex);
}
- if (_model._parms._control_variables != null) {
+ if (_model._parms._control_variables != null || _model._parms._remove_offset_effects) {
TwoDimTable scoringHistoryControlVal = _model._output._scoring_history_unrestricted_model;
_scoringHistoryUnrestrictedModel.restoreFromCheckpoint(scoringHistoryControlVal, colHeadersIndex);
}
@@ -3384,19 +3384,21 @@ private void scoreAndUpdateModel() {
Frame train = DKV.getGet(_parms._train); // need to keep this frame to get scoring metrics back
_model.score(_parms.train(), null, CFuncRef.from(_parms._custom_metric_func)).delete();
scorePostProcessing(train, t1);
- if (_model._parms._control_variables != null){
+ if (_model._parms._control_variables != null || _model._parms._remove_offset_effects){
try {
- _model._useControlVariables = true;
+ _model._useControlVariables = _model._parms._control_variables != null;
+ _model._useRemoveOffsetEffects = _model._parms._remove_offset_effects;
long t2 = System.currentTimeMillis();
_model.score(train, null, CFuncRef.from(_parms._custom_metric_func)).delete();
- scorePostProcessingControlVal(train, t2);
+ scorePostProcessingRestrictedModel(train, t2);
} finally {
_model._useControlVariables = false;
+ _model._useRemoveOffsetEffects = false;
}
}
}
- private void scorePostProcessingControlVal(Frame train, long t1) {
+ private void scorePostProcessingRestrictedModel(Frame train, long t1) {
ModelMetrics mtrain = ModelMetrics.getFromDKV(_model, train); // updated by model.scoreAndUpdateModel
long t2 = System.currentTimeMillis();
if (mtrain != null) {
@@ -3408,15 +3410,10 @@ private void scorePostProcessingControlVal(Frame train, long t1) {
} else {
Log.info(LogMsg("ModelMetrics mtrain is null"));
}
- Log.info(LogMsg("Control values training metrics computed in " + (t2 - t1) + "ms"));
+ Log.info(LogMsg("Restricted model training metrics computed in " + (t2 - t1) + "ms"));
if (_valid != null) {
Frame valid = DKV.getGet(_parms._valid);
- try {
- _model._useControlVariables = true;
- _model.score(_parms.valid(), null, CFuncRef.from(_parms._custom_metric_func)).delete();
- } finally {
- _model._useControlVariables = true;
- }
+ _model.score(_parms.valid(), null, CFuncRef.from(_parms._custom_metric_func)).delete();
_model._output._validation_metrics = ModelMetrics.getFromDKV(_model, valid); //updated by model.scoreAndUpdateModel
ScoreKeeper validScore = new ScoreKeeper(Double.NaN);
validScore.fillFrom(_model._output._validation_metrics);
@@ -3424,18 +3421,30 @@ private void scorePostProcessingControlVal(Frame train, long t1) {
_model.addScoringInfo(_parms, nclasses(), t2, _state._iter); // add to scoringInfo for early stopping
if (_parms._generate_scoring_history) { // update scoring history with deviance train and valid if available
- double[] betaContrVal = _model._output.getControlValBeta(_state.expandBeta(_state.beta()).clone());
- GLMResDevTask task = new GLMResDevTask(_job._key, _dinfo, _parms, betaContrVal).doAll(_dinfo._adaptedFrame);
- double objectiveControlVal = _state.objective(betaContrVal, task._likelihood);
-
- if ((mtrain != null) && (_valid != null)) {
- _scoringHistory.addIterationScore(true, true, _state._iter, task._likelihood,
- objectiveControlVal, _state.deviance(task._likelihood), ((GLMMetrics) _model._output._validation_metrics).residual_deviance(),
- mtrain._nobs, _model._output._validation_metrics._nobs, _state.lambda(), _state.alpha());
- } else { // only doing training deviance
- _scoringHistory.addIterationScore(true, false, _state._iter, task._likelihood,
- objectiveControlVal, _state.deviance(task._likelihood), Double.NaN, mtrain._nobs, 1, _state.lambda(),
- _state.alpha());
+ if(_model._useControlVariables) {
+ double[] betaContrVal = _model._output.getControlValBeta(_state.expandBeta(_state.beta()).clone());
+ GLMResDevTask task = new GLMResDevTask(_job._key, _dinfo, _parms, betaContrVal).doAll(_dinfo._adaptedFrame);
+ double objectiveControlVal = _state.objective(betaContrVal, task._likelihood);
+
+ if ((mtrain != null) && (_valid != null)) {
+ _scoringHistory.addIterationScore(true, true, _state._iter, task._likelihood,
+ objectiveControlVal, _state.deviance(task._likelihood), ((GLMMetrics) _model._output._validation_metrics).residual_deviance(),
+ mtrain._nobs, _model._output._validation_metrics._nobs, _state.lambda(), _state.alpha());
+ } else { // only doing training deviance
+ _scoringHistory.addIterationScore(true, false, _state._iter, task._likelihood,
+ objectiveControlVal, _state.deviance(task._likelihood), Double.NaN, mtrain._nobs, 1, _state.lambda(),
+ _state.alpha());
+ }
+ } else if (_model._useRemoveOffsetEffects) {
+ if ((mtrain != null) && (_valid != null)) {
+ _scoringHistory.addIterationScore(true, true, _state._iter, _state.likelihood(),
+ _state.objective(), _state.deviance(), ((GLMMetrics) _model._output._validation_metrics).residual_deviance(),
+ mtrain._nobs, _model._output._validation_metrics._nobs, _state.lambda(), _state.alpha());
+ } else { // only doing training deviance
+ _scoringHistory.addIterationScore(true, false, _state._iter, _state.likelihood(),
+ _state.objective(), _state.deviance(), Double.NaN, mtrain._nobs, 1, _state.lambda(),
+ _state.alpha());
+ }
}
_job.update(_workPerIteration, _state.toString());
}
@@ -3447,7 +3456,7 @@ private void scorePostProcessing(Frame train, long t1) {
ModelMetrics mtrain = ModelMetrics.getFromDKV(_model, train); // updated by model.scoreAndUpdateModel
long t2 = System.currentTimeMillis();
if (mtrain != null) {
- if (_model._parms._control_variables != null){
+ if (_model._parms._control_variables != null || _model._parms._remove_offset_effects){
_model._output._training_metrics_unrestricted_model = mtrain;
_model._output._training_time_ms = t2 - _model._output._start_time; // remember training time
} else {
@@ -3464,7 +3473,7 @@ private void scorePostProcessing(Frame train, long t1) {
if (_valid != null) {
Frame valid = DKV.getGet(_parms._valid);
_model.score(_parms.valid(), null, CFuncRef.from(_parms._custom_metric_func)).delete();
- if(_model._parms._control_variables != null){
+ if(_model._parms._control_variables != null || _model._parms._remove_offset_effects){
_model._output._validation_metrics_unrestricted_model = ModelMetrics.getFromDKV(_model, valid);
} else {
_model._output._validation_metrics = ModelMetrics.getFromDKV(_model, valid); //updated by model.scoreAndUpdateModel
@@ -3472,7 +3481,7 @@ private void scorePostProcessing(Frame train, long t1) {
ScoreKeeper validScore = new ScoreKeeper(Double.NaN);
validScore.fillFrom(_model._output._validation_metrics);
}
- if(_model._parms._control_variables != null) {
+ if(_model._parms._control_variables != null || _model._parms._remove_offset_effects) {
_model.addUnrestrictedModelScoringInfo(_parms, nclasses(), t2, _state._iter);
} else {
_model.addScoringInfo(_parms, nclasses(), t2, _state._iter);
@@ -3495,7 +3504,7 @@ private void scorePostProcessing(Frame train, long t1) {
_model._output._validation_metrics._nobs;
_lambdaSearchScoringHistory.addLambdaScore(_state._iter, ArrayUtils.countNonzeros(_state.beta()),
_state.lambda(), trainDev, validDev, xval_deviance, xval_se, _state.alpha());
- } else if(_model._parms._control_variables != null){
+ } else if(_model._parms._control_variables != null || _model._parms._remove_offset_effects){
_scoringHistoryUnrestrictedModel.addIterationScore(true, true, _state._iter, _state.likelihood(),
_state.objective(), _state.deviance(), ((GLMMetrics) _model._output._validation_metrics_unrestricted_model).residual_deviance(),
mtrain._nobs, _model._output._validation_metrics_unrestricted_model._nobs, _state.lambda(), _state.alpha());
@@ -3509,7 +3518,7 @@ private void scorePostProcessing(Frame train, long t1) {
_lambdaSearchScoringHistory.addLambdaScore(_state._iter, ArrayUtils.countNonzeros(_state.beta()),
_state.lambda(), _state.deviance() / mtrain._nobs, Double.NaN, xval_deviance,
xval_se, _state.alpha());
- } else if(_model._parms._control_variables != null) {
+ } else if(_model._parms._control_variables != null || _model._parms._remove_offset_effects) {
_scoringHistoryUnrestrictedModel.addIterationScore(true, false, _state._iter, _state.likelihood(),
_state.objective(), _state.deviance(), Double.NaN, mtrain._nobs, 1, _state.lambda(),
_state.alpha());
@@ -3523,7 +3532,7 @@ private void scorePostProcessing(Frame train, long t1) {
}
if (_parms._lambda_search) {
_model._output._scoring_history = _lambdaSearchScoringHistory.to2dTable();
- } else if(_model._parms._control_variables != null){
+ } else if(_model._parms._control_variables != null || _model._parms._remove_offset_effects){
_model._output._scoring_history_unrestricted_model = _scoringHistoryUnrestrictedModel.to2dTable(_parms, _xval_deviances_generate_SH,
_xval_sd_generate_SH);
} else {
@@ -3846,24 +3855,24 @@ private void doCompute() {
if (_parms._generate_variable_inflation_factors) {
_model._output._vif_predictor_names = _model.buildVariableInflationFactors(_train, _dinfo);
}// build variable inflation factors for numerical predictors
- if(_model._parms._control_variables != null) {
- // create combination of scoring history with control variables enabled and disabled
- // keep unrestricted model scoring history in _model._output._control_val_scoring_history
+ if(_model._parms._control_variables != null || _model._parms._remove_offset_effects) {
+ // create combination of scoring history with control variables or remove offset effect enabled and disabled
+ // keep unrestricted model scoring history in _model._output._scoring_history_unrestricted_model
TwoDimTable scoringHistoryEarlyStop = ScoringInfo.createScoringHistoryTable(_model.getScoringInfo(),
(null != _parms._valid), false, _model._output.getModelCategory(), false, _parms.hasCustomMetricFunc());
- TwoDimTable scoringHistoryEarlyStopControlVal = ScoringInfo.createScoringHistoryTable(_model.getUnrestrictedModelScoringInfo(),
+ TwoDimTable scoringHistoryEarlyStopRestricted = ScoringInfo.createScoringHistoryTable(_model.getUnrestrictedModelScoringInfo(),
(null != _parms._valid), false, _model._output.getModelCategory(), false, _parms.hasCustomMetricFunc());
- scoringHistoryEarlyStopControlVal.setTableHeader("Scoring history with control variables enabled");
ScoreKeeper.StoppingMetric sm = _model._parms._stopping_metric.name().equals("AUTO") ? _model._output.isClassifier() ?
ScoreKeeper.StoppingMetric.logloss : ScoreKeeper.StoppingMetric.deviance : _model._parms._stopping_metric;
- _model._output._scoring_history = combineScoringHistoryControlVariables(_model._output._scoring_history, _model._output._scoring_history_unrestricted_model,
- scoringHistoryEarlyStop, scoringHistoryEarlyStopControlVal, sm, null != _parms._valid);
- _model._output._scoring_history_unrestricted_model = combineScoringHistory(_model._output._scoring_history_unrestricted_model, scoringHistoryEarlyStopControlVal);
+ _model._output._scoring_history = combineScoringHistoryRestricted(_model._output._scoring_history, _model._output._scoring_history_unrestricted_model,
+ scoringHistoryEarlyStop, scoringHistoryEarlyStopRestricted, sm, null != _parms._valid);
+ _model._output._scoring_history_unrestricted_model = combineScoringHistory(_model._output._scoring_history_unrestricted_model, scoringHistoryEarlyStopRestricted);
_model._output._scoring_history_unrestricted_model.setTableHeader(_model._output._scoring_history_unrestricted_model.getTableHeader()+" unrestricted model");
- // set control variables flag to true for scoring after training
- _model._useControlVariables = true;
- _model._output._varimp = _model._output.calculateVarimp(true);
+ // set control variables and remove offset effects flag to true for scoring after training
+ _model._useControlVariables = _model._parms._control_variables != null;
+ _model._useRemoveOffsetEffects = _model._parms._remove_offset_effects;
+ _model._output._varimp = _model._output.calculateVarimp(_model._useControlVariables);
_model._output._variable_importances_unrestricted_model = calcVarImp(_model._output.calculateVarimp(false));
_model._output._variable_importances_unrestricted_model.setTableHeader(_model._output._variable_importances_unrestricted_model.getTableHeader()+" unrestricted model");
_model._output._variable_importances = calcVarImp(_model._output._varimp);
@@ -4063,6 +4072,9 @@ protected void updateProgress(boolean canScore) {
GLMResDevTask task = new GLMResDevTask(_job._key,_dinfo,_parms, betaContrVal).doAll(_state._dinfo._adaptedFrame);
double objectiveControlVal = _state.objective(betaContrVal, task._likelihood);
_scoringHistory.addIterationScore(_state._iter, task._likelihood, objectiveControlVal);
+ } else if (_model._parms._remove_offset_effects) {
+ _scoringHistoryUnrestrictedModel.addIterationScore(_state._iter, _state.likelihood(), _state.objective());
+ _scoringHistory.addIterationScore(_state._iter, _state.likelihood(), _state.objective());
} else {
_scoringHistory.addIterationScore(_state._iter, _state.likelihood(), _state.objective());
}
@@ -4079,7 +4091,7 @@ protected void updateProgress(boolean canScore) {
}
private boolean updateEarlyStop() {
- ScoreKeeper[] sk = _parms._control_variables != null ? _model.unrestritedModelScoreKeepers() : _model.scoreKeepers();
+ ScoreKeeper[] sk = _parms._control_variables != null || _parms._remove_offset_effects ? _model.unrestritedModelScoreKeepers() : _model.scoreKeepers();
return _earlyStop || ScoreKeeper.stopEarly(sk,
_parms._stopping_rounds, ScoreKeeper.ProblemType.forSupervised(_nclass > 1), _parms._stopping_metric,
_parms._stopping_tolerance, "model's last", true);
diff --git a/h2o-algos/src/main/java/hex/glm/GLMModel.java b/h2o-algos/src/main/java/hex/glm/GLMModel.java
index 19cd3701bf1b..b934266c7810 100755
--- a/h2o-algos/src/main/java/hex/glm/GLMModel.java
+++ b/h2o-algos/src/main/java/hex/glm/GLMModel.java
@@ -579,6 +579,7 @@ public enum Constraints {EqualTo, LessThanEqualTo};
public double _constraint_beta = 0.9; // eta_k+1 = eta_k/pow(c_k, beta)
public double _constraint_c0 = 10; // set initial epsilon k as 1/c0
public String[] _control_variables; // control variables definition, list of column names
+ public boolean _remove_offset_effects; // control offset effect from prediction and metric calculation
public void validate(GLM glm) {
if (_remove_collinear_columns) {
@@ -724,10 +725,10 @@ public void validate(GLM glm) {
glm.error("_control_variables", "Control variables option is not supported with interactions.");
}
if(_lambda_search) {
- glm.error("_control_variables", "Control variables option is not supported with lambda search.");
+ glm.error("_control_variables", "Control variables option is not supported with Lambda search.");
}
if(_fold_column != null || _nfolds > 0){
- glm.error("_control_variables", "Control variables option is not supported with cross validation.");
+ glm.error("_control_variables", "Control variables option is not supported with cross-validation.");
}
for(String col: _control_variables){
Vec v = train().vec(col);
@@ -755,7 +756,23 @@ public void validate(GLM glm) {
}
}
}
- }
+ } if (_remove_offset_effects) {
+ if (_offset_column == null) {
+ glm.error("_remove_offset_effects", "The offset_column is missing.");
+ }
+ if (_distribution.equals(DistributionFamily.multinomial) || _distribution.equals(DistributionFamily.ordinal) || _distribution.equals(DistributionFamily.custom)){
+ glm.error("_remove_offset_effects", "The "+_distribution.name()+ " distribution is not supported with remove offset effects.");
+ }
+ if (_interactions != null || _interaction_pairs != null) {
+ glm.error("_remove_offset_effects", "Remove offset effects option is not supported with interactions.");
+ }
+ if (_lambda_search) {
+ glm.error("_remove_offset_effects", "Remove offset effects option is not supported with Lambda search.");
+ }
+ if (_fold_column != null || _nfolds > 0) {
+ glm.error("_remove_offset_effects", "Remove offset effects option is not supported with cross-validation.");
+ }
+ }
}
public GLMParameters() {
@@ -1531,6 +1548,7 @@ public Submodel(double lambda, double alpha, double[] beta, int iteration, doubl
public double[] _betaCndCheckpoint; // store temporary beta coefficients for checkpointing purposes
public boolean _finalScoring = false; // used while scoring to indicate if it is a final or partial scoring
public boolean _useControlVariables = false;
+ public boolean _useRemoveOffsetEffects = false;
private static String[] binomialClassNames = new String[]{"0", "1"};
@@ -1718,7 +1736,6 @@ public double[] variableInflationFactors() {
public boolean _binomial;
public boolean _multinomial;
public boolean _ordinal;
- public boolean _score_control_vals_used_but_disabled;
public void setLambdas(GLMParameters parms) {
if (parms._lambda_search) {
@@ -2202,7 +2219,10 @@ else if (_output.bestSubmodel().alpha_value == 1)
}
} else {
double[] b = beta();
- double eta = b[b.length - 1] + o; // intercept + offset
+ double eta = b[b.length - 1]; // intercept
+ if (!this._useRemoveOffsetEffects){ // offset
+ eta += o;
+ }
double[] bcv = b.clone();
if (this._useControlVariables)
bcv = _output.getControlValBeta(bcv); // make beta connected to control variables zero
diff --git a/h2o-algos/src/main/java/hex/glm/GLMScore.java b/h2o-algos/src/main/java/hex/glm/GLMScore.java
index ea6390c422c9..ea46d27e79a8 100644
--- a/h2o-algos/src/main/java/hex/glm/GLMScore.java
+++ b/h2o-algos/src/main/java/hex/glm/GLMScore.java
@@ -94,10 +94,7 @@ public GLMScore(Job j, GLMModel m, DataInfo dinfo, String[] domain, boolean comp
}
_beta_multinomial = null;
}
-
_dinfo._valid = true; // marking dinfo as validation data set disables an assert on unseen levels (which should not happen in train)
-
- m._output._score_control_vals_used_but_disabled = m._parms._control_variables != null && !m._useControlVariables;
_defaultThreshold = m.defaultThreshold();
}
@@ -108,7 +105,10 @@ public GLMScore(Job j, GLMModel m, DataInfo dinfo, String[] domain, boolean comp
Arrays.fill(preds,0);
double previousCDF = 0.0;
for (int cInd = 0; cInd < lastClass; cInd++) {
- double eta = r.innerProduct(bm[cInd]) + o;
+ double eta = r.innerProduct(bm[cInd]);
+ if(!_m._useRemoveOffsetEffects) {
+ eta += o;
+ }
double currCDF = 1.0 / (1 + Math.exp(-eta));
preds[cInd + 1] = currCDF - previousCDF;
previousCDF = currCDF;
@@ -121,7 +121,10 @@ public GLMScore(Job j, GLMModel m, DataInfo dinfo, String[] domain, boolean comp
double sumExp = 0;
double maxRow = 0;
for (int c = 0; c < bm.length; ++c) {
- eta[c] = r.innerProduct(bm[c]) + o;
+ eta[c] = r.innerProduct(bm[c]);
+ if(!_m._useRemoveOffsetEffects) {
+ eta[c] += o;
+ }
if(eta[c] > maxRow)
maxRow = eta[c];
}
@@ -132,8 +135,11 @@ public GLMScore(Job j, GLMModel m, DataInfo dinfo, String[] domain, boolean comp
preds[c + 1] = eta[c] * sumExp;
preds[0] = ArrayUtils.maxIndex(eta);
} else {
- double mu = _m._parms.linkInv(r.innerProduct(_beta) + o);
-
+ double x = r.innerProduct(_beta);
+ if(!_m._useRemoveOffsetEffects) {
+ x += o;
+ }
+ double mu = _m._parms.linkInv(x);
if (_m._parms._family == GLMModel.GLMParameters.Family.binomial
|| _m._parms._family == GLMModel.GLMParameters.Family.quasibinomial
|| _m._parms._family == GLMModel.GLMParameters.Family.fractionalbinomial) { // threshold for prediction
diff --git a/h2o-algos/src/main/java/hex/glm/GLMUtils.java b/h2o-algos/src/main/java/hex/glm/GLMUtils.java
index d0e1c52272fb..f8a1e9489317 100644
--- a/h2o-algos/src/main/java/hex/glm/GLMUtils.java
+++ b/h2o-algos/src/main/java/hex/glm/GLMUtils.java
@@ -162,18 +162,18 @@ public static List getStoppingMetricIndices(ScoreKeeper.StoppingMetric
* timestamp duration iterations Unrestricted negative_log_likelihood Unrestricted objective = training metrics calculated during optimization with control variables included (in glmSc)
* Training RMSE Training LogLoss Training r2 Training AUC Training pr_auc Training Lift Training Classification Error = early stopping training metrics with control variables excluded (in earlyStopSc)
* Validation RMSE Validation LogLoss Validation r2 Validation AUC Validation pr_auc Validation Lift Validation Classification Error = early stopping validation metrics with control variables excluded (in earlyStopSc)
- * Unrestricted Training AUC Unrestricted Validation AUC = stopping metrics with control variables included (in earlyStopScControlVariables)
+ * Unrestricted Training AUC Unrestricted Validation AUC = stopping metrics with control variables included (in earlyStopScRestricted)
* @param glmSc
* @param earlyStopSc
* @param stoppingMetric
- * @param earlyStopScControlVariables
+ * @param earlyStopScRestricted
* @return Combined scoring history table
*/
- public static TwoDimTable combineScoringHistoryControlVariables(TwoDimTable glmSc, TwoDimTable glmScControlVariables,
- TwoDimTable earlyStopSc,
- TwoDimTable earlyStopScControlVariables,
- ScoreKeeper.StoppingMetric stoppingMetric,
- boolean hasValidationMetrics) {
+ public static TwoDimTable combineScoringHistoryRestricted(TwoDimTable glmSc, TwoDimTable glmScRestricted,
+ TwoDimTable earlyStopSc,
+ TwoDimTable earlyStopScRestricted,
+ ScoreKeeper.StoppingMetric stoppingMetric,
+ boolean hasValidationMetrics) {
String[] esColTypes = earlyStopSc.getColTypes();
String[] esColFormats = earlyStopSc.getColFormats();
List finalColHeaders = new ArrayList<>(Arrays.asList(glmSc.getColHeaders()));
@@ -186,13 +186,13 @@ public static TwoDimTable combineScoringHistoryControlVariables(TwoDimTable glmS
List finalColTypes = new ArrayList<>(Arrays.asList(glmSc.getColTypes()));
List finalColFormats = new ArrayList<>(Arrays.asList(glmSc.getColFormats()));
List earlyStopColIndices = new ArrayList<>();
- List earlyStopColIndicesContrVals = getStoppingMetricIndices(stoppingMetric, earlyStopScControlVariables.getColHeaders());
+ List earlyStopColIndicesContrVals = getStoppingMetricIndices(stoppingMetric, earlyStopScRestricted.getColHeaders());
int colCounter = 0;
- String[] glmSCContrColTypes = glmScControlVariables.getColTypes();
- String[] glmSCContrColFormats = glmScControlVariables.getColFormats();
+ String[] glmSCContrColTypes = glmScRestricted.getColTypes();
+ String[] glmSCContrColFormats = glmScRestricted.getColFormats();
List glmScContrValsColIndices = new ArrayList<>();
- String[] glmScControlVariablesHeaders = glmScControlVariables.getColHeaders();
+ String[] glmScControlVariablesHeaders = glmScRestricted.getColHeaders();
for(int i=0; i < glmScControlVariablesHeaders.length; i++){
String colName = glmScControlVariablesHeaders[i];
String colNameLower = colName.toLowerCase();
@@ -233,7 +233,7 @@ public static TwoDimTable combineScoringHistoryControlVariables(TwoDimTable glmS
TwoDimTable res = new TwoDimTable("Scoring History", "",
rowHeaders, finalColHeaders.toArray(new String[tableSize]), finalColTypes.toArray(new String[tableSize]),
finalColFormats.toArray(new String[tableSize]), "");
- res = combineTableContentsControlVariables(glmSc, glmScControlVariables, earlyStopSc, earlyStopScControlVariables, res, glmScContrValsColIndices, earlyStopColIndices, earlyStopColIndicesContrVals, indexOfIter, earlyStopSCIterIndex,
+ res = combineTableContentsControlVariables(glmSc, glmScRestricted, earlyStopSc, earlyStopScRestricted, res, glmScContrValsColIndices, earlyStopColIndices, earlyStopColIndicesContrVals, indexOfIter, earlyStopSCIterIndex,
overlapSize);
return res;
}
diff --git a/h2o-algos/src/main/java/hex/schemas/GLMV3.java b/h2o-algos/src/main/java/hex/schemas/GLMV3.java
index 655431d0791e..30b7cc1893ad 100644
--- a/h2o-algos/src/main/java/hex/schemas/GLMV3.java
+++ b/h2o-algos/src/main/java/hex/schemas/GLMV3.java
@@ -74,6 +74,7 @@ public static final class GLMParametersV3 extends ModelParametersSchemaV3make("predsR"),new String[]{"predict"},new Vec[]{predsRVec});
- Frame manualPredsR = scoreManualWithCoefficients(coefficientsR, train, "manualPredsR", null, true);
- Frame manualPredsH2o = scoreManualWithCoefficients(coefficients, train, "manualPredsH2o", null, true);
+ Frame manualPredsR = scoreManualWithCoefficients(coefficientsR, train, "manualPredsR", true);
+ Frame manualPredsH2o = scoreManualWithCoefficients(coefficients, train, "manualPredsH2o", true);
Frame manualPredsControl = scoreManualWithCoefficients(coefficientsControl, train, "manualPredsControl", new int[]{0}, true);
Frame manualPredsRControl = scoreManualWithCoefficients(coefficientsR, train, "manualPredsR", new int[]{0}, true);
@@ -737,14 +737,26 @@ public void testBasicDataBinomial(){
}
private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, String frameName){
- return scoreManualWithCoefficients(coefficients, data, frameName, null, false);
+ return scoreManualWithCoefficients(coefficients, data, frameName, null, false, null);
+ }
+
+ private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, String frameName, boolean binomial){
+ return scoreManualWithCoefficients(coefficients, data, frameName, null, binomial, null);
}
private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, String frameName, int[] controlVariablesIdx){
- return scoreManualWithCoefficients(coefficients, data, frameName, controlVariablesIdx, false);
+ return scoreManualWithCoefficients(coefficients, data, frameName, controlVariablesIdx, false, null);
+ }
+
+ private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, String frameName, int[] controlVariablesIdx, boolean binomial){
+ return scoreManualWithCoefficients(coefficients, data, frameName, controlVariablesIdx, binomial, null);
+ }
+
+ private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, String frameName, boolean binomial, Vec offset){
+ return scoreManualWithCoefficients(coefficients, data, frameName, null, binomial, offset);
}
- private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, String frameName, int[] controlVariablesIdxs, boolean binomial){
+ private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, String frameName, int[] controlVariablesIdxs, boolean binomial, Vec offset){
Vec predictions = Vec.makeZero(data.numRows(), Vec.T_NUM);
for (long i = 0; i < data.numRows(); i++) {
double prediction = 0;
@@ -756,12 +768,604 @@ private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, Str
}
}
prediction += coefficients[coefficients.length-1];
- if(binomial){
+ if (offset != null) prediction += offset.at(i);
+ if (binomial){
prediction = 1.0 / (Math.exp(-prediction) + 1.0);
}
predictions.set(i, prediction);
}
return new Frame(Key.make(frameName),new String[]{"predict"},new Vec[]{predictions});
}
-
+
+ @Test
+ public void compareModelWithOffsetEnabledAndDisabled() {
+ Frame train = null;
+ Frame test = null;
+ Frame preds = null;
+ GLMModel glm = null;
+ Frame preds2 = null;
+ GLMModel glm2 = null;
+ try {
+ Scope.enter();
+ train = parseTestFile("smalldata/glm_test/binomial_20_cols_10KRows.csv");
+ GLMModel.GLMParameters.Family family = GLMModel.GLMParameters.Family.binomial;
+ String responseColumn = "C21";
+
+ // set cat columns
+ int numCols = train.numCols();
+ int enumCols = (numCols - 1) / 2;
+ for (int cindex = 0; cindex < enumCols; cindex++) {
+ train.replace(cindex, train.vec(cindex).toCategoricalVec()).remove();
+ }
+ int response_index = numCols - 1;
+
+ train.replace((response_index), train.vec(response_index).toCategoricalVec()).remove();
+
+ DKV.put(train);
+ Scope.track_generic(train);
+
+ test = new Frame(train);
+ test.remove(responseColumn);
+
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters(family);
+ params._response_column = responseColumn;
+ params._train = train._key;
+ params._score_each_iteration = true;
+ params._offset_column = "C20";
+ params._remove_offset_effects = true;
+
+ // train model with remove offset effects enabled
+ glm = new GLM(params).trainModel().get();
+ Scope.track_generic(glm);
+
+ System.out.println("_________________________________");
+ System.out.println(glm);
+ System.out.println("______");
+
+ preds = glm.score(test);
+ Scope.track_generic(preds);
+
+ // train model with offset effect removed
+ params._remove_offset_effects = false;
+
+ glm2 = new GLM(params).trainModel().get();
+ Scope.track_generic(glm2);
+
+ preds2 = glm2.score(test);
+ Scope.track_generic(preds2);
+
+ // check result training metrics are not the same
+ double delta = 10e-10;
+ assertNotEquals(glm.auc(), glm2.auc(), delta);
+ assertNotEquals(glm.mse(), glm2.mse(), delta);
+ //assertNotEquals(glm.logloss(), glm2.logloss(), delta);
+
+ double tMse = glm._output._training_metrics._MSE;
+ double tMse2 = glm2._output._training_metrics._MSE;
+ System.out.println(tMse+" "+tMse2);
+ assertNotEquals(tMse, tMse2, delta);
+
+ // check result training metrics unrestricted model and glm model with remove offset effects disabled are the same
+ assertEquals(glm2._output._training_metrics.auc_obj()._auc, glm._output._training_metrics_unrestricted_model.auc_obj()._auc, delta);
+ assertEquals(glm2._output._training_metrics.mse(), glm._output._training_metrics_unrestricted_model.mse(), delta);
+ assertEquals(glm2._output._training_metrics.rmse(), glm._output._training_metrics_unrestricted_model.rmse(), delta);
+
+ // check preds differ
+ int differ = 0;
+ int testRowNumber = 100;
+ double threshold = (2 * testRowNumber)/1.1;
+ for (int i = 0; i < testRowNumber; i++) {
+ if(preds.vec(1).at(i) != preds2.vec(1).at(i)) differ++;
+ if(preds.vec(2).at(i) != preds2.vec(2).at(i)) differ++;
+ }
+
+ assertTrue("Expected number of differing predictions to exceed threshold", differ > threshold);
+
+ System.out.println("Scoring history remove offset enabled");
+ TwoDimTable glmSH = glm._output._scoring_history;
+ System.out.println(glmSH);
+ System.out.println("Scoring history remove offset disabled");
+ TwoDimTable glm2SH = glm2._output._scoring_history;
+ System.out.println(glm2SH);
+ System.out.println("Scoring history remove offset enabled unrestricted model");
+ TwoDimTable glmSHROE = glm._output._scoring_history_unrestricted_model;
+ System.out.println(glmSHROE);
+ System.out.println("Scoring history remove offset disabled unrestricted model");
+ TwoDimTable glm2SHROE = glm2._output._scoring_history_unrestricted_model;
+ System.out.println(glm2SHROE);
+
+ // check scoring history is the same (instead of timestamp and duration column)
+ // change table header because it contains " unrestricted model"
+ glm2SH.setTableHeader(glmSHROE.getTableHeader());
+ assertTwoDimTableEquals(glmSHROE, glm2SH, new int[]{0,1});
+
+ // check control val scoring history is not null when remove offset effects feature is enabled
+ assertNotNull(glmSHROE);
+
+ // check control val scoring history is null when remove offset effects feature is disabled
+ assertNull(glm2SHROE);
+
+ //check variable importance
+ TwoDimTable vi = glm._output._variable_importances;
+ TwoDimTable vi_unrestricted = glm._output._variable_importances_unrestricted_model;
+ TwoDimTable vi_unrestristed_2 = glm2._output._variable_importances;
+
+ assertTrue(Arrays.equals(vi.getRowHeaders(), vi_unrestricted.getRowHeaders()));
+ assertTrue(Arrays.equals(vi_unrestricted.getRowHeaders(), vi_unrestristed_2.getRowHeaders()));
+
+ } finally {
+ if(train != null) train.remove();
+ if(test != null) test.remove();
+ if(preds != null) preds.remove();
+ if(glm != null) glm.remove();
+ if(preds2 != null) preds2.remove();
+ if(glm2 != null) glm2.remove();
+ Scope.exit();
+ }
+ }
+
+ @Test
+ public void compareModelWithOffsetAndControlVariablesEnabledAndDisabled() {
+ Frame train = null;
+ Frame test = null;
+ Frame preds = null;
+ GLMModel glm = null;
+ Frame preds2 = null;
+ GLMModel glm2 = null;
+ try {
+ Scope.enter();
+ train = parseTestFile("smalldata/glm_test/binomial_20_cols_10KRows.csv");
+ GLMModel.GLMParameters.Family family = GLMModel.GLMParameters.Family.binomial;
+ String responseColumn = "C21";
+
+ // set cat columns
+ int numCols = train.numCols();
+ int enumCols = (numCols - 1) / 2;
+ for (int cindex = 0; cindex < enumCols; cindex++) {
+ train.replace(cindex, train.vec(cindex).toCategoricalVec()).remove();
+ }
+ int response_index = numCols - 1;
+
+ train.replace((response_index), train.vec(response_index).toCategoricalVec()).remove();
+
+ DKV.put(train);
+ Scope.track_generic(train);
+
+ test = new Frame(train);
+ test.remove(responseColumn);
+
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters(family);
+ params._response_column = responseColumn;
+ params._train = train._key;
+ params._score_each_iteration = true;
+ params._offset_column = "C20";
+ params._remove_offset_effects = true;
+ params._control_variables = new String[]{"C5"};
+
+ // train model with remove offset effects enabled
+ glm = new GLM(params).trainModel().get();
+ Scope.track_generic(glm);
+
+ System.out.println("_________________________________");
+ System.out.println(glm);
+ System.out.println("______");
+
+ preds = glm.score(test);
+ Scope.track_generic(preds);
+
+ // train model with offset effect removed
+ params._remove_offset_effects = false;
+ params._control_variables = null;
+
+ glm2 = new GLM(params).trainModel().get();
+ Scope.track_generic(glm2);
+
+ preds2 = glm2.score(test);
+ Scope.track_generic(preds2);
+
+ // check result training metrics are not the same
+ double delta = 10e-10;
+ assertNotEquals(glm.auc(), glm2.auc(), delta);
+ assertNotEquals(glm.mse(), glm2.mse(), delta);
+ //assertNotEquals(glm.logloss(), glm2.logloss(), delta);
+
+ double tMse = glm._output._training_metrics._MSE;
+ double tMse2 = glm2._output._training_metrics._MSE;
+ System.out.println(tMse+" "+tMse2);
+ assertNotEquals(tMse, tMse2, delta);
+
+ // check result training metrics unrestricted model and glm model with remove offset effects disabled are the same
+ assertEquals(glm2._output._training_metrics.auc_obj()._auc, glm._output._training_metrics_unrestricted_model.auc_obj()._auc, delta);
+ assertEquals(glm2._output._training_metrics.mse(), glm._output._training_metrics_unrestricted_model.mse(), delta);
+ assertEquals(glm2._output._training_metrics.rmse(), glm._output._training_metrics_unrestricted_model.rmse(), delta);
+
+ // check preds differ
+ int differ = 0;
+ int testRowNumber = 100;
+ double threshold = (2 * testRowNumber)/1.1;
+ for (int i = 0; i < testRowNumber; i++) {
+ if(preds.vec(1).at(i) != preds2.vec(1).at(i)) differ++;
+ if(preds.vec(2).at(i) != preds2.vec(2).at(i)) differ++;
+ }
+ System.out.println(differ + " " + threshold);
+ assert differ > threshold;
+
+ System.out.println("Scoring history remove offset enabled");
+ TwoDimTable glmSH = glm._output._scoring_history;
+ System.out.println(glmSH);
+ System.out.println("Scoring history remove offset disabled");
+ TwoDimTable glm2SH = glm2._output._scoring_history;
+ System.out.println(glm2SH);
+ System.out.println("Scoring history remove offset enabled unrestricted model");
+ TwoDimTable glmSHCV = glm._output._scoring_history_unrestricted_model;
+ System.out.println(glmSHCV);
+ System.out.println("Scoring history remove offset disabled unrestricted model");
+ TwoDimTable glm2SHCV = glm2._output._scoring_history_unrestricted_model;
+ System.out.println(glm2SHCV);
+
+ // check scoring history is the same (instead of timestamp and duration column)
+ // change table header because it contains " unrestricted model"
+ glm2SH.setTableHeader(glmSHCV.getTableHeader());
+ assertTwoDimTableEquals(glmSHCV, glm2SH, new int[]{0,1});
+
+ // check control val scoring history is not null when control vals is enabled
+ assertNotNull(glmSHCV);
+
+ // check control val scoring history is null when control vals is disabled
+ assertNull(glm2SHCV);
+
+ //check variable importance
+ TwoDimTable vi = glm._output._variable_importances;
+ TwoDimTable vi_unrestricted = glm._output._variable_importances_unrestricted_model;
+ TwoDimTable vi_unrestristed_2 = glm2._output._variable_importances;
+
+ assertFalse(Arrays.equals(vi.getRowHeaders(), vi_unrestricted.getRowHeaders()));
+ assertTrue(Arrays.equals(vi_unrestricted.getRowHeaders(), vi_unrestristed_2.getRowHeaders()));
+ } finally {
+ if(train != null) train.remove();
+ if(test != null) test.remove();
+ if(preds != null) preds.remove();
+ if(glm != null) glm.remove();
+ if(preds2 != null) preds2.remove();
+ if(glm2 != null) glm2.remove();
+ Scope.exit();
+ }
+ }
+
+ @Test(expected = H2OModelBuilderIllegalArgumentException.class)
+ public void testRemoveOffsetEffectsMissingOffsetColumn() {
+ Frame train = null;
+ GLMModel glm = null;
+ try {
+ Scope.enter();
+
+ Vec cat1 = Vec.makeVec(new long[]{1,1,1,0,0},new String[]{"black","red"},Vec.newKey());
+ Vec cat2 = Vec.makeVec(new long[]{1,1,1,0,0},new String[]{"a","b"},Vec.newKey());
+ Vec res = Vec.makeVec(new double[]{1,1,2,0,0},cat1.group().addVec());
+ train = new Frame(Key.make("train"),new String[]{"x1", "x2", "y"},new Vec[]{cat1, cat2,res});
+ DKV.put(train);
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters();
+ params._train = train._key;
+ params._alpha = new double[]{0};
+ params._response_column = "y";
+ params._intercept = false;
+ params._remove_offset_effects = true;
+ params._distribution = DistributionFamily.bernoulli;
+ glm = new GLM(params).trainModel().get();
+
+ } finally {
+ if (train != null) train.remove();
+ if (glm != null) glm.remove();
+ Scope.exit();
+ }
+ }
+
+
+ @Test(expected = H2OModelBuilderIllegalArgumentException.class)
+ public void testRemoveOffsetEffectsMultinomial() {
+ Frame train = null;
+ GLMModel glm = null;
+ try {
+ Scope.enter();
+
+ Vec cat1 = Vec.makeVec(new long[]{1,1,1,0,0},new String[]{"black","red"},Vec.newKey());
+ Vec cat2 = Vec.makeVec(new double[]{1,1,1,0,0}, cat1.group().addVec());
+ Vec res = Vec.makeVec(new double[]{1,1,2,0,0},cat1.group().addVec());
+ train = new Frame(Key.make("train"),new String[]{"x1", "x2", "y"},new Vec[]{cat1, cat2, res});
+ DKV.put(train);
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters();
+ params._train = train._key;
+ params._alpha = new double[]{0};
+ params._response_column = "y";
+ params._intercept = false;
+ params._remove_offset_effects = true;
+ params._offset_column = "x2";
+ params._distribution = DistributionFamily.multinomial;
+ glm = new GLM(params).trainModel().get();
+ } finally {
+ if (train != null) train.remove();
+ if (glm != null) glm.remove();
+ Scope.exit();
+ }
+ }
+
+ @Test
+ public void testBasicDataBinomialOffset(){
+ /** Test against GLM in R
+ * cat1 <- factor(c(1,1,1,0,0,1,1,0,0,1,0,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0))
+ * cat2 <- factor(c(1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0))
+ * offset <- c(0.1,0.2,0.2,0.2,0.1,0,0,0.2,0.3,0.5,0.3,0.4,0.8,0.4,0.4,0.5,0,0,0.5,0.1,0,0,0.1,0,0.1,0)
+ * res <- factor(c(1,1,0,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,1,0,1,1,1,1))
+ * data <- data.frame(cat1, cat2, offset, res)
+ * glm <- glm(res ~ cat1 + cat2 + offset(offset), data=data, family = binomial)
+ * summary(glm)
+ * predict(glm)
+ *
+ * Call:
+ * glm(formula = res ~ cat1 + cat2 + offset(offset), family = binomial,
+ * data = data)
+ *
+ * Coefficients:
+ * Estimate Std. Error z value Pr(>|z|)
+ * (Intercept) -0.3310 0.7256 -0.456 0.648
+ * cat11 0.9780 0.8467 1.155 0.248
+ * cat21 0.2295 0.8586 0.267 0.789
+ *
+ * (Dispersion parameter for binomial family taken to be 1)
+ *
+ * Null deviance: 33.557 on 25 degrees of freedom
+ * Residual deviance: 32.173 on 23 degrees of freedom
+ * AIC: 38.173
+ *
+ * Number of Fisher Scoring iterations: 4
+ *
+ * 1 2 3 4 5 6
+ * 0.976506946 0.847045758 1.076506946 -0.130997049 -0.230997049 0.647045758
+ * 7 8 9 10 11 12
+ * 0.647045758 0.098464139 0.198464139 1.147045758 0.198464139 1.047045758
+ * 13 14 15 16 17 18
+ * 0.469002951 1.276506946 1.047045758 1.376506946 -0.330997049 -0.330997049
+ * 19 20 21 22 23 24
+ * 0.398464139 -0.001535861 0.647045758 0.647045758 0.976506946 0.647045758
+ * 25 26
+ * -0.001535861 -0.330997049
+ **/
+ Frame train = null;
+ GLMModel glm = null;
+ GLMModel glmOffset = null;
+ Frame preds = null;
+ Frame predsOffset = null;
+ Frame predsR = null;
+ try {
+ Scope.enter();
+
+ Vec cat1 = Vec.makeVec(new long[]{1,1,1,0,0,1,1,0,0,1,0,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0},new String[]{"0","1"},Vec.newKey());
+ Vec cat2 = Vec.makeVec(new long[]{1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0},new String[]{"0","1"},Vec.newKey());
+ Vec offset = Vec.makeVec(new double[]{0.1,0.2,0.2,0.2,0.1,0,0,0.2,0.3,0.5,0.3,0.4,0.8,0.4,0.4,0.5,0,0,0.5,0.1,0,0,0.1,0,0.1,0}, Vec.newKey());
+ Vec res = Vec.makeVec(new double[]{1,1,0,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,1,0,1,1,1,1}, new String[]{"0","1"},Vec.newKey());
+ train = new Frame(Key.make("train"),new String[]{"cat1", "cat2", "offset", "y"},new Vec[]{cat1, cat2, offset, res});
+ DKV.put(train);
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters();
+ params._train = train._key;
+ params._lambda = new double[]{0};
+ params._alpha = new double[]{0};
+ params._standardize = false;
+ params._non_negative = true;
+ params._intercept = true;
+ params._objective_epsilon = 1e-10;
+ params._gradient_epsilon = 1e-6;
+ params._response_column = "y";
+ params._distribution = DistributionFamily.bernoulli;
+ params._link = GLMModel.GLMParameters.Link.logit;
+ params._max_iterations = 4;
+ params._dispersion_epsilon = 1;
+ params._offset_column = "offset";
+ glm = new GLM(params).trainModel().get();
+ preds = glm.score(train);
+ System.out.println(preds.toTwoDimTable().toString());
+
+ System.out.println(glm._output._variable_importances);
+ System.out.println(glm.coefficients().toString());
+ Double[] coefficients = glm.coefficients().values().toArray(new Double[0]);
+
+ params._remove_offset_effects = true;
+ glmOffset = new GLM(params).trainModel().get();
+ predsOffset = glmOffset.score(train);
+ System.out.println(predsOffset.toTwoDimTable().toString());
+ Double[] coefficientsOffset = glmOffset.coefficients().values().toArray(new Double[0]);
+
+ Double[] coefficientsR = new Double[]{0.9780, 0.2295, -0.3310};
+ Vec predsRVec = Vec.makeVec(new double[]{0.976506946, 0.847045758, 1.076506946, -0.130997049, -0.230997049,
+ 0.647045758, 0.647045758, 0.098464139, 0.198464139, 1.147045758, 0.198464139, 1.047045758,
+ 0.469002951, 1.276506946, 1.047045758, 1.376506946, -0.330997049, -0.330997049, 0.398464139,
+ -0.001535861, 0.647045758, 0.647045758, 0.976506946, 0.647045758, -0.001535861, -0.330997049},
+ Vec.newKey());
+ predsR = new Frame(Key.make("predsR"),new String[]{"predict"},new Vec[]{predsRVec});
+
+ Frame trainWithoutOffset = train.deepCopy("trainWithoutOffset");
+ Vec offsetVec = trainWithoutOffset.remove("offset");
+ Frame manualPredsR = scoreManualWithCoefficients(coefficientsR, trainWithoutOffset, "manualPredsR", true, offsetVec);
+ Frame manualPredsH2o = scoreManualWithCoefficients(coefficients, trainWithoutOffset, "manualPredsH2o", true, offsetVec);
+ Frame manualPredsRemoveOffset = scoreManualWithCoefficients(coefficientsOffset, trainWithoutOffset, "manualPredsRemoveOffset", true);
+ Frame manualPredsRRemoveOffset = scoreManualWithCoefficients(coefficientsR, trainWithoutOffset, "manualPredsR", true);
+
+ double tol = 1e-3;
+ for (long i = 0; i < manualPredsH2o.numRows(); i++) {
+ double h2o = preds.vec(2).at(i);
+ double manualH2o = manualPredsH2o.vec(0).at(i);
+ // predict output from glm in R is not in logit
+ double r = (1.0 / (Math.exp(-predsR.vec(0).at(i)) + 1.0));
+ double manualR = manualPredsR.vec(0).at(i);
+ double h2oOffset = predsOffset.vec(2).at(i);
+ double manualH2oOffset = manualPredsRemoveOffset.vec(0).at(i);
+ double manualROffset = manualPredsRRemoveOffset.vec(0).at(i);
+
+ System.out.println(i+" h2o: "+h2o+ " h2o manual:" +manualH2o+
+ " R: "+r+" R manual: "+manualR +
+ " h2o remove offset: "+h2oOffset+" h2o remove offset manual "+manualH2oOffset+
+ " R remove offset manual: "+manualROffset);
+
+ // glm score calculation check
+ Assert.assertEquals(h2o, manualH2o, tol);
+ Assert.assertEquals(h2o, r, tol);
+ Assert.assertEquals(h2o, manualR, tol);
+
+ // offset calculation check
+ Assert.assertEquals(h2oOffset, manualH2oOffset, tol);
+ Assert.assertEquals(h2oOffset, manualROffset, tol);
+ }
+ } finally {
+ if (train != null) train.remove();
+ if (glm != null) glm.remove();
+ if (glmOffset != null) glmOffset.remove();
+ if (preds != null) preds.remove();
+ if (predsOffset != null) predsOffset.remove();
+ if (predsR != null) predsR.remove();
+ Scope.exit();
+ }
+ }
+
+ @Test
+ public void testBasicDataBinomialControlValuesAndOffset(){
+ /** Test against GLM in R
+ * cat1 <- factor(c(1,1,1,0,0,1,1,0,0,1,0,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0))
+ * cat2 <- factor(c(1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0))
+ * offset <- c(0.1,0.2,0.2,0.2,0.1,0,0,0.2,0.3,0.5,0.3,0.4,0.8,0.4,0.4,0.5,0,0,0.5,0.1,0,0,0.1,0,0.1,0)
+ * res <- factor(c(1,1,0,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,1,0,1,1,1,1))
+ * data <- data.frame(cat1, cat2, offset, res)
+ * glm <- glm(res ~ cat1 + cat2 + offset(offset), data=data, family = binomial)
+ * summary(glm)
+ * predict(glm)
+ *
+ * Call:
+ * glm(formula = res ~ cat1 + cat2 + offset(offset), family = binomial,
+ * data = data)
+ *
+ * Coefficients:
+ * Estimate Std. Error z value Pr(>|z|)
+ * (Intercept) -0.3310 0.7256 -0.456 0.648
+ * cat11 0.9780 0.8467 1.155 0.248
+ * cat21 0.2295 0.8586 0.267 0.789
+ *
+ * (Dispersion parameter for binomial family taken to be 1)
+ *
+ * Null deviance: 33.557 on 25 degrees of freedom
+ * Residual deviance: 32.173 on 23 degrees of freedom
+ * AIC: 38.173
+ *
+ * Number of Fisher Scoring iterations: 4
+ *
+ * 1 2 3 4 5 6
+ * 0.976506946 0.847045758 1.076506946 -0.130997049 -0.230997049 0.647045758
+ * 7 8 9 10 11 12
+ * 0.647045758 0.098464139 0.198464139 1.147045758 0.198464139 1.047045758
+ * 13 14 15 16 17 18
+ * 0.469002951 1.276506946 1.047045758 1.376506946 -0.330997049 -0.330997049
+ * 19 20 21 22 23 24
+ * 0.398464139 -0.001535861 0.647045758 0.647045758 0.976506946 0.647045758
+ * 25 26
+ * -0.001535861 -0.330997049
+ **/
+ Frame train = null;
+ GLMModel glm = null;
+ GLMModel glmCVOffset = null;
+ Frame preds = null;
+ Frame predsCVOffset = null;
+ Frame predsR = null;
+ try {
+ Scope.enter();
+
+ Vec cat1 = Vec.makeVec(new long[]{1,1,1,0,0,1,1,0,0,1,0,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0},new String[]{"0","1"},Vec.newKey());
+ Vec cat2 = Vec.makeVec(new long[]{1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0},new String[]{"0","1"},Vec.newKey());
+ Vec offset = Vec.makeVec(new double[]{0.1,0.2,0.2,0.2,0.1,0,0,0.2,0.3,0.5,0.3,0.4,0.8,0.4,0.4,0.5,0,0,0.5,0.1,0,0,0.1,0,0.1,0}, Vec.newKey());
+ Vec res = Vec.makeVec(new double[]{1,1,0,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,1,0,1,1,1,1}, new String[]{"0","1"},Vec.newKey());
+ train = new Frame(Key.make("train"),new String[]{"cat1", "cat2", "offset", "y"},new Vec[]{cat1, cat2, offset, res});
+ DKV.put(train);
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters();
+ params._train = train._key;
+ params._lambda = new double[]{0};
+ params._alpha = new double[]{0};
+ params._standardize = false;
+ params._non_negative = true;
+ params._intercept = true;
+ params._objective_epsilon = 1e-10;
+ params._gradient_epsilon = 1e-6;
+ params._response_column = "y";
+ params._distribution = DistributionFamily.bernoulli;
+ params._link = GLMModel.GLMParameters.Link.logit;
+ params._max_iterations = 4;
+ params._dispersion_epsilon = 1;
+ params._offset_column = "offset";
+ glm = new GLM(params).trainModel().get();
+ preds = glm.score(train);
+ System.out.println(preds.toTwoDimTable().toString());
+
+ System.out.println(glm._output._variable_importances);
+ System.out.println(glm.coefficients().toString());
+ Double[] coefficients = glm.coefficients().values().toArray(new Double[0]);
+
+ params._control_variables = new String[]{"cat1"};
+ params._remove_offset_effects = true;
+
+ glmCVOffset = new GLM(params).trainModel().get();
+ predsCVOffset = glmCVOffset.score(train);
+ System.out.println(predsCVOffset.toTwoDimTable().toString());
+ Double[] coefficientsOffset = glmCVOffset.coefficients().values().toArray(new Double[0]);
+
+ Double[] coefficientsR = new Double[]{0.9780, 0.2295, -0.3310};
+ Vec predsRVec = Vec.makeVec(new double[]{0.976506946, 0.847045758, 1.076506946, -0.130997049, -0.230997049,
+ 0.647045758, 0.647045758, 0.098464139, 0.198464139, 1.147045758, 0.198464139, 1.047045758,
+ 0.469002951, 1.276506946, 1.047045758, 1.376506946, -0.330997049, -0.330997049, 0.398464139,
+ -0.001535861, 0.647045758, 0.647045758, 0.976506946, 0.647045758, -0.001535861, -0.330997049},
+ Vec.newKey());
+ predsR = new Frame(Key.make("predsR"),new String[]{"predict"},new Vec[]{predsRVec});
+
+ Frame trainWithoutOffset = train.deepCopy("trainWithoutOffset");
+ Vec offsetVec = trainWithoutOffset.remove("offset");
+ Frame manualPredsR = scoreManualWithCoefficients(coefficientsR, trainWithoutOffset, "manualPredsR", true, offsetVec);
+ Frame manualPredsH2o = scoreManualWithCoefficients(coefficients, trainWithoutOffset, "manualPredsH2o", true, offsetVec);
+ Frame manualPredsRemoveCVOffset = scoreManualWithCoefficients(coefficientsOffset, trainWithoutOffset, "manualPredsCVRemoveOffset", new int[]{0}, true);
+ Frame manualPredsRRemoveCVOffset = scoreManualWithCoefficients(coefficientsR, trainWithoutOffset, "manualPredsR", new int[]{0}, true);
+
+ double tol = 1e-3;
+ for (long i = 0; i < manualPredsH2o.numRows(); i++) {
+ double h2o = preds.vec(2).at(i);
+ double manualH2o = manualPredsH2o.vec(0).at(i);
+ // predict output from glm in R is not in logit
+ double r = (1.0 / (Math.exp(-predsR.vec(0).at(i)) + 1.0));
+ double manualR = manualPredsR.vec(0).at(i);
+ double h2oCVOffset = predsCVOffset.vec(2).at(i);
+ double manualH2oCVOffset = manualPredsRemoveCVOffset.vec(0).at(i);
+ double manualRCVOffset = manualPredsRRemoveCVOffset.vec(0).at(i);
+
+ System.out.println(i+" h2o: "+h2o+ " h2o manual:" +manualH2o+
+ " R: "+r+" R manual: "+manualR +
+ " h2o control and remove offset: "+h2oCVOffset+" h2o control variables and remove offset manual "+manualH2oCVOffset+
+ " R control variables and remove offset manual: "+manualRCVOffset);
+
+ // glm score calculation checkmanualROffset
+ Assert.assertEquals(h2o, manualH2o, tol);
+ Assert.assertEquals(h2o, r, tol);
+ Assert.assertEquals(h2o, manualR, tol);
+
+ // offset calculation check
+ Assert.assertEquals(h2oCVOffset, manualH2oCVOffset, tol);
+ Assert.assertEquals(h2oCVOffset, manualRCVOffset, tol);
+ }
+ } finally {
+ if (train != null) train.remove();
+ if (glm != null) glm.remove();
+ if (glmCVOffset != null) glmCVOffset.remove();
+ if (preds != null) preds.remove();
+ if (predsCVOffset != null) predsCVOffset.remove();
+ if (predsR != null) predsR.remove();
+ Scope.exit();
+ }
+ }
}
diff --git a/h2o-bindings/bin/custom/R/gen_glm.py b/h2o-bindings/bin/custom/R/gen_glm.py
index e682a89ea7c7..e80e9ea459c8 100644
--- a/h2o-bindings/bin/custom/R/gen_glm.py
+++ b/h2o-bindings/bin/custom/R/gen_glm.py
@@ -76,7 +76,7 @@ def update_param(name, param):
#' @param destination_key a string or a NULL
#' @export
h2o.make_unrestricted_glm_model <- function(model, destination_key = NULL) {
- stopifnot("GLM wasn't trained with control variables." = !is.null(model@params$actual[["control_variables"]]))
+ stopifnot("GLM wasn't trained with control variables or with remove offset effects." = !is.null(model@params$actual[["control_variables"]]) || isTRUE(model@params$actual[["remove_offset_effects"]]))
query <- list(method = "POST", .h2o.__GLMMakeUnrestrictedModel, model = model@model_id)
if (!missing(destination_key) && !is.null(destination_key)) {
query <- c(query, list(dest = destination_key))
diff --git a/h2o-core/src/main/java/hex/ModelMetricsBinomial.java b/h2o-core/src/main/java/hex/ModelMetricsBinomial.java
index cacb20b7dc73..f5928eabab3e 100755
--- a/h2o-core/src/main/java/hex/ModelMetricsBinomial.java
+++ b/h2o-core/src/main/java/hex/ModelMetricsBinomial.java
@@ -186,7 +186,7 @@ public static class MetricBuilderBinomial> ex
// Passed a float[] sized nclasses+1; ds[0] must be a prediction. ds[1...nclasses-1] must be a class
// distribution;
@Override public double[] perRow(double ds[], float[] yact, Model m) {return perRow(ds, yact, 1, 0, m);}
- @Override public double[] perRow(double ds[], float[] yact, double w, double o, Model m) {
+ @Override public double[] perRow(double ds[], float[] yact, double w, double offset, Model m) {
if( Float .isNaN(yact[0]) ) return ds; // No errors if actual is missing
if(ArrayUtils.hasNaNs(ds)) return ds; // No errors if prediction has missing values (can happen for GLM)
if(w == 0 || Double.isNaN(w)) return ds;
diff --git a/h2o-docs/src/product/data-science/algo-params/control_variables.rst b/h2o-docs/src/product/data-science/algo-params/control_variables.rst
index 75222c3a5e22..b9870358c4ca 100644
--- a/h2o-docs/src/product/data-science/algo-params/control_variables.rst
+++ b/h2o-docs/src/product/data-science/algo-params/control_variables.rst
@@ -17,7 +17,7 @@ Common use cases include:
When control variables are specified, GLM will exclude them during scoring. Model metrics and scoring history are calculated for both the restricted model (with control variables excluded) and the unrestricted model (with control variables included).
-To get the unrestricted model with its own metrics use ``glm.make_unrestriced_glm_model()``/``h2o.make_unrestricted_glm_model(glm)``.
+To get the unrestricted model with its own metrics use ``glm.make_unrestricted_glm_model()`` / ``h2o.make_unrestricted_glm_model(glm)``.
The control variables' coefficients are set to zero in the variable importance table. Use the unrestricted model to get the variable importance table with all variables included.
@@ -33,7 +33,7 @@ The control variables' coefficients are set to zero in the variable importance t
Related Parameters
~~~~~~~~~~~~~~~~~~
-- None
+- `remove_offset_effects `__
Example
~~~~~~~
diff --git a/h2o-docs/src/product/data-science/algo-params/remove_offset_effects.rst b/h2o-docs/src/product/data-science/algo-params/remove_offset_effects.rst
new file mode 100644
index 000000000000..4ce196f41e24
--- /dev/null
+++ b/h2o-docs/src/product/data-science/algo-params/remove_offset_effects.rst
@@ -0,0 +1,122 @@
+``remove_offset_effects``
+--------------------
+
+- Available in: GLM
+- Hyperparameter: no
+
+Description
+~~~~~~~~~~~
+
+This feature allows you to remove offset effects during scoring and metric calculation.
+
+Model metrics and scoring history are calculated for both the restricted model (with offset effects removed) and the unrestricted model (with offset effect included).
+
+To get the unrestricted model with its own metrics use ``glm.make_unrestricted_glm_model()`` / ``h2o.make_unrestricted_glm_model(glm)``.
+
+
+**Notes**:
+
+- This option is experimental.
+- This option is applicable only for regression and binomial distribution.
+- This option is not available when cross validation is enabled.
+- This option is not available when Lambda search is enabled.
+- This option is not available when interactions are enabled.
+
+Related Parameters
+~~~~~~~~~~~~~~~~~~
+
+- `control_variables `__
+
+Example
+~~~~~~~
+
+.. tabs::
+ .. code-tab:: r R
+
+ library(h2o)
+ h2o.init()
+ # import the airlines dataset:
+ # This dataset is used to classify whether a flight will be delayed 'YES' or not "NO"
+ # original data can be found at http://www.transtats.bts.gov/
+ airlines <- h2o.importFile("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
+
+ # convert columns to factors
+ airlines["Year"] <- as.factor(airlines["Year"])
+ airlines["Month"] <- as.factor(airlines["Month"])
+ airlines["DayOfWeek"] <- as.factor(airlines["DayOfWeek"])
+ airlines["Cancelled"] <- as.factor(airlines["Cancelled"])
+ airlines['FlightNum'] <- as.factor(airlines['FlightNum'])
+
+ # set the predictor names and the response column name
+ predictors <- c("Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum")
+ response <- "IsDepDelayed"
+
+ # split into train and validation
+ airlines_splits <- h2o.splitFrame(data = airlines, ratios = 0.8)
+ train <- airlines_splits[[1]]
+ valid <- airlines_splits[[2]]
+
+ # try using the `remove_offset_effects` parameter:
+ airlines_glm <- h2o.glm(family = 'binomial', x = predictors, y = response, training_frame = train,
+ validation_frame = valid,
+ remove_collinear_columns = FALSE,
+ score_each_iteration = TRUE,
+ generate_scoring_history = TRUE,
+ offset_column = "Distance",
+ remove_offset_effects = TRUE)
+
+ # print the AUC for the validation data
+ print(h2o.auc(airlines_glm, valid = TRUE))
+
+ # take a look at the learning curve
+ h2o.learning_curve_plot(airlines_glm)
+
+ # get the unrestricted GLM model
+ unrestricted_airlines_glm <- h2o.make_unrestricted_glm_model(airlines_glm)
+
+
+ .. code-tab:: python
+
+ import h2o
+ from h2o.estimators.glm import H2OGeneralizedLinearEstimator
+ h2o.init()
+
+ # import the airlines dataset:
+ # This dataset is used to classify whether a flight will be delayed 'YES' or not "NO"
+ # original data can be found at http://www.transtats.bts.gov/
+ airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
+
+ # convert columns to factors
+ airlines["Year"]= airlines["Year"].asfactor()
+ airlines["Month"]= airlines["Month"].asfactor()
+ airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
+ airlines["Cancelled"] = airlines["Cancelled"].asfactor()
+ airlines['FlightNum'] = airlines['FlightNum'].asfactor()
+
+ # set the predictor names and the response column name
+ predictors = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]
+ response = "IsDepDelayed"
+
+ # split into train and validation sets
+ train, valid= airlines.split_frame(ratios = [.8])
+
+ # try using the `remove_offset_effects` parameter:
+ # initialize your estimator
+ airlines_glm = H2OGeneralizedLinearEstimator(family = 'binomial',
+ remove_collinear_columns = True,
+ score_each_iteration = True,
+ generate_scoring_history = True,
+ offset_column = "Distance",
+ remove_offset_effects = True)
+
+ # then train your model
+ airlines_glm.train(x = predictors, y = response, training_frame = train, validation_frame = valid)
+
+ # print the auc for the validation data
+ print(airlines_glm.auc(valid=True))
+
+ # take a look at the learning curve
+ airlines_glm.learning_curve_plot()
+
+ # get the unrestricted GLM model
+ unrestricted_airlines_glm = airlines_glm.make_unrestricted_glm_model()
diff --git a/h2o-py/h2o/estimators/glm.py b/h2o-py/h2o/estimators/glm.py
index 4d09e146c6ba..3e9569488e10 100644
--- a/h2o-py/h2o/estimators/glm.py
+++ b/h2o-py/h2o/estimators/glm.py
@@ -95,6 +95,7 @@ def __init__(self,
stopping_metric="auto", # type: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"]
stopping_tolerance=0.001, # type: float
control_variables=None, # type: Optional[List[str]]
+ remove_offset_effects=False, # type: bool
balance_classes=False, # type: bool
class_sampling_factors=None, # type: Optional[List[float]]
max_after_balance_size=5.0, # type: float
@@ -341,6 +342,9 @@ def __init__(self,
Experimental.
Defaults to ``None``.
:type control_variables: List[str], optional
+ :param remove_offset_effects: Remove offset effects from scoring and metric calculation. Experimental.
+ Defaults to ``False``.
+ :type remove_offset_effects: bool
:param balance_classes: Balance training data class counts via over/under-sampling (for imbalanced data).
Defaults to ``False``.
:type balance_classes: bool
@@ -504,6 +508,7 @@ def __init__(self,
self.stopping_metric = stopping_metric
self.stopping_tolerance = stopping_tolerance
self.control_variables = control_variables
+ self.remove_offset_effects = remove_offset_effects
self.balance_classes = balance_classes
self.class_sampling_factors = class_sampling_factors
self.max_after_balance_size = max_after_balance_size
@@ -2056,6 +2061,20 @@ def control_variables(self, control_variables):
assert_is_type(control_variables, None, [str])
self._parms["control_variables"] = control_variables
+ @property
+ def remove_offset_effects(self):
+ """
+ Remove offset effects from scoring and metric calculation. Experimental.
+
+ Type: ``bool``, defaults to ``False``.
+ """
+ return self._parms.get("remove_offset_effects")
+
+ @remove_offset_effects.setter
+ def remove_offset_effects(self, remove_offset_effects):
+ assert_is_type(remove_offset_effects, None, bool)
+ self._parms["remove_offset_effects"] = remove_offset_effects
+
@property
def balance_classes(self):
"""
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_remove_offset_effects.py b/h2o-py/tests/testdir_algos/glm/pyunit_remove_offset_effects.py
new file mode 100644
index 000000000000..83439ff7ccdf
--- /dev/null
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_remove_offset_effects.py
@@ -0,0 +1,61 @@
+import sys
+sys.path.insert(1,"../../../")
+import h2o
+from tests import pyunit_utils
+from h2o.estimators.glm import H2OGeneralizedLinearEstimator
+
+
+def glm_remove_offset_effects():
+
+ cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
+ cars = cars[cars["economy_20mpg"].isna() == 0]
+ cars["name"] = cars["name"].asfactor()
+ cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
+ offset = h2o.H2OFrame([[.5]]*398)
+ offset.set_names(["offset"])
+ cars = cars.cbind(offset)
+
+ glm_model = H2OGeneralizedLinearEstimator(family="binomial")
+ glm_model.train(x=["name", "power", "year"], y="economy_20mpg", training_frame=cars)
+
+ predictions_train = glm_model.predict(cars).as_data_frame()
+ print(glm_model._model_json["output"]["scoring_history"])
+
+ glm_model_2 = H2OGeneralizedLinearEstimator(family="binomial", generate_scoring_history=True)
+ glm_model_2.train(x=["name", "power", "year"], y="economy_20mpg", training_frame=cars)
+
+ predictions_train_2 = glm_model_2.predict(cars).as_data_frame()
+ print(glm_model_2._model_json["output"]["scoring_history"])
+
+ glm_model_roe = H2OGeneralizedLinearEstimator(family="binomial", offset_column="offset", remove_offset_effects=True)
+ glm_model_roe.train(x=["name", "power", "year"], y="economy_20mpg", training_frame=cars)
+
+ predictions_train_roe = glm_model_roe.predict(cars).as_data_frame()
+ print(glm_model_roe._model_json["output"]["scoring_history"])
+
+ glm_model_roe_2 = H2OGeneralizedLinearEstimator(family="binomial", offset_column="offset", remove_offset_effects=True,
+ generate_scoring_history=True)
+ glm_model_roe_2.train(x=["name", "power", "year"], y="economy_20mpg", training_frame=cars)
+ predictions_train_roe2 = glm_model_roe_2.predict(cars).as_data_frame()
+ print(glm_model_roe_2._model_json["output"]["scoring_history"])
+
+ # check model metrics are not the same
+ try:
+ pyunit_utils.check_model_metrics(glm_model, glm_model_roe, "")
+ except AssertionError as err:
+ assert "Scoring history is not the same" in str(err)
+
+ # check predictions are different
+ for i in range(predictions_train.shape[0]):
+ pyunit_utils.assert_not_equal(predictions_train.iloc[i, 1], predictions_train_roe.iloc[i, 1], f"Predictions at position {i} should differ but they don't!")
+
+ # check predictions are the same with and without generate_scoring history
+ for i in range(predictions_train.shape[0]):
+ pyunit_utils.assert_equals(predictions_train.iloc[i, 1], predictions_train_2.iloc[i, 1], f"Predictions at position {i} should not differ but they do!")
+ pyunit_utils.assert_equals(predictions_train_roe.iloc[i, 1], predictions_train_roe2.iloc[i, 1], f"Predictions at position {i} should not differ but they do!")
+
+
+if __name__ == "__main__":
+ pyunit_utils.standalone_test(glm_remove_offset_effects)
+else:
+ glm_remove_offset_effects()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_remove_offset_glm.py b/h2o-py/tests/testdir_algos/glm/pyunit_remove_offset_glm.py
new file mode 100644
index 000000000000..eef2310e97ba
--- /dev/null
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_remove_offset_glm.py
@@ -0,0 +1,47 @@
+from builtins import range
+import sys
+sys.path.insert(1,"../../../")
+import h2o
+from tests import pyunit_utils
+from h2o.estimators.glm import H2OGeneralizedLinearEstimator
+
+
+def remove_offset_glm():
+
+ cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
+ cars = cars[cars["economy_20mpg"].isna() == 0]
+ cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
+
+ offset_col = "offset"
+ offset = h2o.H2OFrame([[.5]]*398)
+ offset.set_names([offset_col])
+ cars = cars.cbind(offset)
+
+ # offset_column passed in the train method
+ glm_model = H2OGeneralizedLinearEstimator(family="binomial")
+ glm_model.train(x=list(range(2,8)),y="economy_20mpg", training_frame=cars, offset_column=offset_col)
+
+ # predict with offset
+ predictions_train = glm_model.predict(cars).as_data_frame()
+ print(predictions_train)
+
+ # metrics with offset
+ perf = glm_model.model_performance(cars)
+ print(perf)
+
+ # setup offset column to zero to remove its effect
+ cars[offset_col] = 0
+
+ # predict with offset effect removed
+ predictions_train_remove_offset = glm_model.predict(cars).as_data_frame()
+ print(predictions_train_remove_offset)
+
+ # metrics with offset effect removed
+ perf = glm_model.model_performance(cars)
+ print(perf)
+
+
+if __name__ == "__main__":
+ pyunit_utils.standalone_test(remove_offset_glm)
+else:
+ remove_offset_glm()
diff --git a/h2o-r/h2o-package/R/glm.R b/h2o-r/h2o-package/R/glm.R
index 38796550d9b9..1e076839c09f 100644
--- a/h2o-r/h2o-package/R/glm.R
+++ b/h2o-r/h2o-package/R/glm.R
@@ -116,6 +116,7 @@
#' @param stopping_tolerance Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this
#' much) Defaults to 0.001.
#' @param control_variables A list of predictor column indices which is used for training but removed for scoring. Experimental.
+#' @param remove_offset_effects \code{Logical}. Remove offset effects from scoring and metric calculation. Experimental. Defaults to FALSE.
#' @param balance_classes \code{Logical}. Balance training data class counts via over/under-sampling (for imbalanced data). Defaults to
#' FALSE.
#' @param class_sampling_factors Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will
@@ -267,6 +268,7 @@ h2o.glm <- function(x,
stopping_metric = c("AUTO", "deviance", "logloss", "MSE", "RMSE", "MAE", "RMSLE", "AUC", "AUCPR", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"),
stopping_tolerance = 0.001,
control_variables = NULL,
+ remove_offset_effects = FALSE,
balance_classes = FALSE,
class_sampling_factors = NULL,
max_after_balance_size = 5.0,
@@ -429,6 +431,8 @@ h2o.glm <- function(x,
parms$stopping_tolerance <- stopping_tolerance
if (!missing(control_variables))
parms$control_variables <- control_variables
+ if (!missing(remove_offset_effects))
+ parms$remove_offset_effects <- remove_offset_effects
if (!missing(balance_classes))
parms$balance_classes <- balance_classes
if (!missing(class_sampling_factors))
@@ -563,6 +567,7 @@ h2o.glm <- function(x,
stopping_metric = c("AUTO", "deviance", "logloss", "MSE", "RMSE", "MAE", "RMSLE", "AUC", "AUCPR", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"),
stopping_tolerance = 0.001,
control_variables = NULL,
+ remove_offset_effects = FALSE,
balance_classes = FALSE,
class_sampling_factors = NULL,
max_after_balance_size = 5.0,
@@ -730,6 +735,8 @@ h2o.glm <- function(x,
parms$stopping_tolerance <- stopping_tolerance
if (!missing(control_variables))
parms$control_variables <- control_variables
+ if (!missing(remove_offset_effects))
+ parms$remove_offset_effects <- remove_offset_effects
if (!missing(balance_classes))
parms$balance_classes <- balance_classes
if (!missing(class_sampling_factors))
@@ -832,7 +839,7 @@ h2o.makeGLMModel <- function(model,beta) {
#' @param destination_key a string or a NULL
#' @export
h2o.make_unrestricted_glm_model <- function(model, destination_key = NULL) {
- stopifnot("GLM wasn't trained with control variables." = !is.null(model@params$actual[["control_variables"]]))
+ stopifnot("GLM wasn't trained with control variables or with remove offset effects." = !is.null(model@params$actual[["control_variables"]]) || isTRUE(model@params$actual[["remove_offset_effects"]]))
query <- list(method = "POST", .h2o.__GLMMakeUnrestrictedModel, model = model@model_id)
if (!missing(destination_key) && !is.null(destination_key)) {
query <- c(query, list(dest = destination_key))
diff --git a/h2o-r/tests/testdir_algos/glm/runit_GLM_remove_offset_effects_explain.R b/h2o-r/tests/testdir_algos/glm/runit_GLM_remove_offset_effects_explain.R
new file mode 100644
index 000000000000..367ddeee19f8
--- /dev/null
+++ b/h2o-r/tests/testdir_algos/glm/runit_GLM_remove_offset_effects_explain.R
@@ -0,0 +1,40 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source("../../../scripts/h2o-r-test-setup.R")
+
+
+
+glm_remove_offset_effects_explain <- function() {
+ df <- h2o.importFile("https://h2o-public-test-data.s3.amazonaws.com/smalldata/prostate/prostate.csv")
+ df$CAPSULE <- as.factor(df$CAPSULE)
+ df$RACE <- as.factor(df$RACE)
+ df$DCAPS <- as.factor(df$DCAPS)
+ df$DPROS <- as.factor(df$DPROS)
+
+ response <- "CAPSULE"
+
+ prostate_glm <- h2o.glm(family = "binomial",
+ y = response,
+ training_frame = df,
+ generate_scoring_history = T,
+ score_each_iteration = T,
+ offset_column = "AGE",
+ remove_offset_effects = T
+ )
+
+ summary(prostate_glm)
+
+ # test make unrestricted model
+ unrestricted_prostate_glm <- h2o.make_unrestricted_glm_model(prostate_glm)
+ expect_false(is.null(unrestricted_prostate_glm))
+ summary(unrestricted_prostate_glm)
+
+ # should pass
+ h2o.learning_curve_plot(prostate_glm)
+ h2o.learning_curve_plot(unrestricted_prostate_glm)
+
+ # should pass
+ h2o.explain(prostate_glm, df)
+ h2o.explain(unrestricted_prostate_glm, df)
+}
+
+doTest("GLM: Remove offset effects works with explain", glm_remove_offset_effects_explain)