diff --git a/h2o-algos/src/main/java/hex/api/MakeGLMModelHandler.java b/h2o-algos/src/main/java/hex/api/MakeGLMModelHandler.java
index 495c3e6c6552..e5ed544900b8 100644
--- a/h2o-algos/src/main/java/hex/api/MakeGLMModelHandler.java
+++ b/h2o-algos/src/main/java/hex/api/MakeGLMModelHandler.java
@@ -56,15 +56,29 @@ public GLMModelV3 make_model(int version, MakeGLMModelV3 args){
public GLMModelV3 make_unrestricted_model(int version, MakeUnrestrictedGLMModelV3 args){
GLMModel model = DKV.getGet(args.model.key());
- if(model == null)
+ if (model == null)
throw new IllegalArgumentException("Missing source model " + args.model);
- if(model._parms._control_variables == null){
- throw new IllegalArgumentException("Source model is not trained with control variables.");
+ if (model._parms._control_variables == null && !model._parms._remove_offset_effects) {
+ throw new IllegalArgumentException("Source model is not trained with control variables or remove offset effects.");
}
- Key generatedKey = Key.make(model._key.toString()+"_unrestricted_model");
+ Key generatedKey;
+ if (args.control_variables_enabled && args.remove_offset_effects_enabled) {
+ throw new IllegalArgumentException("The control_variables_enabled and remove_offset_effects_enabled feature " +
+ "cannot be used together. It produces the same model as the main model.");
+ } else if((args.control_variables_enabled || args.remove_offset_effects_enabled) &&
+ (model._parms._control_variables == null || !model._parms._remove_offset_effects)) {
+ throw new IllegalArgumentException("You can set control_variables_enabled to true or " +
+ "remove_offset_effects_enabled to true only if control_variables and remove_offset_effects are both set.");
+ } else if (args.remove_offset_effects_enabled) {
+ generatedKey = Key.make(model._key.toString() + "_remove_offset_effects_enabled");
+ } else if (args.control_variables_enabled) {
+ generatedKey = Key.make(model._key.toString() + "_control_variables_enabled");
+ } else {
+ generatedKey = Key.make(model._key.toString()+"_unrestricted_model");
+ }
Key key = args.dest != null ? Key.make(args.dest) : generatedKey;
- GLMModel modelContrVars = DKV.getGet(key);
- if(modelContrVars != null) {
+ GLMModel modelUnrestricted = DKV.getGet(key);
+ if (modelUnrestricted != null) {
throw new IllegalArgumentException("Model with "+key+" already exists.");
}
GLMModel.GLMParameters parms = (GLMModel.GLMParameters) model._parms.clone();
@@ -72,22 +86,58 @@ public GLMModelV3 make_unrestricted_model(int version, MakeUnrestrictedGLMModelV
GLMModel m = new GLMModel(key, parms,null, model._ymu,
Double.NaN, Double.NaN, -1);
m.setInputParms(inputParms);
- m._input_parms._control_variables = null;
- m._parms._control_variables = null;
+ if (args.control_variables_enabled){
+ m._input_parms._control_variables = model._parms._control_variables;
+ m._parms._control_variables = model._parms._control_variables;
+ m._input_parms._remove_offset_effects = false;
+ m._parms._remove_offset_effects = false;
+ } else if(args.remove_offset_effects_enabled){
+ m._input_parms._remove_offset_effects = true;
+ m._parms._remove_offset_effects = true;
+ m._input_parms._control_variables = null;
+ m._parms._control_variables = null;
+ } else {
+ m._input_parms._control_variables = null;
+ m._parms._control_variables = null;
+ m._input_parms._remove_offset_effects = false;
+ m._parms._remove_offset_effects = false;
+ }
DataInfo dinfo = model.dinfo();
dinfo.setPredictorTransform(TransformType.NONE);
m._output = new GLMOutput(model.dinfo(), model._output._names, model._output._column_types, model._output._domains,
model._output.coefficientNames(), model._output.beta(), model._output._binomial, model._output._multinomial,
model._output._ordinal, null);
- ModelMetrics mt = model._output._training_metrics_unrestricted_model;
- ModelMetrics mv = model._output._validation_metrics_unrestricted_model;
- m._output._training_metrics = mt;
- m._output._validation_metrics = mv;
- m._output._scoring_history = model._output._scoring_history_unrestricted_model;
+ if (args.control_variables_enabled) {
+ ModelMetrics mt = model._output._training_metrics_restricted_model_cv;
+ ModelMetrics mv = model._output._validation_metrics_restricted_model_cv;
+ m._output._training_metrics = mt;
+ m._output._validation_metrics = mv;
+ m._output._scoring_history = model._output._scoring_history_restricted_model_cv;
+ m.resetThreshold(model.defaultThreshold());
+ m._output._variable_importances = model._output._variable_importances;
+ m._output.setAndMapControlVariablesNames(model._parms._control_variables);
+ } else if (args.remove_offset_effects_enabled) {
+ ModelMetrics mt = model._output._training_metrics_restricted_model_ro;
+ ModelMetrics mv = model._output._validation_metrics_restricted_model_ro;
+ m._output._training_metrics = mt;
+ m._output._validation_metrics = mv;
+ m._output._scoring_history = model._output._scoring_history_restricted_model_ro;
+ m.resetThreshold(model.defaultThreshold());
+ m._output._variable_importances = model._output._variable_importances_unrestricted_model;
+ } else {
+ ModelMetrics mt = model._output._training_metrics_unrestricted_model;
+ ModelMetrics mv = model._output._validation_metrics_unrestricted_model;
+ m._output._training_metrics = mt;
+ m._output._validation_metrics = mv;
+ m._output._scoring_history = model._output._scoring_history_unrestricted_model;
+ m.resetThreshold(model.defaultThreshold());
+ m._output._variable_importances = model._output._variable_importances_unrestricted_model;
+ }
m._output._model_summary = model._output._model_summary;
- m.resetThreshold(model.defaultThreshold());
- m._output._variable_importances = model._output._variable_importances_unrestricted_model;
m._key = key;
+ // setting these flags is important for right scoring
+ m._useControlVariables = args.control_variables_enabled;
+ m._useRemoveOffsetEffects = args.remove_offset_effects_enabled;
DKV.put(key, m);
GLMModelV3 res = new GLMModelV3();
diff --git a/h2o-algos/src/main/java/hex/glm/GLM.java b/h2o-algos/src/main/java/hex/glm/GLM.java
index 06158a7601aa..2bb52a0d7b62 100644
--- a/h2o-algos/src/main/java/hex/glm/GLM.java
+++ b/h2o-algos/src/main/java/hex/glm/GLM.java
@@ -759,6 +759,8 @@ void restoreFromCheckpoint(TwoDimTable sHist, int[] colIndices) {
private transient ScoringHistory _scoringHistory;
private transient ScoringHistory _scoringHistoryUnrestrictedModel;
+ private transient ScoringHistory _scoringHistoryRemoveOffsetEnabled;
+ private transient ScoringHistory _scoringHistoryControlValEnabled;
private transient LambdaSearchScoringHistory _lambdaSearchScoringHistory;
long _t0 = System.currentTimeMillis();
@@ -950,6 +952,12 @@ public void init(boolean expensive) {
_parms._generate_scoring_history);
_scoringHistoryUnrestrictedModel = new ScoringHistory(_parms._valid != null,_parms._nfolds > 1,
_parms._generate_scoring_history);
+ if(_parms._control_variables != null && _parms._remove_offset_effects){
+ _scoringHistoryControlValEnabled = new ScoringHistory(_parms._valid != null,_parms._nfolds > 1,
+ _parms._generate_scoring_history);
+ _scoringHistoryRemoveOffsetEnabled = new ScoringHistory(_parms._valid != null,_parms._nfolds > 1,
+ _parms._generate_scoring_history);
+ }
_train.bulkRollups(); // make sure we have all the rollups computed in parallel
_t0 = System.currentTimeMillis();
if ((_parms._lambda_search || !_parms._intercept || _parms._lambda == null || _parms._lambda[0] > 0))
@@ -1430,9 +1438,9 @@ private void restoreScoringHistoryFromCheckpoint() {
else {
_scoringHistory.restoreFromCheckpoint(scoringHistory, colHeadersIndex);
}
- if (_model._parms._control_variables != null) {
- TwoDimTable scoringHistoryControlVal = _model._output._scoring_history_unrestricted_model;
- _scoringHistoryUnrestrictedModel.restoreFromCheckpoint(scoringHistoryControlVal, colHeadersIndex);
+ if (_model._parms._control_variables != null || _model._parms._remove_offset_effects) {
+ TwoDimTable scoringHistoryUnrestricted = _model._output._scoring_history_unrestricted_model;
+ _scoringHistoryUnrestrictedModel.restoreFromCheckpoint(scoringHistoryUnrestricted, colHeadersIndex);
}
}
@@ -3384,76 +3392,176 @@ private void scoreAndUpdateModel() {
Frame train = DKV.getGet(_parms._train); // need to keep this frame to get scoring metrics back
_model.score(_parms.train(), null, CFuncRef.from(_parms._custom_metric_func)).delete();
scorePostProcessing(train, t1);
- if (_model._parms._control_variables != null){
+ if (_model._parms._control_variables != null || _model._parms._remove_offset_effects){
try {
- _model._useControlVariables = true;
+ _model._useControlVariables = _model._parms._control_variables != null;
+ _model._useRemoveOffsetEffects = _model._parms._remove_offset_effects;
long t2 = System.currentTimeMillis();
_model.score(train, null, CFuncRef.from(_parms._custom_metric_func)).delete();
- scorePostProcessingControlVal(train, t2);
+ scorePostProcessingRestrictedModel(train, t2);
+ if (_model._parms._control_variables != null && _model._parms._remove_offset_effects) {
+ _model._useControlVariables = true;
+ _model._useRemoveOffsetEffects = false;
+ t2 = System.currentTimeMillis();
+ _model.score(train, null, CFuncRef.from(_parms._custom_metric_func)).delete();
+ scorePostProcessingRestrictedModelCVEnabled(train, t2);
+ _model._useControlVariables = false;
+ _model._useRemoveOffsetEffects = true;
+ t2 = System.currentTimeMillis();
+ _model.score(train, null, CFuncRef.from(_parms._custom_metric_func)).delete();
+ scorePostProcessingRestrictedModelROEnabled(train, t2);
+ }
} finally {
_model._useControlVariables = false;
+ _model._useRemoveOffsetEffects = false;
}
}
+ _lastScore = System.currentTimeMillis();
+ long scoringTime = System.currentTimeMillis() - t1;
+ _scoringInterval = Math.max(_scoringInterval, 20 * scoringTime); // at most 5% overhead for scoring
+ _model.update(_job._key);
+ _model.generateSummary(_parms._train, _state._iter);
}
- private void scorePostProcessingControlVal(Frame train, long t1) {
+ private void scorePostProcessingRestrictedModel(Frame train, long t1) {
ModelMetrics mtrain = ModelMetrics.getFromDKV(_model, train); // updated by model.scoreAndUpdateModel
long t2 = System.currentTimeMillis();
if (mtrain != null) {
_model._output._training_metrics = mtrain;
- _model._output._training_time_ms = t2 - _model._output._start_time; // remember training time
+ _model._output._training_time_ms = t2 - _model._output._start_time; // remember training time
ScoreKeeper trainScore = new ScoreKeeper(Double.NaN);
trainScore.fillFrom(mtrain);
Log.info(LogMsg(mtrain.toString()));
} else {
Log.info(LogMsg("ModelMetrics mtrain is null"));
}
- Log.info(LogMsg("Control values training metrics computed in " + (t2 - t1) + "ms"));
+ Log.info(LogMsg("Restricted model training metrics computed in " + (t2 - t1) + "ms"));
if (_valid != null) {
Frame valid = DKV.getGet(_parms._valid);
- try {
- _model._useControlVariables = true;
- _model.score(_parms.valid(), null, CFuncRef.from(_parms._custom_metric_func)).delete();
- } finally {
- _model._useControlVariables = true;
- }
+ _model.score(_parms.valid(), null, CFuncRef.from(_parms._custom_metric_func)).delete();
_model._output._validation_metrics = ModelMetrics.getFromDKV(_model, valid); //updated by model.scoreAndUpdateModel
ScoreKeeper validScore = new ScoreKeeper(Double.NaN);
validScore.fillFrom(_model._output._validation_metrics);
}
_model.addScoringInfo(_parms, nclasses(), t2, _state._iter); // add to scoringInfo for early stopping
+ if (_parms._generate_scoring_history) { // update scoring history with deviance train and valid if available
+ if(_model._useControlVariables) {
+ double[] betaContrVal = _model._output.getControlValBeta(_state.expandBeta(_state.beta()).clone());
+ GLMResDevTask task = new GLMResDevTask(_job._key, _dinfo, _parms, betaContrVal).doAll(_dinfo._adaptedFrame);
+ double objectiveControlVal = _state.objective(betaContrVal, task._likelihood);
+
+ if ((mtrain != null) && (_valid != null)) {
+ _scoringHistory.addIterationScore(true, true, _state._iter, task._likelihood,
+ objectiveControlVal, _state.deviance(task._likelihood), ((GLMMetrics) _model._output._validation_metrics).residual_deviance(),
+ mtrain._nobs, _model._output._validation_metrics._nobs, _state.lambda(), _state.alpha());
+ } else { // only doing training deviance
+ _scoringHistory.addIterationScore(true, false, _state._iter, task._likelihood,
+ objectiveControlVal, _state.deviance(task._likelihood), Double.NaN, mtrain._nobs, 1, _state.lambda(),
+ _state.alpha());
+ }
+ } else if (_model._useRemoveOffsetEffects) {
+ if ((mtrain != null) && (_valid != null)) {
+ _scoringHistory.addIterationScore(true, true, _state._iter, _state.likelihood(),
+ _state.objective(), _state.deviance(), ((GLMMetrics) _model._output._validation_metrics).residual_deviance(),
+ mtrain._nobs, _model._output._validation_metrics._nobs, _state.lambda(), _state.alpha());
+ } else { // only doing training deviance
+ _scoringHistory.addIterationScore(true, false, _state._iter, _state.likelihood(),
+ _state.objective(), _state.deviance(), Double.NaN, mtrain._nobs, 1, _state.lambda(),
+ _state.alpha());
+ }
+ }
+ }
+ _model._output._scoring_history = _scoringHistory != null ? _scoringHistory.to2dTable(_parms, null, null) : null;
+ }
+
+ private void scorePostProcessingRestrictedModelCVEnabled(Frame train, long t1) {
+ ModelMetrics mtrain = ModelMetrics.getFromDKV(_model, train); // updated by model.scoreAndUpdateModel
+ long t2 = System.currentTimeMillis();
+ if (mtrain != null) {
+ _model._output._training_metrics_restricted_model_cv = mtrain;
+ _model._output._training_time_ms = t2 - _model._output._start_time; // remember training time
+ ScoreKeeper trainScore = new ScoreKeeper(Double.NaN);
+ trainScore.fillFrom(mtrain);
+ Log.info(LogMsg(mtrain.toString()));
+ } else {
+ Log.info(LogMsg("ModelMetrics mtrain is null"));
+ }
+ Log.info(LogMsg("Restricted model where control variables feature is enabled training metrics computed in " + (t2 - t1) + "ms"));
+ if (_valid != null) {
+ Frame valid = DKV.getGet(_parms._valid);
+ _model.score(_parms.valid(), null, CFuncRef.from(_parms._custom_metric_func)).delete();
+ _model._output._validation_metrics_restricted_model_cv = ModelMetrics.getFromDKV(_model, valid); //updated by model.scoreAndUpdateModel
+ ScoreKeeper validScore = new ScoreKeeper(Double.NaN);
+ validScore.fillFrom(_model._output._validation_metrics_restricted_model_cv);
+ }
+ _model.addRestrictedModelScoringInfoCV(_parms, nclasses(), t2, _state._iter); // add to scoringInfo for early stopping
+
if (_parms._generate_scoring_history) { // update scoring history with deviance train and valid if available
double[] betaContrVal = _model._output.getControlValBeta(_state.expandBeta(_state.beta()).clone());
GLMResDevTask task = new GLMResDevTask(_job._key, _dinfo, _parms, betaContrVal).doAll(_dinfo._adaptedFrame);
double objectiveControlVal = _state.objective(betaContrVal, task._likelihood);
-
+
if ((mtrain != null) && (_valid != null)) {
- _scoringHistory.addIterationScore(true, true, _state._iter, task._likelihood,
- objectiveControlVal, _state.deviance(task._likelihood), ((GLMMetrics) _model._output._validation_metrics).residual_deviance(),
- mtrain._nobs, _model._output._validation_metrics._nobs, _state.lambda(), _state.alpha());
+ _scoringHistoryControlValEnabled.addIterationScore(true, true, _state._iter, task._likelihood,
+ objectiveControlVal, _state.deviance(task._likelihood), ((GLMMetrics) _model._output._validation_metrics_restricted_model_cv).residual_deviance(),
+ mtrain._nobs, _model._output._validation_metrics_restricted_model_cv._nobs, _state.lambda(), _state.alpha());
} else { // only doing training deviance
- _scoringHistory.addIterationScore(true, false, _state._iter, task._likelihood,
+ _scoringHistoryControlValEnabled.addIterationScore(true, false, _state._iter, task._likelihood,
objectiveControlVal, _state.deviance(task._likelihood), Double.NaN, mtrain._nobs, 1, _state.lambda(),
_state.alpha());
+
}
- _job.update(_workPerIteration, _state.toString());
+ _model._output._scoring_history_restricted_model_cv = _scoringHistoryControlValEnabled != null ? _scoringHistoryControlValEnabled.to2dTable(_parms, null, null) : null;
}
- _model._output._scoring_history = _scoringHistory != null ? _scoringHistory.to2dTable(_parms, null, null) : null;
- _model.update(_job._key);
+ }
+
+ private void scorePostProcessingRestrictedModelROEnabled(Frame train, long t1) {
+ ModelMetrics mtrain = ModelMetrics.getFromDKV(_model, train); // updated by model.scoreAndUpdateModel
+ long t2 = System.currentTimeMillis();
+ if (mtrain != null) {
+ _model._output._training_metrics_restricted_model_ro = mtrain;
+ _model._output._training_time_ms = t2 - _model._output._start_time; // remember training time
+ ScoreKeeper trainScore = new ScoreKeeper(Double.NaN);
+ trainScore.fillFrom(mtrain);
+ Log.info(LogMsg(mtrain.toString()));
+ } else {
+ Log.info(LogMsg("ModelMetrics mtrain is null"));
+ }
+ Log.info(LogMsg("Restricted model where remove offset feature is enabled training metrics computed in " + (t2 - t1) + "ms"));
+ if (_valid != null) {
+ Frame valid = DKV.getGet(_parms._valid);
+ _model.score(_parms.valid(), null, CFuncRef.from(_parms._custom_metric_func)).delete();
+ _model._output._validation_metrics_restricted_model_ro = ModelMetrics.getFromDKV(_model, valid); //updated by model.scoreAndUpdateModel
+ ScoreKeeper validScore = new ScoreKeeper(Double.NaN);
+ validScore.fillFrom(_model._output._validation_metrics_restricted_model_ro);
+ }
+ _model.addRestrictedModelScoringInfoRO(_parms, nclasses(), t2, _state._iter); // add to scoringInfo for early stopping
+
+ if (_parms._generate_scoring_history) { // update scoring history with deviance train and valid if available
+ if ((mtrain != null) && (_valid != null)) {
+ _scoringHistoryRemoveOffsetEnabled.addIterationScore(true, true, _state._iter, _state.likelihood(),
+ _state.objective(), _state.deviance(), ((GLMMetrics) _model._output._validation_metrics_restricted_model_ro).residual_deviance(),
+ mtrain._nobs, _model._output._validation_metrics_restricted_model_ro._nobs, _state.lambda(), _state.alpha());
+ } else { // only doing training deviance
+ _scoringHistoryRemoveOffsetEnabled.addIterationScore(true, false, _state._iter, _state.likelihood(),
+ _state.objective(), _state.deviance(), Double.NaN, mtrain._nobs, 1, _state.lambda(),
+ _state.alpha());
+ }
+ }
+ _model._output._scoring_history_restricted_model_ro = _scoringHistoryRemoveOffsetEnabled != null ? _scoringHistoryRemoveOffsetEnabled.to2dTable(_parms, null, null) : null;
}
private void scorePostProcessing(Frame train, long t1) {
ModelMetrics mtrain = ModelMetrics.getFromDKV(_model, train); // updated by model.scoreAndUpdateModel
long t2 = System.currentTimeMillis();
if (mtrain != null) {
- if (_model._parms._control_variables != null){
+ if (_model._parms._control_variables != null || _model._parms._remove_offset_effects){
_model._output._training_metrics_unrestricted_model = mtrain;
- _model._output._training_time_ms = t2 - _model._output._start_time; // remember training time
} else {
_model._output._training_metrics = mtrain;
- _model._output._training_time_ms = t2 - _model._output._start_time; // remember training time
- }
+ }
+ _model._output._training_time_ms = t2 - _model._output._start_time; // remember training time
ScoreKeeper trainScore = new ScoreKeeper(Double.NaN);
trainScore.fillFrom(mtrain);
Log.info(LogMsg(mtrain.toString()));
@@ -3463,16 +3571,18 @@ private void scorePostProcessing(Frame train, long t1) {
Log.info(LogMsg("Training metrics computed in " + (t2 - t1) + "ms"));
if (_valid != null) {
Frame valid = DKV.getGet(_parms._valid);
+ ScoreKeeper validScore = new ScoreKeeper(Double.NaN);
_model.score(_parms.valid(), null, CFuncRef.from(_parms._custom_metric_func)).delete();
- if(_model._parms._control_variables != null){
+ if(_model._parms._control_variables != null || _model._parms._remove_offset_effects){
_model._output._validation_metrics_unrestricted_model = ModelMetrics.getFromDKV(_model, valid);
+ validScore.fillFrom(_model._output._validation_metrics_unrestricted_model);
} else {
_model._output._validation_metrics = ModelMetrics.getFromDKV(_model, valid); //updated by model.scoreAndUpdateModel
+ validScore.fillFrom(_model._output._validation_metrics);
}
- ScoreKeeper validScore = new ScoreKeeper(Double.NaN);
- validScore.fillFrom(_model._output._validation_metrics);
+
}
- if(_model._parms._control_variables != null) {
+ if(_model._parms._control_variables != null || _model._parms._remove_offset_effects) {
_model.addUnrestrictedModelScoringInfo(_parms, nclasses(), t2, _state._iter);
} else {
_model.addScoringInfo(_parms, nclasses(), t2, _state._iter);
@@ -3495,7 +3605,7 @@ private void scorePostProcessing(Frame train, long t1) {
_model._output._validation_metrics._nobs;
_lambdaSearchScoringHistory.addLambdaScore(_state._iter, ArrayUtils.countNonzeros(_state.beta()),
_state.lambda(), trainDev, validDev, xval_deviance, xval_se, _state.alpha());
- } else if(_model._parms._control_variables != null){
+ } else if(_model._parms._control_variables != null || _model._parms._remove_offset_effects){
_scoringHistoryUnrestrictedModel.addIterationScore(true, true, _state._iter, _state.likelihood(),
_state.objective(), _state.deviance(), ((GLMMetrics) _model._output._validation_metrics_unrestricted_model).residual_deviance(),
mtrain._nobs, _model._output._validation_metrics_unrestricted_model._nobs, _state.lambda(), _state.alpha());
@@ -3509,7 +3619,7 @@ private void scorePostProcessing(Frame train, long t1) {
_lambdaSearchScoringHistory.addLambdaScore(_state._iter, ArrayUtils.countNonzeros(_state.beta()),
_state.lambda(), _state.deviance() / mtrain._nobs, Double.NaN, xval_deviance,
xval_se, _state.alpha());
- } else if(_model._parms._control_variables != null) {
+ } else if(_model._parms._control_variables != null || _model._parms._remove_offset_effects) {
_scoringHistoryUnrestrictedModel.addIterationScore(true, false, _state._iter, _state.likelihood(),
_state.objective(), _state.deviance(), Double.NaN, mtrain._nobs, 1, _state.lambda(),
_state.alpha());
@@ -3519,23 +3629,16 @@ private void scorePostProcessing(Frame train, long t1) {
_state.alpha());
}
}
- _job.update(_workPerIteration, _state.toString());
}
if (_parms._lambda_search) {
_model._output._scoring_history = _lambdaSearchScoringHistory.to2dTable();
- } else if(_model._parms._control_variables != null){
+ } else if (_model._parms._control_variables != null || _model._parms._remove_offset_effects){
_model._output._scoring_history_unrestricted_model = _scoringHistoryUnrestrictedModel.to2dTable(_parms, _xval_deviances_generate_SH,
_xval_sd_generate_SH);
} else {
_model._output._scoring_history = _scoringHistory.to2dTable(_parms, _xval_deviances_generate_SH,
_xval_sd_generate_SH);
}
-
- _model.update(_job._key);
- _model.generateSummary(_parms._train, _state._iter);
- _lastScore = System.currentTimeMillis();
- long scoringTime = System.currentTimeMillis() - t1;
- _scoringInterval = Math.max(_scoringInterval, 20 * scoringTime); // at most 5% overhead for scoring
}
private void coldStart(double[] devHistoryTrain, double[] devHistoryTest) {
@@ -3664,7 +3767,8 @@ public void computeImpl() {
if (_parms._keepBetaDiffVar)
keepFrameKeys(keep, _model._output._betadiff_var);
Scope.untrack(keep.toArray(new Key[keep.size()]));
- }
+ }_model.update(_job._key);
+ _model.generateSummary(_parms._train, _state._iter);
_model.unlock(_job);
}
}
@@ -3846,27 +3950,30 @@ private void doCompute() {
if (_parms._generate_variable_inflation_factors) {
_model._output._vif_predictor_names = _model.buildVariableInflationFactors(_train, _dinfo);
}// build variable inflation factors for numerical predictors
- if(_model._parms._control_variables != null) {
- // create combination of scoring history with control variables enabled and disabled
- // keep unrestricted model scoring history in _model._output._control_val_scoring_history
+ if(_model._parms._control_variables != null || _model._parms._remove_offset_effects) {
+ // create combination of scoring history with control variables or remove offset effect enabled and disabled
+ // keep unrestricted model scoring history in _model._output._scoring_history_unrestricted_model
TwoDimTable scoringHistoryEarlyStop = ScoringInfo.createScoringHistoryTable(_model.getScoringInfo(),
(null != _parms._valid), false, _model._output.getModelCategory(), false, _parms.hasCustomMetricFunc());
- TwoDimTable scoringHistoryEarlyStopControlVal = ScoringInfo.createScoringHistoryTable(_model.getUnrestrictedModelScoringInfo(),
+ TwoDimTable scoringHistoryEarlyStopRestricted = ScoringInfo.createScoringHistoryTable(_model.getUnrestrictedModelScoringInfo(),
(null != _parms._valid), false, _model._output.getModelCategory(), false, _parms.hasCustomMetricFunc());
- scoringHistoryEarlyStopControlVal.setTableHeader("Scoring history with control variables enabled");
ScoreKeeper.StoppingMetric sm = _model._parms._stopping_metric.name().equals("AUTO") ? _model._output.isClassifier() ?
ScoreKeeper.StoppingMetric.logloss : ScoreKeeper.StoppingMetric.deviance : _model._parms._stopping_metric;
- _model._output._scoring_history = combineScoringHistoryControlVariables(_model._output._scoring_history, _model._output._scoring_history_unrestricted_model,
- scoringHistoryEarlyStop, scoringHistoryEarlyStopControlVal, sm, null != _parms._valid);
- _model._output._scoring_history_unrestricted_model = combineScoringHistory(_model._output._scoring_history_unrestricted_model, scoringHistoryEarlyStopControlVal);
+ _model._output._scoring_history = combineScoringHistoryRestricted(_model._output._scoring_history, _model._output._scoring_history_unrestricted_model,
+ scoringHistoryEarlyStop, scoringHistoryEarlyStopRestricted, sm, null != _parms._valid);
+ _model._output._scoring_history_unrestricted_model = combineScoringHistory(_model._output._scoring_history_unrestricted_model, scoringHistoryEarlyStopRestricted);
_model._output._scoring_history_unrestricted_model.setTableHeader(_model._output._scoring_history_unrestricted_model.getTableHeader()+" unrestricted model");
- // set control variables flag to true for scoring after training
- _model._useControlVariables = true;
- _model._output._varimp = _model._output.calculateVarimp(true);
- _model._output._variable_importances_unrestricted_model = calcVarImp(_model._output.calculateVarimp(false));
- _model._output._variable_importances_unrestricted_model.setTableHeader(_model._output._variable_importances_unrestricted_model.getTableHeader()+" unrestricted model");
+ // set control variables and remove offset effects flag to true for scoring after training
+ _model._useControlVariables = _model._parms._control_variables != null;
+ _model._useRemoveOffsetEffects = _model._parms._remove_offset_effects;
+ // calculate varimp of the restricted model
+ _model._output._varimp = _model._output.calculateVarimp(_model._useControlVariables);
_model._output._variable_importances = calcVarImp(_model._output._varimp);
+ // calcultate varimp of the unrestricted model
+ _model._output._variable_importances_unrestricted_model = calcVarImp(_model._output.calculateVarimp(false));
+ _model._output._variable_importances_unrestricted_model.setTableHeader(
+ _model._output._variable_importances_unrestricted_model.getTableHeader()+" unrestricted model");
} else {
TwoDimTable scoring_history_early_stop = ScoringInfo.createScoringHistoryTable(_model.getScoringInfo(),
(null != _parms._valid), false, _model._output.getModelCategory(), false, _parms.hasCustomMetricFunc());
@@ -4063,6 +4170,9 @@ protected void updateProgress(boolean canScore) {
GLMResDevTask task = new GLMResDevTask(_job._key,_dinfo,_parms, betaContrVal).doAll(_state._dinfo._adaptedFrame);
double objectiveControlVal = _state.objective(betaContrVal, task._likelihood);
_scoringHistory.addIterationScore(_state._iter, task._likelihood, objectiveControlVal);
+ } else if (_model._parms._remove_offset_effects) {
+ _scoringHistoryUnrestrictedModel.addIterationScore(_state._iter, _state.likelihood(), _state.objective());
+ _scoringHistory.addIterationScore(_state._iter, _state.likelihood(), _state.objective());
} else {
_scoringHistory.addIterationScore(_state._iter, _state.likelihood(), _state.objective());
}
@@ -4079,7 +4189,7 @@ protected void updateProgress(boolean canScore) {
}
private boolean updateEarlyStop() {
- ScoreKeeper[] sk = _parms._control_variables != null ? _model.unrestritedModelScoreKeepers() : _model.scoreKeepers();
+ ScoreKeeper[] sk = _parms._control_variables != null || _parms._remove_offset_effects ? _model.unrestritedModelScoreKeepers() : _model.scoreKeepers();
return _earlyStop || ScoreKeeper.stopEarly(sk,
_parms._stopping_rounds, ScoreKeeper.ProblemType.forSupervised(_nclass > 1), _parms._stopping_metric,
_parms._stopping_tolerance, "model's last", true);
diff --git a/h2o-algos/src/main/java/hex/glm/GLMModel.java b/h2o-algos/src/main/java/hex/glm/GLMModel.java
index d705905f7e19..a9794b0f67f5 100755
--- a/h2o-algos/src/main/java/hex/glm/GLMModel.java
+++ b/h2o-algos/src/main/java/hex/glm/GLMModel.java
@@ -4,6 +4,7 @@
import hex.DataInfo.TransformType;
import hex.api.MakeGLMModelHandler;
import hex.deeplearning.DeepLearningModel;
+import hex.genmodel.descriptor.ModelDescriptor;
import hex.genmodel.utils.DistributionFamily;
import hex.glm.GLMModel.GLMParameters.Family;
import hex.glm.GLMModel.GLMParameters.Link;
@@ -50,8 +51,13 @@ public class GLMModel extends Model= iter)) { // no duplication
+ return;
+ }
+ GLMScoringInfo currInfo = new GLMScoringInfo();
+ currInfo.is_classification = nclasses > 1;
+ currInfo.validation = parms.valid() != null;
+ currInfo.cross_validation = parms._nfolds > 1;
+ currInfo.iterations = iter;
+ currInfo.time_stamp_ms = currTime;
+ currInfo.total_training_time_ms = _output._training_time_ms;
+ if (_output._training_metrics_restricted_model_cv != null) {
+ currInfo.scored_train = new ScoreKeeper(Double.NaN);
+ currInfo.scored_train.fillFrom(_output._training_metrics_restricted_model_cv);
+ }
+ if (_output._validation_metrics_restricted_model_cv != null) {
+ currInfo.scored_valid = new ScoreKeeper(Double.NaN);
+ currInfo.scored_valid.fillFrom(_output._validation_metrics_restricted_model_cv);
+ }
+ _restrictedModelScoringInfoCV = ScoringInfo.prependScoringInfo(currInfo, _restrictedModelScoringInfoCV);
+ }
+
+ public ScoringInfo[] getRestrictedModelScoringInfoRO() { return _restrictedModelScoringInfoRO;}
+
+ public ScoreKeeper[] restrictedModelROScoreKeepers() {
+ int size = _restrictedModelScoringInfoRO ==null? 0: _restrictedModelScoringInfoRO.length;
+ ScoreKeeper[] sk = new ScoreKeeper[size];
+ for (int i=0;i= iter)) { // no duplication
+ return;
+ }
+ GLMScoringInfo currInfo = new GLMScoringInfo();
+ currInfo.is_classification = nclasses > 1;
+ currInfo.validation = parms.valid() != null;
+ currInfo.cross_validation = parms._nfolds > 1;
+ currInfo.iterations = iter;
+ currInfo.time_stamp_ms = scoringInfo==null?_output._start_time:currTime;
+ currInfo.total_training_time_ms = _output._training_time_ms;
+ if (_output._training_metrics_restricted_model_ro != null) {
+ currInfo.scored_train = new ScoreKeeper(Double.NaN);
+ currInfo.scored_train.fillFrom(_output._training_metrics_restricted_model_ro);
+ }
+ if (_output._validation_metrics_restricted_model_ro != null) {
+ currInfo.scored_valid = new ScoreKeeper(Double.NaN);
+ currInfo.scored_valid.fillFrom(_output._validation_metrics_restricted_model_ro);
+ }
+ _restrictedModelScoringInfoRO = ScoringInfo.prependScoringInfo(currInfo, _restrictedModelScoringInfoRO);
+ }
public void setVcov(double[][] inv) {_output._vcov = inv;}
@@ -579,6 +661,7 @@ public enum Constraints {EqualTo, LessThanEqualTo};
public double _constraint_beta = 0.9; // eta_k+1 = eta_k/pow(c_k, beta)
public double _constraint_c0 = 10; // set initial epsilon k as 1/c0
public String[] _control_variables; // control variables definition, list of column names
+ public boolean _remove_offset_effects; // control offset effect from prediction and metric calculation
public void validate(GLM glm) {
if (_remove_collinear_columns) {
@@ -724,10 +807,10 @@ public void validate(GLM glm) {
glm.error("_control_variables", "Control variables option is not supported with interactions.");
}
if(_lambda_search) {
- glm.error("_control_variables", "Control variables option is not supported with lambda search.");
+ glm.error("_control_variables", "Control variables option is not supported with Lambda search.");
}
if(_fold_column != null || _nfolds > 0){
- glm.error("_control_variables", "Control variables option is not supported with cross validation.");
+ glm.error("_control_variables", "Control variables option is not supported with cross-validation.");
}
for(String col: _control_variables){
Vec v = train().vec(col);
@@ -756,6 +839,23 @@ public void validate(GLM glm) {
}
}
}
+ if (_remove_offset_effects) {
+ if (_offset_column == null) {
+ glm.error("_remove_offset_effects", "The offset_column is missing.");
+ }
+ if (_distribution.equals(DistributionFamily.multinomial) || _distribution.equals(DistributionFamily.ordinal) || _distribution.equals(DistributionFamily.custom)){
+ glm.error("_remove_offset_effects", "The "+_distribution.name()+ " distribution is not supported with remove offset effects.");
+ }
+ if (_interactions != null || _interaction_pairs != null) {
+ glm.error("_remove_offset_effects", "Remove offset effects option is not supported with interactions.");
+ }
+ if (_lambda_search) {
+ glm.error("_remove_offset_effects", "Remove offset effects option is not supported with Lambda search.");
+ }
+ if (_fold_column != null || _nfolds > 0) {
+ glm.error("_remove_offset_effects", "Remove offset effects option is not supported with cross-validation.");
+ }
+ }
}
public GLMParameters() {
@@ -1531,6 +1631,7 @@ public Submodel(double lambda, double alpha, double[] beta, int iteration, doubl
public double[] _betaCndCheckpoint; // store temporary beta coefficients for checkpointing purposes
public boolean _finalScoring = false; // used while scoring to indicate if it is a final or partial scoring
public boolean _useControlVariables = false;
+ public boolean _useRemoveOffsetEffects = false;
private static String[] binomialClassNames = new String[]{"0", "1"};
@@ -1611,21 +1712,36 @@ public double lambda_selected(){
public boolean hasVIF() { return _vif_predictor_names != null; }
private int[] _control_values_idxs_in_adapted_frame;
- private String[] _control_values_names;
+ private String[] _control_variables_names;
- // Unrestricted model is produced when control variables are used.
+ // Unrestricted model is produced when control variables or remove offset features are used.
public TwoDimTable _scoring_history_unrestricted_model;
public ModelMetrics _training_metrics_unrestricted_model;
public ModelMetrics _validation_metrics_unrestricted_model;
+
+ // Other two restricted models is produced when control variables and remove offset features are used together
+ // Output for restricted model where control variables feature is enabled
+ public TwoDimTable _scoring_history_restricted_model_cv;
+ public ModelMetrics _training_metrics_restricted_model_cv;
+ public ModelMetrics _validation_metrics_restricted_model_cv;
+ // Output for restricted model where remove offset feature is enabled
+ public TwoDimTable _scoring_history_restricted_model_ro;
+ public ModelMetrics _training_metrics_restricted_model_ro;
+ public ModelMetrics _validation_metrics_restricted_model_ro;
+
+ public void setAndMapControlVariablesNames(String[] controlVariablesNames){
+ this._control_variables_names = controlVariablesNames;
+ mapControlVariables();
+ }
public void mapControlVariables() {
- if(_control_values_names == null || _names == null) {
+ if(_control_variables_names == null || _names == null) {
return;
}
- _control_values_idxs_in_adapted_frame = new int[_control_values_names.length];
- for(int i = 0; i < _control_values_names.length; i++) {
+ _control_values_idxs_in_adapted_frame = new int[_control_variables_names.length];
+ for(int i = 0; i < _control_variables_names.length; i++) {
for(int j = 0; j < _names.length; j++) {
- if(_control_values_names[i].equals(_names[j]) ) {
+ if(_control_variables_names[i].equals(_names[j]) ) {
_control_values_idxs_in_adapted_frame[i] = j; break;
}
}
@@ -1718,7 +1834,6 @@ public double[] variableInflationFactors() {
public boolean _binomial;
public boolean _multinomial;
public boolean _ordinal;
- public boolean _score_control_vals_used_but_disabled;
public void setLambdas(GLMParameters parms) {
if (parms._lambda_search) {
@@ -1796,7 +1911,7 @@ public GLMOutput(DataInfo dinfo, String[] column_names, String[] column_types, S
_global_beta=beta;
_submodels = new Submodel[]{new Submodel(0, 0, beta, -1, Double.NaN, Double.NaN,
_totalBetaLength, null, false)};
- _control_values_names = controlVarNames;
+ _control_variables_names = controlVarNames;
mapControlVariables();
}
@@ -1852,7 +1967,7 @@ public GLMOutput(GLM glm) {
_multinomial = glm._parms._family == Family.multinomial;
_ordinal = glm._parms._family == Family.ordinal;
// setup control variables idxs from model parameters
- _control_values_names = glm._parms._control_variables;
+ _control_variables_names = glm._parms._control_variables;
mapControlVariables();
}
@@ -2202,7 +2317,10 @@ else if (_output.bestSubmodel().alpha_value == 1)
}
} else {
double[] b = beta();
- double eta = b[b.length - 1] + o; // intercept + offset
+ double eta = b[b.length - 1]; // intercept
+ if (!this._useRemoveOffsetEffects){ // offset
+ eta += o;
+ }
double[] bcv = b.clone();
if (this._useControlVariables)
bcv = _output.getControlValBeta(bcv); // make beta connected to control variables zero
@@ -2412,7 +2530,7 @@ protected ModelMetrics.MetricBuilder scoreMetrics(Frame adaptFrm) {
@Override
public boolean haveMojo() {
- if (_parms._control_variables != null && _parms._control_variables.length > 0)
+ if ((_parms._control_variables != null && _parms._control_variables.length > 0) || _parms._remove_offset_effects)
return _parms.interactionSpec() == null &&
!_parms._family.equals(Family.multinomial) &&
!_parms._family.equals(Family.ordinal) &&
@@ -2424,14 +2542,16 @@ public boolean haveMojo() {
@Override
public boolean havePojo() {
- if (_parms._control_variables != null && _parms._control_variables.length > 0)
+ // POJO doesn't support offset; only allow when offset is absent or remove_offset_effects strips it
+ if ((_parms._control_variables != null && _parms._control_variables.length > 0) || _parms._remove_offset_effects)
return _parms.interactionSpec() == null &&
- _parms._offset_column == null &&
+ (_parms._offset_column == null || _parms._remove_offset_effects) &&
!_parms._family.equals(Family.multinomial) &&
!_parms._family.equals(Family.ordinal) &&
super.havePojo();
- if (_parms.interactionSpec() == null && _parms._offset_column == null) return super.havePojo();
- else return false;
+ if (_parms.interactionSpec() == null && _parms._offset_column == null)
+ return super.havePojo();
+ return false;
}
@Override
@@ -2439,6 +2559,18 @@ public GLMMojoWriter getMojo() {
return new GLMMojoWriter(this);
}
+ @Override
+ public ModelDescriptor modelDescriptor() {
+ if (!_parms._remove_offset_effects)
+ return super.modelDescriptor();
+ return new H2OModelDescriptor() {
+ @Override
+ public String offsetColumn() {
+ return null;
+ }
+ };
+ }
+
private boolean isFeatureUsedInPredict(int featureIdx, double[] beta) {
if (_useControlVariables && _output._control_values_idxs_in_adapted_frame != null && Arrays.binarySearch(_output._control_values_idxs_in_adapted_frame, featureIdx) >= 0) {
return false;
diff --git a/h2o-algos/src/main/java/hex/glm/GLMScore.java b/h2o-algos/src/main/java/hex/glm/GLMScore.java
index ea6390c422c9..8456dcd7ebdd 100644
--- a/h2o-algos/src/main/java/hex/glm/GLMScore.java
+++ b/h2o-algos/src/main/java/hex/glm/GLMScore.java
@@ -94,10 +94,7 @@ public GLMScore(Job j, GLMModel m, DataInfo dinfo, String[] domain, boolean comp
}
_beta_multinomial = null;
}
-
_dinfo._valid = true; // marking dinfo as validation data set disables an assert on unseen levels (which should not happen in train)
-
- m._output._score_control_vals_used_but_disabled = m._parms._control_variables != null && !m._useControlVariables;
_defaultThreshold = m.defaultThreshold();
}
@@ -132,8 +129,11 @@ public GLMScore(Job j, GLMModel m, DataInfo dinfo, String[] domain, boolean comp
preds[c + 1] = eta[c] * sumExp;
preds[0] = ArrayUtils.maxIndex(eta);
} else {
- double mu = _m._parms.linkInv(r.innerProduct(_beta) + o);
-
+ double x = r.innerProduct(_beta);
+ if(!_m._useRemoveOffsetEffects) {
+ x += o;
+ }
+ double mu = _m._parms.linkInv(x);
if (_m._parms._family == GLMModel.GLMParameters.Family.binomial
|| _m._parms._family == GLMModel.GLMParameters.Family.quasibinomial
|| _m._parms._family == GLMModel.GLMParameters.Family.fractionalbinomial) { // threshold for prediction
diff --git a/h2o-algos/src/main/java/hex/glm/GLMUtils.java b/h2o-algos/src/main/java/hex/glm/GLMUtils.java
index d0e1c52272fb..f8a1e9489317 100644
--- a/h2o-algos/src/main/java/hex/glm/GLMUtils.java
+++ b/h2o-algos/src/main/java/hex/glm/GLMUtils.java
@@ -162,18 +162,18 @@ public static List getStoppingMetricIndices(ScoreKeeper.StoppingMetric
* timestamp duration iterations Unrestricted negative_log_likelihood Unrestricted objective = training metrics calculated during optimization with control variables included (in glmSc)
* Training RMSE Training LogLoss Training r2 Training AUC Training pr_auc Training Lift Training Classification Error = early stopping training metrics with control variables excluded (in earlyStopSc)
* Validation RMSE Validation LogLoss Validation r2 Validation AUC Validation pr_auc Validation Lift Validation Classification Error = early stopping validation metrics with control variables excluded (in earlyStopSc)
- * Unrestricted Training AUC Unrestricted Validation AUC = stopping metrics with control variables included (in earlyStopScControlVariables)
+ * Unrestricted Training AUC Unrestricted Validation AUC = stopping metrics with control variables included (in earlyStopScRestricted)
* @param glmSc
* @param earlyStopSc
* @param stoppingMetric
- * @param earlyStopScControlVariables
+ * @param earlyStopScRestricted
* @return Combined scoring history table
*/
- public static TwoDimTable combineScoringHistoryControlVariables(TwoDimTable glmSc, TwoDimTable glmScControlVariables,
- TwoDimTable earlyStopSc,
- TwoDimTable earlyStopScControlVariables,
- ScoreKeeper.StoppingMetric stoppingMetric,
- boolean hasValidationMetrics) {
+ public static TwoDimTable combineScoringHistoryRestricted(TwoDimTable glmSc, TwoDimTable glmScRestricted,
+ TwoDimTable earlyStopSc,
+ TwoDimTable earlyStopScRestricted,
+ ScoreKeeper.StoppingMetric stoppingMetric,
+ boolean hasValidationMetrics) {
String[] esColTypes = earlyStopSc.getColTypes();
String[] esColFormats = earlyStopSc.getColFormats();
List finalColHeaders = new ArrayList<>(Arrays.asList(glmSc.getColHeaders()));
@@ -186,13 +186,13 @@ public static TwoDimTable combineScoringHistoryControlVariables(TwoDimTable glmS
List finalColTypes = new ArrayList<>(Arrays.asList(glmSc.getColTypes()));
List finalColFormats = new ArrayList<>(Arrays.asList(glmSc.getColFormats()));
List earlyStopColIndices = new ArrayList<>();
- List earlyStopColIndicesContrVals = getStoppingMetricIndices(stoppingMetric, earlyStopScControlVariables.getColHeaders());
+ List earlyStopColIndicesContrVals = getStoppingMetricIndices(stoppingMetric, earlyStopScRestricted.getColHeaders());
int colCounter = 0;
- String[] glmSCContrColTypes = glmScControlVariables.getColTypes();
- String[] glmSCContrColFormats = glmScControlVariables.getColFormats();
+ String[] glmSCContrColTypes = glmScRestricted.getColTypes();
+ String[] glmSCContrColFormats = glmScRestricted.getColFormats();
List glmScContrValsColIndices = new ArrayList<>();
- String[] glmScControlVariablesHeaders = glmScControlVariables.getColHeaders();
+ String[] glmScControlVariablesHeaders = glmScRestricted.getColHeaders();
for(int i=0; i < glmScControlVariablesHeaders.length; i++){
String colName = glmScControlVariablesHeaders[i];
String colNameLower = colName.toLowerCase();
@@ -233,7 +233,7 @@ public static TwoDimTable combineScoringHistoryControlVariables(TwoDimTable glmS
TwoDimTable res = new TwoDimTable("Scoring History", "",
rowHeaders, finalColHeaders.toArray(new String[tableSize]), finalColTypes.toArray(new String[tableSize]),
finalColFormats.toArray(new String[tableSize]), "");
- res = combineTableContentsControlVariables(glmSc, glmScControlVariables, earlyStopSc, earlyStopScControlVariables, res, glmScContrValsColIndices, earlyStopColIndices, earlyStopColIndicesContrVals, indexOfIter, earlyStopSCIterIndex,
+ res = combineTableContentsControlVariables(glmSc, glmScRestricted, earlyStopSc, earlyStopScRestricted, res, glmScContrValsColIndices, earlyStopColIndices, earlyStopColIndicesContrVals, indexOfIter, earlyStopSCIterIndex,
overlapSize);
return res;
}
diff --git a/h2o-algos/src/main/java/hex/schemas/GLMV3.java b/h2o-algos/src/main/java/hex/schemas/GLMV3.java
index 655431d0791e..30b7cc1893ad 100644
--- a/h2o-algos/src/main/java/hex/schemas/GLMV3.java
+++ b/h2o-algos/src/main/java/hex/schemas/GLMV3.java
@@ -74,6 +74,7 @@ public static final class GLMParametersV3 extends ModelParametersSchemaV3 threshold;
+ assertTrue(differ > threshold);
System.out.println("Scoring history control val enabled");
TwoDimTable glmSH = glm._output._scoring_history;
@@ -138,9 +138,9 @@ public void compareModelWithControlVariablesEnabledAndDisabled() {
TwoDimTable vi = glm._output._variable_importances;
TwoDimTable vi_unrestricted = glm._output._variable_importances_unrestricted_model;
TwoDimTable vi_unrestristed_2 = glm2._output._variable_importances;
-
- assertNotEquals(vi, vi_unrestricted);
- assertArrayEquals(vi_unrestricted.getRowHeaders(), vi_unrestristed_2.getRowHeaders());
+
+ assertFalse(Arrays.equals(vi.getRowHeaders(), vi_unrestricted.getRowHeaders()));
+ assertTrue(Arrays.equals(vi_unrestricted.getRowHeaders(), vi_unrestristed_2.getRowHeaders()));
} finally {
if(train != null) train.remove();
@@ -416,7 +416,6 @@ public void testControlVariableMultinomial() {
GLMModel glm = null;
try {
Scope.enter();
-
Vec cat1 = Vec.makeVec(new long[]{1,1,1,0,0},new String[]{"black","red"},Vec.newKey());
Vec cat2 = Vec.makeVec(new long[]{1,1,1,0,0},new String[]{"a","b"},Vec.newKey());
Vec res = Vec.makeVec(new double[]{1,1,2,0,0},cat1.group().addVec());
@@ -695,8 +694,8 @@ public void testBasicDataBinomial(){
-0.1541507, 0.1541507, 0.1541507, 0.8109302, 0.8109302, 1.1192316, 0.8109302, 0.1541507, -0.1541507},Vec.newKey());
predsR = new Frame(Key.make("predsR"),new String[]{"predict"},new Vec[]{predsRVec});
- Frame manualPredsR = scoreManualWithCoefficients(coefficientsR, train, "manualPredsR", null, true);
- Frame manualPredsH2o = scoreManualWithCoefficients(coefficients, train, "manualPredsH2o", null, true);
+ Frame manualPredsR = scoreManualWithCoefficients(coefficientsR, train, "manualPredsR", true);
+ Frame manualPredsH2o = scoreManualWithCoefficients(coefficients, train, "manualPredsH2o", true);
Frame manualPredsControl = scoreManualWithCoefficients(coefficientsControl, train, "manualPredsControl", new int[]{0}, true);
Frame manualPredsRControl = scoreManualWithCoefficients(coefficientsR, train, "manualPredsR", new int[]{0}, true);
@@ -737,14 +736,26 @@ public void testBasicDataBinomial(){
}
private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, String frameName){
- return scoreManualWithCoefficients(coefficients, data, frameName, null, false);
+ return scoreManualWithCoefficients(coefficients, data, frameName, null, false, null);
+ }
+
+ private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, String frameName, boolean binomial){
+ return scoreManualWithCoefficients(coefficients, data, frameName, null, binomial, null);
}
private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, String frameName, int[] controlVariablesIdx){
- return scoreManualWithCoefficients(coefficients, data, frameName, controlVariablesIdx, false);
+ return scoreManualWithCoefficients(coefficients, data, frameName, controlVariablesIdx, false, null);
+ }
+
+ private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, String frameName, int[] controlVariablesIdx, boolean binomial){
+ return scoreManualWithCoefficients(coefficients, data, frameName, controlVariablesIdx, binomial, null);
+ }
+
+ private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, String frameName, boolean binomial, Vec offset){
+ return scoreManualWithCoefficients(coefficients, data, frameName, null, binomial, offset);
}
- private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, String frameName, int[] controlVariablesIdxs, boolean binomial){
+ private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, String frameName, int[] controlVariablesIdxs, boolean binomial, Vec offset){
Vec predictions = Vec.makeZero(data.numRows(), Vec.T_NUM);
for (long i = 0; i < data.numRows(); i++) {
double prediction = 0;
@@ -756,12 +767,694 @@ private Frame scoreManualWithCoefficients(Double[] coefficients, Frame data, Str
}
}
prediction += coefficients[coefficients.length-1];
- if(binomial){
+ if (offset != null) prediction += offset.at(i);
+ if (binomial){
prediction = 1.0 / (Math.exp(-prediction) + 1.0);
}
predictions.set(i, prediction);
}
return new Frame(Key.make(frameName),new String[]{"predict"},new Vec[]{predictions});
}
-
+
+ @Test
+ public void compareModelWithOffsetEnabledAndDisabled() {
+ Frame train = null;
+ Frame test = null;
+ Frame preds = null;
+ GLMModel glm = null;
+ Frame preds2 = null;
+ GLMModel glm2 = null;
+ try {
+ Scope.enter();
+ train = parseTestFile("smalldata/glm_test/binomial_20_cols_10KRows.csv");
+ GLMModel.GLMParameters.Family family = GLMModel.GLMParameters.Family.binomial;
+ String responseColumn = "C21";
+
+ // set cat columns
+ int numCols = train.numCols();
+ int enumCols = (numCols - 1) / 2;
+ for (int cindex = 0; cindex < enumCols; cindex++) {
+ train.replace(cindex, train.vec(cindex).toCategoricalVec()).remove();
+ }
+ int response_index = numCols - 1;
+
+ train.replace((response_index), train.vec(response_index).toCategoricalVec()).remove();
+
+ DKV.put(train);
+ Scope.track_generic(train);
+
+ test = new Frame(train);
+ test.remove(responseColumn);
+
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters(family);
+ params._response_column = responseColumn;
+ params._train = train._key;
+ params._score_each_iteration = true;
+ params._offset_column = "C20";
+ params._remove_offset_effects = true;
+
+ // train model with remove offset effects enabled
+ glm = new GLM(params).trainModel().get();
+ Scope.track_generic(glm);
+
+ System.out.println("_________________________________");
+ System.out.println(glm);
+ System.out.println("______");
+
+ preds = glm.score(test);
+ Scope.track_generic(preds);
+
+ // train model with offset effect removed
+ params._remove_offset_effects = false;
+
+ glm2 = new GLM(params).trainModel().get();
+ Scope.track_generic(glm2);
+
+ preds2 = glm2.score(test);
+ Scope.track_generic(preds2);
+
+ // check result training metrics are not the same
+ double delta = 10e-10;
+ assertNotEquals(glm.auc(), glm2.auc(), delta);
+ assertNotEquals(glm.mse(), glm2.mse(), delta);
+ //assertNotEquals(glm.logloss(), glm2.logloss(), delta);
+
+ double tMse = glm._output._training_metrics._MSE;
+ double tMse2 = glm2._output._training_metrics._MSE;
+ System.out.println(tMse+" "+tMse2);
+ assertNotEquals(tMse, tMse2, delta);
+
+ // check result training metrics unrestricted model and glm model with remove offset effects disabled are the same
+ assertEquals(glm2._output._training_metrics.auc_obj()._auc, glm._output._training_metrics_unrestricted_model.auc_obj()._auc, delta);
+ assertEquals(glm2._output._training_metrics.mse(), glm._output._training_metrics_unrestricted_model.mse(), delta);
+ assertEquals(glm2._output._training_metrics.rmse(), glm._output._training_metrics_unrestricted_model.rmse(), delta);
+
+ // check preds differ
+ int differ = 0;
+ int testRowNumber = 100;
+ double threshold = (2 * testRowNumber)/1.1;
+ for (int i = 0; i < testRowNumber; i++) {
+ if(preds.vec(1).at(i) != preds2.vec(1).at(i)) differ++;
+ if(preds.vec(2).at(i) != preds2.vec(2).at(i)) differ++;
+ }
+
+ assertTrue("Expected number of differing predictions to exceed threshold", differ > threshold);
+
+ System.out.println("Scoring history remove offset enabled");
+ TwoDimTable glmSH = glm._output._scoring_history;
+ System.out.println(glmSH);
+ System.out.println("Scoring history remove offset disabled");
+ TwoDimTable glm2SH = glm2._output._scoring_history;
+ System.out.println(glm2SH);
+ System.out.println("Scoring history remove offset enabled unrestricted model");
+ TwoDimTable glmSHROE = glm._output._scoring_history_unrestricted_model;
+ System.out.println(glmSHROE);
+ System.out.println("Scoring history remove offset disabled unrestricted model");
+ TwoDimTable glm2SHROE = glm2._output._scoring_history_unrestricted_model;
+ System.out.println(glm2SHROE);
+
+ // check scoring history is the same (instead of timestamp and duration column)
+ // change table header because it contains " unrestricted model"
+ glm2SH.setTableHeader(glmSHROE.getTableHeader());
+ assertTwoDimTableEquals(glmSHROE, glm2SH, new int[]{0,1});
+
+ // check control val scoring history is not null when remove offset effects feature is enabled
+ assertNotNull(glmSHROE);
+
+ // check control val scoring history is null when remove offset effects feature is disabled
+ assertNull(glm2SHROE);
+
+ //check variable importance
+ TwoDimTable vi = glm._output._variable_importances;
+ TwoDimTable vi_unrestricted = glm._output._variable_importances_unrestricted_model;
+ TwoDimTable vi_unrestristed_2 = glm2._output._variable_importances;
+
+ assertTrue(Arrays.equals(vi.getRowHeaders(), vi_unrestricted.getRowHeaders()));
+ assertTrue(Arrays.equals(vi_unrestricted.getRowHeaders(), vi_unrestristed_2.getRowHeaders()));
+
+ } finally {
+ if(train != null) train.remove();
+ if(test != null) test.remove();
+ if(preds != null) preds.remove();
+ if(glm != null) glm.remove();
+ if(preds2 != null) preds2.remove();
+ if(glm2 != null) glm2.remove();
+ Scope.exit();
+ }
+ }
+
+ @Test
+ public void compareModelWithOffsetAndControlVariablesEnabledAndDisabled() {
+ Frame train = null;
+ Frame test = null;
+ Frame preds = null;
+ GLMModel glm = null;
+ Frame preds2 = null;
+ GLMModel glm2 = null;
+ try {
+ Scope.enter();
+ train = parseTestFile("smalldata/glm_test/binomial_20_cols_10KRows.csv");
+ GLMModel.GLMParameters.Family family = GLMModel.GLMParameters.Family.binomial;
+ String responseColumn = "C21";
+
+ // set cat columns
+ int numCols = train.numCols();
+ int enumCols = (numCols - 1) / 2;
+ for (int cindex = 0; cindex < enumCols; cindex++) {
+ train.replace(cindex, train.vec(cindex).toCategoricalVec()).remove();
+ }
+ int response_index = numCols - 1;
+
+ train.replace((response_index), train.vec(response_index).toCategoricalVec()).remove();
+
+ DKV.put(train);
+ Scope.track_generic(train);
+
+ test = new Frame(train);
+ test.remove(responseColumn);
+
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters(family);
+ params._response_column = responseColumn;
+ params._train = train._key;
+ params._score_each_iteration = true;
+ params._offset_column = "C20";
+ params._remove_offset_effects = true;
+ params._control_variables = new String[]{"C5"};
+
+ // train model with remove offset effects enabled
+ glm = new GLM(params).trainModel().get();
+ Scope.track_generic(glm);
+
+ System.out.println("_________________________________");
+ System.out.println(glm);
+ System.out.println("______");
+
+ preds = glm.score(test);
+ Scope.track_generic(preds);
+
+ // train model with offset effect removed
+ params._remove_offset_effects = false;
+ params._control_variables = null;
+
+ glm2 = new GLM(params).trainModel().get();
+ Scope.track_generic(glm2);
+
+ preds2 = glm2.score(test);
+ Scope.track_generic(preds2);
+
+ // check result training metrics are not the same
+ double delta = 10e-10;
+ assertNotEquals(glm.auc(), glm2.auc(), delta);
+ assertNotEquals(glm.mse(), glm2.mse(), delta);
+ //assertNotEquals(glm.logloss(), glm2.logloss(), delta);
+
+ double tMse = glm._output._training_metrics._MSE;
+ double tMse2 = glm2._output._training_metrics._MSE;
+ System.out.println(tMse+" "+tMse2);
+ assertNotEquals(tMse, tMse2, delta);
+
+ // check result training metrics unrestricted model and glm model with remove offset effects disabled are the same
+ assertEquals(glm2._output._training_metrics.auc_obj()._auc, glm._output._training_metrics_unrestricted_model.auc_obj()._auc, delta);
+ assertEquals(glm2._output._training_metrics.mse(), glm._output._training_metrics_unrestricted_model.mse(), delta);
+ assertEquals(glm2._output._training_metrics.rmse(), glm._output._training_metrics_unrestricted_model.rmse(), delta);
+
+ // check preds differ
+ int differ = 0;
+ int testRowNumber = 100;
+ double threshold = (2 * testRowNumber)/1.1;
+ for (int i = 0; i < testRowNumber; i++) {
+ if(preds.vec(1).at(i) != preds2.vec(1).at(i)) differ++;
+ if(preds.vec(2).at(i) != preds2.vec(2).at(i)) differ++;
+ }
+ System.out.println(differ + " " + threshold);
+ assertTrue(differ > threshold);
+
+ System.out.println("Scoring history remove offset enabled");
+ TwoDimTable glmSH = glm._output._scoring_history;
+ System.out.println(glmSH);
+ System.out.println("Scoring history remove offset disabled");
+ TwoDimTable glm2SH = glm2._output._scoring_history;
+ System.out.println(glm2SH);
+ System.out.println("Scoring history remove offset enabled unrestricted model");
+ TwoDimTable glmSHCV = glm._output._scoring_history_unrestricted_model;
+ System.out.println(glmSHCV);
+ System.out.println("Scoring history remove offset disabled unrestricted model");
+ TwoDimTable glm2SHCV = glm2._output._scoring_history_unrestricted_model;
+ System.out.println(glm2SHCV);
+
+ // check scoring history is the same (instead of timestamp and duration column)
+ // change table header because it contains " unrestricted model"
+ glm2SH.setTableHeader(glmSHCV.getTableHeader());
+ assertTwoDimTableEquals(glmSHCV, glm2SH, new int[]{0,1});
+
+ // check control val scoring history is not null when control vals is enabled
+ assertNotNull(glmSHCV);
+
+ // check control val scoring history is null when control vals is disabled
+ assertNull(glm2SHCV);
+
+ //check variable importance
+ TwoDimTable vi = glm._output._variable_importances;
+ TwoDimTable vi_unrestricted = glm._output._variable_importances_unrestricted_model;
+ TwoDimTable vi_unrestristed_2 = glm2._output._variable_importances;
+
+ assertFalse(Arrays.equals(vi.getRowHeaders(), vi_unrestricted.getRowHeaders()));
+ assertTrue(Arrays.equals(vi_unrestricted.getRowHeaders(), vi_unrestristed_2.getRowHeaders()));
+ } finally {
+ if(train != null) train.remove();
+ if(test != null) test.remove();
+ if(preds != null) preds.remove();
+ if(glm != null) glm.remove();
+ if(preds2 != null) preds2.remove();
+ if(glm2 != null) glm2.remove();
+ Scope.exit();
+ }
+ }
+
+ @Test(expected = H2OModelBuilderIllegalArgumentException.class)
+ public void testRemoveOffsetEffectsMissingOffsetColumn() {
+ Frame train = null;
+ GLMModel glm = null;
+ try {
+ Scope.enter();
+
+ Vec cat1 = Vec.makeVec(new long[]{1,1,1,0,0},new String[]{"black","red"},Vec.newKey());
+ Vec cat2 = Vec.makeVec(new long[]{1,1,1,0,0},new String[]{"a","b"},Vec.newKey());
+ Vec res = Vec.makeVec(new double[]{1,1,2,0,0},cat1.group().addVec());
+ train = new Frame(Key.make("train"),new String[]{"x1", "x2", "y"},new Vec[]{cat1, cat2,res});
+ DKV.put(train);
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters();
+ params._train = train._key;
+ params._alpha = new double[]{0};
+ params._response_column = "y";
+ params._intercept = false;
+ params._remove_offset_effects = true;
+ params._distribution = DistributionFamily.bernoulli;
+ glm = new GLM(params).trainModel().get();
+
+ } finally {
+ if (train != null) train.remove();
+ if (glm != null) glm.remove();
+ Scope.exit();
+ }
+ }
+
+
+ @Test(expected = H2OModelBuilderIllegalArgumentException.class)
+ public void testRemoveOffsetEffectsMultinomial() {
+ Frame train = null;
+ GLMModel glm = null;
+ try {
+ Scope.enter();
+
+ Vec cat1 = Vec.makeVec(new long[]{1,1,1,0,0},new String[]{"black","red"},Vec.newKey());
+ Vec cat2 = Vec.makeVec(new double[]{1,1,1,0,0}, cat1.group().addVec());
+ Vec res = Vec.makeVec(new double[]{1,1,2,0,0},cat1.group().addVec());
+ train = new Frame(Key.make("train"),new String[]{"x1", "x2", "y"},new Vec[]{cat1, cat2, res});
+ DKV.put(train);
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters();
+ params._train = train._key;
+ params._alpha = new double[]{0};
+ params._response_column = "y";
+ params._intercept = false;
+ params._remove_offset_effects = true;
+ params._offset_column = "x2";
+ params._distribution = DistributionFamily.multinomial;
+ glm = new GLM(params).trainModel().get();
+ } finally {
+ if (train != null) train.remove();
+ if (glm != null) glm.remove();
+ Scope.exit();
+ }
+ }
+
+ @Test
+ public void testBasicDataBinomialOffset(){
+ /** Test against GLM in R
+ * cat1 <- factor(c(1,1,1,0,0,1,1,0,0,1,0,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0))
+ * cat2 <- factor(c(1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0))
+ * offset <- c(0.1,0.2,0.2,0.2,0.1,0,0,0.2,0.3,0.5,0.3,0.4,0.8,0.4,0.4,0.5,0,0,0.5,0.1,0,0,0.1,0,0.1,0)
+ * res <- factor(c(1,1,0,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,1,0,1,1,1,1))
+ * data <- data.frame(cat1, cat2, offset, res)
+ * glm <- glm(res ~ cat1 + cat2 + offset(offset), data=data, family = binomial)
+ * summary(glm)
+ * predict(glm)
+ *
+ * Call:
+ * glm(formula = res ~ cat1 + cat2 + offset(offset), family = binomial,
+ * data = data)
+ *
+ * Coefficients:
+ * Estimate Std. Error z value Pr(>|z|)
+ * (Intercept) -0.3310 0.7256 -0.456 0.648
+ * cat11 0.9780 0.8467 1.155 0.248
+ * cat21 0.2295 0.8586 0.267 0.789
+ *
+ * (Dispersion parameter for binomial family taken to be 1)
+ *
+ * Null deviance: 33.557 on 25 degrees of freedom
+ * Residual deviance: 32.173 on 23 degrees of freedom
+ * AIC: 38.173
+ *
+ * Number of Fisher Scoring iterations: 4
+ *
+ * 1 2 3 4 5 6
+ * 0.976506946 0.847045758 1.076506946 -0.130997049 -0.230997049 0.647045758
+ * 7 8 9 10 11 12
+ * 0.647045758 0.098464139 0.198464139 1.147045758 0.198464139 1.047045758
+ * 13 14 15 16 17 18
+ * 0.469002951 1.276506946 1.047045758 1.376506946 -0.330997049 -0.330997049
+ * 19 20 21 22 23 24
+ * 0.398464139 -0.001535861 0.647045758 0.647045758 0.976506946 0.647045758
+ * 25 26
+ * -0.001535861 -0.330997049
+ **/
+ Frame train = null;
+ GLMModel glm = null;
+ GLMModel glmOffset = null;
+ Frame preds = null;
+ Frame predsOffset = null;
+ Frame predsR = null;
+ try {
+ Scope.enter();
+
+ Vec cat1 = Vec.makeVec(new long[]{1,1,1,0,0,1,1,0,0,1,0,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0},new String[]{"0","1"},Vec.newKey());
+ Vec cat2 = Vec.makeVec(new long[]{1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0},new String[]{"0","1"},Vec.newKey());
+ Vec offset = Vec.makeVec(new double[]{0.1,0.2,0.2,0.2,0.1,0,0,0.2,0.3,0.5,0.3,0.4,0.8,0.4,0.4,0.5,0,0,0.5,0.1,0,0,0.1,0,0.1,0}, Vec.newKey());
+ Vec res = Vec.makeVec(new double[]{1,1,0,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,1,0,1,1,1,1}, new String[]{"0","1"},Vec.newKey());
+ train = new Frame(Key.make("train"),new String[]{"cat1", "cat2", "offset", "y"},new Vec[]{cat1, cat2, offset, res});
+ DKV.put(train);
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters();
+ params._train = train._key;
+ params._lambda = new double[]{0};
+ params._alpha = new double[]{0};
+ params._standardize = false;
+ params._non_negative = true;
+ params._intercept = true;
+ params._objective_epsilon = 1e-10;
+ params._gradient_epsilon = 1e-6;
+ params._response_column = "y";
+ params._distribution = DistributionFamily.bernoulli;
+ params._link = GLMModel.GLMParameters.Link.logit;
+ params._max_iterations = 4;
+ params._dispersion_epsilon = 1;
+ params._offset_column = "offset";
+ glm = new GLM(params).trainModel().get();
+ preds = glm.score(train);
+ System.out.println(preds.toTwoDimTable().toString());
+
+ System.out.println(glm._output._variable_importances);
+ System.out.println(glm.coefficients().toString());
+ Double[] coefficients = glm.coefficients().values().toArray(new Double[0]);
+
+ params._remove_offset_effects = true;
+ glmOffset = new GLM(params).trainModel().get();
+ predsOffset = glmOffset.score(train);
+ System.out.println(predsOffset.toTwoDimTable().toString());
+ Double[] coefficientsOffset = glmOffset.coefficients().values().toArray(new Double[0]);
+
+ Double[] coefficientsR = new Double[]{0.9780, 0.2295, -0.3310};
+ Vec predsRVec = Vec.makeVec(new double[]{0.976506946, 0.847045758, 1.076506946, -0.130997049, -0.230997049,
+ 0.647045758, 0.647045758, 0.098464139, 0.198464139, 1.147045758, 0.198464139, 1.047045758,
+ 0.469002951, 1.276506946, 1.047045758, 1.376506946, -0.330997049, -0.330997049, 0.398464139,
+ -0.001535861, 0.647045758, 0.647045758, 0.976506946, 0.647045758, -0.001535861, -0.330997049},
+ Vec.newKey());
+ predsR = new Frame(Key.make("predsR"),new String[]{"predict"},new Vec[]{predsRVec});
+
+ Frame trainWithoutOffset = train.deepCopy("trainWithoutOffset");
+ Vec offsetVec = trainWithoutOffset.remove("offset");
+ Frame manualPredsR = scoreManualWithCoefficients(coefficientsR, trainWithoutOffset, "manualPredsR", true, offsetVec);
+ Frame manualPredsH2o = scoreManualWithCoefficients(coefficients, trainWithoutOffset, "manualPredsH2o", true, offsetVec);
+ Frame manualPredsRemoveOffset = scoreManualWithCoefficients(coefficientsOffset, trainWithoutOffset, "manualPredsRemoveOffset", true);
+ Frame manualPredsRRemoveOffset = scoreManualWithCoefficients(coefficientsR, trainWithoutOffset, "manualPredsR", true);
+
+ double tol = 1e-3;
+ for (long i = 0; i < manualPredsH2o.numRows(); i++) {
+ double h2o = preds.vec(2).at(i);
+ double manualH2o = manualPredsH2o.vec(0).at(i);
+ // predict output from glm in R is not in logit
+ double r = (1.0 / (Math.exp(-predsR.vec(0).at(i)) + 1.0));
+ double manualR = manualPredsR.vec(0).at(i);
+ double h2oOffset = predsOffset.vec(2).at(i);
+ double manualH2oOffset = manualPredsRemoveOffset.vec(0).at(i);
+ double manualROffset = manualPredsRRemoveOffset.vec(0).at(i);
+
+ System.out.println(i+" h2o: "+h2o+ " h2o manual:" +manualH2o+
+ " R: "+r+" R manual: "+manualR +
+ " h2o remove offset: "+h2oOffset+" h2o remove offset manual "+manualH2oOffset+
+ " R remove offset manual: "+manualROffset);
+
+ // glm score calculation check
+ Assert.assertEquals(h2o, manualH2o, tol);
+ Assert.assertEquals(h2o, r, tol);
+ Assert.assertEquals(h2o, manualR, tol);
+
+ // offset calculation check
+ Assert.assertEquals(h2oOffset, manualH2oOffset, tol);
+ Assert.assertEquals(h2oOffset, manualROffset, tol);
+ }
+ } finally {
+ if (train != null) train.remove();
+ if (glm != null) glm.remove();
+ if (glmOffset != null) glmOffset.remove();
+ if (preds != null) preds.remove();
+ if (predsOffset != null) predsOffset.remove();
+ if (predsR != null) predsR.remove();
+ Scope.exit();
+ }
+ }
+
+ @Test
+ public void testBasicDataBinomialControlValuesAndOffset(){
+ /** Test against GLM in R
+ * cat1 <- factor(c(1,1,1,0,0,1,1,0,0,1,0,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0))
+ * cat2 <- factor(c(1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0))
+ * offset <- c(0.1,0.2,0.2,0.2,0.1,0,0,0.2,0.3,0.5,0.3,0.4,0.8,0.4,0.4,0.5,0,0,0.5,0.1,0,0,0.1,0,0.1,0)
+ * res <- factor(c(1,1,0,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,1,0,1,1,1,1))
+ * data <- data.frame(cat1, cat2, offset, res)
+ * glm <- glm(res ~ cat1 + cat2 + offset(offset), data=data, family = binomial)
+ * summary(glm)
+ * predict(glm)
+ *
+ * Call:
+ * glm(formula = res ~ cat1 + cat2 + offset(offset), family = binomial,
+ * data = data)
+ *
+ * Coefficients:
+ * Estimate Std. Error z value Pr(>|z|)
+ * (Intercept) -0.3310 0.7256 -0.456 0.648
+ * cat11 0.9780 0.8467 1.155 0.248
+ * cat21 0.2295 0.8586 0.267 0.789
+ *
+ * (Dispersion parameter for binomial family taken to be 1)
+ *
+ * Null deviance: 33.557 on 25 degrees of freedom
+ * Residual deviance: 32.173 on 23 degrees of freedom
+ * AIC: 38.173
+ *
+ * Number of Fisher Scoring iterations: 4
+ *
+ * 1 2 3 4 5 6
+ * 0.976506946 0.847045758 1.076506946 -0.130997049 -0.230997049 0.647045758
+ * 7 8 9 10 11 12
+ * 0.647045758 0.098464139 0.198464139 1.147045758 0.198464139 1.047045758
+ * 13 14 15 16 17 18
+ * 0.469002951 1.276506946 1.047045758 1.376506946 -0.330997049 -0.330997049
+ * 19 20 21 22 23 24
+ * 0.398464139 -0.001535861 0.647045758 0.647045758 0.976506946 0.647045758
+ * 25 26
+ * -0.001535861 -0.330997049
+ **/
+ Frame train = null;
+ GLMModel glm = null;
+ GLMModel glmCVOffset = null;
+ Frame preds = null;
+ Frame predsCVOffset = null;
+ Frame predsR = null;
+ try {
+ Scope.enter();
+
+ Vec cat1 = Vec.makeVec(new long[]{1,1,1,0,0,1,1,0,0,1,0,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0},new String[]{"0","1"},Vec.newKey());
+ Vec cat2 = Vec.makeVec(new long[]{1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0},new String[]{"0","1"},Vec.newKey());
+ Vec offset = Vec.makeVec(new double[]{0.1,0.2,0.2,0.2,0.1,0,0,0.2,0.3,0.5,0.3,0.4,0.8,0.4,0.4,0.5,0,0,0.5,0.1,0,0,0.1,0,0.1,0}, Vec.newKey());
+ Vec res = Vec.makeVec(new double[]{1,1,0,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,1,0,1,1,1,1}, new String[]{"0","1"},Vec.newKey());
+ train = new Frame(Key.make("train"),new String[]{"cat1", "cat2", "offset", "y"},new Vec[]{cat1, cat2, offset, res});
+ DKV.put(train);
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters();
+ params._train = train._key;
+ params._lambda = new double[]{0};
+ params._alpha = new double[]{0};
+ params._standardize = false;
+ params._non_negative = true;
+ params._intercept = true;
+ params._objective_epsilon = 1e-10;
+ params._gradient_epsilon = 1e-6;
+ params._response_column = "y";
+ params._distribution = DistributionFamily.bernoulli;
+ params._link = GLMModel.GLMParameters.Link.logit;
+ params._max_iterations = 4;
+ params._dispersion_epsilon = 1;
+ params._offset_column = "offset";
+ glm = new GLM(params).trainModel().get();
+ preds = glm.score(train);
+ System.out.println(preds.toTwoDimTable().toString());
+
+ System.out.println(glm._output._variable_importances);
+ System.out.println(glm.coefficients().toString());
+ Double[] coefficients = glm.coefficients().values().toArray(new Double[0]);
+
+ params._control_variables = new String[]{"cat1"};
+ params._remove_offset_effects = true;
+
+ glmCVOffset = new GLM(params).trainModel().get();
+ predsCVOffset = glmCVOffset.score(train);
+ System.out.println(predsCVOffset.toTwoDimTable().toString());
+ Double[] coefficientsOffset = glmCVOffset.coefficients().values().toArray(new Double[0]);
+
+ Double[] coefficientsR = new Double[]{0.9780, 0.2295, -0.3310};
+ Vec predsRVec = Vec.makeVec(new double[]{0.976506946, 0.847045758, 1.076506946, -0.130997049, -0.230997049,
+ 0.647045758, 0.647045758, 0.098464139, 0.198464139, 1.147045758, 0.198464139, 1.047045758,
+ 0.469002951, 1.276506946, 1.047045758, 1.376506946, -0.330997049, -0.330997049, 0.398464139,
+ -0.001535861, 0.647045758, 0.647045758, 0.976506946, 0.647045758, -0.001535861, -0.330997049},
+ Vec.newKey());
+ predsR = new Frame(Key.make("predsR"),new String[]{"predict"},new Vec[]{predsRVec});
+
+ Frame trainWithoutOffset = train.deepCopy("trainWithoutOffset");
+ Vec offsetVec = trainWithoutOffset.remove("offset");
+ Frame manualPredsR = scoreManualWithCoefficients(coefficientsR, trainWithoutOffset, "manualPredsR", true, offsetVec);
+ Frame manualPredsH2o = scoreManualWithCoefficients(coefficients, trainWithoutOffset, "manualPredsH2o", true, offsetVec);
+ Frame manualPredsRemoveCVOffset = scoreManualWithCoefficients(coefficientsOffset, trainWithoutOffset, "manualPredsCVRemoveOffset", new int[]{0}, true);
+ Frame manualPredsRRemoveCVOffset = scoreManualWithCoefficients(coefficientsR, trainWithoutOffset, "manualPredsR", new int[]{0}, true);
+
+ double tol = 1e-3;
+ for (long i = 0; i < manualPredsH2o.numRows(); i++) {
+ double h2o = preds.vec(2).at(i);
+ double manualH2o = manualPredsH2o.vec(0).at(i);
+ // predict output from glm in R is not in logit
+ double r = (1.0 / (Math.exp(-predsR.vec(0).at(i)) + 1.0));
+ double manualR = manualPredsR.vec(0).at(i);
+ double h2oCVOffset = predsCVOffset.vec(2).at(i);
+ double manualH2oCVOffset = manualPredsRemoveCVOffset.vec(0).at(i);
+ double manualRCVOffset = manualPredsRRemoveCVOffset.vec(0).at(i);
+
+ System.out.println(i+" h2o: "+h2o+ " h2o manual:" +manualH2o+
+ " R: "+r+" R manual: "+manualR +
+ " h2o control and remove offset: "+h2oCVOffset+" h2o control variables and remove offset manual "+manualH2oCVOffset+
+ " R control variables and remove offset manual: "+manualRCVOffset);
+
+ // glm score calculation checkmanualROffset
+ Assert.assertEquals(h2o, manualH2o, tol);
+ Assert.assertEquals(h2o, r, tol);
+ Assert.assertEquals(h2o, manualR, tol);
+
+ // offset calculation check
+ Assert.assertEquals(h2oCVOffset, manualH2oCVOffset, tol);
+ Assert.assertEquals(h2oCVOffset, manualRCVOffset, tol);
+ }
+ } finally {
+ if (train != null) train.remove();
+ if (glm != null) glm.remove();
+ if (glmCVOffset != null) glmCVOffset.remove();
+ if (preds != null) preds.remove();
+ if (predsCVOffset != null) predsCVOffset.remove();
+ if (predsR != null) predsR.remove();
+ Scope.exit();
+ }
+ }
+
+ @Test(expected = H2OModelBuilderIllegalArgumentException.class)
+ public void testRemoveOffsetWithInteraction() {
+ Frame train = null;
+ GLMModel glm = null;
+ try {
+ Scope.enter();
+
+ Vec cat1 = Vec.makeVec(new long[]{1,1,1,0,0,1,1,0,0,1,0,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0},new String[]{"0","1"},Vec.newKey());
+ Vec cat2 = Vec.makeVec(new long[]{1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0},new String[]{"0","1"},Vec.newKey());
+ Vec offset = Vec.makeVec(new double[]{0.1,0.2,0.2,0.2,0.1,0,0,0.2,0.3,0.5,0.3,0.4,0.8,0.4,0.4,0.5,0,0,0.5,0.1,0,0,0.1,0,0.1,0}, Vec.newKey());
+ Vec res = Vec.makeVec(new double[]{1,1,0,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,1,0,1,1,1,1}, new String[]{"0","1"},Vec.newKey());
+ train = new Frame(Key.make("train"),new String[]{"x1", "x2", "offset", "y"},new Vec[]{cat1, cat2, offset, res});
+ DKV.put(train);
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters();
+ params._train = train._key;
+ params._alpha = new double[]{0};
+ params._response_column = "y";
+ params._intercept = false;
+ params._remove_offset_effects = true;
+ params._offset_column = "offset";
+ params._interactions = new String[]{"x1", "x2"};
+ glm = new GLM(params).trainModel().get();
+ } finally {
+ if (train != null) train.remove();
+ if (glm != null) glm.remove();
+ Scope.exit();
+ }
+ }
+
+ @Test(expected = H2OModelBuilderIllegalArgumentException.class)
+ public void testRemoveOffsetWithLambdaSearch() {
+ Frame train = null;
+ GLMModel glm = null;
+ try {
+ Scope.enter();
+
+ Vec cat1 = Vec.makeVec(new long[]{1,1,1,0,0,1,1,0,0,1,0,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0},new String[]{"0","1"},Vec.newKey());
+ Vec cat2 = Vec.makeVec(new long[]{1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0},new String[]{"0","1"},Vec.newKey());
+ Vec offset = Vec.makeVec(new double[]{0.1,0.2,0.2,0.2,0.1,0,0,0.2,0.3,0.5,0.3,0.4,0.8,0.4,0.4,0.5,0,0,0.5,0.1,0,0,0.1,0,0.1,0}, Vec.newKey());
+ Vec res = Vec.makeVec(new double[]{1,1,0,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,1,0,1,1,1,1}, new String[]{"0","1"},Vec.newKey());
+ train = new Frame(Key.make("train"),new String[]{"x1", "x2", "offset", "y"},new Vec[]{cat1, cat2, offset, res});
+ DKV.put(train);
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters();
+ params._train = train._key;
+ params._alpha = new double[]{0};
+ params._response_column = "y";
+ params._intercept = false;
+ params._remove_offset_effects = true;
+ params._offset_column = "offset";
+ params._lambda_search = true;
+ glm = new GLM(params).trainModel().get();
+ } finally {
+ if (train != null) train.remove();
+ if (glm != null) glm.remove();
+ Scope.exit();
+ }
+ }
+
+ @Test(expected = H2OModelBuilderIllegalArgumentException.class)
+ public void testRemoveOffsetWithCrossValiadation() {
+ Frame train = null;
+ GLMModel glm = null;
+ try {
+ Scope.enter();
+
+ Vec cat1 = Vec.makeVec(new long[]{1,1,1,0,0,1,1,0,0,1,0,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0},new String[]{"0","1"},Vec.newKey());
+ Vec cat2 = Vec.makeVec(new long[]{1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0},new String[]{"0","1"},Vec.newKey());
+ Vec offset = Vec.makeVec(new double[]{0.1,0.2,0.2,0.2,0.1,0,0,0.2,0.3,0.5,0.3,0.4,0.8,0.4,0.4,0.5,0,0,0.5,0.1,0,0,0.1,0,0.1,0}, Vec.newKey());
+ Vec res = Vec.makeVec(new double[]{1,1,0,0,0,1,0,1,0,1,1,1,1,1,1,0,0,0,1,0,1,0,1,1,1,1}, new String[]{"0","1"},Vec.newKey());
+ train = new Frame(Key.make("train"),new String[]{"x1", "x2", "offset", "y"},new Vec[]{cat1, cat2, offset, res});
+ DKV.put(train);
+
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters();
+ params._train = train._key;
+ params._alpha = new double[]{0};
+ params._response_column = "y";
+ params._intercept = false;
+ params._remove_offset_effects = true;
+ params._offset_column = "offset";
+ params._nfolds = 3;
+ glm = new GLM(params).trainModel().get();
+ } finally {
+ if (train != null) train.remove();
+ if (glm != null) glm.remove();
+ Scope.exit();
+ }
+ }
}
diff --git a/h2o-algos/src/test/java/hex/glm/GLMMojoControlVarsOffsetTest.java b/h2o-algos/src/test/java/hex/glm/GLMMojoControlVarsOffsetTest.java
new file mode 100644
index 000000000000..dbdf04b1cf37
--- /dev/null
+++ b/h2o-algos/src/test/java/hex/glm/GLMMojoControlVarsOffsetTest.java
@@ -0,0 +1,244 @@
+package hex.glm;
+
+import hex.generic.Generic;
+import hex.generic.GenericModel;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import water.Scope;
+import water.TestUtil;
+import water.fvec.Frame;
+import water.fvec.TestFrameBuilder;
+import water.fvec.Vec;
+import water.runner.CloudSize;
+import water.runner.H2ORunner;
+
+import java.io.File;
+import java.io.FileOutputStream;
+
+import static org.junit.Assert.*;
+
+/**
+ * Tests GLM MOJO scoring for all combinations of remove_offset_effects and control_variables
+ * across binomial, gaussian, and tweedie families.
+ */
+@RunWith(H2ORunner.class)
+@CloudSize(1)
+public class GLMMojoControlVarsOffsetTest extends TestUtil {
+
+ // -- Binomial --
+
+ @Test
+ public void testMojoBinomialBaseline() throws Exception {
+ testCombination(buildBinomialFrame(), GLMModel.GLMParameters.Family.binomial,
+ false, false, "binomial_baseline");
+ }
+
+ @Test
+ public void testMojoBinomialRemoveOffsetOnly() throws Exception {
+ testCombination(buildBinomialFrame(), GLMModel.GLMParameters.Family.binomial,
+ true, false, "binomial_roe");
+ }
+
+ @Test
+ public void testMojoBinomialControlVarsOnly() throws Exception {
+ testCombination(buildBinomialFrame(), GLMModel.GLMParameters.Family.binomial,
+ false, true, "binomial_cv");
+ }
+
+ @Test
+ public void testMojoBinomialBoth() throws Exception {
+ testCombination(buildBinomialFrame(), GLMModel.GLMParameters.Family.binomial,
+ true, true, "binomial_both");
+ }
+
+ // -- Gaussian --
+
+ @Test
+ public void testMojoGaussianBaseline() throws Exception {
+ testCombination(buildRegressionFrame(false), GLMModel.GLMParameters.Family.gaussian,
+ false, false, "gaussian_baseline");
+ }
+
+ @Test
+ public void testMojoGaussianRemoveOffsetOnly() throws Exception {
+ testCombination(buildRegressionFrame(false), GLMModel.GLMParameters.Family.gaussian,
+ true, false, "gaussian_roe");
+ }
+
+ @Test
+ public void testMojoGaussianControlVarsOnly() throws Exception {
+ testCombination(buildRegressionFrame(false), GLMModel.GLMParameters.Family.gaussian,
+ false, true, "gaussian_cv");
+ }
+
+ @Test
+ public void testMojoGaussianBoth() throws Exception {
+ testCombination(buildRegressionFrame(false), GLMModel.GLMParameters.Family.gaussian,
+ true, true, "gaussian_both");
+ }
+
+ // -- Tweedie --
+
+ @Test
+ public void testMojoTweedieBaseline() throws Exception {
+ testCombination(buildRegressionFrame(true), GLMModel.GLMParameters.Family.tweedie,
+ false, false, "tweedie_baseline");
+ }
+
+ @Test
+ public void testMojoTweedieRemoveOffsetOnly() throws Exception {
+ testCombination(buildRegressionFrame(true), GLMModel.GLMParameters.Family.tweedie,
+ true, false, "tweedie_roe");
+ }
+
+ @Test
+ public void testMojoTweedieControlVarsOnly() throws Exception {
+ testCombination(buildRegressionFrame(true), GLMModel.GLMParameters.Family.tweedie,
+ false, true, "tweedie_cv");
+ }
+
+ @Test
+ public void testMojoTweedieBoth() throws Exception {
+ testCombination(buildRegressionFrame(true), GLMModel.GLMParameters.Family.tweedie,
+ true, true, "tweedie_both");
+ }
+
+ // -- Feature-effect tests: verify all combinations produce different predictions --
+
+ @Test
+ public void testAllCombinationsDifferGaussian() throws Exception {
+ assertAllCombinationsDiffer(buildRegressionFrame(false), GLMModel.GLMParameters.Family.gaussian);
+ }
+
+ // -- Test logic --
+
+ private void assertAllCombinationsDiffer(Frame train, GLMModel.GLMParameters.Family family) throws Exception {
+ try {
+ Scope.enter();
+ Scope.track(train);
+
+ // Train all four combinations
+ Frame predsBase = trainAndScore(train, family, false, false);
+ Frame predsRO = trainAndScore(train, family, true, false);
+ Frame predsCV = trainAndScore(train, family, false, true);
+ Frame predsBoth = trainAndScore(train, family, true, true);
+
+ // Each combination should produce different predictions from every other
+ assertPredictionsDiffer(predsBase, predsRO, "baseline vs RO");
+ assertPredictionsDiffer(predsBase, predsCV, "baseline vs CV");
+ assertPredictionsDiffer(predsBase, predsBoth, "baseline vs RO+CV");
+ assertPredictionsDiffer(predsRO, predsCV, "RO vs CV");
+ assertPredictionsDiffer(predsRO, predsBoth, "RO vs RO+CV");
+ assertPredictionsDiffer(predsCV, predsBoth, "CV vs RO+CV");
+ } finally {
+ Scope.exit();
+ }
+ }
+
+ private Frame trainAndScore(Frame train, GLMModel.GLMParameters.Family family,
+ boolean removeOffsetEffects, boolean useControlVars) {
+ GLMModel.GLMParameters params = makeParams(train, family, removeOffsetEffects, useControlVars);
+ GLMModel model = new GLM(params).trainModel().get();
+ Scope.track_generic(model);
+ Frame preds = model.score(train);
+ Scope.track(preds);
+ return preds;
+ }
+
+ private GLMModel.GLMParameters makeParams(Frame train, GLMModel.GLMParameters.Family family,
+ boolean removeOffsetEffects, boolean useControlVars) {
+ GLMModel.GLMParameters params = new GLMModel.GLMParameters();
+ params._response_column = "response";
+ params._train = train._key;
+ params._family = family;
+ params._offset_column = "offset";
+ params._lambda = new double[]{0};
+ if (removeOffsetEffects) params._remove_offset_effects = true;
+ if (useControlVars) params._control_variables = new String[]{"x3"};
+ if (family == GLMModel.GLMParameters.Family.tweedie) {
+ params._tweedie_variance_power = 1.5;
+ params._tweedie_link_power = 0;
+ }
+ return params;
+ }
+
+ private void assertPredictionsDiffer(Frame pred1, Frame pred2, String label) {
+ int colIdx = pred1.numCols() > 1 ? 1 : 0;
+ long nrows = Math.min(pred1.numRows(), 100);
+ int differ = 0;
+ for (int i = 0; i < nrows; i++) {
+ if (Math.abs(pred1.vec(colIdx).at(i) - pred2.vec(colIdx).at(i)) > 1e-10) differ++;
+ }
+ assertTrue(label + ": predictions should differ (only " + differ + "/" + nrows + " rows differed)",
+ differ > nrows / 10);
+ }
+
+ private void testCombination(Frame train, GLMModel.GLMParameters.Family family,
+ boolean removeOffsetEffects, boolean useControlVars, String label) throws Exception {
+ try {
+ Scope.enter();
+ Scope.track(train);
+
+ GLMModel.GLMParameters params = makeParams(train, family, removeOffsetEffects, useControlVars);
+ GLMModel model = new GLM(params).trainModel().get();
+ Scope.track_generic(model);
+ assertTrue(label + ": should support MOJO", model.haveMojo());
+
+ Frame h2oPreds = model.score(train);
+ Scope.track(h2oPreds);
+
+ // Save MOJO, reimport as GenericModel, score, and compare
+ File mojoFile = File.createTempFile("glm_mojo", ".zip");
+ mojoFile.deleteOnExit();
+ try (FileOutputStream fos = new FileOutputStream(mojoFile)) {
+ model.getMojo().writeTo(fos);
+ }
+
+ GenericModel genericModel = Generic.importMojoModel(mojoFile.getAbsolutePath(), false);
+ Scope.track_generic(genericModel);
+
+ Frame mojoPreds = genericModel.score(train);
+ Scope.track(mojoPreds);
+
+ assertFrameEquals(h2oPreds, mojoPreds, 1e-8);
+ } finally {
+ Scope.exit();
+ }
+ }
+
+ // -- Frame builders --
+
+ private Frame buildBinomialFrame() {
+ double[] x1 = {1.2, 2.3, 3.1, 0.5, 1.8, 2.7, 3.5, 0.9, 1.5, 2.1, 3.0, 0.7, 1.1, 2.5, 3.3, 0.4, 1.6, 2.9, 3.7, 0.8};
+ double[] x2 = {0.5, 1.5, 0.8, 1.2, 0.3, 1.8, 0.6, 1.1, 0.9, 1.4, 0.7, 1.6, 0.4, 1.3, 0.2, 1.7, 0.1, 1.9, 0.5, 1.0};
+ double[] x3 = {3.0, 1.0, 2.0, 4.0, 3.5, 1.5, 2.5, 3.2, 1.8, 2.8, 3.8, 1.2, 2.2, 3.6, 1.6, 2.6, 4.2, 0.8, 3.4, 1.4};
+ double[] offset = {0.1, -0.2, 0.3, -0.1, 0.2, -0.3, 0.15, -0.15, 0.25, -0.25, 0.05, -0.05, 0.12, -0.12, 0.22, -0.22, 0.08, -0.08, 0.18, -0.18};
+ String[] response = {"1", "1", "1", "0", "1", "1", "1", "0", "1", "1", "1", "0", "0", "1", "1", "0", "1", "1", "1", "0"};
+
+ Frame f = new TestFrameBuilder()
+ .withColNames("x1", "x2", "x3", "offset", "response")
+ .withVecTypes(Vec.T_NUM, Vec.T_NUM, Vec.T_NUM, Vec.T_NUM, Vec.T_CAT)
+ .withDataForCol(0, x1).withDataForCol(1, x2).withDataForCol(2, x3)
+ .withDataForCol(3, offset).withDataForCol(4, response)
+ .build();
+ return f;
+ }
+
+ private Frame buildRegressionFrame(boolean positiveResponse) {
+ double[] x1 = {1.2, 2.3, 3.1, 0.5, 1.8, 2.7, 3.5, 0.9, 1.5, 2.1, 3.0, 0.7, 1.1, 2.5, 3.3, 0.4, 1.6, 2.9, 3.7, 0.8};
+ double[] x2 = {0.5, 1.5, 0.8, 1.2, 0.3, 1.8, 0.6, 1.1, 0.9, 1.4, 0.7, 1.6, 0.4, 1.3, 0.2, 1.7, 0.1, 1.9, 0.5, 1.0};
+ double[] x3 = {3.0, 1.0, 2.0, 4.0, 3.5, 1.5, 2.5, 3.2, 1.8, 2.8, 3.8, 1.2, 2.2, 3.6, 1.6, 2.6, 4.2, 0.8, 3.4, 1.4};
+ double[] offset = {0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.15, 0.15, 0.25, 0.25, 0.05, 0.05, 0.12, 0.12, 0.22, 0.22, 0.08, 0.08, 0.18, 0.18};
+ double[] response = positiveResponse
+ ? new double[]{2.5, 4.1, 5.8, 1.2, 3.3, 5.0, 6.5, 1.8, 2.9, 4.5, 5.6, 1.5, 2.1, 4.8, 6.2, 0.9, 3.0, 5.3, 7.0, 1.6}
+ : new double[]{2.5, 4.1, 5.8, -1.2, 3.3, 5.0, 6.5, -1.8, 2.9, 4.5, 5.6, -1.5, 2.1, 4.8, 6.2, -0.9, 3.0, 5.3, 7.0, -1.6};
+
+ Frame f = new TestFrameBuilder()
+ .withColNames("x1", "x2", "x3", "offset", "response")
+ .withVecTypes(Vec.T_NUM, Vec.T_NUM, Vec.T_NUM, Vec.T_NUM, Vec.T_NUM)
+ .withDataForCol(0, x1).withDataForCol(1, x2).withDataForCol(2, x3)
+ .withDataForCol(3, offset).withDataForCol(4, response)
+ .build();
+ return f;
+ }
+}
diff --git a/h2o-bindings/bin/custom/R/gen_glm.py b/h2o-bindings/bin/custom/R/gen_glm.py
index e682a89ea7c7..67d3b5fe38b2 100644
--- a/h2o-bindings/bin/custom/R/gen_glm.py
+++ b/h2o-bindings/bin/custom/R/gen_glm.py
@@ -72,12 +72,26 @@ def update_param(name, param):
#' Make unrestricted GLM model when control variables are defined.
#'
#' Needs source model trained with control variables enabled.
-#' @param model a GLM \linkS4class{H2OModel} trained with control variable
+#' @param model a GLM \linkS4class{H2OModel} trained with control variable or with remove offset effects
#' @param destination_key a string or a NULL
+#' @param control_variables_enabled a logical flag set control variables flag to get model affected only by
+#' this feature (available only if control_variables and remove_offset_effects parameters are both set)
+#' @param remove_offset_effects_enabled a logical flag set remove offset effects flag to get model affected only by
+#' this feature (available only if control_variables and remove_offset_effects parameters are both set)
#' @export
-h2o.make_unrestricted_glm_model <- function(model, destination_key = NULL) {
- stopifnot("GLM wasn't trained with control variables." = !is.null(model@params$actual[["control_variables"]]))
- query <- list(method = "POST", .h2o.__GLMMakeUnrestrictedModel, model = model@model_id)
+h2o.make_unrestricted_glm_model <- function(model, destination_key = NULL, control_variables_enabled = FALSE, remove_offset_effects_enabled = FALSE) {
+ stopifnot("GLM wasn't trained with control variables or with remove offset effects." =
+ !is.null(model@params$actual[["control_variables"]]) || isTRUE(model@params$actual[["remove_offset_effects"]]))
+ if ((is.null(model@params$actual[["control_variables"]]) || isFALSE(model@params$actual[["remove_offset_effects"]]))
+ && (isTRUE(control_variables_enabled) || isTRUE(remove_offset_effects_enabled))) {
+ stop("GLM wasn't trained with both control variables and with remove offset effects feature set, the control_variables_enabled and remove_offset_effects_enabled features cannot be used.")
+ }
+ if ((!is.null(model@params$actual[["control_variables"]]) || isTRUE(model@params$actual[["remove_offset_effects"]]))
+ && (isTRUE(control_variables_enabled) && isTRUE(remove_offset_effects_enabled))){
+ stop("The control_variables_enabled and remove_offset_effects_enabled feature cannot be used together. It produces the same model as the main model.")
+ }
+ query <- list(method = "POST", .h2o.__GLMMakeUnrestrictedModel, model = model@model_id,
+ control_variables_enabled=control_variables_enabled, remove_offset_effects_enabled=remove_offset_effects_enabled)
if (!missing(destination_key) && !is.null(destination_key)) {
query <- c(query, list(dest = destination_key))
}
diff --git a/h2o-bindings/bin/custom/python/gen_glm.py b/h2o-bindings/bin/custom/python/gen_glm.py
index f54a66f15900..86b924a40c2b 100644
--- a/h2o-bindings/bin/custom/python/gen_glm.py
+++ b/h2o-bindings/bin/custom/python/gen_glm.py
@@ -308,20 +308,24 @@ def allConstraintsPassed(model):
else:
raise H2OValueError("allConstraintsPassed can only be called when there are linear constraints.")
-
- def make_unrestricted_glm_model(self, dest=None):
+ def make_unrestricted_glm_model(self, dest=None, control_variables_enabled=False, remove_offset_effects_enabled=False):
"""
Make unrestricted GLM model when control variables are defined.
Needs to be passed source model trained with control variables enabled.
:param dest: (optional) destination key
+ :param control_variables_enabled: (optional) set control variables flag to get model affected only
+ by this feature (available only if control_variables and remove_offset_effects parameters are both set)
+ :param remove_offset_effects_enabled: (optional) set remove offset effects flag to get model affected only
+ by this feature (available only if control_variables and remove_offset_effects parameters are both set)
:examples:
>>> d = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> m = H2OGeneralizedLinearEstimator(family='binomial',
... solver='COORDINATE_DESCENT',
+ ... remove_offset_effects = True,
... control_variables=["PSA"])
>>> m.train(training_frame=d,
... x=[2,3,4,5,6,7,8],
@@ -331,11 +335,22 @@ def make_unrestricted_glm_model(self, dest=None):
>>> m2 = m.make_unrestricted_glm_model(dest="unrestricted_glm")
>>> p2 = m2.model_performance(d)
>>> print(p2)
+ >>> m3 = m.make_unrestricted_glm_model(dest="unrestricted_glm_cv", control_variables_enabled=True)
+ >>> p3 = m3.model_performance(d)
+ >>> print(p3)
"""
+ if self.actual_params["control_variables"] is None and not(self.actual_params["remove_offset_effects"]):
+ raise H2OValueError("GLM wasn't trained with control variables or with remove offset effects.")
+ if (self.actual_params["control_variables"] is None or not(self.actual_params["remove_offset_effects"])) and (control_variables_enabled or remove_offset_effects_enabled):
+ raise H2OValueError("GLM wasn't trained with both control variables and with remove offset effects feature set, the control_variables_enabled and remove_offset_effects_enabled features cannot be used.")
+ if self.actual_params["control_variables"] is not None and self.actual_params["remove_offset_effects"] and (control_variables_enabled and remove_offset_effects_enabled):
+ raise H2OValueError("The control_variables_enabled and remove_offset_effects_enabled feature cannot be used together. It produces the same model as the main model.")
model_json = h2o.api(
"POST /3/MakeUnrestrictedGLMModel",
data={"model": self._model_json["model_id"]["name"],
- "dest": dest}
+ "dest": dest,
+ "control_variables_enabled": control_variables_enabled,
+ "remove_offset_effects_enabled": remove_offset_effects_enabled}
)
m = H2OGeneralizedLinearEstimator()
if dest is None:
diff --git a/h2o-core/src/main/java/hex/ModelMetricsBinomial.java b/h2o-core/src/main/java/hex/ModelMetricsBinomial.java
index cacb20b7dc73..f5928eabab3e 100755
--- a/h2o-core/src/main/java/hex/ModelMetricsBinomial.java
+++ b/h2o-core/src/main/java/hex/ModelMetricsBinomial.java
@@ -186,7 +186,7 @@ public static class MetricBuilderBinomial> ex
// Passed a float[] sized nclasses+1; ds[0] must be a prediction. ds[1...nclasses-1] must be a class
// distribution;
@Override public double[] perRow(double ds[], float[] yact, Model m) {return perRow(ds, yact, 1, 0, m);}
- @Override public double[] perRow(double ds[], float[] yact, double w, double o, Model m) {
+ @Override public double[] perRow(double ds[], float[] yact, double w, double offset, Model m) {
if( Float .isNaN(yact[0]) ) return ds; // No errors if actual is missing
if(ArrayUtils.hasNaNs(ds)) return ds; // No errors if prediction has missing values (can happen for GLM)
if(w == 0 || Double.isNaN(w)) return ds;
diff --git a/h2o-docs/src/product/data-science/algo-params/control_variables.rst b/h2o-docs/src/product/data-science/algo-params/control_variables.rst
index 75222c3a5e22..5d61ca7f3807 100644
--- a/h2o-docs/src/product/data-science/algo-params/control_variables.rst
+++ b/h2o-docs/src/product/data-science/algo-params/control_variables.rst
@@ -17,10 +17,13 @@ Common use cases include:
When control variables are specified, GLM will exclude them during scoring. Model metrics and scoring history are calculated for both the restricted model (with control variables excluded) and the unrestricted model (with control variables included).
-To get the unrestricted model with its own metrics use ``glm.make_unrestriced_glm_model()``/``h2o.make_unrestricted_glm_model(glm)``.
+To get the unrestricted model with its own metrics use ``glm.make_unrestricted_glm_model()`` / ``h2o.make_unrestricted_glm_model(glm)``.
The control variables' coefficients are set to zero in the variable importance table. Use the unrestricted model to get the variable importance table with all variables included.
+If you set up the ``control_variables`` together with the ``remove_offset_effects`` feature, model metrics and scoring history are calculated with both effects enabled.
+If you need to get a model with only one feature enabled, you can get it using ``glm.make_unrestricted_glm_model(control_variables_enabled=True)`` or ``glm.make_unrestricted_glm_model(remove_offset_effects_enabled=True)``
+
**Notes**:
@@ -33,7 +36,7 @@ The control variables' coefficients are set to zero in the variable importance t
Related Parameters
~~~~~~~~~~~~~~~~~~
-- None
+- `remove_offset_effects `__
Example
~~~~~~~
diff --git a/h2o-docs/src/product/data-science/algo-params/remove_offset_effects.rst b/h2o-docs/src/product/data-science/algo-params/remove_offset_effects.rst
new file mode 100644
index 000000000000..8ffb53e38921
--- /dev/null
+++ b/h2o-docs/src/product/data-science/algo-params/remove_offset_effects.rst
@@ -0,0 +1,124 @@
+``remove_offset_effects``
+--------------------
+
+- Available in: GLM
+- Hyperparameter: no
+
+Description
+~~~~~~~~~~~
+
+This feature allows you to remove offset effects during scoring and metric calculation.
+
+Model metrics and scoring history are calculated for both the restricted model (with offset effects removed) and the unrestricted model (with offset effect included).
+
+To get the unrestricted model with its own metrics use ``glm.make_unrestricted_glm_model()`` / ``h2o.make_unrestricted_glm_model(glm)``.
+
+If you set up the ``remove_offset_effects`` together with the ``control_variables`` feature, model metrics and scoring history are calculated with both effects enabled.
+If you need to get a model with only one feature enabled, you can get it using ``glm.make_unrestricted_glm_model(control_variables_enabled=True)`` or ``glm.make_unrestricted_glm_model(remove_offset_effects_enabled=True)``
+
+**Notes**:
+
+- This option is experimental.
+- This option is applicable only for regression and binomial distribution.
+- This option is not available when cross validation is enabled.
+- This option is not available when Lambda search is enabled.
+- This option is not available when interactions are enabled.
+
+Related Parameters
+~~~~~~~~~~~~~~~~~~
+
+- `control_variables `__
+
+Example
+~~~~~~~
+
+.. tabs::
+ .. code-tab:: r R
+
+ library(h2o)
+ h2o.init()
+ # import the airlines dataset:
+ # This dataset is used to classify whether a flight will be delayed 'YES' or not "NO"
+ # original data can be found at http://www.transtats.bts.gov/
+ airlines <- h2o.importFile("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
+
+ # convert columns to factors
+ airlines["Year"] <- as.factor(airlines["Year"])
+ airlines["Month"] <- as.factor(airlines["Month"])
+ airlines["DayOfWeek"] <- as.factor(airlines["DayOfWeek"])
+ airlines["Cancelled"] <- as.factor(airlines["Cancelled"])
+ airlines['FlightNum'] <- as.factor(airlines['FlightNum'])
+
+ # set the predictor names and the response column name
+ predictors <- c("Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "FlightNum")
+ response <- "IsDepDelayed"
+
+ # split into train and validation
+ airlines_splits <- h2o.splitFrame(data = airlines, ratios = 0.8)
+ train <- airlines_splits[[1]]
+ valid <- airlines_splits[[2]]
+
+ # try using the `remove_offset_effects` parameter:
+ airlines_glm <- h2o.glm(family = 'binomial', x = predictors, y = response, training_frame = train,
+ validation_frame = valid,
+ remove_collinear_columns = FALSE,
+ score_each_iteration = TRUE,
+ generate_scoring_history = TRUE,
+ offset_column = "Distance",
+ remove_offset_effects = TRUE)
+
+ # print the AUC for the validation data
+ print(h2o.auc(airlines_glm, valid = TRUE))
+
+ # take a look at the learning curve
+ h2o.learning_curve_plot(airlines_glm)
+
+ # get the unrestricted GLM model
+ unrestricted_airlines_glm <- h2o.make_unrestricted_glm_model(airlines_glm)
+
+
+ .. code-tab:: python
+
+ import h2o
+ from h2o.estimators.glm import H2OGeneralizedLinearEstimator
+ h2o.init()
+
+ # import the airlines dataset:
+ # This dataset is used to classify whether a flight will be delayed 'YES' or not "NO"
+ # original data can be found at http://www.transtats.bts.gov/
+ airlines= h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
+
+ # convert columns to factors
+ airlines["Year"]= airlines["Year"].asfactor()
+ airlines["Month"]= airlines["Month"].asfactor()
+ airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
+ airlines["Cancelled"] = airlines["Cancelled"].asfactor()
+ airlines['FlightNum'] = airlines['FlightNum'].asfactor()
+
+ # set the predictor names and the response column name
+ predictors = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "FlightNum"]
+ response = "IsDepDelayed"
+
+ # split into train and validation sets
+ train, valid= airlines.split_frame(ratios = [.8])
+
+ # try using the `remove_offset_effects` parameter:
+ # initialize your estimator
+ airlines_glm = H2OGeneralizedLinearEstimator(family = 'binomial',
+ remove_collinear_columns = True,
+ score_each_iteration = True,
+ generate_scoring_history = True,
+ offset_column = "Distance",
+ remove_offset_effects = True)
+
+ # then train your model
+ airlines_glm.train(x = predictors, y = response, training_frame = train, validation_frame = valid)
+
+ # print the auc for the validation data
+ print(airlines_glm.auc(valid=True))
+
+ # take a look at the learning curve
+ airlines_glm.learning_curve_plot()
+
+ # get the unrestricted GLM model
+ unrestricted_airlines_glm = airlines_glm.make_unrestricted_glm_model()
diff --git a/h2o-genmodel/src/main/java/hex/genmodel/AbstractMojoWriter.java b/h2o-genmodel/src/main/java/hex/genmodel/AbstractMojoWriter.java
index f5649dd1d5db..7585a6304833 100644
--- a/h2o-genmodel/src/main/java/hex/genmodel/AbstractMojoWriter.java
+++ b/h2o-genmodel/src/main/java/hex/genmodel/AbstractMojoWriter.java
@@ -16,7 +16,7 @@
public abstract class AbstractMojoWriter {
/**
- * Reference to the model being written. Use this in the subclasses to retreive information from your model.
+ * Reference to the model being written. Use this in the subclasses to retrieve information from your model.
*/
private ModelDescriptor model;
diff --git a/h2o-py/h2o/estimators/glm.py b/h2o-py/h2o/estimators/glm.py
index 4d09e146c6ba..8704ca83f09d 100644
--- a/h2o-py/h2o/estimators/glm.py
+++ b/h2o-py/h2o/estimators/glm.py
@@ -95,6 +95,7 @@ def __init__(self,
stopping_metric="auto", # type: Literal["auto", "deviance", "logloss", "mse", "rmse", "mae", "rmsle", "auc", "aucpr", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"]
stopping_tolerance=0.001, # type: float
control_variables=None, # type: Optional[List[str]]
+ remove_offset_effects=False, # type: bool
balance_classes=False, # type: bool
class_sampling_factors=None, # type: Optional[List[float]]
max_after_balance_size=5.0, # type: float
@@ -341,6 +342,9 @@ def __init__(self,
Experimental.
Defaults to ``None``.
:type control_variables: List[str], optional
+ :param remove_offset_effects: Remove offset effects from scoring and metric calculation. Experimental.
+ Defaults to ``False``.
+ :type remove_offset_effects: bool
:param balance_classes: Balance training data class counts via over/under-sampling (for imbalanced data).
Defaults to ``False``.
:type balance_classes: bool
@@ -504,6 +508,7 @@ def __init__(self,
self.stopping_metric = stopping_metric
self.stopping_tolerance = stopping_tolerance
self.control_variables = control_variables
+ self.remove_offset_effects = remove_offset_effects
self.balance_classes = balance_classes
self.class_sampling_factors = class_sampling_factors
self.max_after_balance_size = max_after_balance_size
@@ -2056,6 +2061,20 @@ def control_variables(self, control_variables):
assert_is_type(control_variables, None, [str])
self._parms["control_variables"] = control_variables
+ @property
+ def remove_offset_effects(self):
+ """
+ Remove offset effects from scoring and metric calculation. Experimental.
+
+ Type: ``bool``, defaults to ``False``.
+ """
+ return self._parms.get("remove_offset_effects")
+
+ @remove_offset_effects.setter
+ def remove_offset_effects(self, remove_offset_effects):
+ assert_is_type(remove_offset_effects, None, bool)
+ self._parms["remove_offset_effects"] = remove_offset_effects
+
@property
def balance_classes(self):
"""
@@ -2802,20 +2821,24 @@ def allConstraintsPassed(model):
else:
raise H2OValueError("allConstraintsPassed can only be called when there are linear constraints.")
-
- def make_unrestricted_glm_model(self, dest=None):
+ def make_unrestricted_glm_model(self, dest=None, control_variables_enabled=False, remove_offset_effects_enabled=False):
"""
Make unrestricted GLM model when control variables are defined.
Needs to be passed source model trained with control variables enabled.
:param dest: (optional) destination key
+ :param control_variables_enabled: (optional) set control variables flag to get model affected only
+ by this feature (available only if control_variables and remove_offset_effects parameters are both set)
+ :param remove_offset_effects_enabled: (optional) set remove offset effects flag to get model affected only
+ by this feature (available only if control_variables and remove_offset_effects parameters are both set)
:examples:
>>> d = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
>>> m = H2OGeneralizedLinearEstimator(family='binomial',
... solver='COORDINATE_DESCENT',
+ ... remove_offset_effects = True,
... control_variables=["PSA"])
>>> m.train(training_frame=d,
... x=[2,3,4,5,6,7,8],
@@ -2825,11 +2848,22 @@ def make_unrestricted_glm_model(self, dest=None):
>>> m2 = m.make_unrestricted_glm_model(dest="unrestricted_glm")
>>> p2 = m2.model_performance(d)
>>> print(p2)
- """
+ >>> m3 = m.make_unrestricted_glm_model(dest="unrestricted_glm_cv", control_variables_enabled=True)
+ >>> p3 = m3.model_performance(d)
+ >>> print(p3)
+ """
+ if self.actual_params["control_variables"] is None and not(self.actual_params["remove_offset_effects"]):
+ raise H2OValueError("GLM wasn't trained with control variables or with remove offset effects.")
+ if (self.actual_params["control_variables"] is None or not(self.actual_params["remove_offset_effects"])) and (control_variables_enabled or remove_offset_effects_enabled):
+ raise H2OValueError("GLM wasn't trained with both control variables and with remove offset effects feature set, the control_variables_enabled and remove_offset_effects_enabled features cannot be used.")
+ if self.actual_params["control_variables"] is not None and self.actual_params["remove_offset_effects"] and (control_variables_enabled and remove_offset_effects_enabled):
+ raise H2OValueError("The control_variables_enabled and remove_offset_effects_enabled feature cannot be used together. It produces the same model as the main model.")
model_json = h2o.api(
"POST /3/MakeUnrestrictedGLMModel",
data={"model": self._model_json["model_id"]["name"],
- "dest": dest}
+ "dest": dest,
+ "control_variables_enabled": control_variables_enabled,
+ "remove_offset_effects_enabled": remove_offset_effects_enabled}
)
m = H2OGeneralizedLinearEstimator()
if dest is None:
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_glm_make_unrestricted_model.py b/h2o-py/tests/testdir_algos/glm/pyunit_glm_make_unrestricted_model.py
new file mode 100644
index 000000000000..cdbaf08345fd
--- /dev/null
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_glm_make_unrestricted_model.py
@@ -0,0 +1,174 @@
+import sys
+
+from h2o.exceptions import H2OResponseError
+
+sys.path.insert(1, "../../../")
+import h2o
+from tests import pyunit_utils
+from h2o.estimators.glm import H2OGeneralizedLinearEstimator
+
+
+def glm_unrestricted_model():
+ cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
+ cars = cars[cars["economy_20mpg"].isna() == 0]
+ cars["name"] = cars["name"].asfactor()
+ cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
+ offset_col = "offset"
+ offset = h2o.H2OFrame([[.5]] * cars.nrows)
+ offset.set_names([offset_col])
+ cars = cars.cbind(offset)
+
+ print("-- Model without control variables and remove offset effects --")
+ glm_model = H2OGeneralizedLinearEstimator(family="binomial", score_each_iteration=True,
+ generate_scoring_history=True, seed=0xC0FFEE)
+ glm_model.train(x=["name", "power", "year"], y="economy_20mpg", training_frame=cars, offset_column=offset_col)
+ metrics = glm_model.training_model_metrics()
+ print(metrics)
+ print(glm_model)
+
+ print("-- Model with control variables --")
+ glm_model_cv = H2OGeneralizedLinearEstimator(family="binomial", control_variables=["year"],
+ score_each_iteration=True, generate_scoring_history=True,
+ seed=0xC0FFEE)
+ glm_model_cv.train(x=["name", "power", "year"], y="economy_20mpg", training_frame=cars, offset_column=offset_col)
+ print(glm_model_cv)
+ metrics_cv = glm_model_cv.training_model_metrics()
+ print(metrics_cv)
+
+ print("-- Unrestricted model with control variables --")
+ glm_model_unrestricted_cv = glm_model_cv.make_unrestricted_glm_model(dest="unrestricted_cv")
+ print(glm_model_unrestricted_cv)
+ metrics_unrestricted_cv = glm_model_unrestricted_cv.training_model_metrics()
+ print(metrics_unrestricted_cv)
+
+ print("-- Model with remove offset effects --")
+ glm_model_ro = H2OGeneralizedLinearEstimator(family="binomial", remove_offset_effects=True,
+ generate_scoring_history=True,
+ score_each_iteration=True, seed=0xC0FFEE)
+ glm_model_ro.train(x=["name", "power", "year"], y="economy_20mpg", training_frame=cars, offset_column=offset_col)
+ print(glm_model_ro)
+ metrics_ro = glm_model_ro.training_model_metrics()
+ print(metrics_ro)
+
+ print("-- Unrestricted model with remove offset effects --")
+ glm_model_unrestricted_ro = glm_model_ro.make_unrestricted_glm_model(dest="unrestricted_ro")
+ print(glm_model_unrestricted_ro)
+ metrics_unrestricted_ro = glm_model_unrestricted_ro.training_model_metrics()
+ print(metrics_unrestricted_ro)
+
+ print("-- Model with control variables and remove offset effects --")
+ glm_model_cv_ro = H2OGeneralizedLinearEstimator(family="binomial", control_variables=["year"],
+ remove_offset_effects=True, generate_scoring_history=True,
+ score_each_iteration=True, seed=0xC0FFEE)
+ glm_model_cv_ro.train(x=["name", "power", "year"], y="economy_20mpg", training_frame=cars, offset_column=offset_col)
+ print(glm_model_cv_ro)
+ metrics_cv_ro = glm_model_cv_ro.training_model_metrics()
+ print(metrics_cv_ro)
+
+ print("-- Unrestricted model with control variables and remove offset effects disabled --")
+ glm_model_unrestricted_cv_ro = glm_model_cv_ro.make_unrestricted_glm_model(dest="all_false")
+ print(glm_model_unrestricted_cv_ro)
+ metrics_unrestricted_cv_ro = glm_model_unrestricted_cv_ro.training_model_metrics()
+ print(metrics_unrestricted_cv_ro)
+
+ print("-- Unrestricted model with control variables enabled and remove offset effects disabled --")
+ glm_model_unrestricted_cv_true_ro_false = glm_model_cv_ro.make_unrestricted_glm_model(dest="cv_true",
+ control_variables_enabled=True)
+ print(glm_model_unrestricted_cv_true_ro_false)
+ metrics_unrestricted_cv_true_ro_false = glm_model_unrestricted_cv_true_ro_false.training_model_metrics()
+ print(metrics_unrestricted_cv_true_ro_false)
+
+ print("-- Unrestricted model with control variables disabled and remove offset effects enabled --")
+ glm_model_unrestricted_cv_false_ro_true = glm_model_cv_ro.make_unrestricted_glm_model(dest="ro_true",
+ remove_offset_effects_enabled=True)
+ print(glm_model_unrestricted_cv_false_ro_true)
+ metrics_unrestricted_cv_false_ro_true = glm_model_unrestricted_cv_false_ro_true.training_model_metrics()
+ print(metrics_unrestricted_cv_false_ro_true)
+
+ # predictions with basic model
+ predictions = glm_model.predict(cars).as_data_frame()
+ # predictions with control variables enabled
+ predictions_cv = glm_model_cv.predict(cars).as_data_frame()
+ # predict with unrestricted model
+ predictions_unrestricted_cv = glm_model_unrestricted_cv.predict(cars).as_data_frame()
+ # predictions with control variables enabled
+ predictions_ro = glm_model_ro.predict(cars).as_data_frame()
+ # predict with unrestricted model
+ predictions_unrestricted_ro = glm_model_unrestricted_ro.predict(cars).as_data_frame()
+ # predictions with control variables and remove offset effects enabled
+ predictions_cv_ro = glm_model_cv_ro.predict(cars).as_data_frame()
+ # predictions with unrestricted model control variables enabled and remove offset effect enabled
+ predictions_unrestricted_cv_ro = glm_model_unrestricted_cv_ro.predict(cars).as_data_frame()
+ # predict with unrestricted model control variables enabled and remove offset effect disabled
+ predictions_unrestricted_cv_true_ro_false = glm_model_unrestricted_cv_true_ro_false.predict(cars).as_data_frame()
+ # predict with unrestricted model control variables disabled and remove offset effect enabled
+ predictions_unrestricted_cv_false_ro_true = glm_model_unrestricted_cv_false_ro_true.predict(cars).as_data_frame()
+
+ # check the coefficients
+ for k in glm_model.coef().keys():
+ pyunit_utils.assert_equals(glm_model.coef()[k], glm_model_unrestricted_cv.coef().get(k, float("NaN")),
+ f"Coefficient {k} differs!")
+
+ # check predictions are the same
+ for i in range(predictions.shape[0]):
+ pyunit_utils.assert_equals(predictions.iloc[i, 1], predictions_unrestricted_cv.iloc[i, 1],
+ f"{i}th prediction differs!")
+ pyunit_utils.assert_equals(predictions.iloc[i, 1], predictions_unrestricted_ro.iloc[i, 1],
+ f"{i}th prediction differs!")
+ pyunit_utils.assert_equals(predictions.iloc[i, 1], predictions_unrestricted_cv_ro.iloc[i, 1],
+ f"{i}th prediction differs!")
+ pyunit_utils.assert_equals(predictions_cv.iloc[i, 1], predictions_unrestricted_cv_true_ro_false.iloc[i, 1],
+ f"{i}th prediction differs!")
+ pyunit_utils.assert_equals(predictions_ro.iloc[i, 1], predictions_unrestricted_cv_false_ro_true.iloc[i, 1],
+ f"{i}th prediction differs!")
+
+ # check predictions differ
+ for i in range(predictions.shape[0]):
+ pyunit_utils.assert_not_equal(predictions.iloc[i, 1], predictions_cv.iloc[i, 1],
+ f"Predictions at position {i} should differ but they don't!")
+ pyunit_utils.assert_not_equal(predictions.iloc[i, 1], predictions_ro.iloc[i, 1],
+ f"Predictions at position {i} should differ but they don't!")
+ pyunit_utils.assert_not_equal(predictions.iloc[i, 1], predictions_cv_ro.iloc[i, 1],
+ f"Predictions at position {i} should differ but they don't!")
+ pyunit_utils.assert_not_equal(predictions_unrestricted_cv_false_ro_true.iloc[i, 1],
+ predictions_unrestricted_cv_true_ro_false.iloc[i, 1],
+ f"Predictions at position {i} should differ but they don't!")
+
+ print(glm_model_cv.scoring_history())
+ print(glm_model_unrestricted_cv_true_ro_false.scoring_history())
+
+ # check scoring history are the same
+ pyunit_utils.assert_equal_scoring_history(glm_model, glm_model_unrestricted_cv,
+ ["objective", "negative_log_likelihood"])
+ pyunit_utils.assert_equal_scoring_history(glm_model_cv, glm_model_unrestricted_cv_true_ro_false,
+ ["objective", "negative_log_likelihood", "deviance_train", "lambda"])
+ pyunit_utils.assert_equal_scoring_history(glm_model_ro, glm_model_unrestricted_cv_false_ro_true,
+ ["objective", "negative_log_likelihood", "deviance_train", "lambda"])
+ pyunit_utils.assert_equal_scoring_history(glm_model_unrestricted_cv, glm_model_unrestricted_cv_ro,
+ ["objective", "negative_log_likelihood", "deviance_train", "lambda"])
+
+ # should fail
+ try:
+ glm_model_ro.make_unrestricted_glm_model(dest="ro_true", remove_offset_effects_enabled=True)
+ assert False, "Should have throw exception."
+ except Exception as ex:
+ print(ex)
+ temp = str(ex)
+ assert "GLM wasn't trained with both control variables and with remove offset effects feature set, the control_variables_enabled and remove_offset_effects_enabled features cannot be used." in temp, \
+ "Wrong exception was received."
+
+ try:
+ glm_model_cv_ro.make_unrestricted_glm_model(dest="ro_true", remove_offset_effects_enabled=True,
+ control_variables_enabled=True)
+ assert False, "Should have throw exception."
+ except Exception as ex:
+ print(ex)
+ temp = str(ex)
+ assert "The control_variables_enabled and remove_offset_effects_enabled feature cannot be used together. It produces the same model as the main model." in temp, \
+ "Wrong exception was received."
+
+
+if __name__ == "__main__":
+ pyunit_utils.standalone_test(glm_unrestricted_model)
+else:
+ glm_unrestricted_model()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_glm_mojo_control_vars_offset.py b/h2o-py/tests/testdir_algos/glm/pyunit_glm_mojo_control_vars_offset.py
new file mode 100644
index 000000000000..0e38123e325b
--- /dev/null
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_glm_mojo_control_vars_offset.py
@@ -0,0 +1,109 @@
+import sys
+sys.path.insert(1, "../../../")
+import h2o
+from tests import pyunit_utils
+from h2o.estimators.glm import H2OGeneralizedLinearEstimator
+import tempfile
+
+
+def glm_mojo_control_vars_offset():
+ """
+ Test GLM MOJO for all combinations of remove_offset_effects and control_variables
+ across binomial, gaussian, and tweedie families.
+ """
+ train = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
+ train["CAPSULE"] = train["CAPSULE"].asfactor()
+ train["RACE"] = train["RACE"].asfactor()
+ train["DCAPS"] = train["DCAPS"].asfactor()
+ train["DPROS"] = train["DPROS"].asfactor()
+ train["offset_col"] = train["AGE"] / 100.0
+
+ combinations = [
+ ("baseline", False, None),
+ ("remove_offset_effects", True, None),
+ ("control_variables", False, ["PSA"]),
+ ("both", True, ["PSA"]),
+ ]
+
+ # Binomial
+ for label, roe, cv in combinations:
+ params = dict(family="binomial", offset_column="offset_col", lambda_=0)
+ if roe:
+ params["remove_offset_effects"] = True
+ if cv is not None:
+ params["control_variables"] = cv
+ compare_mojo("binomial_" + label, "CAPSULE",
+ ["RACE", "DCAPS", "PSA", "VOL", "DPROS", "GLEASON"], train, params)
+
+ # Gaussian
+ for label, roe, cv in combinations:
+ params = dict(family="gaussian", offset_column="offset_col", lambda_=0)
+ if roe:
+ params["remove_offset_effects"] = True
+ if cv is not None:
+ params["control_variables"] = cv
+ compare_mojo("gaussian_" + label, "VOL",
+ ["RACE", "DCAPS", "PSA", "DPROS", "GLEASON"], train, params)
+
+ # Verify that features actually change predictions (gaussian as representative family)
+ verify_features_change_predictions("gaussian", "VOL",
+ ["RACE", "DCAPS", "PSA", "DPROS", "GLEASON"], train,
+ dict(family="gaussian", offset_column="offset_col", lambda_=0))
+
+ # Tweedie (response must be positive)
+ train["positive_vol"] = abs(train["VOL"]) + 1
+ for label, roe, cv in combinations:
+ params = dict(family="tweedie", offset_column="offset_col", lambda_=0,
+ tweedie_variance_power=1.5, tweedie_link_power=0)
+ if roe:
+ params["remove_offset_effects"] = True
+ if cv is not None:
+ params["control_variables"] = cv
+ compare_mojo("tweedie_" + label, "positive_vol",
+ ["RACE", "DCAPS", "PSA", "DPROS", "GLEASON"], train, params)
+
+
+def compare_mojo(label, y, x, data, params):
+ print("=== {} ===".format(label))
+ model = H2OGeneralizedLinearEstimator(**params)
+ model.train(x=x, y=y, training_frame=data)
+
+ pred_h2o = model.predict(data)
+
+ mojo_path = model.save_mojo(path=tempfile.mkdtemp())
+ mojo_model = h2o.import_mojo(mojo_path)
+ pred_mojo = mojo_model.predict(data)
+
+ pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, prob=1, tol=1e-8)
+ print(" PASSED: {}".format(label))
+ return pred_h2o
+
+
+def assert_predictions_differ(pred1, pred2, label):
+ col = "p0" if "p0" in pred1.columns else "predict"
+ max_diff = (pred1[col].asnumeric() - pred2[col].asnumeric()).abs().max()
+ assert max_diff > 1e-10, \
+ "{}: predictions should differ but max diff = {}".format(label, max_diff)
+ print(" DIFFER OK: {} (max_diff={})".format(label, max_diff))
+
+
+def verify_features_change_predictions(family_label, y, x, data, base_params):
+ """Verify that all combinations of RO and CV produce different predictions from each other."""
+ pred_base = compare_mojo(family_label + "_base_check", y, x, data, base_params)
+ pred_ro = compare_mojo(family_label + "_ro_check", y, x, data, dict(base_params, remove_offset_effects=True))
+ pred_cv = compare_mojo(family_label + "_cv_check", y, x, data, dict(base_params, control_variables=["PSA"]))
+ pred_both = compare_mojo(family_label + "_both_check", y, x, data,
+ dict(base_params, remove_offset_effects=True, control_variables=["PSA"]))
+
+ assert_predictions_differ(pred_base, pred_ro, family_label + " baseline vs RO")
+ assert_predictions_differ(pred_base, pred_cv, family_label + " baseline vs CV")
+ assert_predictions_differ(pred_base, pred_both, family_label + " baseline vs RO+CV")
+ assert_predictions_differ(pred_ro, pred_cv, family_label + " RO vs CV")
+ assert_predictions_differ(pred_ro, pred_both, family_label + " RO vs RO+CV")
+ assert_predictions_differ(pred_cv, pred_both, family_label + " CV vs RO+CV")
+
+
+if __name__ == "__main__":
+ pyunit_utils.standalone_test(glm_mojo_control_vars_offset)
+else:
+ glm_mojo_control_vars_offset()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_remove_offset_effects.py b/h2o-py/tests/testdir_algos/glm/pyunit_remove_offset_effects.py
new file mode 100644
index 000000000000..f426683fcdab
--- /dev/null
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_remove_offset_effects.py
@@ -0,0 +1,67 @@
+import sys
+sys.path.insert(1,"../../../")
+import h2o
+from tests import pyunit_utils
+from h2o.estimators.glm import H2OGeneralizedLinearEstimator
+
+
+def glm_remove_offset_effects():
+
+ cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
+ cars = cars[cars["economy_20mpg"].isna() == 0]
+ cars["name"] = cars["name"].asfactor()
+ cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
+ offset = h2o.H2OFrame([[.5]] * cars.nrows)
+ offset.set_names(["offset"])
+ cars = cars.cbind(offset)
+
+ glm_model = H2OGeneralizedLinearEstimator(family="binomial")
+ glm_model.train(x=["name", "power", "year"], y="economy_20mpg", training_frame=cars)
+
+ predictions_train = glm_model.predict(cars).as_data_frame()
+ print(glm_model._model_json["output"]["scoring_history"])
+
+ glm_model_2 = H2OGeneralizedLinearEstimator(family="binomial", generate_scoring_history=True)
+ glm_model_2.train(x=["name", "power", "year"], y="economy_20mpg", training_frame=cars)
+
+ predictions_train_2 = glm_model_2.predict(cars).as_data_frame()
+ print(glm_model_2._model_json["output"]["scoring_history"])
+
+ glm_model_roe = H2OGeneralizedLinearEstimator(family="binomial", offset_column="offset", remove_offset_effects=True)
+ glm_model_roe.train(x=["name", "power", "year"], y="economy_20mpg", training_frame=cars)
+
+ predictions_train_roe = glm_model_roe.predict(cars).as_data_frame()
+ print(glm_model_roe._model_json["output"]["scoring_history"])
+
+ glm_model_roe_2 = H2OGeneralizedLinearEstimator(family="binomial", offset_column="offset", remove_offset_effects=True,
+ generate_scoring_history=True)
+ glm_model_roe_2.train(x=["name", "power", "year"], y="economy_20mpg", training_frame=cars)
+ predictions_train_roe2 = glm_model_roe_2.predict(cars).as_data_frame()
+ print(glm_model_roe_2._model_json["output"]["scoring_history"])
+
+ # check model metrics are not the same
+ try:
+ pyunit_utils.check_model_metrics(glm_model, glm_model_roe, "")
+ except AssertionError as err:
+ assert "Scoring history is not the same" in str(err)
+ else:
+ assert False, "Expected check_model_metrics to fail because scoring history should differ " \
+ "between glm_model and glm_model_roe"
+
+ # check predictions are different
+ for i in range(predictions_train.shape[0]):
+ pyunit_utils.assert_not_equal(predictions_train.iloc[i, 1], predictions_train_roe.iloc[i, 1],
+ f"Predictions at position {i} should differ but they don't!")
+
+ # check predictions are the same with and without generate_scoring history
+ for i in range(predictions_train.shape[0]):
+ pyunit_utils.assert_equals(predictions_train.iloc[i, 1], predictions_train_2.iloc[i, 1],
+ f"Predictions at position {i} should not differ but they do!")
+ pyunit_utils.assert_equals(predictions_train_roe.iloc[i, 1], predictions_train_roe2.iloc[i, 1],
+ f"Predictions at position {i} should not differ but they do!")
+
+
+if __name__ == "__main__":
+ pyunit_utils.standalone_test(glm_remove_offset_effects)
+else:
+ glm_remove_offset_effects()
diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_remove_offset_effects_compare.py b/h2o-py/tests/testdir_algos/glm/pyunit_remove_offset_effects_compare.py
new file mode 100644
index 000000000000..72d48ee5a68b
--- /dev/null
+++ b/h2o-py/tests/testdir_algos/glm/pyunit_remove_offset_effects_compare.py
@@ -0,0 +1,75 @@
+from builtins import range
+import sys
+sys.path.insert(1,"../../../")
+import h2o
+from tests import pyunit_utils
+from h2o.estimators.glm import H2OGeneralizedLinearEstimator
+
+
+def glm_remove_offset_effects():
+
+ cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
+ cars = cars[cars["economy_20mpg"].isna() == 0]
+ cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
+
+ offset_col = "offset"
+ offset = h2o.H2OFrame([[.5]]*cars.nrows)
+ offset.set_names([offset_col])
+ cars = cars.cbind(offset)
+
+ # offset_column passed in the train method
+ glm_model = H2OGeneralizedLinearEstimator(family="binomial")
+ glm_model.train(x=list(range(2,8)),y="economy_20mpg", training_frame=cars, offset_column=offset_col)
+
+ # predict with offset
+ predictions_train = glm_model.predict(cars).as_data_frame()
+ print(predictions_train)
+
+ # metrics with offset
+ perf = glm_model.model_performance(cars)
+ print(perf)
+
+ # offset_column passed in the train method
+ glm_model_remove_offset_effects = H2OGeneralizedLinearEstimator(family="binomial", remove_offset_effects=True)
+ glm_model_remove_offset_effects.train(x=list(range(2,8)),y="economy_20mpg", training_frame=cars,
+ offset_column=offset_col)
+
+ predictions_train_remove_offset_effects = glm_model_remove_offset_effects.predict(cars).as_data_frame()
+ print(predictions_train_remove_offset_effects)
+
+ # metrics with remove offset effects enabled
+ perf_remove_offset_effects = glm_model_remove_offset_effects.model_performance(cars)
+ print(perf_remove_offset_effects)
+
+ # setup offset column to zero to remove its effect
+ cars[offset_col] = 0
+
+ # predict with offset effects removed
+ predictions_train_remove_offset_manual = glm_model.predict(cars).as_data_frame()
+ print(predictions_train_remove_offset_manual)
+
+ # metrics with offset effects removed
+ perf_remove_offset_manual = glm_model.model_performance(cars)
+ print(perf_remove_offset_manual)
+
+ mse_with_offset = perf.mse()
+ mse_remove_offset_manual = perf_remove_offset_manual.mse()
+ mse_remove_offset_effects = perf_remove_offset_effects.mse()
+ # use tolerance-based comparisons to avoid brittleness with floating point values
+ assert abs(mse_with_offset - mse_remove_offset_manual) > 1e-6, \
+ "MSE with offset should differ from MSE with offset effects manually removed"
+ pyunit_utils.assert_equals(mse_remove_offset_manual, mse_remove_offset_effects, delta=1e-6)
+
+ # check predictions are different
+ for i in range(predictions_train.shape[0]):
+ pyunit_utils.assert_not_equal(predictions_train.iloc[i, 1], predictions_train_remove_offset_effects.iloc[i, 1],
+ f"Predictions at position {i} should differ but they don't!")
+ pyunit_utils.assert_equals(predictions_train_remove_offset_manual.iloc[i, 1],
+ predictions_train_remove_offset_effects.iloc[i, 1],
+ f"Predictions at position {i} should equal but they don't!")
+
+
+if __name__ == "__main__":
+ pyunit_utils.standalone_test(glm_remove_offset_effects)
+else:
+ glm_remove_offset_effects()
diff --git a/h2o-r/h2o-package/R/glm.R b/h2o-r/h2o-package/R/glm.R
index 38796550d9b9..ec76bbe64d25 100644
--- a/h2o-r/h2o-package/R/glm.R
+++ b/h2o-r/h2o-package/R/glm.R
@@ -116,6 +116,7 @@
#' @param stopping_tolerance Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this
#' much) Defaults to 0.001.
#' @param control_variables A list of predictor column indices which is used for training but removed for scoring. Experimental.
+#' @param remove_offset_effects \code{Logical}. Remove offset effects from scoring and metric calculation. Experimental. Defaults to FALSE.
#' @param balance_classes \code{Logical}. Balance training data class counts via over/under-sampling (for imbalanced data). Defaults to
#' FALSE.
#' @param class_sampling_factors Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will
@@ -267,6 +268,7 @@ h2o.glm <- function(x,
stopping_metric = c("AUTO", "deviance", "logloss", "MSE", "RMSE", "MAE", "RMSLE", "AUC", "AUCPR", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"),
stopping_tolerance = 0.001,
control_variables = NULL,
+ remove_offset_effects = FALSE,
balance_classes = FALSE,
class_sampling_factors = NULL,
max_after_balance_size = 5.0,
@@ -429,6 +431,8 @@ h2o.glm <- function(x,
parms$stopping_tolerance <- stopping_tolerance
if (!missing(control_variables))
parms$control_variables <- control_variables
+ if (!missing(remove_offset_effects))
+ parms$remove_offset_effects <- remove_offset_effects
if (!missing(balance_classes))
parms$balance_classes <- balance_classes
if (!missing(class_sampling_factors))
@@ -563,6 +567,7 @@ h2o.glm <- function(x,
stopping_metric = c("AUTO", "deviance", "logloss", "MSE", "RMSE", "MAE", "RMSLE", "AUC", "AUCPR", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"),
stopping_tolerance = 0.001,
control_variables = NULL,
+ remove_offset_effects = FALSE,
balance_classes = FALSE,
class_sampling_factors = NULL,
max_after_balance_size = 5.0,
@@ -730,6 +735,8 @@ h2o.glm <- function(x,
parms$stopping_tolerance <- stopping_tolerance
if (!missing(control_variables))
parms$control_variables <- control_variables
+ if (!missing(remove_offset_effects))
+ parms$remove_offset_effects <- remove_offset_effects
if (!missing(balance_classes))
parms$balance_classes <- balance_classes
if (!missing(class_sampling_factors))
@@ -828,12 +835,26 @@ h2o.makeGLMModel <- function(model,beta) {
#' Make unrestricted GLM model when control variables are defined.
#'
#' Needs source model trained with control variables enabled.
-#' @param model a GLM \linkS4class{H2OModel} trained with control variable
+#' @param model a GLM \linkS4class{H2OModel} trained with control variable or with remove offset effects
#' @param destination_key a string or a NULL
+#' @param control_variables_enabled a logical flag set control variables flag to get model affected only by
+#' this feature (available only if control_variables and remove_offset_effects parameters are both set)
+#' @param remove_offset_effects_enabled a logical flag set remove offset effects flag to get model affected only by
+#' this feature (available only if control_variables and remove_offset_effects parameters are both set)
#' @export
-h2o.make_unrestricted_glm_model <- function(model, destination_key = NULL) {
- stopifnot("GLM wasn't trained with control variables." = !is.null(model@params$actual[["control_variables"]]))
- query <- list(method = "POST", .h2o.__GLMMakeUnrestrictedModel, model = model@model_id)
+h2o.make_unrestricted_glm_model <- function(model, destination_key = NULL, control_variables_enabled = FALSE, remove_offset_effects_enabled = FALSE) {
+ stopifnot("GLM wasn't trained with control variables or with remove offset effects." =
+ !is.null(model@params$actual[["control_variables"]]) || isTRUE(model@params$actual[["remove_offset_effects"]]))
+ if ((is.null(model@params$actual[["control_variables"]]) || isFALSE(model@params$actual[["remove_offset_effects"]]))
+ && (isTRUE(control_variables_enabled) || isTRUE(remove_offset_effects_enabled))) {
+ stop("GLM wasn't trained with both control variables and with remove offset effects feature set, the control_variables_enabled and remove_offset_effects_enabled features cannot be used.")
+ }
+ if ((!is.null(model@params$actual[["control_variables"]]) || isTRUE(model@params$actual[["remove_offset_effects"]]))
+ && (isTRUE(control_variables_enabled) && isTRUE(remove_offset_effects_enabled))){
+ stop("The control_variables_enabled and remove_offset_effects_enabled feature cannot be used together. It produces the same model as the main model.")
+ }
+ query <- list(method = "POST", .h2o.__GLMMakeUnrestrictedModel, model = model@model_id,
+ control_variables_enabled=control_variables_enabled, remove_offset_effects_enabled=remove_offset_effects_enabled)
if (!missing(destination_key) && !is.null(destination_key)) {
query <- c(query, list(dest = destination_key))
}
diff --git a/h2o-r/tests/testdir_algos/glm/runit_GLM_make_unrestricted_model.R b/h2o-r/tests/testdir_algos/glm/runit_GLM_make_unrestricted_model.R
new file mode 100644
index 000000000000..03913ceb4b78
--- /dev/null
+++ b/h2o-r/tests/testdir_algos/glm/runit_GLM_make_unrestricted_model.R
@@ -0,0 +1,66 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source("../../../scripts/h2o-r-test-setup.R")
+
+
+
+glm_make_unrestricted_model_test <- function() {
+ df <- h2o.importFile("https://h2o-public-test-data.s3.amazonaws.com/smalldata/prostate/prostate.csv")
+ df$CAPSULE <- as.factor(df$CAPSULE)
+ df$RACE <- as.factor(df$RACE)
+ df$DCAPS <- as.factor(df$DCAPS)
+ df$DPROS <- as.factor(df$DPROS)
+
+ response <- "CAPSULE"
+
+ prostate_glm <- h2o.glm(family = "binomial",
+ y = response,
+ training_frame = df,
+ generate_scoring_history = T,
+ score_each_iteration = T,
+ offset_column = "AGE",
+ remove_offset_effects = T,
+ control_variables = c("PSA")
+ )
+
+ summary(prostate_glm)
+
+ # test make unrestricted model
+ unrestricted_prostate_glm <- h2o.make_unrestricted_glm_model(prostate_glm)
+ expect_false(is.null(unrestricted_prostate_glm))
+ summary(unrestricted_prostate_glm)
+
+ # test make unrestricted model control variables enabled
+ unrestricted_prostate_glm_cv_enabled <- h2o.make_unrestricted_glm_model(prostate_glm,
+ control_variables_enabled=TRUE)
+ expect_false(is.null(unrestricted_prostate_glm_cv_enabled))
+ summary(unrestricted_prostate_glm_cv_enabled)
+
+ # test make unrestricted model remove offset enabled
+ unrestricted_prostate_glm_ro_enabled <- h2o.make_unrestricted_glm_model(prostate_glm,
+ remove_offset_effects_enabled=TRUE)
+ expect_false(is.null(unrestricted_prostate_glm_ro_enabled))
+ summary(unrestricted_prostate_glm_ro_enabled)
+
+ # should pass
+ h2o.learning_curve_plot(prostate_glm)
+ h2o.learning_curve_plot(unrestricted_prostate_glm)
+ h2o.learning_curve_plot(unrestricted_prostate_glm_cv_enabled)
+ h2o.learning_curve_plot(unrestricted_prostate_glm_ro_enabled)
+
+ #should fail
+ assertError(h2o.make_unrestricted_glm_model(prostate_glm, remove_offset_effects_enabled=TRUE,
+ control_variables_enabled=TRUE))
+
+ prostate_glm_2 <- h2o.glm(family = "binomial",
+ y = response,
+ training_frame = df,
+ generate_scoring_history = T,
+ score_each_iteration = T,
+ offset_column = "AGE",
+ control_variables = c("PSA"))
+
+ assertError(h2o.make_unrestricted_glm_model(prostate_glm_2, remove_offset_effects_enabled=TRUE))
+ assertError(h2o.make_unrestricted_glm_model(prostate_glm_2, control_variables_enabled=TRUE))
+}
+
+doTest("GLM: Test make unrestricted model", glm_make_unrestricted_model_test)
diff --git a/h2o-r/tests/testdir_algos/glm/runit_GLM_mojo_control_vars_offset.R b/h2o-r/tests/testdir_algos/glm/runit_GLM_mojo_control_vars_offset.R
new file mode 100644
index 000000000000..ee5471d6d0ba
--- /dev/null
+++ b/h2o-r/tests/testdir_algos/glm/runit_GLM_mojo_control_vars_offset.R
@@ -0,0 +1,104 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues = TRUE)$"f")))
+source("../../../scripts/h2o-r-test-setup.R")
+
+test.glm_mojo_control_vars_offset <- function() {
+ # Test GLM MOJO for all combinations of remove_offset_effects and control_variables
+ # across binomial, gaussian, and tweedie families.
+
+ h2o.data <- h2o.uploadFile(locate("smalldata/logreg/prostate.csv"))
+ h2o.data$CAPSULE <- as.factor(h2o.data$CAPSULE)
+ h2o.data$RACE <- as.factor(h2o.data$RACE)
+ h2o.data$DCAPS <- as.factor(h2o.data$DCAPS)
+ h2o.data$DPROS <- as.factor(h2o.data$DPROS)
+ h2o.data$offset_col <- h2o.data$AGE / 100.0
+ h2o.data$positive_vol <- abs(h2o.data$VOL) + 1
+
+ binomial_x <- c("RACE", "DCAPS", "PSA", "VOL", "DPROS", "GLEASON")
+ regression_x <- c("RACE", "DCAPS", "PSA", "DPROS", "GLEASON")
+
+ # Binomial
+ for (combo in list(
+ list(label = "binomial_baseline", roe = FALSE, cv = NULL),
+ list(label = "binomial_roe", roe = TRUE, cv = NULL),
+ list(label = "binomial_cv", roe = FALSE, cv = c("PSA")),
+ list(label = "binomial_both", roe = TRUE, cv = c("PSA"))
+ )) {
+ args <- list(family = "binomial", offset_column = "offset_col", lambda = 0)
+ if (combo$roe) args$remove_offset_effects <- TRUE
+ if (!is.null(combo$cv)) args$control_variables <- combo$cv
+ compare_mojo(combo$label, binomial_x, "CAPSULE", h2o.data, args)
+ }
+
+ # Gaussian
+ for (combo in list(
+ list(label = "gaussian_baseline", roe = FALSE, cv = NULL),
+ list(label = "gaussian_roe", roe = TRUE, cv = NULL),
+ list(label = "gaussian_cv", roe = FALSE, cv = c("PSA")),
+ list(label = "gaussian_both", roe = TRUE, cv = c("PSA"))
+ )) {
+ args <- list(family = "gaussian", offset_column = "offset_col", lambda = 0)
+ if (combo$roe) args$remove_offset_effects <- TRUE
+ if (!is.null(combo$cv)) args$control_variables <- combo$cv
+ compare_mojo(combo$label, regression_x, "VOL", h2o.data, args)
+ }
+
+ # Verify that features actually change predictions (gaussian as representative family)
+ verify_features_change_predictions("gaussian", regression_x, "VOL", h2o.data,
+ list(family = "gaussian", offset_column = "offset_col", lambda = 0))
+
+ # Tweedie
+ for (combo in list(
+ list(label = "tweedie_baseline", roe = FALSE, cv = NULL),
+ list(label = "tweedie_roe", roe = TRUE, cv = NULL),
+ list(label = "tweedie_cv", roe = FALSE, cv = c("PSA")),
+ list(label = "tweedie_both", roe = TRUE, cv = c("PSA"))
+ )) {
+ args <- list(family = "tweedie", offset_column = "offset_col", lambda = 0,
+ tweedie_variance_power = 1.5, tweedie_link_power = 0)
+ if (combo$roe) args$remove_offset_effects <- TRUE
+ if (!is.null(combo$cv)) args$control_variables <- combo$cv
+ compare_mojo(combo$label, regression_x, "positive_vol", h2o.data, args)
+ }
+}
+
+compare_mojo <- function(label, x, y, data, args) {
+ Log.info(label)
+ model <- do.call(h2o.glm, c(list(x = x, y = y, training_frame = data), args))
+ pred_h2o <- h2o.predict(model, data)
+
+ mojo_path <- h2o.save_mojo(model, path = tempdir(), force = TRUE)
+ mojo_model <- h2o.import_mojo(mojo_path)
+ pred_mojo <- h2o.predict(mojo_model, data)
+
+ compareFrames(pred_h2o, pred_mojo, prob = 1, tolerance = 1e-8)
+ Log.info(paste(" PASSED:", label))
+ return(pred_h2o)
+}
+
+assert_predictions_differ <- function(pred1, pred2, label) {
+ col <- if ("p0" %in% colnames(pred1)) "p0" else "predict"
+ max_diff <- max(abs(as.data.frame(pred1[[col]]) - as.data.frame(pred2[[col]])))
+ if (max_diff <= 1e-10) {
+ stop(paste0(label, ": predictions should differ but max diff = ", max_diff))
+ }
+ Log.info(paste(" DIFFER OK:", label, "max_diff =", max_diff))
+}
+
+verify_features_change_predictions <- function(family_label, x, y, data, base_args) {
+ pred_base <- compare_mojo(paste0(family_label, "_base_check"), x, y, data, base_args)
+ pred_ro <- compare_mojo(paste0(family_label, "_ro_check"), x, y, data,
+ c(base_args, list(remove_offset_effects = TRUE)))
+ pred_cv <- compare_mojo(paste0(family_label, "_cv_check"), x, y, data,
+ c(base_args, list(control_variables = c("PSA"))))
+ pred_both <- compare_mojo(paste0(family_label, "_both_check"), x, y, data,
+ c(base_args, list(remove_offset_effects = TRUE, control_variables = c("PSA"))))
+
+ assert_predictions_differ(pred_base, pred_ro, paste(family_label, "baseline vs RO"))
+ assert_predictions_differ(pred_base, pred_cv, paste(family_label, "baseline vs CV"))
+ assert_predictions_differ(pred_base, pred_both, paste(family_label, "baseline vs RO+CV"))
+ assert_predictions_differ(pred_ro, pred_cv, paste(family_label, "RO vs CV"))
+ assert_predictions_differ(pred_ro, pred_both, paste(family_label, "RO vs RO+CV"))
+ assert_predictions_differ(pred_cv, pred_both, paste(family_label, "CV vs RO+CV"))
+}
+
+doTest("GLM MOJO with remove_offset_effects and control_variables", test.glm_mojo_control_vars_offset)
diff --git a/h2o-r/tests/testdir_algos/glm/runit_GLM_remove_offset_effects_explain.R b/h2o-r/tests/testdir_algos/glm/runit_GLM_remove_offset_effects_explain.R
new file mode 100644
index 000000000000..209c63c74a19
--- /dev/null
+++ b/h2o-r/tests/testdir_algos/glm/runit_GLM_remove_offset_effects_explain.R
@@ -0,0 +1,40 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source("../../../scripts/h2o-r-test-setup.R")
+
+
+
+glm_remove_offset_effects_explain <- function() {
+ df <- h2o.importFile("https://h2o-public-test-data.s3.amazonaws.com/smalldata/prostate/prostate.csv")
+ df$CAPSULE <- as.factor(df$CAPSULE)
+ df$RACE <- as.factor(df$RACE)
+ df$DCAPS <- as.factor(df$DCAPS)
+ df$DPROS <- as.factor(df$DPROS)
+
+ response <- "CAPSULE"
+
+ prostate_glm <- h2o.glm(family = "binomial",
+ y = response,
+ training_frame = df,
+ generate_scoring_history = T,
+ score_each_iteration = T,
+ offset_column = "AGE",
+ remove_offset_effects = T
+ )
+
+ summary(prostate_glm)
+
+ # test make unrestricted model
+ unrestricted_prostate_glm <- h2o.make_unrestricted_glm_model(prostate_glm)
+ expect_false(is.null(unrestricted_prostate_glm))
+ summary(unrestricted_prostate_glm)
+
+ # should pass
+ h2o.learning_curve_plot(prostate_glm)
+ h2o.learning_curve_plot(unrestricted_prostate_glm)
+
+ # should pass
+ h2o.explain(prostate_glm, df)
+ h2o.explain(unrestricted_prostate_glm, df)
+}
+
+doTest("GLM: Remove offset effects works with explain", glm_remove_offset_effects_explain)