-
Notifications
You must be signed in to change notification settings - Fork 16
Open
Description
Noise standard deviation/variance doesn't exist for logistic or poisson regression. Seems like part of it is being computed in coef_cov_quad_form, but not being used to calculate the noise_std; whereas, in the sample_sparse_lin_reg function uses this method to calculate the variance.
https://github.com/yaglm/yaglm/blob/c6b55ea70b7f8c4e4f017acfdb1fbf0546911dee/yaglm/toy_data.py#L219
Mini example below:
"""
Mini example for training LASSO with information criterion under logistic regression
"""
from time import time
import pandas as pd
import numpy as np
from yaglm.GlmTuned import GlmTrainMetric
from yaglm.config.penalty import Lasso
from yaglm.toy_data import sample_sparse_log_reg
from yaglm.metrics.info_criteria import InfoCriteria
from yaglm.infer.Inferencer import Inferencer
# create a python package that supports the simulations
from glm_sims.utils import sample_seeds
from glm_sims.metrics import get_results_log_reg
###############
# Sample data #
###############
# sample separate train, validation and test set seeds
# these sees are used to sample the different data sets
sampling_seeds = sample_seeds(n_seeds=3, random_state= 3482)
# note if the true data distrubtion has a random component
# e.g. if we randomly generate beta, then we will
# need another seed that fixes the distrubtion to be the same
# for the train, validation and test data
# store high-level information about the simulation
sim_start_time = time()
# keyword arguments pass to each sampling function that specify
# the underlying distrubtion
data_dis_kws = {'beta_type': 23,
'beta_random_state': 68,
'n_features': 10,
'corr': 0.5}
X_train, y_train, model_info = \
sample_sparse_log_reg(n_samples=100,
random_state=sampling_seeds[0], # train seed
**data_dis_kws
)
# pull out the true model data
coef_true = model_info['coef']
X_val, y_val, _ = \
sample_sparse_log_reg(n_samples=100,
random_state=sampling_seeds[1], # val seed
**data_dis_kws
)
X_test, y_test, _ = \
sample_sparse_log_reg(n_samples=1000,
random_state=sampling_seeds[2], # test seed
**data_dis_kws
)
################
# Setup models #
################
# Append the validation data to the training data for model fitting
X_train_val = np.append(X_train, X_val, axis = 0)
y_train_val = np.append(y_train, y_val)
cv_kws = {'loss': 'log_reg',
'cv': 5}
est_kws = {'standardize': False, 'fit_intercept': False}
models = {}
models['lasso__tune=AIC'] = GlmTrainMetric(penalty=Lasso(),
scorer=InfoCriteria(crit='aic'),
inferencer=Inferencer(dof='support'),
**est_kws)
results = []
for name, model in models.items():
print(name)
# fit model
start_time = time()
model.fit(X_train_val, y_train_val)
pen_val = model.best_tune_params_['penalty__pen_val']
try:
mix_val = model.best_tune_params_['penalty__mix_val']
except:
mix_val = np.nan
runtime = time() - start_time
# sklearn saves the coefficient as ndarray of shape (1, n_features)
# the get_results function assumes the coefficient is an ndarray of shape (n_features,)
if ((name == 'sklasso__tune=cv') | (name == 'skridge__tune=cv')):
model.coef_ = np.reshape(model.coef_, (10,))
# compute evaulation metrics
# this outputs a dict where each key is the name of a metric
# e.g. res['L1_to_truth'] = 1.2, res['test_error'] = ...
res = get_results_log_reg(model,
X_train=X_train, y_train=y_train,
X_test=X_test, y_test=y_test,
coef_true=coef_true, intercept_true = 0)
res['runtime'] = runtime
# store information identifying this row of the results data frame
res['model'] = name
res['mc_idx'] = 1
res['n_samples_train'] = 100
res['n_features'] = 10
res['beta_type'] = 23
res['n_nonzero'] = 10
res['best_pen_val'] = pen_val
res['best_mix_val'] = mix_val
# possibly other information e.g. n_samplmes if we are varying
# the number of samples for each simulation
results.append(res)
# convert list of dicts to data frame
results = pd.DataFrame(results)Metadata
Metadata
Assignees
Labels
No labels