Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
216 changes: 96 additions & 120 deletions msdbook/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,89 +3,116 @@
import pandas as pd
import matplotlib.pyplot as plt
from msdbook.utils import fit_logit, plot_contour_map
from statsmodels.base.wrapper import ResultsWrapper
import warnings
from statsmodels.tools.sm_exceptions import HessianInversionWarning
warnings.simplefilter("ignore", HessianInversionWarning)

warnings.simplefilter("ignore", HessianInversionWarning)


@pytest.fixture
def sample_data():
"""Fixture to provide sample data for testing."""
np.random.seed(0) # For reproducibility

# Number of samples
n = 100

# Generate some random data
df = pd.DataFrame({
'Success': np.random.randint(0, 2, size=n), # Binary outcome variable (0 or 1)
'Predictor1': np.random.randn(n), # Random values for Predictor1
'Predictor2': np.random.randn(n), # Random values for Predictor2
'Interaction': np.random.randn(n) # Random values for Interaction term
})
df = pd.DataFrame(
{
"Success": np.random.randint(0, 2, size=n), # Binary outcome variable (0 or 1)
"Predictor1": np.random.randn(n), # Random values for Predictor1
"Predictor2": np.random.randn(n), # Random values for Predictor2
"Interaction": np.random.randn(n), # Random values for Interaction term
}
)

return df
predictor1 = 'Predictor1'
predictor2 = 'Predictor2'
interaction = 'Interaction'
intercept = 'Intercept'
success = 'Success'


predictor1 = "Predictor1"
predictor2 = "Predictor2"
interaction = "Interaction"
intercept = "Intercept"
success = "Success"


# @pytest.mark.parametrize("predictors, expected_params, min_coeff, max_coeff", [
# (['Predictor1', 'Predictor2'], np.array([0.34060709, -0.26968773, 0.31551482, 0.45824332]), 1e-5, 10), # Adjusted expected params
# ])
@pytest.mark.filterwarnings("ignore:Inverting hessian failed, no bse or cov_params available")
@pytest.mark.parametrize("sample_data, df_resid, df_model, llf", [
(pd.DataFrame({
predictor1: [1.0, 2.0, 3.0],
predictor2: [3.0, 4.0, 5.0],
interaction: [2.0, 4.0, 6.0],
intercept: [1.0, 1.0, 1.0],
success: [1.0, 1.0, 0.0],
}), 0.0, 2.0, -6.691275315650184e-06),

(pd.DataFrame({
predictor1: [5.0, 6.0, 7.0],
predictor2: [7.0, 8.0, 9.0],
interaction: [3.0, 6.0, 9.0],
intercept: [1.0, 1.0, 1.0],
success: [1.0, 0.0, 1.0],
}), 0.0, 2.0, -2.4002923915238235e-06),

(pd.DataFrame({
predictor1: [0.5, 1.5, 2.5],
predictor2: [1.0, 2.0, 3.0],
interaction: [0.2, 0.4, 0.6],
intercept: [1.0, 1.0, 1.0],
success: [0.0, 1.0, 1.0],
}), 0.0, 2.0, -1.7925479970021486e-05)
])
@pytest.mark.parametrize(
"sample_data, df_resid, df_model, llf",
[
(
pd.DataFrame(
{
predictor1: [1.0, 2.0, 3.0],
predictor2: [3.0, 4.0, 5.0],
interaction: [2.0, 4.0, 6.0],
intercept: [1.0, 1.0, 1.0],
success: [1.0, 1.0, 0.0],
}
),
0.0,
2.0,
-6.691275315650184e-06,
),
(
pd.DataFrame(
{
predictor1: [5.0, 6.0, 7.0],
predictor2: [7.0, 8.0, 9.0],
interaction: [3.0, 6.0, 9.0],
intercept: [1.0, 1.0, 1.0],
success: [1.0, 0.0, 1.0],
}
),
0.0,
2.0,
-2.4002923915238235e-06,
),
(
pd.DataFrame(
{
predictor1: [0.5, 1.5, 2.5],
predictor2: [1.0, 2.0, 3.0],
interaction: [0.2, 0.4, 0.6],
intercept: [1.0, 1.0, 1.0],
success: [0.0, 1.0, 1.0],
}
),
0.0,
2.0,
-1.7925479970021486e-05,
),
],
)
def test_fit_logit(sample_data, df_resid, df_model, llf):
predictors = [predictor1, predictor2]
result = fit_logit(sample_data, predictors)
assert result.df_resid == df_resid
assert result.df_model == df_model
assert result.llf == llf


def test_plot_contour_map(sample_data):
"""Test the plot_contour_map function."""
fig, ax = plt.subplots()

# Fit a logit model for the purpose of plotting
result = fit_logit(sample_data, ['Predictor1', 'Predictor2'])
result = fit_logit(sample_data, ["Predictor1", "Predictor2"])

# Dynamically generate grid and levels based on input data to reflect the data range
xgrid_min, xgrid_max = sample_data['Predictor1'].min(), sample_data['Predictor1'].max()
ygrid_min, ygrid_max = sample_data['Predictor2'].min(), sample_data['Predictor2'].max()
xgrid_min, xgrid_max = sample_data["Predictor1"].min(), sample_data["Predictor1"].max()
ygrid_min, ygrid_max = sample_data["Predictor2"].min(), sample_data["Predictor2"].max()
xgrid = np.linspace(xgrid_min - 1, xgrid_max + 1, 50)
ygrid = np.linspace(ygrid_min - 1, ygrid_max + 1, 50)
levels = np.linspace(0, 1, 10)
contour_cmap = 'viridis'
dot_cmap = 'coolwarm'

contour_cmap = "viridis"
dot_cmap = "coolwarm"

# Call the plot function
contourset = plot_contour_map(
ax,
Expand All @@ -96,114 +123,63 @@ def test_plot_contour_map(sample_data):
levels,
xgrid,
ygrid,
'Predictor1',
'Predictor2',
base=0,
"Predictor1",
"Predictor2",
)

# Verify the plot and axis limits/labels are correct
assert contourset is not None
assert ax.get_xlim() == (xgrid.min(), xgrid.max())
assert ax.get_ylim() == (ygrid.min(), ygrid.max())
assert ax.get_xlabel() == 'Predictor1'
assert ax.get_ylabel() == 'Predictor2'
assert ax.get_xlabel() == "Predictor1"
assert ax.get_ylabel() == "Predictor2"

# Verify that scatter plot is present by checking the number of points
assert len(ax.collections) > 0
assert len(ax.collections) > 0
plt.close(fig)


def test_empty_data():
"""Test with empty data to ensure no errors."""
empty_df = pd.DataFrame({
'Success': [],
'Predictor1': [],
'Predictor2': [],
'Interaction': []
})

empty_df = pd.DataFrame({"Success": [], "Predictor1": [], "Predictor2": [], "Interaction": []})

# Test if fitting with empty data raises an error
with pytest.raises(ValueError):
fit_logit(empty_df, ['Predictor1', 'Predictor2'])
fit_logit(empty_df, ["Predictor1", "Predictor2"])

# Test plotting with empty data (skip fitting since it's empty)
fig, ax = plt.subplots()

# Ensure that no fitting occurs on an empty DataFrame
if not empty_df.empty:
result = fit_logit(empty_df, ['Predictor1', 'Predictor2'])
contourset = plot_contour_map(
ax, result, empty_df,
'viridis', 'coolwarm', np.linspace(0, 1, 10), np.linspace(-2, 2, 50),
np.linspace(-2, 2, 50), 'Predictor1', 'Predictor2', base=0
)
assert contourset is not None
else:
# Skip plotting if DataFrame is empty
assert True # Ensures that we expect no result or plotting for empty DataFrame

plt.close(fig)
# Close any created figures
plt.close("all")


def test_invalid_predictors(sample_data):
"""Test with invalid predictors."""
invalid_predictors = ['InvalidPredictor1', 'InvalidPredictor2']
invalid_predictors = ["InvalidPredictor1", "InvalidPredictor2"]

with pytest.raises(KeyError):
fit_logit(sample_data, invalid_predictors)


def test_logit_with_interaction(sample_data):
"""Test logistic regression with interaction term."""
sample_data['Interaction'] = sample_data['Predictor1'] * sample_data['Predictor2']
result = fit_logit(sample_data, ['Predictor1', 'Predictor2'])

data = sample_data.copy()
data["Interaction"] = data["Predictor1"] * data["Predictor2"]
result = fit_logit(data, ["Predictor1", "Predictor2"])

# Ensure the interaction term is included in the result
assert 'Interaction' in result.params.index
assert "Interaction" in result.params.index


def test_fit_logit_comprehensive(sample_data):
"""Comprehensive test for fit_logit checking various aspects."""
# Check valid predictors
result = fit_logit(sample_data, ['Predictor1', 'Predictor2'])

# Validate coefficients are reasonable
assert np.all(np.abs(result.params) > 1e-5) # Coefficients should not be too close to zero
result = fit_logit(sample_data, ["Predictor1", "Predictor2"])

# Validate coefficients are reasonable (not exceeding expected ranges)
assert np.all(np.abs(result.params) < 10) # Coefficients should not exceed 10

# Check if specific expected values are close (if known from actual model output)
EXPECTED_PARAMS = np.array([0.34060709, -0.26968773, 0.31551482, 0.45824332]) # Update with actual expected values
assert np.allclose(result.params.values, EXPECTED_PARAMS, atol=0.1) # Increased tolerance to 0.1
# Check if all expected predictors are present in the result
for predictor in ["Intercept", "Predictor1", "Predictor2", "Interaction"]:
assert predictor in result.params.index

# Check p-values are valid
assert np.all(np.isfinite(result.pvalues)) # P-values should be finite numbers
# Check if any coefficient has a p-value less than 0.1 (10% significance level)
assert np.any(result.pvalues < 0.1)


def test_fit_logit_without_interaction():
"""Test that fit_logit works without an Interaction column."""
np.random.seed(42)
n = 100

# Create data WITHOUT an Interaction column
df_no_interaction = pd.DataFrame({
'Success': np.random.randint(0, 2, size=n),
'Predictor1': np.random.randn(n),
'Predictor2': np.random.randn(n)
})

# This should work without raising a KeyError
result = fit_logit(df_no_interaction, ['Predictor1', 'Predictor2'])

# Verify the result is valid
assert result is not None
assert hasattr(result, 'params')

# Should have 3 parameters: Intercept, Predictor1, Predictor2 (no Interaction)
assert len(result.params) == 3
assert 'Intercept' in result.params.index
assert 'Predictor1' in result.params.index
assert 'Predictor2' in result.params.index
assert 'Interaction' not in result.params.index

35 changes: 17 additions & 18 deletions msdbook/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import statsmodels.api as sm


def fit_logit(dta, predictors):
"""Logistic regression with optional interaction term.

Expand All @@ -28,23 +29,18 @@ def fit_logit(dta, predictors):

# Add intercept column of 1s
dta["Intercept"] = np.ones(np.shape(dta)[0])

# Get columns of predictors, starting with intercept
cols = dta.columns.tolist()[-1:] + predictors

# Add interaction term if present in the data
if "Interaction" in dta.columns:
cols = cols + ["Interaction"]


# Get columns of predictors
cols = dta.columns.tolist()[-1:] + predictors + ["Interaction"]

# Fit logistic regression without the deprecated 'disp' argument
logit = sm.Logit(dta["Success"], dta[cols])
result = logit.fit(method='bfgs') # Use method='bfgs' or another supported method
result = logit.fit(method="bfgs") # Use method='bfgs' or another supported method

return result

def plot_contour_map(
ax, result, dta, contour_cmap, dot_cmap, levels, xgrid, ygrid, xvar, yvar, base
):

def plot_contour_map(ax, result, dta, contour_cmap, dot_cmap, levels, xgrid, ygrid, xvar, yvar):
"""Plot the contour map"""

# Ignore tight layout warnings
Expand All @@ -60,12 +56,15 @@ def plot_contour_map(
Z = np.reshape(z, np.shape(X))

contourset = ax.contourf(X, Y, Z, levels, cmap=contour_cmap, aspect="auto")

# Plot scatter points based on the data
xpoints = np.mean(dta[xvar].values.reshape(-1, 10), axis=1)
ypoints = np.mean(dta[yvar].values.reshape(-1, 10), axis=1)
colors = np.round(np.mean(dta["Success"].values.reshape(-1, 10), axis=1), 0)

# Trim data to ensure it's divisible by 10 for reshaping
n = len(dta[xvar].values)
n_trim = (n // 10) * 10
xpoints = np.mean(dta[xvar].values[:n_trim].reshape(-1, 10), axis=1)
ypoints = np.mean(dta[yvar].values[:n_trim].reshape(-1, 10), axis=1)
colors = np.round(np.mean(dta["Success"].values[:n_trim].reshape(-1, 10), axis=1), 0)

ax.scatter(xpoints, ypoints, s=10, c=colors, edgecolor="none", cmap=dot_cmap)
ax.set_xlim(np.min(xgrid), np.max(xgrid))
ax.set_ylim(np.min(ygrid), np.max(ygrid))
Expand Down
7 changes: 4 additions & 3 deletions notebooks/basin_users_logistic_regression.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -302,15 +302,16 @@
" # plot contour map\n",
" contourset = plot_contour_map(ax, result, dta, contour_cmap,\n",
" dot_cmap, contour_levels, xgrid,\n",
" ygrid, all_predictors[0], all_predictors[1], base)\n",
" ygrid, all_predictors[0], all_predictors[1])\n",
" \n",
" ax.set_title(usernames[i])\n",
" \n",
"# set up colorbar\n",
"cbar_ax = fig.add_axes([0.98, 0.15, 0.05, 0.7])\n",
"cbar = fig.colorbar(contourset, cax=cbar_ax)\n",
"cbar_ax.set_ylabel('Probability of Success', fontsize=16)\n",
"cbar_ax.tick_params(axis='y', which='major', labelsize=12)\n"
"cbar_ax.tick_params(axis='y', which='major', labelsize=12)\n",
""
]
},
{
Expand Down Expand Up @@ -382,4 +383,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}