Address remaining code review comments from PR #159 (#165)

Copilot · crvernon · web-flow · commit f3458d513016 · 2025-11-17T17:28:00.000-05:00
* Initial plan

* Address remaining PR review comments

Co-authored-by: crvernon &lt;3947069+crvernon@users.noreply.github.com&gt;

---------

Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
Co-authored-by: crvernon &lt;3947069+crvernon@users.noreply.github.com&gt;
Co-authored-by: Chris Vernon &lt;chrisrvernon@gmail.com&gt;
diff --git a/msdbook/tests/test_utils.py b/msdbook/tests/test_utils.py
@@ -3,89 +3,116 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 from msdbook.utils import fit_logit, plot_contour_map
-from statsmodels.base.wrapper import ResultsWrapper
 import warnings
 from statsmodels.tools.sm_exceptions import HessianInversionWarning
-warnings.simplefilter("ignore", HessianInversionWarning)
 
+warnings.simplefilter("ignore", HessianInversionWarning)
 
 
 @pytest.fixture
 def sample_data():
     """Fixture to provide sample data for testing."""
     np.random.seed(0)  # For reproducibility
-    
+
     # Number of samples
     n = 100
 
     # Generate some random data
-    df = pd.DataFrame({
-        'Success': np.random.randint(0, 2, size=n),  # Binary outcome variable (0 or 1)
-        'Predictor1': np.random.randn(n),  # Random values for Predictor1
-        'Predictor2': np.random.randn(n),  # Random values for Predictor2
-        'Interaction': np.random.randn(n)  # Random values for Interaction term
-    })
+    df = pd.DataFrame(
+        {
+            "Success": np.random.randint(0, 2, size=n),  # Binary outcome variable (0 or 1)
+            "Predictor1": np.random.randn(n),  # Random values for Predictor1
+            "Predictor2": np.random.randn(n),  # Random values for Predictor2
+            "Interaction": np.random.randn(n),  # Random values for Interaction term
+        }
+    )
 
     return df
-predictor1 = 'Predictor1'
-predictor2 = 'Predictor2'
-interaction = 'Interaction'
-intercept = 'Intercept'
-success = 'Success'
+
+
+predictor1 = "Predictor1"
+predictor2 = "Predictor2"
+interaction = "Interaction"
+intercept = "Intercept"
+success = "Success"
+
 
 # @pytest.mark.parametrize("predictors, expected_params, min_coeff, max_coeff", [
 #     (['Predictor1', 'Predictor2'], np.array([0.34060709, -0.26968773, 0.31551482, 0.45824332]), 1e-5, 10),  # Adjusted expected params
 # ])
 @pytest.mark.filterwarnings("ignore:Inverting hessian failed, no bse or cov_params available")
-@pytest.mark.parametrize("sample_data, df_resid, df_model, llf", [
-    (pd.DataFrame({
-        predictor1: [1.0, 2.0, 3.0],
-        predictor2: [3.0, 4.0, 5.0],
-        interaction: [2.0, 4.0, 6.0],
-        intercept: [1.0, 1.0, 1.0],
-        success: [1.0, 1.0, 0.0],
-    }), 0.0, 2.0, -6.691275315650184e-06),
-
-    (pd.DataFrame({
-        predictor1: [5.0, 6.0, 7.0],
-        predictor2: [7.0, 8.0, 9.0],
-        interaction: [3.0, 6.0, 9.0],
-        intercept: [1.0, 1.0, 1.0],
-        success: [1.0, 0.0, 1.0],
-    }), 0.0, 2.0, -2.4002923915238235e-06),
-
-    (pd.DataFrame({
-        predictor1: [0.5, 1.5, 2.5],
-        predictor2: [1.0, 2.0, 3.0],
-        interaction: [0.2, 0.4, 0.6],
-        intercept: [1.0, 1.0, 1.0],
-        success: [0.0, 1.0, 1.0],
-    }), 0.0, 2.0, -1.7925479970021486e-05)
-])
+@pytest.mark.parametrize(
+    "sample_data, df_resid, df_model, llf",
+    [
+        (
+            pd.DataFrame(
+                {
+                    predictor1: [1.0, 2.0, 3.0],
+                    predictor2: [3.0, 4.0, 5.0],
+                    interaction: [2.0, 4.0, 6.0],
+                    intercept: [1.0, 1.0, 1.0],
+                    success: [1.0, 1.0, 0.0],
+                }
+            ),
+            0.0,
+            2.0,
+            -6.691275315650184e-06,
+        ),
+        (
+            pd.DataFrame(
+                {
+                    predictor1: [5.0, 6.0, 7.0],
+                    predictor2: [7.0, 8.0, 9.0],
+                    interaction: [3.0, 6.0, 9.0],
+                    intercept: [1.0, 1.0, 1.0],
+                    success: [1.0, 0.0, 1.0],
+                }
+            ),
+            0.0,
+            2.0,
+            -2.4002923915238235e-06,
+        ),
+        (
+            pd.DataFrame(
+                {
+                    predictor1: [0.5, 1.5, 2.5],
+                    predictor2: [1.0, 2.0, 3.0],
+                    interaction: [0.2, 0.4, 0.6],
+                    intercept: [1.0, 1.0, 1.0],
+                    success: [0.0, 1.0, 1.0],
+                }
+            ),
+            0.0,
+            2.0,
+            -1.7925479970021486e-05,
+        ),
+    ],
+)
 def test_fit_logit(sample_data, df_resid, df_model, llf):
     predictors = [predictor1, predictor2]
     result = fit_logit(sample_data, predictors)
     assert result.df_resid == df_resid
     assert result.df_model == df_model
     assert result.llf == llf
 
+
 def test_plot_contour_map(sample_data):
     """Test the plot_contour_map function."""
     fig, ax = plt.subplots()
 
     # Fit a logit model for the purpose of plotting
-    result = fit_logit(sample_data, ['Predictor1', 'Predictor2'])
+    result = fit_logit(sample_data, ["Predictor1", "Predictor2"])
 
     # Dynamically generate grid and levels based on input data to reflect the data range
-    xgrid_min, xgrid_max = sample_data['Predictor1'].min(), sample_data['Predictor1'].max()
-    ygrid_min, ygrid_max = sample_data['Predictor2'].min(), sample_data['Predictor2'].max()
+    xgrid_min, xgrid_max = sample_data["Predictor1"].min(), sample_data["Predictor1"].max()
+    ygrid_min, ygrid_max = sample_data["Predictor2"].min(), sample_data["Predictor2"].max()
     xgrid = np.linspace(xgrid_min - 1, xgrid_max + 1, 50)
     ygrid = np.linspace(ygrid_min - 1, ygrid_max + 1, 50)
     levels = np.linspace(0, 1, 10)
-    
-    contour_cmap = 'viridis'
-    dot_cmap = 'coolwarm'
-    
+
+    contour_cmap = "viridis"
+    dot_cmap = "coolwarm"
+
     # Call the plot function
     contourset = plot_contour_map(
         ax,
@@ -96,114 +123,63 @@ def test_plot_contour_map(sample_data):
         levels,
         xgrid,
         ygrid,
-        'Predictor1',
-        'Predictor2',
-        base=0,
+        "Predictor1",
+        "Predictor2",
     )
 
     # Verify the plot and axis limits/labels are correct
     assert contourset is not None
     assert ax.get_xlim() == (xgrid.min(), xgrid.max())
     assert ax.get_ylim() == (ygrid.min(), ygrid.max())
-    assert ax.get_xlabel() == 'Predictor1'
-    assert ax.get_ylabel() == 'Predictor2'
+    assert ax.get_xlabel() == "Predictor1"
+    assert ax.get_ylabel() == "Predictor2"
 
     # Verify that scatter plot is present by checking the number of points
-    assert len(ax.collections) > 0  
+    assert len(ax.collections) > 0
     plt.close(fig)
 
 
 def test_empty_data():
     """Test with empty data to ensure no errors."""
-    empty_df = pd.DataFrame({
-        'Success': [],
-        'Predictor1': [],
-        'Predictor2': [],
-        'Interaction': []
-    })
-    
+    empty_df = pd.DataFrame({"Success": [], "Predictor1": [], "Predictor2": [], "Interaction": []})
+
     # Test if fitting with empty data raises an error
     with pytest.raises(ValueError):
-        fit_logit(empty_df, ['Predictor1', 'Predictor2'])
+        fit_logit(empty_df, ["Predictor1", "Predictor2"])
 
-    # Test plotting with empty data (skip fitting since it's empty)
-    fig, ax = plt.subplots()
-
-    # Ensure that no fitting occurs on an empty DataFrame
-    if not empty_df.empty:
-        result = fit_logit(empty_df, ['Predictor1', 'Predictor2'])
-        contourset = plot_contour_map(
-            ax, result, empty_df,
-            'viridis', 'coolwarm', np.linspace(0, 1, 10), np.linspace(-2, 2, 50),
-            np.linspace(-2, 2, 50), 'Predictor1', 'Predictor2', base=0
-        )
-        assert contourset is not None
-    else:
-        # Skip plotting if DataFrame is empty
-        assert True  # Ensures that we expect no result or plotting for empty DataFrame
-    
-    plt.close(fig)
+    # Close any created figures
+    plt.close("all")
 
 
 def test_invalid_predictors(sample_data):
     """Test with invalid predictors."""
-    invalid_predictors = ['InvalidPredictor1', 'InvalidPredictor2']
-    
+    invalid_predictors = ["InvalidPredictor1", "InvalidPredictor2"]
+
     with pytest.raises(KeyError):
         fit_logit(sample_data, invalid_predictors)
 
 
 def test_logit_with_interaction(sample_data):
     """Test logistic regression with interaction term."""
-    sample_data['Interaction'] = sample_data['Predictor1'] * sample_data['Predictor2']
-    result = fit_logit(sample_data, ['Predictor1', 'Predictor2'])
-    
+    data = sample_data.copy()
+    data["Interaction"] = data["Predictor1"] * data["Predictor2"]
+    result = fit_logit(data, ["Predictor1", "Predictor2"])
+
     # Ensure the interaction term is included in the result
-    assert 'Interaction' in result.params.index
+    assert "Interaction" in result.params.index
 
 
 def test_fit_logit_comprehensive(sample_data):
     """Comprehensive test for fit_logit checking various aspects."""
     # Check valid predictors
-    result = fit_logit(sample_data, ['Predictor1', 'Predictor2'])
-    
-    # Validate coefficients are reasonable
-    assert np.all(np.abs(result.params) > 1e-5)  # Coefficients should not be too close to zero
+    result = fit_logit(sample_data, ["Predictor1", "Predictor2"])
+
+    # Validate coefficients are reasonable (not exceeding expected ranges)
     assert np.all(np.abs(result.params) < 10)  # Coefficients should not exceed 10
 
-    # Check if specific expected values are close (if known from actual model output)
-    EXPECTED_PARAMS = np.array([0.34060709, -0.26968773, 0.31551482, 0.45824332])  # Update with actual expected values
-    assert np.allclose(result.params.values, EXPECTED_PARAMS, atol=0.1)  # Increased tolerance to 0.1
+    # Check if all expected predictors are present in the result
+    for predictor in ["Intercept", "Predictor1", "Predictor2", "Interaction"]:
+        assert predictor in result.params.index
 
     # Check p-values are valid
     assert np.all(np.isfinite(result.pvalues))  # P-values should be finite numbers
-    # Check if any coefficient has a p-value less than 0.1 (10% significance level)
-    assert np.any(result.pvalues < 0.1)
-
-
-def test_fit_logit_without_interaction():
-    """Test that fit_logit works without an Interaction column."""
-    np.random.seed(42)
-    n = 100
-    
-    # Create data WITHOUT an Interaction column
-    df_no_interaction = pd.DataFrame({
-        'Success': np.random.randint(0, 2, size=n),
-        'Predictor1': np.random.randn(n),
-        'Predictor2': np.random.randn(n)
-    })
-    
-    # This should work without raising a KeyError
-    result = fit_logit(df_no_interaction, ['Predictor1', 'Predictor2'])
-    
-    # Verify the result is valid
-    assert result is not None
-    assert hasattr(result, 'params')
-    
-    # Should have 3 parameters: Intercept, Predictor1, Predictor2 (no Interaction)
-    assert len(result.params) == 3
-    assert 'Intercept' in result.params.index
-    assert 'Predictor1' in result.params.index
-    assert 'Predictor2' in result.params.index
-    assert 'Interaction' not in result.params.index
-    
diff --git a/msdbook/utils.py b/msdbook/utils.py
@@ -3,6 +3,7 @@
 import numpy as np
 import statsmodels.api as sm
 
+
 def fit_logit(dta, predictors):
     """Logistic regression with optional interaction term.
     
@@ -28,23 +29,18 @@ def fit_logit(dta, predictors):
 
     # Add intercept column of 1s
     dta["Intercept"] = np.ones(np.shape(dta)[0])
-    
-    # Get columns of predictors, starting with intercept
-    cols = dta.columns.tolist()[-1:] + predictors
-    
-    # Add interaction term if present in the data
-    if "Interaction" in dta.columns:
-        cols = cols + ["Interaction"]
-    
+
+    # Get columns of predictors
+    cols = dta.columns.tolist()[-1:] + predictors + ["Interaction"]
+
     # Fit logistic regression without the deprecated 'disp' argument
     logit = sm.Logit(dta["Success"], dta[cols])
-    result = logit.fit(method='bfgs')  # Use method='bfgs' or another supported method
-    
+    result = logit.fit(method="bfgs")  # Use method='bfgs' or another supported method
+
     return result
 
-def plot_contour_map(
-    ax, result, dta, contour_cmap, dot_cmap, levels, xgrid, ygrid, xvar, yvar, base
-):
+
+def plot_contour_map(ax, result, dta, contour_cmap, dot_cmap, levels, xgrid, ygrid, xvar, yvar):
     """Plot the contour map"""
 
     # Ignore tight layout warnings
@@ -60,12 +56,15 @@ def plot_contour_map(
     Z = np.reshape(z, np.shape(X))
 
     contourset = ax.contourf(X, Y, Z, levels, cmap=contour_cmap, aspect="auto")
-    
+
     # Plot scatter points based on the data
-    xpoints = np.mean(dta[xvar].values.reshape(-1, 10), axis=1)
-    ypoints = np.mean(dta[yvar].values.reshape(-1, 10), axis=1)
-    colors = np.round(np.mean(dta["Success"].values.reshape(-1, 10), axis=1), 0)
-    
+    # Trim data to ensure it's divisible by 10 for reshaping
+    n = len(dta[xvar].values)
+    n_trim = (n // 10) * 10
+    xpoints = np.mean(dta[xvar].values[:n_trim].reshape(-1, 10), axis=1)
+    ypoints = np.mean(dta[yvar].values[:n_trim].reshape(-1, 10), axis=1)
+    colors = np.round(np.mean(dta["Success"].values[:n_trim].reshape(-1, 10), axis=1), 0)
+
     ax.scatter(xpoints, ypoints, s=10, c=colors, edgecolor="none", cmap=dot_cmap)
     ax.set_xlim(np.min(xgrid), np.max(xgrid))
     ax.set_ylim(np.min(ygrid), np.max(ygrid))
diff --git a/notebooks/basin_users_logistic_regression.ipynb b/notebooks/basin_users_logistic_regression.ipynb
@@ -302,15 +302,16 @@
     "    # plot contour map\n",
     "    contourset = plot_contour_map(ax, result, dta, contour_cmap,\n",
     "                                    dot_cmap, contour_levels, xgrid,\n",
-    "                                    ygrid, all_predictors[0], all_predictors[1], base)\n",
+    "                                    ygrid, all_predictors[0], all_predictors[1])\n",
     "    \n",
     "    ax.set_title(usernames[i])\n",
     "    \n",
     "# set up colorbar\n",
     "cbar_ax = fig.add_axes([0.98, 0.15, 0.05, 0.7])\n",
     "cbar = fig.colorbar(contourset, cax=cbar_ax)\n",
     "cbar_ax.set_ylabel('Probability of Success', fontsize=16)\n",
-    "cbar_ax.tick_params(axis='y', which='major', labelsize=12)\n"
+    "cbar_ax.tick_params(axis='y', which='major', labelsize=12)\n",
+    ""
    ]
   },
   {
@@ -382,4 +383,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}