smaller test dataset for bgbb (#1039)

ColtAllen · web-flow · commit aa5fa1d280ea · 2024-09-15T22:38:37.000+02:00
diff --git a/tests/clv/models/test_beta_geo_beta_binom.py b/tests/clv/models/test_beta_geo_beta_binom.py
@@ -40,13 +40,10 @@ def setup_class(cls):
         cls.gamma_true = 0.6567
 
         # Use Quickstart dataset (the CDNOW_sample research data) for testing
-        test_data = pd.read_csv("data/bgbb_donations.csv")
+        cls.data = pd.read_csv("data/bgbb_donations.csv")
 
-        cls.data = test_data
-        # cls.customer_id = test_data["customer_id"]
-        # cls.frequency = test_data["frequency"]
-        # cls.recency = test_data["recency"]
-        # cls.T = test_data["T"]
+        # sample from full dataset for tests involving model fits
+        cls.sample_data = cls.data.sample(n=1000, random_state=45)
 
         # take sample of all unique recency/frequency/T combinations to test predictive methods
         test_customer_ids = [
@@ -74,8 +71,8 @@ def setup_class(cls):
             11103,
         ]
 
-        cls.sample_data = test_data.query("customer_id.isin(@test_customer_ids)")
-        cls.sample_data_N = len(test_customer_ids)
+        cls.pred_data = cls.data.query("customer_id.isin(@test_customer_ids)")
+        cls.pred_data_N = len(test_customer_ids)
 
         # Instantiate model with CDNOW data for testing
         cls.model = BetaGeoBetaBinomModel(cls.data)
@@ -278,13 +275,16 @@ def test_model_repr(self, custom_config):
     @pytest.mark.parametrize(
         "fit_method, rtol",
         [
-            ("mcmc", 0.1),
+            (
+                "mcmc",
+                0.3,
+            ),  # higher rtol required for sample_data; within .1 tolerance for full dataset;
             ("map", 0.2),
         ],
     )
     def test_model_convergence(self, fit_method, rtol, model_config):
         model = BetaGeoBetaBinomModel(
-            data=self.data,
+            data=self.sample_data,
             model_config=model_config,
         )
         model.build_model()
@@ -307,7 +307,7 @@ def test_model_convergence(self, fit_method, rtol, model_config):
         )
 
     def test_fit_result_without_fit(self, model_config):
-        model = BetaGeoBetaBinomModel(data=self.data, model_config=model_config)
+        model = BetaGeoBetaBinomModel(data=self.pred_data, model_config=model_config)
         with pytest.raises(RuntimeError, match="The model hasn't been fit yet"):
             model.fit_result
 
@@ -327,20 +327,20 @@ def test_expected_purchases(self, test_t):
         true_purchases = (
             self.lifetimes_model.conditional_expected_number_of_purchases_up_to_time(
                 m_periods_in_future=test_t,
-                frequency=self.sample_data["frequency"],
-                recency=self.sample_data["recency"],
-                n_periods=self.sample_data["T"],
+                frequency=self.pred_data["frequency"],
+                recency=self.pred_data["recency"],
+                n_periods=self.pred_data["T"],
             )
         )
 
         # test parametrization with default data has different dims
         est_num_purchases = self.model.expected_purchases(future_t=test_t)
         assert est_num_purchases.shape == (self.chains, self.draws, self.N)
 
-        data = self.sample_data.assign(future_t=test_t)
+        data = self.pred_data.assign(future_t=test_t)
         est_num_purchases = self.model.expected_purchases(data)
 
-        assert est_num_purchases.shape == (self.chains, self.draws, self.sample_data_N)
+        assert est_num_purchases.shape == (self.chains, self.draws, self.pred_data_N)
         assert est_num_purchases.dims == ("chain", "draw", "customer_id")
 
         np.testing.assert_allclose(
@@ -398,33 +398,33 @@ def test_expected_purchases_new_customer(self):
     def test_expected_probability_alive(self, test_t):
         true_prob_alive = self.lifetimes_model.conditional_probability_alive(
             m_periods_in_future=test_t,
-            frequency=self.sample_data["frequency"],
-            recency=self.sample_data["recency"],
-            n_periods=self.sample_data["T"],
+            frequency=self.pred_data["frequency"],
+            recency=self.pred_data["recency"],
+            n_periods=self.pred_data["T"],
         )
 
         # test parametrization with default data has different dims
         est_prob_alive = self.model.expected_probability_alive(future_t=test_t)
         assert est_prob_alive.shape == (self.chains, self.draws, self.N)
 
-        sample_data = self.sample_data.assign(future_t=test_t)
-        est_prob_alive = self.model.expected_probability_alive(sample_data)
+        pred_data = self.pred_data.assign(future_t=test_t)
+        est_prob_alive = self.model.expected_probability_alive(pred_data)
 
-        assert est_prob_alive.shape == (self.chains, self.draws, self.sample_data_N)
+        assert est_prob_alive.shape == (self.chains, self.draws, self.pred_data_N)
         assert est_prob_alive.dims == ("chain", "draw", "customer_id")
         np.testing.assert_allclose(
             true_prob_alive,
             est_prob_alive.mean(("chain", "draw")),
             rtol=0.01,
         )
 
-        alt_data = self.sample_data.assign(future_t=7.5)
+        alt_data = self.pred_data.assign(future_t=7.5)
         est_prob_alive_t = self.model.expected_probability_alive(alt_data)
         assert est_prob_alive.mean() > est_prob_alive_t.mean()
 
     def test_distribution_new_customer(self) -> None:
         mock_model = BetaGeoBetaBinomModel(
-            data=self.data,
+            data=self.sample_data,
         )
         mock_model.build_model()
         mock_model.idata = az.from_dict(
@@ -444,7 +444,7 @@ def test_distribution_new_customer(self) -> None:
             random_seed=rng
         )
         customer_rec_freq = mock_model.distribution_new_customer_recency_frequency(
-            self.data, T=self.data["T"], random_seed=rng
+            self.sample_data, T=self.sample_data["T"], random_seed=rng
         )
         customer_rec = customer_rec_freq.sel(obs_var="recency")
         customer_freq = customer_rec_freq.sel(obs_var="frequency")
@@ -463,7 +463,7 @@ def test_distribution_new_customer(self) -> None:
                 beta=self.beta_true,
                 delta=self.delta_true,
                 gamma=self.gamma_true,
-                T=self.data["T"],
+                T=self.sample_data["T"],
             ),
             random_seed=rng,
         ).T