Fix/pop in cat refit (#163)

blondered · web-flow · commit 374b4ae3f3a7 · 2024-07-16T13:48:16.000+03:00
- Fixed `PopularInCategoryModel` refit behaviour and `cross-validate` compatibility - Fixed `PopularInCategoryModel` empty category interactions behaviour - Added tests Closes #162
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 - Used the latest version of `lightfm` that allows to install it using `poetry>=1.5.0` ([#141](https://github.com/MobileTeleSystems/RecTools/pull/141))
 - Added restriction to `pytorch` version for MacOSX + x86_64 that allows to install it on such platforms ([#142](https://github.com/MobileTeleSystems/RecTools/pull/142))
+- `PopularInCategoryModel` fitting for multiple times, `cross_validate` compatibility, behaviour with empty category interactions ([#163](https://github.com/MobileTeleSystems/RecTools/pull/163))
 
 
 ## [0.6.0] - 13.05.2024
diff --git a/README.md b/README.md
@@ -16,8 +16,9 @@
   <a href="https://rectools.readthedocs.io/en/stable/">Documentation</a> |
   <a href="https://github.com/MobileTeleSystems/RecTools/tree/main/examples">Examples</a> |
     <a href="https://github.com/MobileTeleSystems/RecTools/tree/main/examples/tutorials">Tutorials</a> |
-  <a href="https://github.com/MobileTeleSystems/RecTools/blob/main/CONTRIBUTING.rst">Contribution Guide</a> |
-  <a href="https://github.com/MobileTeleSystems/RecTools/releases">Release Notes</a>
+  <a href="https://github.com/MobileTeleSystems/RecTools/blob/main/CONTRIBUTING.rst">Contributing</a> |
+  <a href="https://github.com/MobileTeleSystems/RecTools/releases">Releases</a> |
+  <a href="https://github.com/orgs/MobileTeleSystems/projects/1">Developers Board</a>
 </p>
 
 RecTools is an easy-to-use Python library which makes the process of building recommendation systems easier, 
diff --git a/rectools/models/popular_in_category.py b/rectools/models/popular_in_category.py
@@ -160,15 +160,19 @@ def _check_category_feature(self, dataset: Dataset) -> None:
 
     def _calc_category_scores(self, dataset: Dataset, interactions: pd.DataFrame) -> None:
         scores_dict = {}
+        empty_columns = []
         for column_num in self.category_columns:
             item_idx = dataset.item_features.values.getcol(column_num).nonzero()[0]  # type: ignore
-            self.category_interactions[column_num] = interactions[interactions[Columns.Item].isin(item_idx)].copy()
+            category_interactions = interactions[interactions[Columns.Item].isin(item_idx)]
             # Category interactions might be empty
-            if self.category_interactions[column_num].shape[0] == 0:
-                self.category_columns.remove(column_num)
+            if category_interactions.shape[0] == 0:
+                empty_columns.append(column_num)
             else:
+                self.category_interactions[column_num] = category_interactions.copy()
                 col, func = self._get_groupby_col_and_agg_func(self.popularity)
                 scores_dict[column_num] = self.category_interactions[column_num][col].apply(func)
+
+        self.category_columns = [col for col in self.category_columns if col not in empty_columns]
         self.category_scores = pd.Series(scores_dict).sort_values(ascending=False)
 
     def _define_categories_for_analysis(self) -> None:
@@ -177,7 +181,7 @@ def _define_categories_for_analysis(self) -> None:
                 self.n_effective_categories = self.n_categories
                 relevant_categories = self.category_scores.head(self.n_categories).index
                 self.category_scores = self.category_scores.loc[relevant_categories]
-                self.category_columns = relevant_categories
+                self.category_columns = relevant_categories.to_list()
             else:
                 self.n_effective_categories = len(self.category_columns)
                 warnings.warn(
@@ -188,6 +192,13 @@ def _define_categories_for_analysis(self) -> None:
             self.n_effective_categories = len(self.category_columns)
 
     def _fit(self, dataset: Dataset) -> None:  # type: ignore
+
+        self.category_columns = []
+        self.category_interactions = {}
+        self.models = {}
+        self.category_scores = pd.Series()
+        self.n_effective_categories = 0
+
         self._check_category_feature(dataset)
         interactions = self._filter_interactions(dataset.interactions.df)
         self._calc_category_scores(dataset, interactions)
diff --git a/tests/model_selection/test_cross_validate.py b/tests/model_selection/test_cross_validate.py
@@ -28,7 +28,7 @@
 from rectools.metrics.base import MetricAtK
 from rectools.model_selection import LastNSplitter, cross_validate
 from rectools.model_selection.cross_validate import _gen_2x_internal_ids_dataset
-from rectools.models import ImplicitALSWrapperModel, PopularModel, RandomModel
+from rectools.models import ImplicitALSWrapperModel, PopularInCategoryModel, PopularModel, RandomModel
 from rectools.models.base import ModelBase
 from tests.testing_utils import assert_sparse_matrix_equal
 
@@ -146,6 +146,7 @@ def setup_method(self) -> None:
                 [14, "f2", 1],
                 [11, "f1", "y"],
                 [11, "f2", 2],
+                [12, "f1", "y"],
             ],
             columns=["id", "feature", "value"],
         )
@@ -247,6 +248,7 @@ def test_happy_path_with_features(self, prefer_warm_inference_over_cold: bool) -
 
         models: tp.Dict[str, ModelBase] = {
             "als": ImplicitALSWrapperModel(AlternatingLeastSquares(factors=2, iterations=2, random_state=42)),
+            "pop_in_cat": PopularInCategoryModel(category_feature="f1", n_categories=2),
         }
 
         actual = cross_validate(
@@ -282,7 +284,9 @@ def test_happy_path_with_features(self, prefer_warm_inference_over_cold: bool) -
             ],
             "metrics": [
                 {"model": "als", "i_split": 0, "precision@2": 0.5, "recall@1": 0.0},
-                {"model": "als", "i_split": 1, "precision@2": 0.375, "recall@1": 0.25},
+                {"model": "pop_in_cat", "i_split": 0, "precision@2": 0.5, "recall@1": 0.5},
+                {"model": "als", "i_split": 1, "precision@2": 0.375, "recall@1": 0.0},
+                {"model": "pop_in_cat", "i_split": 1, "precision@2": 0.375, "recall@1": 0.25},
             ],
         }
 
diff --git a/tests/models/test_popular_in_category.py b/tests/models/test_popular_in_category.py
@@ -422,11 +422,25 @@ def test_i2i(
             actual,
         )
 
-    def test_second_fit_refits_model(self, dataset: Dataset) -> None:
+    @pytest.mark.parametrize("popularity", ("mean_weight", "n_users", "n_interactions"))
+    @pytest.mark.parametrize("category_feature", ("f1", "f2"))
+    @pytest.mark.parametrize("mixing_strategy", ("group", "rotate"))
+    @pytest.mark.parametrize("ratio_strategy", ("equal", "proportional"))
+    @pytest.mark.parametrize("n_categories", (2, None))
+    def test_second_fit_refits_model(
+        self,
+        dataset: Dataset,
+        popularity: str,
+        category_feature: str,
+        mixing_strategy: str,
+        ratio_strategy: str,
+        n_categories: tp.Optional[int],
+    ) -> None:
         model = PopularInCategoryModel(
-            category_feature="f2",
-            popularity="mean_weight",
-            mixing_strategy="group",
-            ratio_strategy="proportional",
+            category_feature=category_feature,
+            popularity=popularity,
+            mixing_strategy=mixing_strategy,
+            ratio_strategy=ratio_strategy,
+            n_categories=n_categories,
         )
         assert_second_fit_refits_model(model, dataset)