Feature/features and filter update (#267)

blondered · web-flow · commit 4a4a8742eac7 · 2025-02-24T14:55:35.000+03:00
- Added `normalize` argument to `CatalogCoverage` - Fixed `NDCG` doc - Fixed keeping extra cols in `Dataset.filter` method. Closes #265
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,9 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## Unreleased
 
 ### Added
-- `CatalogCoverage` metric ([#266](https://github.com/MobileTeleSystems/RecTools/pull/266))
+- `CatalogCoverage` metric ([#266](https://github.com/MobileTeleSystems/RecTools/pull/266), [#267](https://github.com/MobileTeleSystems/RecTools/pull/267))
 - `divide_by_achievable` argument to `NDCG` metric ([#266](https://github.com/MobileTeleSystems/RecTools/pull/266))
 
+### Changed
+- Interactions extra columns are not dropped in `Dataset.filter_interactions` method [#267](https://github.com/MobileTeleSystems/RecTools/pull/267)
+
 ## [0.11.0] - 17.02.2025
 
 ### Added
diff --git a/rectools/dataset/dataset.py b/rectools/dataset/dataset.py
@@ -401,7 +401,8 @@ def filter_interactions(
         # 1x internal -> 2x internal
         user_id_map = IdMap.from_values(interactions_df[Columns.User].values)
         item_id_map = IdMap.from_values(interactions_df[Columns.Item].values)
-        interactions = Interactions.from_raw(interactions_df, user_id_map, item_id_map)
+        # We shouldn't drop extra columns if they are present
+        interactions = Interactions.from_raw(interactions_df, user_id_map, item_id_map, keep_extra_cols=True)
 
         def _handle_features(
             features: tp.Optional[Features], target_id_map: IdMap, dataset_id_map: IdMap
diff --git a/rectools/metrics/catalog.py b/rectools/metrics/catalog.py
@@ -16,23 +16,29 @@
 
 import typing as tp
 
+import attr
 import pandas as pd
 
 from rectools import Columns
 
 from .base import Catalog, MetricAtK
 
 
+@attr.s
 class CatalogCoverage(MetricAtK):
     """
-    Share of items in catalog that is present in recommendations for all users.
+    Count (or share) of items from catalog that is present in recommendations for all users.
 
     Parameters
     ----------
     k : int
         Number of items at the top of recommendations list that will be used to calculate metric.
+    normalize: bool, default ``False``
+        Flag, which says whether to normalize metric or not.
     """
 
+    normalize: bool = attr.ib(default=False)
+
     def calc(self, reco: pd.DataFrame, catalog: Catalog) -> float:
         """
         Calculate metric value.
@@ -49,7 +55,10 @@ def calc(self, reco: pd.DataFrame, catalog: Catalog) -> float:
         float
             Value of metric (aggregated for all users).
         """
-        return reco.loc[reco[Columns.Rank] <= self.k, Columns.Item].nunique() / len(catalog)
+        res = reco.loc[reco[Columns.Rank] <= self.k, Columns.Item].nunique()
+        if self.normalize:
+            return res / len(catalog)
+        return res
 
 
 CatalogMetric = CatalogCoverage
diff --git a/rectools/metrics/ranking.py b/rectools/metrics/ranking.py
@@ -314,28 +314,27 @@ class NDCG(_RankingMetric):
     r"""
     Normalized Discounted Cumulative Gain at k (NDCG@k).
 
-    Estimates relevance of recommendations taking in account their order.
+    Estimates relevance of recommendations taking in account their order. `"Discounted Gain"`
+    means that original item relevance is being discounted based on this
+    items rank. The closer is item to the top the, the more gain is achieved.
+    `"Cumulative"` means that all items discounted gains from ``k`` ranks are being summed.
+    `"Normalized"` means that the actual value of DCG is being divided by the `"Ideal DCG"` (IDCG).
+    This is the maximum possible value of `DCG@k`, used as normalization coefficient to ensure that
+    `NDCG@k` values lie in ``[0, 1]``.
 
     .. math::
         NDCG@k=\frac{1}{|U|}\sum_{u \in U}\frac{DCG_u@k}{IDCG_u@k}
 
+        DCG_u@k = \sum_{i=1}^{k} \frac{rel_u(i)}{log(i + 1)}
+
     where
-        - :math:`DCG_u@k` is "Discounted Cumulative Gain" at k for user u.
-        - `"Gain"` stands for relevance of item at position i to user. It equals to ``1`` if this item
-          is relevant, ``0`` otherwise
-        - `"Discounted Gain"` means that original item relevance is being discounted based on this
-          items rank. The closer is item to the top the, the more gain is achieved.
-        - `"Discounted Cumulative Gain"` means that discounted gains are summed together.
-        - :math:`IDCG_u@k` is `"Ideal Discounted Cumulative Gain"` at k for user u. This is maximum
-          possible value of `DCG@k`, used as normalization coefficient to ensure that `NDCG@k`
-          values lie in ``[0, 1]``.
-
-    When `divide_by_achievable` is set to ``False`` (default) `IDCG_u@k` is the same value for all
-    users and is equal to:
-    :math:`IDCG_u@k = \sum_{i=1}^{k} \frac{1}{log(i + 1)}`
-    When `divide_by_achievable` is set to ``True``, the formula for IDCG depends
-    on number of each user relevant items in the test set. The formula is:
-    :math:`IDCG_u@k = \sum_{i=1}^{\min (|R(u)|, k)} \frac{1}{log(i + 1)}`
+        - :math:`IDCG_u@k = \sum_{i=1}^{k} \frac{1}{log(i + 1)}` when `divide_by_achievable` is set
+          to ``False`` (default).
+        - :math:`IDCG_u@k = \sum_{i=1}^{\min (|R(u)|, k)} \frac{1}{log(i + 1)}` when
+          `divide_by_achievable` is set to ``True``.
+        - :math:`rel_u(i)` is `"Gain"`. Here it is an indicator function, it equals to ``1`` if the
+          item at rank ``i`` is relevant to user ``u``, ``0`` otherwise.
+        - :math:`|R_u|` is number of relevant (ground truth) items for user ``u``.
 
     Parameters
     ----------
diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py
@@ -362,19 +362,19 @@ def dataset_to_filter(self) -> Dataset:
         user_id_map = IdMap.from_values([10, 11, 12, 13, 14])
         df = pd.DataFrame(
             [
-                [0, 0, 1, "2021-09-01"],
-                [4, 2, 1, "2021-09-02"],
-                [2, 1, 1, "2021-09-02"],
-                [2, 2, 1, "2021-09-03"],
-                [3, 2, 1, "2021-09-03"],
-                [3, 3, 1, "2021-09-03"],
-                [3, 4, 1, "2021-09-04"],
-                [1, 2, 1, "2021-09-04"],
-                [3, 1, 1, "2021-09-05"],
-                [4, 2, 1, "2021-09-05"],
-                [3, 3, 1, "2021-09-06"],
+                [0, 0, 1, "2021-09-01", 1],
+                [4, 2, 1, "2021-09-02", 1],
+                [2, 1, 1, "2021-09-02", 1],
+                [2, 2, 1, "2021-09-03", 1],
+                [3, 2, 1, "2021-09-03", 1],
+                [3, 3, 1, "2021-09-03", 1],
+                [3, 4, 1, "2021-09-04", 1],
+                [1, 2, 1, "2021-09-04", 1],
+                [3, 1, 1, "2021-09-05", 1],
+                [4, 2, 1, "2021-09-05", 1],
+                [3, 3, 1, "2021-09-06", 1],
             ],
-            columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],
+            columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime, "extra"],
         ).astype({Columns.Datetime: "datetime64[ns]"})
         interactions = Interactions(df)
         return Dataset(user_id_map, item_id_map, interactions)
@@ -426,12 +426,12 @@ def test_filter_dataset_interactions_df_rows_without_features(
         )
         expected_interactions_2x_internal_df = pd.DataFrame(
             [
-                [0, 0, 1, "2021-09-01"],
-                [1, 1, 1, "2021-09-02"],
-                [2, 2, 1, "2021-09-02"],
-                [2, 1, 1, "2021-09-03"],
+                [0, 0, 1, "2021-09-01", 1],
+                [1, 1, 1, "2021-09-02", 1],
+                [2, 2, 1, "2021-09-02", 1],
+                [2, 1, 1, "2021-09-03", 1],
             ],
-            columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],
+            columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime, "extra"],
         ).astype({Columns.Datetime: "datetime64[ns]", Columns.Weight: float})
         np.testing.assert_equal(filtered_dataset.user_id_map.external_ids, expected_external_user_ids)
         np.testing.assert_equal(filtered_dataset.item_id_map.external_ids, expected_external_item_ids)
@@ -464,12 +464,12 @@ def test_filter_dataset_interactions_df_rows_with_features(
         )
         expected_interactions_2x_internal_df = pd.DataFrame(
             [
-                [0, 0, 1, "2021-09-01"],
-                [1, 1, 1, "2021-09-02"],
-                [2, 2, 1, "2021-09-02"],
-                [2, 1, 1, "2021-09-03"],
+                [0, 0, 1, "2021-09-01", 1],
+                [1, 1, 1, "2021-09-02", 1],
+                [2, 2, 1, "2021-09-02", 1],
+                [2, 1, 1, "2021-09-03", 1],
             ],
-            columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],
+            columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime, "extra"],
         ).astype({Columns.Datetime: "datetime64[ns]", Columns.Weight: float})
         np.testing.assert_equal(filtered_dataset.user_id_map.external_ids, expected_external_user_ids)
         np.testing.assert_equal(filtered_dataset.item_id_map.external_ids, expected_external_item_ids)
diff --git a/tests/metrics/test_catalog.py b/tests/metrics/test_catalog.py
@@ -16,14 +16,14 @@
 
 import numpy as np
 import pandas as pd
+import pytest
 
 from rectools import Columns
 from rectools.metrics import CatalogCoverage
 
 
 class TestCatalogCoverage:
     def setup_method(self) -> None:
-        self.metric = CatalogCoverage(k=2)
         self.reco = pd.DataFrame(
             {
                 Columns.User: [1, 1, 1, 2, 2, 3, 4],
@@ -32,7 +32,8 @@ def setup_method(self) -> None:
             }
         )
 
-    def test_calc(self) -> None:
+    @pytest.mark.parametrize("normalize,expected", ((True, 0.4), (False, 2.0)))
+    def test_calc(self, normalize: bool, expected: float) -> None:
         catalog = np.arange(5)
-        expected = 0.4
-        assert self.metric.calc(self.reco, catalog) == expected
+        metric = CatalogCoverage(k=2, normalize=normalize)
+        assert metric.calc(self.reco, catalog) == expected
diff --git a/tests/metrics/test_scoring.py b/tests/metrics/test_scoring.py
@@ -119,7 +119,7 @@ def test_success(self) -> None:
             "sufficient": SufficientReco(k=2),
             "unrepeated": UnrepeatedReco(k=2),
             "covered_users": CoveredUsers(k=2),
-            "catalog_coverage": CatalogCoverage(k=2),
+            "catalog_coverage": CatalogCoverage(k=2, normalize=True),
         }
         with pytest.warns(UserWarning, match="Custom metrics are not supported"):
             actual = calc_metrics(

Original file line number	Diff line number	Diff line change
`@@ -119,7 +119,7 @@ def test_success(self) -> None:`
`119`	`119`	`"sufficient": SufficientReco(k=2),`
`120`	`120`	`"unrepeated": UnrepeatedReco(k=2),`
`121`	`121`	`"covered_users": CoveredUsers(k=2),`
`122`		`- "catalog_coverage": CatalogCoverage(k=2),`
	`122`	`+ "catalog_coverage": CatalogCoverage(k=2, normalize=True),`
`123`	`123`	`}`
`124`	`124`	`with pytest.warns(UserWarning, match="Custom metrics are not supported"):`
`125`	`125`	`actual = calc_metrics(`