implement ContextualDataset (#2066)

Qing Feng · facebook-github-bot · commit 8cdc595b6043 · 2023-10-26T11:05:05.000-07:00
Summary: Pull Request resolved: #2066 Implement contextual dataset for fitting contextual GP. If one single dataset is passed, we construct the data for fitting LCEA GP; if multiple datasets are given, each should correspond a context breakdown of one outcome and is expected to be combined to be fitted with LCEMGP Reviewed By: bletham Differential Revision: D50440957 fbshipit-source-id: 55b06d3bc3c739eb47978c4303d38be0cc286dbf
diff --git a/botorch/utils/datasets.py b/botorch/utils/datasets.py
@@ -8,6 +8,8 @@
 
 from __future__ import annotations
 
+import collections
+
 import warnings
 from typing import Any, Dict, List, Optional, Union
 
@@ -476,3 +478,152 @@ def get_dataset_without_task_feature(self, outcome_name: str) -> SupervisedDatas
             ],
             outcome_names=[outcome_name],
         )
+
+
+class ContextualDataset(SupervisedDataset):
+    """This is a contextual dataset that is constructed from either a single
+    dateset containing overall outcome or a list of datasets that each corresponds
+    to a context breakdown.
+    """
+
+    def __init__(
+        self,
+        datasets: List[SupervisedDataset],
+        parameter_decomposition: Dict[str, List[str]],
+        context_buckets: List[str],
+        metric_decomposition: Optional[Dict[str, List[str]]] = None,
+    ):
+        """Construct a `ContextualDataset`.
+
+        Args:
+            datasets: A list of the datasets of individual tasks. Each dataset
+                is expected to contain data for only one outcome.
+            parameter_decomposition: Dict from context name to list of indices
+                of X corresponding to that context.
+            context_buckets: List of the context names in the order of dataset
+                in datasets corresponding to each context outcome.
+            metric_decomposition: Context breakdown metrics. Keys are context names.
+                Values are the lists of metric names belonging to the context:
+                {'context1': ['m1_c1'], 'context2': ['m1_c2'],}.
+        """
+        self.datasets: Dict[str, SupervisedDataset] = {
+            ds.outcome_names[0]: ds for ds in datasets
+        }
+        self.feature_names = datasets[0].feature_names
+        self.outcome_names = list(self.datasets.keys())
+        self.parameter_decomposition = parameter_decomposition
+        self.context_buckets = context_buckets
+        self.metric_decomposition = metric_decomposition
+        self._validate_datasets(
+            datasets=datasets, metric_decomposition=metric_decomposition
+        )
+        # order the dataset based on context bucket
+        self.outcome_names = self._sort_outcome_names()
+
+    @property
+    def X(self) -> Tensor:
+        return self.datasets[self.outcome_names[0]].X
+
+    @property
+    def Y(self) -> Tensor:
+        """Concatenates the Ys from the child datasets to create the Y expected
+        by LCEM model if there are multiple datasets; Or return the Y expected
+        by LCEA model if there is only one dataset.
+        """
+        if len(self.datasets) == 1:
+            # use LCEA model
+            return self.datasets[self.outcome_names[0]].Y
+        else:
+            return torch.cat(
+                [self.datasets[outcome_name].Y for outcome_name in self.outcome_names],
+                dim=-1,
+            )
+
+    @property
+    def Yvar(self) -> Tensor:
+        """Concatenates the Yvars from the child datasets to create the Y expected
+        by LCEM model if there are multiple datasets; Or return the Yvar expected
+        by LCEA model if there is only one dataset.
+        """
+        if len(self.datasets) == 1:
+            # use LCEA model
+            return self.datasets[self.outcome_names[0]].Yvar
+        else:
+            return torch.cat(
+                [
+                    self.datasets[outcome_name].Yvar
+                    for outcome_name in self.outcome_names
+                ],
+                dim=-1,
+            )
+
+    def _sort_outcome_names(self) -> List[str]:
+        """Sort the outcome names according to the order of context buckets."""
+        outcome_names = list(self.datasets.keys())
+        if len(outcome_names) == 1:
+            return outcome_names
+        else:
+            context_outcome_map = {}
+            for context in self.context_buckets:
+                for outcome_name in outcome_names:
+                    if outcome_name in self.metric_decomposition[context]:
+                        if context_outcome_map.get(context, None) is not None:
+                            raise ValueError(
+                                f"{context} bucket contains mutltiple outcomes"
+                            )
+                        context_outcome_map[context] = outcome_name
+        return [context_outcome_map[context] for context in self.context_buckets]
+
+    def _validate_datasets(
+        self,
+        datasets: List[SupervisedDataset],
+        metric_decomposition: Optional[Dict[str, List[str]]] = None,
+    ) -> None:
+        """Validation of given datasets.
+        1. each dataset has same X.
+        2. metric_decomposition is not None if there are multiple datasets.
+        3. metric_decomposition contains all the outcomes in datasets.
+        4. value keys of parameter decomposition and the keys of
+        metric_decomposition match context buckets.
+        """
+        X = datasets[0].X
+        for dataset in datasets:
+            if torch.equal(X, dataset.X) is not True:
+                raise InputDataError("Require same X for context buckets")
+
+        if len(datasets) > 1:
+            if metric_decomposition is None:
+                raise InputDataError(
+                    "metric_decomposition must be provided when there are"
+                    + " multiple datasets."
+                )
+        else:
+            if metric_decomposition is not None:
+                raise InputDataError(
+                    "metric_decomposition is redundant when there is one "
+                    + "dataset for overall outcome."
+                )
+
+        if collections.Counter(
+            list(self.parameter_decomposition.keys())
+        ) != collections.Counter(self.context_buckets):
+            raise InputDataError(
+                "Keys of parameter decomposition and context buckets do not match."
+            )
+
+        if metric_decomposition is not None:
+            if collections.Counter(
+                list(self.metric_decomposition.keys())
+            ) != collections.Counter(self.context_buckets):
+                raise InputDataError(
+                    "Keys of metric decomposition and context buckets do not match."
+                )
+
+            all_metrics = []
+            for m in metric_decomposition.values():
+                all_metrics.extend(m)
+            for outcome in self.outcome_names:
+                if outcome not in all_metrics:
+                    raise InputDataError(
+                        f"{outcome} is missing in metric_decomposition."
+                    )
diff --git a/test/utils/test_datasets.py b/test/utils/test_datasets.py
@@ -10,6 +10,7 @@
 from botorch.exceptions.errors import InputDataError, UnsupportedError
 from botorch.utils.containers import DenseContainer, SliceContainer
 from botorch.utils.datasets import (
+    ContextualDataset,
     FixedNoiseDataset,
     MultiTaskDataset,
     RankingDataset,
@@ -335,3 +336,175 @@ def test_multi_task(self):
                 task_feature_index=-1,
                 target_task_value=0,
             )
+
+    def test_contextual_datasets(self):
+        num_contexts = 3
+        feature_names = [f"x_c{i}" for i in range(num_contexts)]
+        parameter_decomposition = {
+            f"context_{i}": [f"x_c{i}"] for i in range(num_contexts)
+        }
+        context_buckets = list(parameter_decomposition.keys())
+        context_outcome_list = [f"y:context_{i}" for i in range(num_contexts)]
+        metric_decomposition = {f"{c}": [f"y:{c}"] for c in context_buckets}
+
+        # test construction of agg outcome
+        dataset_list1 = [
+            make_dataset(
+                d=1 * num_contexts,
+                has_yvar=True,
+                feature_names=feature_names,
+                outcome_names=["y"],
+            )
+        ]
+        context_dt = ContextualDataset(
+            datasets=dataset_list1,
+            parameter_decomposition=parameter_decomposition,
+            context_buckets=context_buckets,
+        )
+        self.assertEqual(len(context_dt.datasets), len(dataset_list1))
+        self.assertListEqual(context_dt.context_buckets, context_buckets)
+        self.assertListEqual(context_dt.outcome_names, ["y"])
+        self.assertListEqual(context_dt.feature_names, feature_names)
+        self.assertIs(context_dt.datasets["y"], dataset_list1[0])
+        self.assertIs(context_dt.X, dataset_list1[0].X)
+        self.assertIs(context_dt.Y, dataset_list1[0].Y)
+        self.assertIs(context_dt.Yvar, dataset_list1[0].Yvar)
+
+        # test construction of context outcome
+        dataset_list2 = [
+            make_dataset(
+                d=1 * num_contexts,
+                has_yvar=True,
+                feature_names=feature_names,
+                outcome_names=[context_outcome_list[0]],
+            )
+        ]
+        for m in context_outcome_list[1:]:
+            dataset_list2.append(
+                SupervisedDataset(
+                    X=dataset_list2[0].X,
+                    Y=rand(dataset_list2[0].Y.size()),
+                    Yvar=rand(dataset_list2[0].Yvar.size()),
+                    feature_names=feature_names,
+                    outcome_names=[m],
+                )
+            )
+        context_dt = ContextualDataset(
+            datasets=dataset_list2,
+            parameter_decomposition=parameter_decomposition,
+            context_buckets=context_buckets,
+            metric_decomposition=metric_decomposition,
+        )
+        self.assertEqual(len(context_dt.datasets), len(dataset_list2))
+        self.assertListEqual(context_dt.context_buckets, context_buckets)
+        self.assertListEqual(context_dt.outcome_names, context_outcome_list)
+        self.assertListEqual(context_dt.feature_names, feature_names)
+        self.assertTrue(torch.equal(context_dt.X, dataset_list2[-1].X))
+        self.assertEqual(context_dt.Y.shape[-1], len(context_outcome_list))
+        self.assertEqual(context_dt.Yvar.shape[-1], len(context_outcome_list))
+        for dt in dataset_list2:
+            self.assertIs(context_dt.datasets[dt.outcome_names[0]], dt)
+
+        # test the ordering via context buckets
+        context_dt_reverse = ContextualDataset(
+            datasets=dataset_list2,
+            parameter_decomposition=parameter_decomposition,
+            context_buckets=context_buckets[::-1],  # reverse order
+            metric_decomposition=metric_decomposition,
+        )
+        self.assertListEqual(
+            context_dt_reverse.outcome_names, context_outcome_list[::-1]
+        )
+        self.assertTrue(
+            torch.equal(context_dt.Y, torch.flip(context_dt_reverse.Y, (1,)))
+        )
+        self.assertTrue(
+            torch.equal(context_dt.Yvar, torch.flip(context_dt_reverse.Yvar, (1,)))
+        )
+
+        # test dataset validation
+        wrong_metric_decomposition = {
+            f"{c}": [f"y:{c}"] for c in context_buckets if c != "context_0"
+        }
+        wrong_metric_decomposition["context_0"] = ["y:context_0", "y:context_1"]
+        with self.assertRaisesRegex(
+            ValueError, "context_0 bucket contains mutltiple outcomes"
+        ):
+            ContextualDataset(
+                datasets=dataset_list2,
+                parameter_decomposition=parameter_decomposition,
+                context_buckets=context_buckets,
+                metric_decomposition=wrong_metric_decomposition,
+            )
+
+        with self.assertRaisesRegex(
+            InputDataError, "Require same X for context buckets"
+        ):
+            ContextualDataset(
+                datasets=[
+                    make_dataset(d=num_contexts, outcome_names=[m])
+                    for m in context_outcome_list
+                ],
+                parameter_decomposition=parameter_decomposition,
+                context_buckets=context_buckets,
+            )
+
+        with self.assertRaisesRegex(
+            InputDataError,
+            "metric_decomposition must be provided when there are multiple datasets.",
+        ):
+            ContextualDataset(
+                datasets=dataset_list2,
+                parameter_decomposition=parameter_decomposition,
+                context_buckets=context_buckets,
+            )
+
+        with self.assertRaisesRegex(
+            InputDataError,
+            "metric_decomposition is redundant when there is "
+            + "one dataset for overall outcome.",
+        ):
+            ContextualDataset(
+                datasets=dataset_list1,
+                parameter_decomposition=parameter_decomposition,
+                context_buckets=context_buckets,
+                metric_decomposition=metric_decomposition,
+            )
+
+        with self.assertRaisesRegex(
+            InputDataError,
+            "Keys of parameter decomposition and context buckets do not match.",
+        ):
+            ContextualDataset(
+                datasets=dataset_list1,
+                parameter_decomposition=parameter_decomposition,
+                context_buckets=["context_0", "context_1"],
+            )
+
+        with self.assertRaisesRegex(
+            InputDataError,
+            "Keys of metric decomposition and context buckets do not match.",
+        ):
+            ContextualDataset(
+                datasets=dataset_list2,
+                parameter_decomposition=parameter_decomposition,
+                context_buckets=context_buckets,
+                metric_decomposition={
+                    f"{c}": [f"y:{c}"] for c in context_buckets if c != "context_0"
+                },
+            )
+
+        wrong_metric_decomposition = {
+            f"{c}": [f"y:{c}"] for c in context_buckets if c != "context_0"
+        }
+        wrong_metric_decomposition["context_0"] = ["wrong_metric"]
+        missing_outcome = "y:context_0"
+        with self.assertRaisesRegex(
+            InputDataError, f"{missing_outcome} is missing in metric_decomposition."
+        ):
+            ContextualDataset(
+                datasets=dataset_list2,
+                parameter_decomposition=parameter_decomposition,
+                context_buckets=context_buckets,
+                metric_decomposition=wrong_metric_decomposition,
+            )