Detector Param Tuning

rohanfb · facebook-github-bot · commit 33d3d5c1c917 · 2021-06-09T13:21:51.000-07:00
Summary: Minor changes to support metalearning for detection

Reviewed By: yangbk560

Differential Revision: D28313594

fbshipit-source-id: 9e21ed88b40ed4a9153508a9548652a0638903b9
diff --git a/kats/detectors/bocpd_model.py b/kats/detectors/bocpd_model.py
@@ -9,24 +9,22 @@
 algorithm as a DetectorModel, to provide a common interface.
 """
 
-from typing import Optional
-import pandas as pd
 import json
+from typing import Optional
 
-from kats.detectors.detector import DetectorModel
-
+import pandas as pd
 from kats.consts import TimeSeriesData
-
 from kats.detectors.bocpd import (
     BOCPDetector,
     BOCPDModelType,
 )
-
+from kats.detectors.detector import DetectorModel
 from kats.detectors.detector_consts import (
     AnomalyResponse,
     ConfidenceBand,
 )
 
+
 class BocpdDetectorModel(DetectorModel):
     """Implements the Bayesian Online Changepoint Detection as a DetectorModel.
 
@@ -43,16 +41,25 @@ class BocpdDetectorModel(DetectorModel):
     >>> anom = bocpd_detector.fit_predict(data=level_ts)
     """
 
-    def __init__(self, serialized_model: Optional[bytes] = None, slow_drift: bool = False):
+    def __init__(
+        self,
+        serialized_model: Optional[bytes] = None,
+        slow_drift: bool = False,
+        threshold: Optional[float] = None,
+    ):
         if serialized_model is None:
             self.slow_drift = slow_drift
+            self.threshold = threshold
         else:
             model_dict = json.loads(serialized_model)
-            if 'slow_drift' in model_dict:
-                self.slow_drift = model_dict['slow_drift']
+            if "slow_drift" in model_dict:
+                self.slow_drift = model_dict["slow_drift"]
             else:
                 self.slow_drift = slow_drift
-
+            if "threshold" in model_dict:
+                self.threshold = model_dict["threshold"]
+            else:
+                self.threshold = threshold
 
     def serialize(self) -> bytes:
         """Returns the serialzed model.
@@ -64,7 +71,7 @@ def serialize(self) -> bytes:
             json containing information about serialized model.
         """
 
-        model_dict = {'slow_drift': self.slow_drift}
+        model_dict = {"slow_drift": self.slow_drift}
         return json.dumps(model_dict).encode("utf-8")
 
     def _handle_missing_data_extend(
@@ -76,10 +83,7 @@ def _handle_missing_data_extend(
         # but we will remove the interpolated data when we
         # evaluate, to make sure that the anomaly score is
         # the same length as data
-        original_time_list = (
-            list(historical_data.time)
-            + list(data.time)
-        )
+        original_time_list = list(historical_data.time) + list(data.time)
 
         if historical_data.is_data_missing():
             historical_data = historical_data.interpolate()
@@ -90,20 +94,27 @@ def _handle_missing_data_extend(
 
         # extend has been done, now remove the interpolated data
         data = TimeSeriesData(
-            pd.DataFrame({
-                'time':[
-                    historical_data.time.iloc[i] for i in range(len(historical_data))
-                    if historical_data.time.iloc[i] in original_time_list],
-                'value':[
-                    historical_data.value.iloc[i] for i in range(len(historical_data))
-                    if historical_data.time.iloc[i] in original_time_list]
-            }),
-            use_unix_time=True, unix_time_units="s", tz="US/Pacific"
+            pd.DataFrame(
+                {
+                    "time": [
+                        historical_data.time.iloc[i]
+                        for i in range(len(historical_data))
+                        if historical_data.time.iloc[i] in original_time_list
+                    ],
+                    "value": [
+                        historical_data.value.iloc[i]
+                        for i in range(len(historical_data))
+                        if historical_data.time.iloc[i] in original_time_list
+                    ],
+                }
+            ),
+            use_unix_time=True,
+            unix_time_units="s",
+            tz="US/Pacific",
         )
 
         return data
 
-
     # pyre-fixme[14]: `fit_predict` overrides method defined in `DetectorModel`
     #  inconsistently.
     def fit_predict(
@@ -129,7 +140,7 @@ def fit_predict(
         # pyre-fixme[16]: `BocpdDetectorModel` has no attribute `last_N`.
         self.last_N = len(data)
 
-        #if there is historical data
+        # if there is historical data
         # we prepend it to data, and run
         # the detector as if we only saw data
         if historical_data is not None:
@@ -138,20 +149,39 @@ def fit_predict(
         bocpd_model = BOCPDetector(data=data)
 
         if not self.slow_drift:
-            _ = bocpd_model.detector(
-                model=BOCPDModelType.NORMAL_KNOWN_MODEL, choose_priors=True,
-                agg_cp=True
-            )
+
+            if self.threshold is not None:
+                _ = bocpd_model.detector(
+                    model=BOCPDModelType.NORMAL_KNOWN_MODEL,
+                    choose_priors=True,
+                    agg_cp=True,
+                    threshold=self.threshold,
+                )
+            else:
+                _ = bocpd_model.detector(
+                    model=BOCPDModelType.NORMAL_KNOWN_MODEL,
+                    choose_priors=True,
+                    agg_cp=True,
+                )
         else:
-            _ = bocpd_model.detector(
-                model=BOCPDModelType.TREND_CHANGE_MODEL, choose_priors=False,
-                agg_cp=True
-            )
+            if self.threshold is not None:
+                _ = bocpd_model.detector(
+                    model=BOCPDModelType.NORMAL_KNOWN_MODEL,
+                    choose_priors=True,
+                    agg_cp=True,
+                    threshold=self.threshold,
+                )
+            else:
+                _ = bocpd_model.detector(
+                    model=BOCPDModelType.TREND_CHANGE_MODEL,
+                    choose_priors=False,
+                    agg_cp=True,
+                )
 
         change_prob_dict = bocpd_model.get_change_prob()
         change_prob = list(change_prob_dict.values())[0]
 
-        #construct the object
+        # construct the object
         N = len(data)
         default_ts = TimeSeriesData(time=data.time, value=pd.Series(N * [0.0]))
         score_ts = TimeSeriesData(time=data.time, value=pd.Series(change_prob))
diff --git a/kats/detectors/changepoint_evaluator.py b/kats/detectors/changepoint_evaluator.py
@@ -416,8 +416,6 @@ def _parse_data(self, df_row: Any):
         this_ts = df_row['time_series']
         this_anno = df_row['annotation']
 
-        print(this_dataset)
-
         this_anno_json_acc = this_anno.replace("'", "\"")
         this_anno_dict = json.loads(this_anno_json_acc)
 
diff --git a/kats/detectors/cusum_model.py b/kats/detectors/cusum_model.py
@@ -31,7 +31,7 @@
 import logging
 from datetime import datetime
 from enum import Enum
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -49,7 +49,6 @@
 CHANGEPOINT_RETENTION = 7 * 24 * 60 * 60  # in seconds
 MAX_CHANGEPOINT = 10
 
-
 def percentage_change(
     data: TimeSeriesData, pre_mean: float, **kwargs: Any
 ) -> TimeSeriesData:
@@ -94,12 +93,18 @@ class CusumScoreFunction(Enum):
     percentage_change = "percentage_change"
     z_score = "z_score"
 
-
+# Score Function Constants
 SCORE_FUNC_DICT = {
     CusumScoreFunction.change.value: change,
     CusumScoreFunction.percentage_change.value: percentage_change,
     CusumScoreFunction.z_score.value: z_score,
 }
+DEFAULT_SCORE_FUNCTION = CusumScoreFunction.change
+STR_TO_SCORE_FUNC = { # Used for param tuning
+    "change": CusumScoreFunction.change,
+    "percentage_change": CusumScoreFunction.percentage_change,
+    "z_score": CusumScoreFunction.z_score,
+}
 
 
 class CUSUMDetectorModel(DetectorModel):
@@ -139,7 +144,7 @@ def __init__(
         magnitude_quantile: float = CUSUM_DEFAULT_ARGS["magnitude_quantile"],
         magnitude_ratio: float = CUSUM_DEFAULT_ARGS["magnitude_ratio"],
         change_directions: List[str] = CUSUM_DEFAULT_ARGS["change_directions"],
-        score_func: CusumScoreFunction = CusumScoreFunction.change,
+        score_func: Union[str, CusumScoreFunction] = DEFAULT_SCORE_FUNCTION,
         remove_seasonality: bool = CUSUM_DEFAULT_ARGS["remove_seasonality"],
     ):
         if serialized_model:
@@ -178,8 +183,16 @@ def __init__(
             self.magnitude_quantile = magnitude_quantile
             self.magnitude_ratio = magnitude_ratio
             self.change_directions = change_directions
-            self.score_func = score_func.value
             self.remove_seasonality = remove_seasonality
+
+            # We allow score_function to be a str for compatibility with param tuning
+            if isinstance(score_func, str):
+                if score_func in STR_TO_SCORE_FUNC:
+                    score_func = STR_TO_SCORE_FUNC[score_func]
+                else:
+                    score_func = DEFAULT_SCORE_FUNCTION
+            self.score_func = score_func.value
+
         else:
             raise ValueError(
                 """
diff --git a/kats/models/metalearner/metalearner_hpt.py b/kats/models/metalearner/metalearner_hpt.py
@@ -49,6 +49,14 @@
         ],
         "numerical_idx": [],
     },
+    "cusum": {
+        "categorical_idx": ["score_func"],
+        "numerical_idx": ["delta_std_ratio", "scan_window", "historical_window"],
+    },
+    "statsig": {
+        "categorical_idx": [],
+        "numerical_idx": ["n_control", "n_test"],
+    }
 }
 
 default_model_networks = {
@@ -78,6 +86,16 @@
         "n_hidden_cat_combo": [[5], [5], [2], [3], [5], [5], [5]],
         "n_hidden_num": [],
     },
+    "cusum": {
+        "n_hidden_shared": [20],
+        "n_hidden_cat_combo": [[3]],
+        "n_hidden_num": [5, 5, 5],
+    },
+    "statsig": {
+        "n_hidden_shared": [20],
+        "n_hidden_cat_combo": [],
+        "n_hidden_num": [5, 5],
+    },
 }
 
 
@@ -163,7 +181,7 @@ def __init__(
                     numerical_idx = default_model_params[default_model]["numerical_idx"]
 
                 else:
-                    msg = f"default_model={default_model} is not available! Please choose one from 'prophet', 'arima', 'sarima', 'holtwinters', stlf, 'theta'"
+                    msg = f"default_model={default_model} is not available! Please choose one from 'prophet', 'arima', 'sarima', 'holtwinters', 'stlf', 'theta', 'cusum', 'statsig'"
                     logging.error(msg)
                     raise ValueError(msg)
 
@@ -385,7 +403,7 @@ def _prepare_data(
             else None
         )
         y_num = (
-            torch.from_numpy(self._target_num[train_idx, :]).float()
+            torch.from_numpy(self._target_num[train_idx, :].astype('float')).float()
             if self.numerical_idx
             else None
         )
@@ -398,7 +416,7 @@ def _prepare_data(
             else None
         )
         y_num_val = (
-            torch.from_numpy(self._target_num[val_idx, :]).float()
+            torch.from_numpy(self._target_num[val_idx, :].astype('float')).float()
             if self.numerical_idx
             else None
         )
diff --git a/kats/models/metalearner/metalearner_modelselect.py b/kats/models/metalearner/metalearner_modelselect.py
@@ -60,9 +60,16 @@ class MetaLearnModelSelect:
     def __init__(
         self, metadata: Optional[List[Dict[str, Any]]] = None, load_model: bool = False
     ) -> None:
-        if not load_model and metadata is not None:
+        if not load_model:
+            # pyre-fixme[6]: Expected `Sized` for 1st param but got
+            #  `Optional[List[typing.Any]]`.
             if len(metadata) <= 30:
-                msg = f"metadata size should be greater than 30 but receives {len(metadata)}."
+                msg = "Dataset is too small to train a meta learner!"
+                logging.error(msg)
+                raise ValueError(msg)
+
+            if metadata is None:
+                msg = "Missing metadata!"
                 logging.error(msg)
                 raise ValueError(msg)