fix: update how ink makes prediction (#71)

yuxqiu · web-flow · commit 13222c7b0d0e · 2023-05-20T17:34:51.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -82,7 +82,7 @@ exclude = ["truelearn.tests*"]  # exclude tests from build artifacts
 # e.g. pip install truelearn[dev] will install the tests dependencies
 [project.optional-dependencies]
 tests = ["pytest>=7.2.1", "pytest-cov>=4.0.0", "pytest-socket>=0.6.0"]
-linters = ["prospector[with_bandit,with_mypy]>=1.8.4"]
+linters = ["prospector[with_bandit,with_mypy]==1.8.4"]
 docs = ["sphinx>=5.3.0", "furo>=2023.03.27","sphinx_copybutton>=0.5.1", "sphinx-gallery>=0.12.2", "Pillow>=9.4.0"]
 dev = ["truelearn[tests, linters, docs]","black>=22.12.0"]
 
diff --git a/truelearn/learning/_ink_classifier.py b/truelearn/learning/_ink_classifier.py
@@ -64,9 +64,9 @@ class INKClassifier(BaseClassifier):
         ...         ink_classifier.predict_proba(event)
         ...     )
         ...
-        True 0.64105...
-        False 0.44438...
-        True 0.64909...
+        True 0.64839...
+        False 0.43767...
+        True 0.65660...
         >>> ink_classifier.get_params(deep=False)  # doctest:+ELLIPSIS
         {...'learner_meta_weights': LearnerMetaWeights(novelty_weights=Weights(\
 mean=0.20461..., variance=0.45871...), interest_weights=Weights(\
@@ -174,9 +174,9 @@ def __eval_matching_quality(
             bias_weights.mean * pred_bias,
         ]
         team_learner_variance = [
-            novelty_weights.variance * pred_novelty,
-            interest_weights.variance * pred_interest,
-            bias_weights.variance * pred_bias,
+            novelty_weights.variance * (pred_novelty**2),
+            interest_weights.variance * (pred_interest**2),
+            bias_weights.variance * (pred_bias**2),
         ]
         team_content_mean = [self._threshold]
         team_content_variance = []
@@ -203,21 +203,13 @@ def __create_env(self):
     def __update_weights(
         self,
         x: EventModel,
-        pred_novelty: float,
-        pred_interest: float,
         pred_actual: float,
     ) -> None:
         """Update the weights of novelty, interest and bias.
 
         Args:
             x:
                 A representation of the learning event.
-            pred_novelty:
-                The predicted probability of the learner's engagement by using
-                NoveltyClassifier.
-            pred_interest:
-                The predicted probability of the learner's engagement by using
-                InterestClassifier.
             pred_actual:
                 Whether the learner actually engages in the given event. This value is
                 either 0 or 1.
@@ -228,6 +220,9 @@ def __update_weights(
         if self._greedy and cur_pred == pred_actual:
             return
 
+        pred_novelty = self._novelty_classifier.predict_proba(x)
+        pred_interest = self._interest_classifier.predict_proba(x)
+
         # train
         env = self.__create_env()
         team_experts = (
@@ -278,11 +273,7 @@ def __update_weights(
     def fit(self, x: EventModel, y: bool) -> Self:
         self._novelty_classifier.fit(x, y)
         self._interest_classifier.fit(x, y)
-
-        pred_novelty = self._novelty_classifier.predict_proba(x)
-        pred_interest = self._interest_classifier.predict_proba(x)
-
-        self.__update_weights(x, pred_novelty, pred_interest, y)
+        self.__update_weights(x, y)
         return self
 
     def predict(self, x: EventModel) -> bool:
diff --git a/truelearn/tests/test_learning.py b/truelearn/tests/test_learning.py
@@ -799,7 +799,11 @@ def test_ink_classifier_customize(self, train_cases, test_events):
         for event, label in zip(train_events, train_labels):
             classifier.fit(event, label)
 
-        expected_results = [0.4155257653300731, 0.3792233211000749, 0.35213145076551466]
+        expected_results = [
+            0.40575267541878457,
+            0.36519542301026875,
+            0.33362493980730495,
+        ]
         actual_results = [classifier.predict_proba(event) for event in test_events]
 
         check_farray_close(actual_results, expected_results)
@@ -849,7 +853,7 @@ def test_ink_classifier(self, train_cases, test_events):
         for event, label in zip(train_events, train_labels):
             classifier.fit(event, label)
 
-        expected_results = [0.3943943468622016, 0.3536982390875026, 0.33082714771211985]
+        expected_results = [0.3844070661899784, 0.3398805698754434, 0.3133264788862059]
         actual_results = [classifier.predict_proba(event) for event in test_events]
 
         check_farray_close(actual_results, expected_results)
diff --git a/truelearn/tests/test_utils_visualisations.py b/truelearn/tests/test_utils_visualisations.py
@@ -1,8 +1,7 @@
-# pylint: disable=missing-function-docstring,missing-class-docstring
+# pylint: disable=missing-function-docstring,missing-class-docstring,line-too-long
 import functools
 import random
 import pathlib
-import filecmp
 import types
 import os
 import sys
@@ -113,28 +112,42 @@ def file_comparison(plotter_type: str, config: Optional[Dict[str, Dict]] = None)
             For matplotlib type, the method will test `.png`.
         config:
             A dictionary containing the configuration for each extension.
+            E.g. config={".png": {...}, ".json": {...}, ...}
     """
     config = config or {}
 
     if plotter_type == "plotly":
         # only support html and json for plotly
         # because the backend engine that plotly uses
-        # to generate imgaes is platform dependent
+        # to generate images is platform dependent
         # Therefore, to be able to provide consistent
         # and replicable tests, we test against json and html.
         extensions = {
-            ".json": config.get(".json", {}),
+            ".json": {
+                **config.get(".json", {}),
+                # to generate files with cross-platform consistent encoding
+                "encoding": "utf-8",
+            },
             ".html": {
                 **config.get(".html", {}),
                 # overwrite settings for div_id and include_plotlyjs
                 # as they directly affect the generated output
                 "div_id": UUID,
                 "include_plotlyjs": "https://cdn.plot.ly/plotly-2.20.0.min.js",
+                # to generate files with cross-platform consistent encoding
+                "encoding": "utf-8",
             },
         }
 
         def file_cmp_func(filename1, filename2):
-            return filecmp.cmp(filename1, filename2)
+            # since we use utf-8 to save all text files
+            # we can safely open them with utf-8 here
+            with open(filename1, "rt", encoding="utf-8") as f1, open(
+                filename2, "rt", encoding="utf-8"
+            ) as f2:
+                # line by line comparison, ignore the differences in newline characters
+                # see https://docs.python.org/3/library/functions.html#open-newline-parameter # noqa
+                return f1.readlines() == f2.readlines()
 
     elif plotter_type == "matplotlib":
         extensions = {
diff --git a/truelearn/utils/visualisations/_base.py b/truelearn/utils/visualisations/_base.py
@@ -197,10 +197,16 @@ def savefig(self, file: str, **kargs):
                         The default width of the image in the HTML file.
                     default_height:
                         The default height of the image in the HTML file.
+                    encoding:
+                        The encoding of the saved HTML file. If unspecified,
+                        the encoding will be utf-8.
 
                 If you want to export a JSON file, you can optionally pass in
                     pretty:
                         Whether the saved JSON representation should be pretty-printed.
+                    encoding:
+                        The encoding of the saved JSON file. If unspecified,
+                        the encoding will be utf-8.
 
                 If you want to export an image file, you can optionally pass in
                     width:
@@ -213,10 +219,15 @@ def savefig(self, file: str, **kargs):
             to find out more supported arguments.
         """
         if file.endswith(".html"):
-            self.figure.write_html(file=file, **kargs)
+            encoding = kargs.pop("encoding", None) or "utf-8"
+            with open(file, mode="wt", encoding=encoding) as f:
+                self.figure.write_html(file=f, **kargs)
             return
+
         if file.endswith(".json"):
-            self.figure.write_json(file=file, **kargs)
+            encoding = kargs.pop("encoding", None) or "utf-8"
+            with open(file, mode="wt", encoding=encoding) as f:
+                self.figure.write_json(file=f, **kargs)
             return
 
         self.figure.write_image(file=file, **kargs)