Changed API doc and code to support 0.4 feature set:

roycoding · roycoding · commit 02c769c59419 · 2019-12-28T22:27:35.000-06:00
- Only handle binary outcomes (no variable payouts)
- Handle historical payouts
- `est_payouts` is now `est_probs`
diff --git a/docs/slots-docs.md b/docs/slots-docs.md
@@ -13,46 +13,49 @@ This documents details the current and planned API for slots. Non-implemented fe
     1. Current choice
     2. number of trials completed for each arm
     3. scores for each arm
-    4. average payout per arm (payout*wins/trials?)
+    4. average payout per arm (wins/trials?)
     5. Current regret.  Regret = Trials*mean_max - sum^T_t=1(reward_t)
         - See [ref](http://research.microsoft.com/en-us/um/people/sebubeck/SurveyBCB12.pdf)
 6. Use sane defaults.
 7. Be obvious and clean.
+8. For the time being handle only binary payouts.
 
 ### Library API ideas:
 #### Running slots with a live website
 ```Python
-# Using slots to determine the best of 3 variations on a live website. 3 is the default.
+# Using slots to determine the best of 3 variations on a live website. 3 is the default number of bandits and epsilon greedy is the default strategy.
 mab = slots.MAB(3, live=True)
 
 # Make the first choice randomly, record responses, and input reward
 # 2 was chosen.
-# Run online trial (input most recent result) until test criteria is met.
+# Update online trial (input most recent result) until test criteria is met.
 mab.online_trial(bandit=2,payout=1)
 
 # Repsonse of mab.online_trial() is a dict of the form:
 {'new_trial': boolean, 'choice': int, 'best': int}
 
 # Where:
 #   If the criterion is met, new_trial = False.
-#   choice is the current choice of arm to try.
+#   choice is the current choice of arm to try next.
 #   best is the current best estimate of the highest payout arm.
 ```
 
 #### Creating a MAB test instance:
 
 ```Python
-# Default: 3 bandits with random p_i and pay_i = 1
-mab = slots.MAB(live=False)
+# Default: 3 bandits with random probabilities, p_i.
+mab = slots.MAB()
 
-# Set up 4 bandits with random p_i and pay_i
-mab = slots.MAB(4, live=False)
+# Set up 4 bandits with random p_i.
+mab = slots.MAB(4)
 
 # 4 bandits with specified p_i
-mab = slots.MAB(probs = [0.2,0.1,0.4,0.1], live=False)
+mab = slots.MAB(probs = [0.2,0.1,0.4,0.1])
 
-# 3 bandits with specified pay_i
-mab = slots.MAB(payouts = [1,10,15], live=False)
+# Creating 3 bandits with histoprical payout data
+mab = slots.MAB(3, hist_payouts = np.array([[0,0,1,...],
+                                            [1,0,0,...],
+                                            [0,0,0,...]]))
 ```
 
 #### Running tests with strategy, S
@@ -98,8 +101,8 @@ mab.bandits.reset()
 
 # Set probabilities or payouts
 # (NOT YET IMPLEMENTED)
-mab.bandits.probs_set([0.1,0.05,0.2,0.15])
-mab.bandits.payouts_set([1,1.5,0.5,0.8])
+mab.bandits.set_probs([0.1,0.05,0.2,0.15])
+mab.bandits.set_hist_payouts([[1,1,0,0],[0,1,0,0]])
 ```
 
 #### Displaying / retrieving test info
@@ -114,10 +117,10 @@ mab.prob_est()
 
 # Retrieve bandit probability estimate of bandit i
 # (NOT YET IMPLEMENTED)
-mab.prob_est(i)
+mab.est_prob(i)
 
-# Retrieve bandit payout estimates (p * payout)
-mab.est_payout()
+# Retrieve bandit probability estimates
+mab.est_probs()
 
 # Retrieve current bandit choice
 # (NOT YET IMPLEMENTED, use mab.choices[-1])
diff --git a/slots/slots.py b/slots/slots.py
@@ -10,8 +10,12 @@
         mab.best  # Bandit with highest probability after T trials
 
     - Run MAB test on "real" payout data (probabilites unknown).
-        mab = slots.MAB(payouts = [0,0,0,1,0,0,0,0,0,....])
-        mab.run(trials = 10000) # Max is length of payouts
+        mab = slots.MAB(hist_payouts = [[0,0,...], [1,0,...], [0,1,...])
+        mab.run(trials = 10000)
+
+    - Run MAB test on "live" data
+        mab = slots.MAB(num_bandits=3, live=True)
+        mab.online_trial(bandit=1, payout=0)
 """
 
 
@@ -27,66 +31,78 @@ def __init__(
         self,
         num_bandits=3,
         probs=None,
-        payouts=None,
+        hist_payouts=None,
         live=False,
         stop_criterion={"criterion": "regret", "value": 0.1},
     ):
         """
         Parameters
         ----------
-        num_bandits : int
+        num_bandits : int, optional
             default is 3
-        probs : np.array of floats
+        probs : array of floats, optional
             payout probabilities
-        payouts : np.array of floats
-            If `live` is True, `payouts` should be None.
-        live : bool
+        hist_payouts : list of lists of ints, one array per bandit, optional
+            This is for testing on historical data.
+            If you set `probs` or `live` is True, `hist_payouts` should be None.
+        live : bool, optional
             Whether the use is for a live, online trial.
-        stop_criterion : dict
+        stop_criterion : dict, optional
             Stopping criterion (str) and threshold value (float).
         """
 
         self.choices = []
 
         if not probs:
-            if not payouts:
+            if not hist_payouts:
                 if live:
                     # Live trial scenario, where nothing is known except the
                     # number of bandits
                     self.bandits = Bandits(
-                        live=True, payouts=np.zeros(num_bandits), probs=None
+                        live=True, payouts=np.zeros(num_bandits)
                     )
                 else:
-                    # A pure experiment scenario with random probabilities
-                    # and single payout values are 1.
+                    # A pure experiment scenario with random probabilities.
                     self.bandits = Bandits(
-                        probs=[np.random.rand() for x in range(num_bandits)],
-                        payouts=np.ones(num_bandits),
+                        probs=np.random.rand(num_bandits),
+                        payouts=np.zeros(num_bandits),
                         live=False,
                     )
             else:
                 # Run strategies on known historical sequence of payouts. Probabilities are not known.
+                num_bandits = len(hist_payouts)
                 if live:
                     print(
                         "slots: Cannot have a defined array of payouts and live=True. live set to False"
                     )
                 self.bandits = Bandits(
-                    probs=[np.random.rand() for x in range(len(payouts))],
-                    payouts=payouts,
+                    hist_payouts=hist_payouts,
+                    payouts=np.zeros(num_bandits),
                     live=False,
                 )
-                num_bandits = len(payouts)
         else:
-            if payouts:
-                # A pure experiment scenario with known probabilities and known single payout values.
-                self.bandits = Bandits(probs=probs, payouts=payouts, live=False)
-                num_bandits = len(payouts)
+            if hist_payouts:
+                # A pure experiment scenario with known historical payout values. Probabilities will be ignored.
+                num_bandits = len(probs)
+                print(
+                    "slots: Since historical payout data has been supplied, probabilities will be ignored."
+                )
+                if len(probs) == len(payouts):
+                    self.bandits = Bandits(
+                        hist_payouts=hist_payouts,
+                        live=False,
+                        payouts=np.zeros(num_bandits),
+                    )
+                else:
+                    raise Exception(
+                        "slots: Dimensions of probs and payouts mismatched."
+                    )
             else:
-                # A pure experiment scenario with known probabilities and single payout values of 1.
+                # A pure experiment scenario with known probabilities
+                num_bandits = len(probs)
                 self.bandits = Bandits(
-                    probs=probs, payouts=np.ones(len(probs)), live=False
+                    probs=probs, payouts=np.zeros(num_bandits), live=False
                 )
-                num_bandits = len(probs)
 
         self.wins = np.zeros(num_bandits)
         self.pulls = np.zeros(num_bandits)
@@ -123,12 +139,14 @@ def run(self, trials=100, strategy="eps_greedy", parameters=None):
         """
 
         if trials < 1:
-            raise Exception("MAB.run: Number of trials cannot be less than 1!")
+            raise Exception(
+                "slots.MAB.run: Number of trials cannot be less than 1!"
+            )
 
         else:
             if strategy not in self.strategies:
                 raise Exception(
-                    "MAB,run: Strategy name invalid. Choose from:"
+                    "slots.MAB,run: Strategy name invalid. Choose from:"
                     " {}".format(", ".join(self.strategies))
                 )
 
@@ -346,7 +364,7 @@ def best(self):
         else:
             return np.argmax(self.wins / (self.pulls + 0.1))
 
-    def est_payouts(self):
+    def est_probs(self):
         """
         Calculate current estimate of average payout for each bandit.
 
@@ -447,7 +465,11 @@ def online_trial(
             )
 
         if self.crit_met():
-            return {"new_trial": False, "choice": self.best(), "best": self.best()}
+            return {
+                "new_trial": False,
+                "choice": self.best(),
+                "best": self.best(),
+            }
         else:
             return {
                 "new_trial": True,
@@ -463,7 +485,7 @@ def update(self, bandit, payout):
         ----------
         bandit : int
             Bandit index
-        payout : float
+        payout : int (0 or 1)
 
         Returns
         -------
@@ -481,31 +503,27 @@ class Bandits:
     Bandit class.
     """
 
-    def __init__(self, probs, payouts, live=True):
+    def __init__(self, payouts, probs=None, hist_payouts=None, live=False):
         """
         Instantiate Bandit class, determining
             - Probabilities of bandit payouts
             - Bandit payouts
 
         Parameters
         ----------
-        probs: array of floats
-            Probabilities of bandit payouts
-        payouts : array of floats
-            Amount of bandit payouts. If `live` is True, `payouts` should be an
-            N length array of zeros.
-        live : bool
+        payouts : array of ints
+            Cumulative bandit payouts. `payouts` should start as an N
+            length array of zeros, where N is the number of bandits.
+        probs: array of floats, optional
+            Probabilities of bandit payouts.
+        hist_payouts: list of arrays of ints, optional
+        live : bool, optional
         """
 
         if not live:
-            # Only use arrays of equal length
-            if len(probs) != len(payouts):
-                raise Exception(
-                    "Bandits.__init__: Probability and payouts "
-                    "arrays of different lengths!"
-                )
             self.probs = probs
             self.payouts = payouts
+            self.hist_payouts = hist_payouts
             self.live = False
         else:
             self.live = True
@@ -531,11 +549,18 @@ def pull(self, i):
                 return self.payouts[i].pop()
             else:
                 return None
+        elif self.hist_payouts:
+            if not hist[i]:
+                return None
+            else:
+                _p = hist[i][0]
+                hist[i] = hist[i][1:]
+                return _p
         else:
             if np.random.rand() < self.probs[i]:
-                return self.payouts[i]
+                return 1
             else:
-                return 0.0
+                return 0
 
     def info(self):
         pass