@@ -28,8 +28,8 @@ class DBS(Player):
2828 used when computing discounted frequencies to learn opponent's
2929 strategy. Must be between 0 and 1. The default is 0.75
3030 promotion_threshold : int, optional
31- number of observations needed to promote a change in opponent's
32- strategy . The default is 3.
31+ number of successive observations needed to promote an
32+ opponent behavior as a deterministic rule . The default is 3.
3333 violation_threshold : int, optional
3434 number of observations needed to considerate opponent's
3535 strategy has changed. You can lower it when noise increases.
@@ -58,35 +58,65 @@ class DBS(Player):
5858 def __init__ (self , discount_factor = .75 , promotion_threshold = 3 ,
5959 violation_threshold = 4 , reject_threshold = 3 , tree_depth = 5 ):
6060 super ().__init__ ()
61-
62- # default opponent's policy is TitForTat
61+
62+ # The opponent's behavior is represented by a 3 dicts :
63+ # Rd, Rc, and Rp.
64+ # His behavior his modeled by a set of rules. A rule is the move that
65+ # the opponent will play (C or D or a probability to play C) after a
66+ # given outcome (for instance after (C, D))
67+ # A rule can be deterministic or probabilistic
68+ # - Rc is the set of deterministic rules
69+ # - Rp is the set of probabilistic rules
70+ # - Rd is the default rule set which is used for initialization but also
71+ # keeps track of previous policies when change in the opponent behavior
72+ # happens, in order to have a smooth transition
73+ # - Pi is a set of rules that aggregates all above sets of rules in
74+ # order to fully model the opponent's behavior
75+
76+ # Default rule set Rd
77+ # Default opponent's policy is TitForTat
6378 self .Rd = create_policy (1 , 1 , 0 , 0 )
79+ # Set of current deterministic rules Rc
6480 self .Rc = {}
65- self .Pi = self .Rd # policy used by MoveGen
81+ # Aggregated rule set Pi
82+ self .Pi = self .Rd
83+ # For each rule in Rd we need to count the number of successive
84+ # violations. Those counts are saved in violation_counts.
6685 self .violation_counts = {}
6786 self .reject_threshold = reject_threshold
6887 self .violation_threshold = violation_threshold
6988 self .promotion_threshold = promotion_threshold
7089 self .tree_depth = tree_depth
90+ # v is a violation count used to know when to clean the default rule
91+ # set Rd
7192 self .v = 0
93+ # A discount factor for computing the probabilistic rules
7294 self .alpha = discount_factor
73- self .history_by_cond = {}
74- # to compute the discount frequencies, we need to keep
75- # up to date an history of what has been played for each
76- # condition:
95+
96+ # The probabilistic rule set Rp is not saved as an attribute, but each
97+ # rule is computed only when needed.
98+ # The rules are computed as discounted frequencies of opponent's past
99+ # moves. To compute the discounted frequencies, we need to keep
100+ # up to date an history of what has been played following each
101+ # outcome (or condition):
77102 # We save it as a dict history_by_cond; keys are conditions
78- # (ex (C,C)) and values are a tuple of 2 lists (G,F)
79- # for a condition j:
103+ # (ex (C, C)) and values are a tuple of 2 lists (G, F)
104+ # for a condition j and an iteration i in the match :
80105 # G[i] = 1 if cond j was True at turn i-1 and C has been played
81- # by the opponent; else G[i]=0
82- # F[i] = 1 if cond j was True at turn i-1; else G[i]=0
106+ # by the opponent; else G[i] = 0
107+ # F[i] = 1 if cond j was True at turn i-1; else F[i]=0
108+ # this representation makes the computing of discounted frequencies
109+ # easy and efficient
83110 # initial hypothesized policy is TitForTat
84- self .history_by_cond [(C , C )] = ([1 ], [1 ])
85- self .history_by_cond [(C , D )] = ([1 ], [1 ])
86- self .history_by_cond [(D , C )] = ([0 ], [1 ])
87- self .history_by_cond [(D , D )] = ([0 ], [1 ])
111+ self .history_by_cond = {
112+ [(C , C )] = ([1 ], [1 ])
113+ [(C , D )] = ([1 ], [1 ])
114+ [(D , C )] = ([0 ], [1 ])
115+ [(D , D )] = ([0 ], [1 ])
116+ }
88117
89118 def reset (self ):
119+ """ Reset instance properties. """
90120 super ().reset ()
91121 self .Rd = create_policy (1 , 1 , 0 , 0 )
92122 self .Rc = {}
@@ -101,7 +131,26 @@ def reset(self):
101131
102132 def should_promote (self , r_plus , promotion_threshold = 3 ):
103133 """
104-
134+ This function determines if the move r_plus is a deterministic
135+ behavior of the opponent, and then returns True, or if r_plus
136+ is due to a random behavior (or noise) which would require a
137+ probabilistic rule, in which case it returns False
138+
139+ To do so it looks into the game history : if the K last times
140+ when the opponent was in the same situation than in r_plus, he
141+ played the same thing, then then r_plus is considered as a
142+ deterministic rule (where K is the user-defined
143+ promotion_threshold)
144+
145+ Parameters
146+
147+ r_plus : tuple of (tuple of actions.Actions, actions.Actions)
148+ exemple: ((C, C), D)
149+ r_plus represents one outcome of the history, and the
150+ following move played by the opponent
151+ promotion_threshold : int, optionnal
152+ number of successive observations needed to promote an
153+ opponent behavior as a deterministic rule. Default is 3.
105154 """
106155 if r_plus [1 ] == C :
107156 opposite_action = 0
@@ -127,9 +176,17 @@ def should_promote(self, r_plus, promotion_threshold=3):
127176 return False
128177
129178 def should_demote (self , r_minus , violation_threshold = 4 ):
179+ """
180+ Checks if the number of successive violations of a deterministic
181+ rule (in the opponent's behavior) exceeds the user-defined
182+ violation_threshold
183+ """
130184 return (self .violation_counts [r_minus [0 ]] >= violation_threshold )
131185
132186 def update_history_by_cond (self , opponent_history ):
187+ """
188+ Updates self.history_by_cond, between each turns of the game.
189+ """
133190 two_moves_ago = (self .history [- 2 ], opponent_history [- 2 ])
134191 for outcome ,GF in self .history_by_cond .items ():
135192 G ,F = GF
@@ -143,7 +200,25 @@ def update_history_by_cond(self, opponent_history):
143200 G .append (0 )
144201 F .append (0 )
145202
146- def compute_prob_rule (self , outcome , alpha ):
203+ def compute_prob_rule (self , outcome , alpha = 1 ):
204+ """
205+ Uses the game history to compute the probability of the opponent
206+ playing C, in the outcome situation
207+ (exemple : outcome = (C, C)).
208+ When alpha = 1, the results is approximately equal to the frequency
209+ of the occurence of outcome -> C.
210+ alpha is a discount factor that allows to give more weight to recent
211+ events than earlier ones.
212+
213+ Parameters
214+
215+ outcome : tuple of two actions.Actions
216+ in {(C, C), (C, D), (D, C), (D, D)}
217+ We want to compute the probability that the opponent plays C
218+ following this outcome in the game
219+ alpha : int, optionnal
220+ Discount factor. Default is 1.
221+ """
147222 G = self .history_by_cond [outcome ][0 ]
148223 F = self .history_by_cond [outcome ][1 ]
149224 discounted_g = 0
@@ -153,25 +228,30 @@ def compute_prob_rule(self, outcome, alpha):
153228 discounted_g += alpha_k * g
154229 discounted_f += alpha_k * f
155230 alpha_k = alpha * alpha_k
156- p_cond = discounted_g / discounted_f
231+ p_cond = discounted_g / discounted_f
157232 return p_cond
158233
159234 def strategy (self , opponent : Player ) -> Action :
160-
161235 # First move
162236 if not self .history :
163237 return C
164238
165239 if (len (opponent .history ) >= 2 ):
166240
167- # update history_by_cond
241+ # We begin by update history_by_cond
168242 # (i.e. update Rp)
169243 self .update_history_by_cond (opponent .history )
170244
171245 two_moves_ago = (self .history [- 2 ], opponent .history [- 2 ])
246+ # r_plus is the information of what the opponent just played,
247+ # following the previous outcome two_moves_ago
172248 r_plus = (two_moves_ago , opponent .history [- 1 ])
249+ # r_minus is the opposite move, following the same outcome
173250 r_minus = (two_moves_ago , ({C , D } - {opponent .history [- 1 ]}).pop ())
174251
252+ # If r_plus and r_minus are not in the current set of deterministic
253+ # rules, we check if r_plus should be added to it (following the
254+ # rule defined in the should_promote function)
175255 if r_plus [0 ] not in self .Rc .keys ():
176256 if self .should_promote (r_plus , self .promotion_threshold ):
177257 self .Rc [r_plus [0 ]] = action_to_int (r_plus [1 ])
@@ -187,9 +267,14 @@ def strategy(self, opponent: Player) -> Action:
187267 self .violation_counts [r_plus [0 ]] = 0
188268 # (if r- in Rc)
189269 elif r_minus [1 ] == to_check :
190- # increment violation count of r-
270+ # Increment violation count of r-
191271 self .violation_counts [r_plus [0 ]] += 1
192- if self .should_demote (r_minus ,self .violation_threshold ):
272+ # As we observe that the behavior of the opponent is
273+ # opposed to a rule modeled in Rc, we check if the number
274+ # of consecutive violations of this rule is superior to
275+ # a threshold. If it is, we clean Rc, but we keep the rules
276+ # of Rc in Rd for smooth transition
277+ if self .should_demote (r_minus , self .violation_threshold ):
193278 self .Rd .update (self .Rc )
194279 self .Rc .clear ()
195280 self .violation_counts .clear ()
@@ -206,25 +291,28 @@ def strategy(self, opponent: Player) -> Action:
206291 and self .Rd [r_minus [0 ]] == action_to_int (r_minus [1 ])
207292 )
208293
294+ # Increment number of violations of Rd rules
209295 if r_minus_in_Rd :
210296 self .v += 1
211-
297+ # If the number of violations is superior to a threshold, clean Rd
212298 if (self .v > self .reject_threshold
213299 or (r_plus_in_Rc and r_minus_in_Rd )):
214300 self .Rd .clear ()
215301 self .v = 0
216302
217- # compute Rp for conditions that are neither in Rc or Rd
303+ # Compute Rp for conditions that are neither in Rc or Rd
218304 Rp = {}
219305 all_cond = [(C , C ), (C , D ), (D , C ), (D , D )]
220306 for outcome in all_cond :
221307 if ((outcome not in self .Rc .keys ())
222308 and (outcome not in self .Rd .keys ())):
223- # then we need to compute opponent's C answer probability
309+ # Compute opponent's C answer probability
224310 Rp [outcome ] = self .compute_prob_rule (outcome , self .alpha )
225311
312+ # We aggregate the rules of Rc, Rd, and Rp in a set of rule Pi
226313 self .Pi = {}
227- # algorithm ensure no duplicate keys -> no key overwriting
314+ # The algorithm makes sure that a rule cannot be in two different
315+ # sets of rule so we do not need to check for duplicates.
228316 self .Pi .update (self .Rc )
229317 self .Pi .update (self .Rd )
230318 self .Pi .update (Rp )
0 commit comments