Counterfactual_Regret_Minimization/playing_kuhn_AI.py at main · jaimeggb/Counterfactual_Regret_Minimization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
from typing import List, Dict
import random
import numpy as np
import sys

random.seed(4)
Actions = ['B', 'C']  # bet/call vs check/fold

class InformationSet():
    def __init__(self):
        self.cumulative_regrets = np.zeros(shape=len(Actions))
        self.strategy_sum = np.zeros(shape=len(Actions))
        self.num_actions = len(Actions)

    def normalize(self, strategy: np.array) -> np.array:
        """Normalize a strategy. If there are no positive regrets,
        use a uniform random strategy"""
        if sum(strategy) > 0:
            strategy /= sum(strategy)
        else:
            strategy = np.array([1.0 / self.num_actions] * self.num_actions)
        return strategy

    def get_strategy(self, reach_probability: float) -> np.array:
        """Return regret-matching strategy"""
        strategy = np.maximum(0, self.cumulative_regrets)
        strategy = self.normalize(strategy)

        self.strategy_sum += reach_probability * strategy
        return strategy

    def get_average_strategy(self) -> np.array:
        return self.normalize(self.strategy_sum.copy())


class KuhnPoker():
    @staticmethod # Static methods do not require a class instance creation. So, they are not dependent on the state of the object.
    def is_terminal(history: str) -> bool:
        return history in ['BC', 'BB', 'CC', 'CBB', 'CBC']

    @staticmethod
    def get_payoff(history: str, cards: List[str]) -> int:
        """get payoff for 'active' player in terminal history"""
        if history in ['BC', 'CBC']:
            return +1
        else:  # CC or BB or CBB
            payoff = 2 if 'B' in history else 1
            active_player = len(history) % 2
            player_card = cards[active_player]
            opponent_card = cards[(active_player + 1) % 2]
            if player_card == 'K' or opponent_card == 'J':
                return payoff
            else:
                return -payoff


class KuhnCFRTrainer():
    def __init__(self):
        self.infoset_map: Dict[str, InformationSet] = {}

    def get_information_set(self, card_and_history: str) -> InformationSet:
        """add if needed and return"""
        if card_and_history not in self.infoset_map:
            self.infoset_map[card_and_history] = InformationSet()
        return self.infoset_map[card_and_history]

    def cfr(self, cards: List[str], history: str, reach_probabilities: np.array, active_player: int):
        if KuhnPoker.is_terminal(history):
            return KuhnPoker.get_payoff(history, cards)

        my_card = cards[active_player]
        #print('active_player: ', active_player, 'my_card + history: ', my_card + history)
        info_set = self.get_information_set(my_card + history)

        strategy = info_set.get_strategy(reach_probabilities[active_player])
        #print('active_player: ', active_player, 'strategy: ', strategy)

        opponent = (active_player + 1) % 2
        counterfactual_values = np.zeros(len(Actions))

        for ix, action in enumerate(Actions):
            action_probability = strategy[ix]

            # compute new reach probabilities after this action
            new_reach_probabilities = reach_probabilities.copy()
            new_reach_probabilities[active_player] *= action_probability

            # recursively call cfr method, next player to act is the opponent
            counterfactual_values[ix] = -self.cfr(cards, history + action, new_reach_probabilities, opponent)
            #print('active_player: ', active_player, 'strategy: ', strategy,'infoset: ', my_card + history, 'cards: ', cards, 'history + action: ', history + action, 'action_probability: ', action_probability, 'new_reach_probs: ', new_reach_probabilities, 'new_reach_probs[active_player]: ', new_reach_probabilities[active_player], )
            print('active_player: ', active_player, 'strategy: ', strategy,'infoset: ', my_card + history, 'cards: ', cards, 'history + action: ', history + action, 'CF_values[ix]: ', counterfactual_values[ix], 'new_reach_probabilities', new_reach_probabilities)
        #print('active_player: ', active_player, 'counterfactual_values: ', counterfactual_values)

        # Value of the current game state is just counterfactual values weighted by action probabilities
        node_value = counterfactual_values.dot(strategy)
        for ix, action in enumerate(Actions):
            #print(reach_probabilities[opponent], counterfactual_values[ix], node_value)
            info_set.cumulative_regrets[ix] += reach_probabilities[opponent] * (counterfactual_values[ix] - node_value)
        ##print('active_player: ', active_player)
        #print('my_card + history: ', my_card + history)
        #print('node_value: ', node_value)
        #print('info_set.cumulative_regrets: ', info_set.cumulative_regrets)
        #print()
        return node_value # counterfactual utility/happiness from being at this game node h

    def train(self, num_iterations: int) -> int:
        util = 0
        kuhn_cards = ['J', 'Q', 'K']
        for iter in range(num_iterations):
            #print('------------------------------------------------------------------')
            #print('iteration: ', iter)
            print()
            cards = random.sample(kuhn_cards, 2)
            history = ''
            reach_probabilities = np.ones(2)
            util += self.cfr(cards, history, reach_probabilities, 0)
        return util

if __name__ == "__main__":
    num_iterations = 200
    np.set_printoptions(precision=2, floatmode='fixed', suppress=True)

    cfr_trainer = KuhnCFRTrainer()
    util = cfr_trainer.train(num_iterations)

    #print(f"\nRunning Kuhn Poker chance sampling CFR for {num_iterations} iterations")
    #print(f"\nExpected average game value (for player 1): {(-1./18):.3f}")
    #print(f"Computed average game value               : {(util / num_iterations):.3f}\n")

    #print("We expect the bet frequency for a Jack to be between 0 and 1/3")
    #print("The bet frequency of a King should be three times the one for a Jack\n")

    #print(f"History  Bet  Pass")
    for name, info_set in sorted(cfr_trainer.infoset_map.items(), key=lambda s: len(s[0])):
        print(f"{name:3}:    {info_set.get_average_strategy()}")

# "C:/Users/Jaime GG-B/AppData/Local/Programs/Python/Python39/python.exe" "c:/Users/Jaime GG-B/Downloads/KuhnPoker_CFR copy.py" 100


'''
Code to play AI after it has been trained
'''

num_poker_rounds
p0_wins_counter = 0
p1_wins_counter = 0
for _ in range(num_poker_rounds):
    # Deal cards

    #  compute the strategy according to regret matching
    my_card = cards[active_player]
    #print('active_player: ', active_player, 'my_card + history: ', my_card + history)
    info_set = self.get_information_set(my_card + history)
    strategy = info_set.get_strategy(reach_probabilities[active_player])
    print('p0_strategy: ', strategy)

    #  add the strategy to p0 running total of strategy probabilities
    strategy_sum += strategy

    # Choose p0 action and p0 opponent's action ----------------------------------
    p1_action = list(Action)[int(input('Enter 0, 1 or 2: '))] #p1_action = random.choices(list(Action), weights=fixed_p1_strategy)[0]         # weights=strategy) for both agents to use the regret matching strategy
    print('p1_action: ', p1_action)
    p0_action = random.choices(list(Action), weights=strategy)[0]
    print('p0_action: ',p0_action)

    p0_wins_counter, p1_wins_counter = getScoreAndPrintWinner(p0_action, p1_action, p0_wins_counter, p1_wins_counter)

    #  compute the payoff and regrets
    p0_payoff = get_payoff(p0_action, p1_action)
    regrets = get_regrets(p0_payoff, p1_action)

    #  add regrets from this round to the cumulative regrets
    cumulative_regrets += regrets