@@ -65,7 +65,22 @@ def runValueIteration(self):
6565 value iteration, V_k+1(...) depends on V_k(...)'s.
6666 """
6767 "*** YOUR CODE HERE ***"
68-
68+ V_curr = util .Counter ()
69+ for _ in range (self .iterations ):
70+ states = self .mdp .getStates ()
71+ for state in states :
72+ if self .mdp .isTerminal (state ):
73+ V_curr [state ] = float (0 )
74+ continue
75+ Q_curr = []
76+ for act in self .mdp .getPossibleActions (state ):
77+ qValue = self .computeQValueFromValues (state , act )
78+ Q_curr .append (qValue )
79+ V_curr [state ] = max (Q_curr )
80+ self .values = V_curr .copy ()
81+
82+
83+
6984 def getValue (self , state ):
7085 """
7186 Return the value of the state (computed in __init__).
@@ -78,7 +93,16 @@ def computeQValueFromValues(self, state, action):
7893 value function stored in self.values.
7994 """
8095 "*** YOUR CODE HERE ***"
81- util .raiseNotDefined ()
96+ q = 0
97+ if not self .mdp .isTerminal (state ):
98+ for n in self .mdp .getTransitionStatesAndProbs (state , action ):
99+ nextState = n [0 ]
100+ prob = n [1 ]
101+ q += prob * (self .mdp .getReward (state , action , nextState ) + self .discount * self .values [nextState ])
102+ return q
103+
104+
105+
82106
83107 def computeActionFromValues (self , state ):
84108 """
@@ -90,6 +114,10 @@ def computeActionFromValues(self, state):
90114 terminal state, you should return None.
91115 """
92116 "*** YOUR CODE HERE ***"
117+ policy = util .Counter ();
118+ for action in self .mdp .getPossibleActions (state ):
119+ policy [action ] = self .getQValue (state , action );
120+ return policy .argMax ();
93121 util .raiseNotDefined ()
94122
95123 def getPolicy (self , state ):
0 commit comments