udacity
diff --git a/‎.idea/.gitignore‎
Lines changed: 8 additions & 0 deletions b/‎.idea/.gitignore‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.idea/deep-reinforcement-learning.iml‎
Lines changed: 15 additions & 0 deletions b/‎.idea/deep-reinforcement-learning.iml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎.idea/inspectionProfiles/profiles_settings.xml‎
Lines changed: 6 additions & 0 deletions b/‎.idea/inspectionProfiles/profiles_settings.xml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.idea/misc.xml‎
Lines changed: 7 additions & 0 deletions b/‎.idea/misc.xml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎.idea/modules.xml‎
Lines changed: 8 additions & 0 deletions b/‎.idea/modules.xml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.idea/vcs.xml‎
Lines changed: 6 additions & 0 deletions b/‎.idea/vcs.xml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎monte-carlo/blackjack.py‎ b/‎monte-carlo/blackjack.py‎
diff --git a/‎temporal-difference/TD_CliffWalking_Q_Learning_Solution.py‎
Lines changed: 30 additions & 81 deletions b/‎temporal-difference/TD_CliffWalking_Q_Learning_Solution.py‎
Lines changed: 30 additions & 81 deletions
diff --git a/‎temporal-difference/TD_CliffWalking_SARSA_Solution.py‎
Lines changed: 9 additions & 31 deletions b/‎temporal-difference/TD_CliffWalking_SARSA_Solution.py‎
Lines changed: 9 additions & 31 deletions
@@ -1,32 +1,20 @@
-import sys
+from collections import defaultdict
+
 import gym
 import numpy as np
-from collections import defaultdict, deque
-import matplotlib.pyplot as plt
 
 import check_test
 from plot_utils import plot_values
 
 env = gym.make('CliffWalking-v0')
 
-# print(env.action_space)
-# print(env.observation_space)
-#
-# # define the optimal state-value function
-# V_opt = np.zeros((4, 12))
-# V_opt[0:13][0] = -np.arange(3, 15)[::-1]
-# V_opt[0:13][1] = -np.arange(3, 15)[::-1] + 1
-# V_opt[0:13][2] = -np.arange(3, 15)[::-1] + 2
-# V_opt[3][0] = -13
-#
-# plot_values(V_opt)
-
-
-def sarsa(env, num_episodes, alpha, gamma=1.0, epsilon_start=1.0):
+def q_learning(env, num_episodes, alpha, gamma=1.0, epsilon_start=1.0):
     # decide epsilon
     epsilon = epsilon_start
     epsilon_min = 0.1
-    epsilon_decay = 0.9999
+    epsilon_decay = 0.997
+
+    nA = 4
 
     # initialize action-value function (empty dictionary of arrays)
     Q = defaultdict(lambda: np.zeros(env.nA))
@@ -35,53 +23,35 @@ def sarsa(env, num_episodes, alpha, gamma=1.0, epsilon_start=1.0):
     for i_episode in range(1, num_episodes + 1):
 
         # monitor progress
-        if i_episode % 499999 == 0:
-            print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
-            print (str(Q))
+        # if i_episode % 1 == 0:
+        #     print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
+        #
+        #     sys.stdout.flush()
 
+        if i_episode % 100 == 0:
+            print(f"Episode {i_episode}: Epsilon = {epsilon:.4f}")
 
         # set the value of epsilon
         epsilon = max(epsilon * epsilon_decay, epsilon_min)
 
         # generate episode
-        episode = generate_episode(env=env, Q=Q, epsilon=epsilon, nA=4)
-
-        Q = update_q(episode, Q, alpha, gamma)
-    return Q
-
-
-def generate_episode(env, Q, epsilon, nA):
-    episode = []
-    state, _ = env.reset()
-    if isinstance(state, dict):
-        state = tuple(sorted(state.items()))
+        state, _ = env.reset()
 
-    action = np.random.choice(np.arange(nA),
-                              p=get_probs(Q[state], epsilon, nA)) if state in Q else env.action_space.sample()
+        while True:
+            action = np.random.choice(np.arange(nA), p=get_probs(Q[state], epsilon, nA))
 
-    while True:
-        if isinstance(state, tuple):
-            state = state[0]  # Extract actual state if (state, info) is returned
+            next_state, reward, terminated, truncated, _ = env.step(action)  # ✅ New API
+            if next_state not in Q:
+                Q[next_state] = np.zeros(nA)  # ✅ Ensure Q-value initialization
 
-        # ✅ Convert state to tuple if it’s a dictionary
-        if isinstance(state, dict):
-            state = tuple(sorted(state.items()))
+            if terminated or truncated:
+                break
 
-        next_state, reward, terminated, truncated, _ = env.step(action)  # ✅ New API
-        if next_state not in Q:
-            Q[next_state] = np.zeros(nA)  # ✅ Ensure Q-value initialization
+            next_Q = 0 if terminated else np.max(Q[next_state])
+            Q[state][action] += alpha * (reward + gamma * next_Q - Q[state][action])
 
-        next_action = np.random.choice(np.arange(nA), p=get_probs(Q[next_state], epsilon, nA))
-
-        episode.append((state, action, reward))
-
-        if terminated or truncated:
-            break
-
-        state = next_state  # ✅ Track next state
-        action = next_action  # ✅ Track next action
-
-    return episode
+            state = next_state
+    return Q
 
 def get_probs(Q_s, epsilon, nA):
     """ obtains the action probabilities corresponding to epsilon-greedy policy """
@@ -90,37 +60,16 @@ def get_probs(Q_s, epsilon, nA):
     policy_states[best_action] = 1 - epsilon + (epsilon / nA)
     return policy_states
 
-def pick_action(epsilon, Q, next_state):
-    if np.random.rand() < epsilon:
-        next_action = env.action_space.sample()  # Explore (random action)
-    else:
-        next_action = np.argmax(Q[next_state])  # Exploit (best action)
-
-    return next_action
-
-def update_q(episode, Q, alpha, gamma):
-    """ updates the action-value function estimate using the most recent episode """
-    states, actions, rewards = zip(*episode)
-    # prepare for discounting
-    for i in range(len(states) - 1):  # Ignore last step
-        state, action = states[i], actions[i]
-        next_state, next_action = states[i + 1], actions[i + 1]  # ✅ Use episode step
-
-        old_Q = Q[state][action]
-        next_Q = Q[next_state][next_action]  # ✅ Correct SARSA update
-        Q[state][action] = old_Q + alpha * (rewards[i] + gamma * next_Q - old_Q)
-    return Q
-
 
 # obtain the estimated optimal policy and corresponding action-value function
-Q_sarsa = sarsa(env, 500000, .01)
+Q_q_learning = q_learning(env, 5000, .01)
 
 # print the estimated optimal policy
-policy_sarsa = np.array([np.argmax(Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)]).reshape(4,12)
-check_test.run_check('td_control_check', policy_sarsa)
+policy_q_learning = np.array([np.argmax(Q_q_learning[key]) if key in Q_q_learning else -1 for key in np.arange(48)]).reshape(4,12)
+check_test.run_check('td_control_check', policy_q_learning)
 print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
-print(policy_sarsa)
+print(policy_q_learning)
 
 # plot the estimated optimal state-value function
-V_sarsa = ([np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48)])
-plot_values(V_sarsa)
+V_q_learning = ([np.max(Q_q_learning[key]) if key in Q_q_learning else 0 for key in np.arange(48)])
+plot_values(V_q_learning)
@@ -9,24 +9,11 @@
 
 env = gym.make('CliffWalking-v0')
 
-# print(env.action_space)
-# print(env.observation_space)
-#
-# # define the optimal state-value function
-# V_opt = np.zeros((4, 12))
-# V_opt[0:13][0] = -np.arange(3, 15)[::-1]
-# V_opt[0:13][1] = -np.arange(3, 15)[::-1] + 1
-# V_opt[0:13][2] = -np.arange(3, 15)[::-1] + 2
-# V_opt[3][0] = -13
-#
-# plot_values(V_opt)
-
-
-def sarsa(env, num_episodes, alpha, gamma=1.0, epsilon_start=1.0):
+def sarsa(env, num_episodes, alpha, gamma=1.0, epsilon_start=0.5):
     # decide epsilon
     epsilon = epsilon_start
     epsilon_min = 0.1
-    epsilon_decay = 0.9999
+    epsilon_decay = 0.99
 
     # initialize action-value function (empty dictionary of arrays)
     Q = defaultdict(lambda: np.zeros(env.nA))
@@ -35,10 +22,9 @@ def sarsa(env, num_episodes, alpha, gamma=1.0, epsilon_start=1.0):
     for i_episode in range(1, num_episodes + 1):
 
         # monitor progress
-        if i_episode % 499999 == 0:
-            print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
-            print (str(Q))
-
+        if i_episode % 1 == 0:
+            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
+            sys.stdout.flush()
 
         # set the value of epsilon
         epsilon = max(epsilon * epsilon_decay, epsilon_min)
@@ -53,20 +39,12 @@ def sarsa(env, num_episodes, alpha, gamma=1.0, epsilon_start=1.0):
 def generate_episode(env, Q, epsilon, nA):
     episode = []
     state, _ = env.reset()
-    if isinstance(state, dict):
-        state = tuple(sorted(state.items()))
 
-    action = np.random.choice(np.arange(nA),
-                              p=get_probs(Q[state], epsilon, nA)) if state in Q else env.action_space.sample()
+    action = np.random.choice(np.arange(nA), p=get_probs(Q[state], epsilon, nA)) if state not in Q else np.argmax(
+        Q[state]
+    )
 
     while True:
-        if isinstance(state, tuple):
-            state = state[0]  # Extract actual state if (state, info) is returned
-
-        # ✅ Convert state to tuple if it’s a dictionary
-        if isinstance(state, dict):
-            state = tuple(sorted(state.items()))
-
         next_state, reward, terminated, truncated, _ = env.step(action)  # ✅ New API
         if next_state not in Q:
             Q[next_state] = np.zeros(nA)  # ✅ Ensure Q-value initialization
@@ -113,7 +91,7 @@ def update_q(episode, Q, alpha, gamma):
 
 
 # obtain the estimated optimal policy and corresponding action-value function
-Q_sarsa = sarsa(env, 500000, .01)
+Q_sarsa = sarsa(env, 5000, .01)
 
 # print the estimated optimal policy
 policy_sarsa = np.array([np.argmax(Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)]).reshape(4,12)