Open-Deep-ML · 836hardik-agrawal · Jul 29, 2025 · Jul 29, 2025 · Jul 30, 2025 · Sep 27, 2025
diff --git a/questions/172_td0_value_function_update_for_single_episode/description.md b/questions/172_td0_value_function_update_for_single_episode/description.md
@@ -0,0 +1,9 @@
+Implement the **TD(0) policy evaluation update** for a single episode under a given deterministic policy.  
+The episode is a list of  $ (state, action, reward, nextstate) $ transitions that are all consistent with the provided policy $\pi$.
+
+Use the TD(0) update rule to compute a **single pass of value updates** for each state in the episode.
+
+Assume **discounting factor** to be $1$
+
+**Constraint**:  
+All transitions in the episode **must adhere to the given policy** $\pi $ , i.e., the action taken in each transition must match $\pi(\text{state})$
diff --git a/questions/172_td0_value_function_update_for_single_episode/example.json b/questions/172_td0_value_function_update_for_single_episode/example.json
@@ -0,0 +1,6 @@
+{
+  "input": "episode = [\n    ('s1', 'a1', 1.0, 's2'),\n    ('s2', 'a2', 2.0, 's3'),\n    ('s3', 'a3', 3.0, 'terminal')\n]\nV = {'s1': 0.0, 's2': 0.0, 's3': 0.0, 'terminal': 0.0}\npi = {'s1': 'a1', 's2': 'a2', 's3': 'a3'}\nalpha = 0.5\nV_updated = td0_policy_evaluation(episode, V, pi, alpha)\nprint({k: round(v, 2) for k, v in V_updated.items()})",
+  "output": "{'s1': 0.5, 's2': 1.0, 's3': 1.5, 'terminal': 0.0}",
+  "reasoning": "Each update uses the current value of the next state:\n- V(s1) = 0.5 * (1 + 0) = 0.5\n- V(s2) = 0.5 * (2 + 0) = 1.0\n- V(s3) =  0.5 * (3 + 0) = 1.5"
+}
+
diff --git a/questions/172_td0_value_function_update_for_single_episode/learn.md b/questions/172_td0_value_function_update_for_single_episode/learn.md
@@ -0,0 +1,20 @@
+# TD(0) Policy Evaluation Algorithm
+
+**Input**: the policy $\pi$ to be evaluated  
+**Algorithm parameter**: step size $( \alpha \in (0, 1] ) $
+
+**Initialize** $V(s)$, for all $( s \in \mathcal{S}^+ )$, arbitrarily except that $ V(\text{terminal}) = 0 $
+
+## Loop for each episode:
+1. Initialize state $( S )$
+2. **Loop for each step of the episode**:
+   - $( A \leftarrow \pi(S) ) $ action given by the $\pi$ for $S$
+   - Take action $A$, observe reward $R$, next state $S'$
+   - Update value:
+     $
+     V(S) \leftarrow V(S) + \alpha \left[ R + V(S') - V(S) \right]
+     $
+   - $S \leftarrow S' $
+3. **Until** $S$ is terminal
+
+
diff --git a/questions/172_td0_value_function_update_for_single_episode/meta.json b/questions/172_td0_value_function_update_for_single_episode/meta.json
@@ -0,0 +1,15 @@
+{
+  "id": "172",
+  "title": "TD(0) Value Function Update from a Single Episode",
+  "difficulty": "medium",
+  "category": "Reinforcement Learning",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": [
+    {
+      "profile_link": "https://github.com/836hardik-agrawal",
+      "name": "Hardik Agrawal"
+    }
+  ]
+}
diff --git a/questions/172_td0_value_function_update_for_single_episode/solution.py b/questions/172_td0_value_function_update_for_single_episode/solution.py
@@ -0,0 +1,4 @@
+def td0_policy_evaluation(episode, V, pi, alpha):
+    for (s, a, r, s_next) in episode:
+        V[s] += alpha * (r + V[s_next] - V[s])
+    return V
diff --git a/questions/172_td0_value_function_update_for_single_episode/starter_code.py b/questions/172_td0_value_function_update_for_single_episode/starter_code.py
@@ -0,0 +1,15 @@
+def td0_policy_evaluation(episode, V, pi, alpha):
+    """
+    Perform TD(0) policy evaluation update over a single episode.
+
+    Parameters:
+    - episode: list of tuples (s, a, r, s_next), representing the transitions
+    - V: dict mapping states to their current value estimates
+    - pi: dict representing the deterministic policy π(s) = a
+    - alpha: float, step size (0 < alpha <= 1)
+
+    Returns:
+    - dict: updated value function V after applying TD(0) updates
+    """
+    # Your code here
+    pass
diff --git a/questions/172_td0_value_function_update_for_single_episode/tests.json b/questions/172_td0_value_function_update_for_single_episode/tests.json
@@ -0,0 +1,11 @@
+[
+  {
+    "test": "episode = [\n    ('s1', 'a1', 1.0, 's2'),\n    ('s2', 'a2', 2.0, 's1'),  \n    ('s1', 'a1', 1.0, 'terminal')\n]\nV = {'s1': 0.0, 's2': 0.0, 'terminal': 0.0}\npi = {'s1': 'a1', 's2': 'a2'}\nalpha = 0.5\nV_updated = td0_policy_evaluation(episode, V, pi, alpha)\nprint({k: round(v, 2) for k, v in V_updated.items()})",
+    "expected_output": "{'s1': 0.75, 's2': 1.25, 'terminal': 0.0}"
+  },
+  {
+    "test": "episode = [\n    ('A', 'left', 5.0, 'B'),\n    ('B', 'right', 0.0, 'C'),\n    ('C', 'down', 1.0, 'terminal')\n]\nV = {'A': 0.0, 'B': 0.0, 'C': 0.0, 'terminal': 0.0}\npi = {'A': 'left', 'B': 'right', 'C': 'down'}\nalpha = 0.5\nV_updated = td0_policy_evaluation(episode, V, pi, alpha)\nprint({k: round(v, 2) for k, v in V_updated.items()})",
+    "expected_output": "{'A': 2.5, 'B': 0.0, 'C': 0.5, 'terminal': 0.0}"
+  }
+
+]
diff --git a/questions/173_implement_the_SARSA_Algorithm_on_policy/description.md b/questions/173_implement_the_SARSA_Algorithm_on_policy/description.md
@@ -0,0 +1,6 @@
+Implement the **SARSA** algorithm to estimate Q-values for a given set of deterministic transitions using greedy action selection.
+
+- All Q-values are initialized to zero.
+- Each episode starts from a given initial state.
+- The episode ends when it reaches the $terminal$ state or when the number of steps exceeds $maxsteps$.
+- Changes made to Q-values are persistent across episodes.
diff --git a/questions/173_implement_the_SARSA_Algorithm_on_policy/example.json b/questions/173_implement_the_SARSA_Algorithm_on_policy/example.json
@@ -0,0 +1,5 @@
+{
+    "input": "transitions = {\n    ('A', 'left'): (5.0, 'B'),\n    ('A', 'right'): (1.0, 'C'),\n    ('B', 'left'): (2.0, 'A'),\n    ('B', 'right'): (0.0, 'C'),\n    ('C', 'down'): (1.0, 'terminal')\n}\n\ninitial_states = ['A', 'B']\nalpha = 0.1\ngamma = 0.9\nmax_steps = 10\n\nQ = sarsa_update(transitions, initial_states, alpha, gamma, max_steps)\n\nfor k in sorted(transitions):\n    print(f\"Q{str(k):15} = {Q[k]:.4f}\")",
+    "output": "Q('A', 'left')   = 4.2181\nQ('A', 'right')  = 0.0000\nQ('B', 'left')   = 2.7901\nQ('B', 'right')  = 0.0000",
+    "reasoning": "The SARSA update rule is:\nQ(s,a) <- Q(s,a) + alpha * [reward + gamma * Q(s',a') - Q(s,a)]\n\nStarting from initial Q-values of 0, each episode updates Q-values based on the transitions.\n- Q('A', 'left') increases because it leads to B, and B can eventually return to A or C with additional rewards.\n- Q('A', 'right') and Q('B', 'right') remain 0.0 because the next state C leads directly to terminal with small reward.\n- Q('B', 'left') increases due to cyclic transitions giving non-zero rewards."
+}
diff --git a/questions/173_implement_the_SARSA_Algorithm_on_policy/learn.md b/questions/173_implement_the_SARSA_Algorithm_on_policy/learn.md
@@ -0,0 +1,31 @@
+# SARSA: On-Policy TD Control
+
+**Goal**: Estimate the action-value function $Q^\pi \approx q^*$ using the SARSA algorithm (on-policy Temporal-Difference control).
+
+## Parameters
+- Step size $\alpha \in (0, 1]$
+- Discount factor $\gamma \in [0, 1]$
+
+## Initialization
+- Initialize $Q(s, a)$ arbitrarily for all $s \in \mathcal{S}^+$, $a \in \mathcal{A}(s)$  
+- Set $Q(\text{terminal}, \cdot) = 0$
+
+## Algorithm
+
+**Loop for each episode:**
+1. Initialize state $S$
+2. Choose action $A$ from $S$ using a policy derived from $Q$ (e.g., greedy)
+
+    **Loop for each step of the episode:**
+    1. Take action $A$, observe reward $R$ and next state $S'$
+    2. Choose next action $A'$ from $S'$ using a policy derived from $Q$ (e.g., greedy)
+    3. Update the action-value:
+       $
+       Q(S, A) \leftarrow Q(S, A) + \alpha \left[ R + \gamma Q(S', A') - Q(S, A) \right]
+       $
+    4. Set $S \leftarrow S'$, $A \leftarrow A'$
+    5. Repeat until $S$ is terminal
+
+This algorithm continuously improves the policy as it explores and learns from interaction, making it suitable for online reinforcement learning scenarios.
+
+
diff --git a/questions/173_implement_the_SARSA_Algorithm_on_policy/meta.json b/questions/173_implement_the_SARSA_Algorithm_on_policy/meta.json
@@ -0,0 +1,15 @@
+{
+  "id": "173",
+  "title": "implement the SARSA Algorithm on policy",
+  "difficulty": "medium",
+  "category": "Reinforcement Learning",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": [
+    {
+      "profile_link": "https://github.com/836hardik-agrawal",
+      "name": "Hardik Agrawal"
+    }
+  ]
+}
diff --git a/questions/173_implement_the_SARSA_Algorithm_on_policy/solution.py b/questions/173_implement_the_SARSA_Algorithm_on_policy/solution.py
@@ -0,0 +1,38 @@
+from collections import defaultdict
+def select_greedy_action(state,action_after_state,Q):
+    actions = action_after_state.get(state,[])
+    if not actions:
+        return None
+    else:
+        max_q = max(Q[(state,a)] for a in actions)
+        action_required = []
+        for a in actions:
+            if(Q[(state,a)] == max_q):
+                action_required.append(a)
+        final_action = min(action_required)
+        return final_action
+def sarsa_update(transitions, initial_states, alpha, gamma, max_steps):
+    Q = defaultdict(float)
+    action_after_state = defaultdict(set)
+    for (s,a) in transitions:
+        action_after_state[s].add(a)
+
+    for state in initial_states:
+        steps = 0
+        s = state
+        action = select_greedy_action(s,action_after_state,Q)
+        while s!="terminal" and steps<max_steps:
+            reward,next_state = transitions[(s,action)]
+            steps+=1
+            if next_state == "terminal":
+                action_next = None
+                next_q = 0
+            else:
+                action_next = select_greedy_action(next_state,action_after_state,Q)
+                next_q = Q[next_state,action_next]
+
+            Q[(s,action)] += alpha*(reward+ gamma*next_q- Q[(s,action)])
+            s = next_state
+            action = action_next
+
+    return Q
diff --git a/questions/173_implement_the_SARSA_Algorithm_on_policy/starter_code.py b/questions/173_implement_the_SARSA_Algorithm_on_policy/starter_code.py
@@ -0,0 +1,16 @@
+def sarsa_update(transitions, initial_states, alpha, gamma, max_steps):
+    """
+    Perform SARSA updates on the given environment transitions.
+
+    Args:
+        transitions (dict): mapping (state, action) -> (reward, next_state)
+        initial_states (list): list of starting states to simulate episodes from
+        alpha (float): learning rate
+        gamma (float): discount factor
+        max_steps (int): maximum steps allowed per episode
+
+    Returns:
+        dict: final Q-table as a dictionary {(state, action): value}
+    """
+    # Your code here
+    pass
diff --git a/questions/173_implement_the_SARSA_Algorithm_on_policy/tests.json b/questions/173_implement_the_SARSA_Algorithm_on_policy/tests.json
@@ -0,0 +1,14 @@
+[
+  {
+    "test": "transitions = {\n    ('A', 'go'): (1.0, 'B'),\n    ('B', 'go'): (2.0, 'C'),\n    ('C', 'go'): (3.0, 'terminal')\n}\ninitial_states = ['A']\nalpha = 0.5\ngamma = 0.9\nmax_steps = 5\nQ = sarsa_update(transitions, initial_states, alpha, gamma, max_steps)\nfor k in sorted(Q):\n    print(f\"Q{str(k):15} = {Q[k]:.4f}\")",
+    "expected_output": "Q('A', 'go')     = 0.5000\nQ('B', 'go')     = 1.0000\nQ('C', 'go')     = 1.5000"
+  },
+  {
+    "test": "transitions = {\n    ('S1', 'left'): (2.0, 'S2'),\n    ('S1', 'right'): (1.0, 'S3'),\n    ('S2', 'left'): (0.5, 'terminal'),\n    ('S3', 'right'): (0.5, 'terminal')\n}\ninitial_states = ['S1', 'S2', 'S3']\nalpha = 0.1\ngamma = 0.8\nmax_steps = 10\nQ = sarsa_update(transitions, initial_states, alpha, gamma, max_steps)\nfor k in sorted(Q):\n    print(f\"Q{str(k):15} = {Q[k]:.4f}\")",
+    "expected_output": "Q('S1', 'left')  = 0.2000\nQ('S1', 'right') = 0.0000\nQ('S2', 'left')  = 0.0950\nQ('S3', 'right') = 0.0500"
+  },
+  {
+    "test": "transitions = {\n    ('A', 'x'): (0.0, 'terminal'),\n    ('A', 'y'): (5.0, 'B'),\n    ('B', 'z'): (2.0, 'terminal')\n}\ninitial_states = ['A']\nalpha = 0.4\ngamma = 0.9\nmax_steps = 3\nQ = sarsa_update(transitions, initial_states, alpha, gamma, max_steps)\nfor k in sorted(Q):\n    print(f\"Q{str(k):15} = {Q[k]:.4f}\")",
+    "expected_output": "Q('A', 'x')      = 0.0000\nQ('A', 'y')      = 0.0000"
+  }
+]