diff --git a/.history/build/101_20251128232115.json b/.history/build/101_20251128232115.json new file mode 100644 index 00000000..21e4e125 --- /dev/null +++ b/.history/build/101_20251128232115.json @@ -0,0 +1,42 @@ +{ + "id": "101", + "title": "Implement the GRPO Objective Function", + "difficulty": "hard", + "category": "Reinforcement Learning", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { + "profile_link": "https://github.com/moe18", + "name": "Moe Chabot" + } + ], + "description": "Implement the GRPO (Generalized Relative Policy Optimization) objective function used to optimize policy parameters in reinforcement learning. Your task is to compute the GRPO objective given the likelihood ratios, advantage estimates, old policy probabilities, reference policy probabilities, and apply the clipping mechanism and KL divergence penalty correctly to maintain training stability.", + "learn_section": "### Understanding GRPO (Generalized Relative Policy Optimization)\n\nGRPO is an advanced policy optimization algorithm in reinforcement learning that updates policy parameters while ensuring training stability. It builds upon Proximal Policy Optimization (PPO) by incorporating a KL divergence penalty to keep the new policy close to a reference policy.\n\n### Mathematical Definition\n\nThe GRPO objective function is defined as:\n\n$$\nJ_{GRPO}(\\theta) = \\mathbb{E}_{q \\sim P(Q), \\{o_i\\}_{i=1}^G \\sim \\pi_{\\theta_{old}}(O|q)} \\left[ \\frac{1}{G} \\sum_{i=1}^G \\min\\left( \\rho_i A_i, \\text{clip}(\\rho_i, 1-\\epsilon, 1+\\epsilon) A_i \\right) - \\beta D_{KL}(\\pi_{\\theta} \\| \\pi_{ref}) \\right]\n$$\n\nWhere:\n\n- $\\rho_i = \\frac{\\pi_{\\theta}(o_i | q)}{\\pi_{\\theta_{old}}(o_i | q)}$ is the likelihood ratio.\n- $A_i$ is the advantage estimate for the $i$-th action.\n- $\\epsilon$ is the clipping parameter.\n- $\\beta$ controls the influence of the KL divergence penalty.\n- $D_{KL}$ is the Kullback-Leibler divergence between the new policy $\\pi_{\\theta}$ and the reference policy $\\pi_{ref}$.\n\n### Key Components\n\n#### Likelihood Ratio $\\rho_i$\n- Measures how much more likely the new policy $\\pi_{\\theta}$ is to produce an output $o_i$ compared to the old policy $\\pi_{\\theta_{old}}$.\n- $$\\rho_i = \\frac{\\pi_{\\theta}(o_i | q)}{\\pi_{\\theta_{old}}(o_i | q)}$$\n\n#### Advantage Function $A_i$\n- Evaluates the benefit of taking action $o_i$ compared to the average action.\n- $$A_i = \\frac{r_i - \\text{mean}(r_1, \\ldots, r_G)}{\\text{std}(r_1, \\ldots, r_G)}$$\n- Where $r_i$ is the reward for the $i$-th action.\n\n#### Clipping Mechanism\n- Restricts the likelihood ratio to the range $[1 - \\epsilon, 1 + \\epsilon]$ to prevent large updates.\n- $$\\text{clip}(\\rho_i, 1 - \\epsilon, 1 + \\epsilon)$$\n\n#### KL Divergence Penalty\n- Ensures the new policy $\\pi_{\\theta}$ does not deviate significantly from the reference policy $\\pi_{ref}$.\n- $$-\\beta D_{KL}(\\pi_{\\theta} \\| \\pi_{ref})$$\n\n### Benefits of GRPO\n\n#### Stability\n- The clipping mechanism prevents drastic policy updates, ensuring stable training.\n\n#### Controlled Exploration\n- The KL divergence penalty maintains a balance between exploring new policies and sticking close to a reliable reference policy.\n\n#### Improved Performance\n- By carefully managing policy updates, GRPO can lead to more effective learning and better policy performance.\n\n### Use Cases\n\n#### Reinforcement Learning Tasks\n- Suitable for environments requiring stable and efficient policy updates.\n- also a key component used for the DeepSeek-R1 model\n\n#### Complex Decision-Making Problems\n- Effective in scenarios with high-dimensional action spaces where maintaining policy stability is crucial.\n\n### Conclusion\n\nGRPO enhances policy optimization in reinforcement learning by combining the benefits of PPO with an additional KL divergence penalty. This ensures that policy updates are both effective and stable, leading to more reliable and performant learning agents.", + "starter_code": "import numpy as np\n\ndef grpo_objective(rhos, A, pi_theta_old, pi_theta_ref, epsilon=0.2, beta=0.01) -> float:\n\t\"\"\"\n\tCompute the GRPO objective function.\n\n\tArgs:\n\t\trhos: List of likelihood ratios (p_i) = pi_theta(o_i | q) / pi_theta_old(o_i | q).\n\t\tA: List of advantage estimates (A_i).\n\t\tpi_theta_old: List representing the old policy probabilities pi_theta_old(o_i | q).\n\t\tpi_theta_ref: List representing the reference policy probabilities pi_ref(o_i | q).\n\t\tepsilon: Clipping parameter (eps).\n\t\tbeta: KL divergence penalty coefficient (beta).\n\n\tReturns:\n\t\tThe computed GRPO objective value.\n\t\"\"\"\n\t# Your code here\n\tpass", + "solution": "import numpy as np\n\ndef grpo_objective(rhos, A, pi_theta_old, pi_theta_ref, epsilon=0.2, beta=0.01) -> float:\n \"\"\"\n Compute the GRPO objective function.\n\n Args:\n rhos: List of likelihood ratios (ρ_i) = π_theta(o_i | q) / π_theta_old(o_i | q).\n A: List of advantage estimates (A_i).\n pi_theta_old: List representing the old policy probabilities π_theta_old(o_i | q).\n pi_theta_ref: List representing the reference policy probabilities π_ref(o_i | q).\n epsilon: Clipping parameter (ϵ).\n beta: KL divergence penalty coefficient (β).\n\n Returns:\n The computed GRPO objective value.\n \"\"\"\n G = len(rhos)\n if not (len(A) == len(pi_theta_old) == len(pi_theta_ref) == G):\n raise ValueError(\"All input lists must have the same length.\")\n \n # Compute clipped likelihood ratios\n clipped_rhos = np.clip(rhos, 1 - epsilon, 1 + epsilon)\n \n # Compute the minimum terms for the objective\n unclipped = np.array(rhos) * np.array(A)\n clipped = clipped_rhos * np.array(A)\n min_terms = np.minimum(unclipped, clipped)\n average_min = np.mean(min_terms)\n \n # Compute pi_theta from rhos and pi_theta_old\n pi_theta = np.array(rhos) * np.array(pi_theta_old)\n \n # Normalize pi_theta and pi_theta_ref to ensure they are valid probability distributions\n pi_theta /= np.sum(pi_theta)\n pi_theta_ref /= np.sum(pi_theta_ref)\n \n # Compute KL divergence D_KL(pi_theta || pi_theta_ref)\n kl_divergence = np.sum(pi_theta * np.log(pi_theta / pi_theta_ref + 1e-10)) # Added epsilon to avoid log(0)\n \n # Compute the final objective\n objective = average_min - beta * kl_divergence\n \n return objective", + "example": { + "input": "grpo_objective([1.2, 0.8, 1.1], [1.0, 1.0, 1.0], [0.9, 1.1, 1.0], [1.0, 0.5, 1.5], epsilon=0.2, beta=0.01)", + "output": "1.032749", + "reasoning": "The function calculates the GRPO objective by first clipping the likelihood ratios, computing the minimum terms, averaging them, and then subtracting the KL divergence penalty scaled by beta." + }, + "test_cases": [ + { + "test": "print(round(grpo_objective([1.2, 0.8, 1.1], [1.0, 1.0, 1.0], [0.9, 1.1, 1.0], [1.0, 0.5, 1.5], epsilon=0.2, beta=0.01),6))", + "expected_output": "1.032749" + }, + { + "test": "print(round(grpo_objective([0.9, 1.1], [1.0, 1.0], [1.0, 1.0], [0.8, 1.2], epsilon=0.1, beta=0.05),6))", + "expected_output": "0.999743" + }, + { + "test": "print(round(grpo_objective([1.5, 0.5, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.2, 0.7, 1.3], epsilon=0.15, beta=0.02),6))", + "expected_output": "0.882682" + }, + { + "test": "print(round(grpo_objective([1.0], [1.0], [1.0], [1.0], epsilon=0.1, beta=0.01),6))", + "expected_output": "1.0" + } + ] +} \ No newline at end of file diff --git a/.history/build/101_20251128233327.json b/.history/build/101_20251128233327.json new file mode 100644 index 00000000..af1b61c7 --- /dev/null +++ b/.history/build/101_20251128233327.json @@ -0,0 +1,42 @@ +{ + "id": "101", + "title": "Implement the GRPO Objective Function", + "difficulty": "hard", + "category": "Reinforcement Learning", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { + "profile_link": "https://github.com/moe18", + "name": "Moe Chabot" + } + ], + "description": "Implement the GRPO (Generalized Relative Policy Optimization) objective function used to optimize policy parameters in reinforcement learning. Your task is to compute the GRPO objective given the likelihood ratios, advantage estimates, old policy probabilities, reference policy probabilities, and apply the clipping mechanism and KL divergence penalty correctly to maintain training stability.", + "learn_section": "### Understanding GRPO (Generalized Relative Policy Optimization)\n\nGRPO is an advanced policy optimization algorithm in reinforcement learning that updates policy parameters while ensuring training stability. It builds upon Proximal Policy Optimization (PPO) by incorporating a KL divergence penalty to keep the new policy close to a reference policy.\n\n### Mathematical Definition\n\nThe GRPO objective function is defined as:\n\n$$\nJ_{GRPO}(\\theta) = \\mathbb{E}_{q \\sim P(Q), \\{o_i\\}_{i=1}^G \\sim \\pi_{\\theta_{old}}(O|q)} \\left[ \\frac{1}{G} \\sum_{i=1}^G \\min\\left( \\rho_i A_i, \\text{clip}(\\rho_i, 1-\\epsilon, 1+\\epsilon) A_i \\right) - \\beta D_{KL}(\\pi_{\\theta} \\| \\pi_{ref}) \\right]\n$$\n\nWhere:\n\n- $\\rho_i = \\frac{\\pi_{\\theta}(o_i | q)}{\\pi_{\\theta_{old}}(o_i | q)}$ is the likelihood ratio.\n- $A_i$ is the advantage estimate for the $i$-th action.\n- $\\epsilon$ is the clipping parameter.\n- $\\beta$ controls the influence of the KL divergence penalty.\n- $D_{KL}$ is the Kullback-Leibler divergence between the new policy $\\pi_{\\theta}$ and the reference policy $\\pi_{ref}$.\n\n### Key Components\n\n#### Likelihood Ratio $\\rho_i$\n- Measures how much more likely the new policy $\\pi_{\\theta}$ is to produce an output $o_i$ compared to the old policy $\\pi_{\\theta_{old}}$.\n- $$\\rho_i = \\frac{\\pi_{\\theta}(o_i | q)}{\\pi_{\\theta_{old}}(o_i | q)}$$\n\n#### Advantage Function $A_i$\n- Evaluates the benefit of taking action $o_i$ compared to the average action.\n- $$A_i = \\frac{r_i - \\text{mean}(r_1, \\ldots, r_G)}{\\text{std}(r_1, \\ldots, r_G)}$$\n- Where $r_i$ is the reward for the $i$-th action.\n\n#### Clipping Mechanism\n- Restricts the likelihood ratio to the range $[1 - \\epsilon, 1 + \\epsilon]$ to prevent large updates.\n- $$\\text{clip}(\\rho_i, 1 - \\epsilon, 1 + \\epsilon)$$\n\n#### KL Divergence Penalty\n- Ensures the new policy $\\pi_{\\theta}$ does not deviate significantly from the reference policy $\\pi_{ref}$.\n- $$-\\beta D_{KL}(\\pi_{\\theta} \\| \\pi_{ref})$$\n\n### Benefits of GRPO\n\n#### Stability\n- The clipping mechanism prevents drastic policy updates, ensuring stable training.\n\n#### Controlled Exploration\n- The KL divergence penalty maintains a balance between exploring new policies and sticking close to a reliable reference policy.\n\n#### Improved Performance\n- By carefully managing policy updates, GRPO can lead to more effective learning and better policy performance.\n\n### Use Cases\n\n#### Reinforcement Learning Tasks\n- Suitable for environments requiring stable and efficient policy updates.\n- also a key component used for the DeepSeek-R1 model\n\n#### Complex Decision-Making Problems\n- Effective in scenarios with high-dimensional action spaces where maintaining policy stability is crucial.\n\n### Conclusion\n\nGRPO enhances policy optimization in reinforcement learning by combining the benefits of PPO with an additional KL divergence penalty. This ensures that policy updates are both effective and stable, leading to more reliable and performant learning agents.", + "starter_code": "import numpy as np\n\ndef grpo_objective(rhos, A, pi_theta_old, pi_theta_ref, epsilon=0.2, beta=0.01) -> float:\n\t\"\"\"\n\tCompute the GRPO objective function.\n\n\tArgs:\n\t\trhos: List of likelihood ratios (p_i) = pi_theta(o_i | q) / pi_theta_old(o_i | q).\n\t\tA: List of advantage estimates (A_i).\n\t\tpi_theta_old: List representing the old policy probabilities pi_theta_old(o_i | q).\n\t\tpi_theta_ref: List representing the reference policy probabilities pi_ref(o_i | q).\n\t\tepsilon: Clipping parameter (eps).\n\t\tbeta: KL divergence penalty coefficient (beta).\n\n\tReturns:\n\t\tThe computed GRPO objective value.\n\t\"\"\"\n\t# Your code here\n\tpass", + "solution": "import numpy as np\n\ndef grpo_objective(rhos, A, pi_theta_old, pi_theta_ref, epsilon=0.2, beta=0.01) -> float:\n \"\"\"\n Compute the GRPO objective function.\n\n Args:\n rhos: List of likelihood ratios (ρ_i) = π_theta(o_i | q) / π_theta_old(o_i | q).\n A: List of advantage estimates (A_i).\n pi_theta_old: List representing the old policy probabilities π_theta_old(o_i | q).\n pi_theta_ref: List representing the reference policy probabilities π_ref(o_i | q).\n epsilon: Clipping parameter (ϵ).\n beta: KL divergence penalty coefficient (β).\n\n Returns:\n The computed GRPO objective value.\n \"\"\"\n G = len(rhos)\n if not (len(A) == len(pi_theta_old) == len(pi_theta_ref) == G):\n raise ValueError(\"All input lists must have the same length.\")\n \n # Compute clipped likelihood ratios\n clipped_rhos = np.clip(rhos, 1 - epsilon, 1 + epsilon)\n \n # Compute the minimum terms for the objective\n unclipped = np.array(rhos) * np.array(A)\n clipped = clipped_rhos * np.array(A)\n min_terms = np.minimum(unclipped, clipped)\n average_min = np.mean(min_terms)\n \n # Compute pi_theta from rhos and pi_theta_old\n pi_theta = np.array(rhos) * np.array(pi_theta_old)\n \n # Normalize pi_theta and pi_theta_ref to ensure they are valid probability distributions\n pi_theta /= np.sum(pi_theta)\n pi_theta_ref /= np.sum(pi_theta_ref)\n \n # Compute KL divergence D_KL(pi_theta || pi_theta_ref)\n kl_divergence = np.sum(pi_theta * np.log(pi_theta / pi_theta_ref + 1e-10)) # Added epsilon to avoid log(0)\n \n # Compute the final objective\n objective = average_min - beta * kl_divergence\n \n return objective", + "example": { + "input": "grpo_objective([1.2, 0.8, 1.1], [1.0, 1.0, 1.0], [0.9, 1.1, 1.0], [1.0, 0.5, 1.5], epsilon=0.2, beta=0.01)", + "output": "1.032749", + "reasoning": "The function calculates the GRPO objective by first clipping the likelihood ratios, computing the minimum terms, averaging them, and then subtracting the KL divergence penalty scaled by beta." + }, + "test_cases": [ + { + "test": "print(round(grpo_objective([1.2, 0.8, 1.1], [1.0, 1.0, 1.0], [0.9, 1.1, 1.0], [1.0, 0.5, 1.5], epsilon=0.2, beta=0.01),6))", + "expected_output": "1.032708" + }, + { + "test": "print(round(grpo_objective([0.9, 1.1], [1.0, 1.0], [1.0, 1.0], [0.8, 1.2], epsilon=0.1, beta=0.05),6))", + "expected_output": "0.999736" + }, + { + "test": "print(round(grpo_objective([1.5, 0.5, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.2, 0.7, 1.3], epsilon=0.15, beta=0.02),6))", + "expected_output": "0.882672" + }, + { + "test": "print(round(grpo_objective([1.0], [1.0], [1.0], [1.0], epsilon=0.1, beta=0.01),6))", + "expected_output": "1.0" + } + ] +} \ No newline at end of file diff --git a/.history/build/101_20251128233328.json b/.history/build/101_20251128233328.json new file mode 100644 index 00000000..af1b61c7 --- /dev/null +++ b/.history/build/101_20251128233328.json @@ -0,0 +1,42 @@ +{ + "id": "101", + "title": "Implement the GRPO Objective Function", + "difficulty": "hard", + "category": "Reinforcement Learning", + "video": "", + "likes": "0", + "dislikes": "0", + "contributor": [ + { + "profile_link": "https://github.com/moe18", + "name": "Moe Chabot" + } + ], + "description": "Implement the GRPO (Generalized Relative Policy Optimization) objective function used to optimize policy parameters in reinforcement learning. Your task is to compute the GRPO objective given the likelihood ratios, advantage estimates, old policy probabilities, reference policy probabilities, and apply the clipping mechanism and KL divergence penalty correctly to maintain training stability.", + "learn_section": "### Understanding GRPO (Generalized Relative Policy Optimization)\n\nGRPO is an advanced policy optimization algorithm in reinforcement learning that updates policy parameters while ensuring training stability. It builds upon Proximal Policy Optimization (PPO) by incorporating a KL divergence penalty to keep the new policy close to a reference policy.\n\n### Mathematical Definition\n\nThe GRPO objective function is defined as:\n\n$$\nJ_{GRPO}(\\theta) = \\mathbb{E}_{q \\sim P(Q), \\{o_i\\}_{i=1}^G \\sim \\pi_{\\theta_{old}}(O|q)} \\left[ \\frac{1}{G} \\sum_{i=1}^G \\min\\left( \\rho_i A_i, \\text{clip}(\\rho_i, 1-\\epsilon, 1+\\epsilon) A_i \\right) - \\beta D_{KL}(\\pi_{\\theta} \\| \\pi_{ref}) \\right]\n$$\n\nWhere:\n\n- $\\rho_i = \\frac{\\pi_{\\theta}(o_i | q)}{\\pi_{\\theta_{old}}(o_i | q)}$ is the likelihood ratio.\n- $A_i$ is the advantage estimate for the $i$-th action.\n- $\\epsilon$ is the clipping parameter.\n- $\\beta$ controls the influence of the KL divergence penalty.\n- $D_{KL}$ is the Kullback-Leibler divergence between the new policy $\\pi_{\\theta}$ and the reference policy $\\pi_{ref}$.\n\n### Key Components\n\n#### Likelihood Ratio $\\rho_i$\n- Measures how much more likely the new policy $\\pi_{\\theta}$ is to produce an output $o_i$ compared to the old policy $\\pi_{\\theta_{old}}$.\n- $$\\rho_i = \\frac{\\pi_{\\theta}(o_i | q)}{\\pi_{\\theta_{old}}(o_i | q)}$$\n\n#### Advantage Function $A_i$\n- Evaluates the benefit of taking action $o_i$ compared to the average action.\n- $$A_i = \\frac{r_i - \\text{mean}(r_1, \\ldots, r_G)}{\\text{std}(r_1, \\ldots, r_G)}$$\n- Where $r_i$ is the reward for the $i$-th action.\n\n#### Clipping Mechanism\n- Restricts the likelihood ratio to the range $[1 - \\epsilon, 1 + \\epsilon]$ to prevent large updates.\n- $$\\text{clip}(\\rho_i, 1 - \\epsilon, 1 + \\epsilon)$$\n\n#### KL Divergence Penalty\n- Ensures the new policy $\\pi_{\\theta}$ does not deviate significantly from the reference policy $\\pi_{ref}$.\n- $$-\\beta D_{KL}(\\pi_{\\theta} \\| \\pi_{ref})$$\n\n### Benefits of GRPO\n\n#### Stability\n- The clipping mechanism prevents drastic policy updates, ensuring stable training.\n\n#### Controlled Exploration\n- The KL divergence penalty maintains a balance between exploring new policies and sticking close to a reliable reference policy.\n\n#### Improved Performance\n- By carefully managing policy updates, GRPO can lead to more effective learning and better policy performance.\n\n### Use Cases\n\n#### Reinforcement Learning Tasks\n- Suitable for environments requiring stable and efficient policy updates.\n- also a key component used for the DeepSeek-R1 model\n\n#### Complex Decision-Making Problems\n- Effective in scenarios with high-dimensional action spaces where maintaining policy stability is crucial.\n\n### Conclusion\n\nGRPO enhances policy optimization in reinforcement learning by combining the benefits of PPO with an additional KL divergence penalty. This ensures that policy updates are both effective and stable, leading to more reliable and performant learning agents.", + "starter_code": "import numpy as np\n\ndef grpo_objective(rhos, A, pi_theta_old, pi_theta_ref, epsilon=0.2, beta=0.01) -> float:\n\t\"\"\"\n\tCompute the GRPO objective function.\n\n\tArgs:\n\t\trhos: List of likelihood ratios (p_i) = pi_theta(o_i | q) / pi_theta_old(o_i | q).\n\t\tA: List of advantage estimates (A_i).\n\t\tpi_theta_old: List representing the old policy probabilities pi_theta_old(o_i | q).\n\t\tpi_theta_ref: List representing the reference policy probabilities pi_ref(o_i | q).\n\t\tepsilon: Clipping parameter (eps).\n\t\tbeta: KL divergence penalty coefficient (beta).\n\n\tReturns:\n\t\tThe computed GRPO objective value.\n\t\"\"\"\n\t# Your code here\n\tpass", + "solution": "import numpy as np\n\ndef grpo_objective(rhos, A, pi_theta_old, pi_theta_ref, epsilon=0.2, beta=0.01) -> float:\n \"\"\"\n Compute the GRPO objective function.\n\n Args:\n rhos: List of likelihood ratios (ρ_i) = π_theta(o_i | q) / π_theta_old(o_i | q).\n A: List of advantage estimates (A_i).\n pi_theta_old: List representing the old policy probabilities π_theta_old(o_i | q).\n pi_theta_ref: List representing the reference policy probabilities π_ref(o_i | q).\n epsilon: Clipping parameter (ϵ).\n beta: KL divergence penalty coefficient (β).\n\n Returns:\n The computed GRPO objective value.\n \"\"\"\n G = len(rhos)\n if not (len(A) == len(pi_theta_old) == len(pi_theta_ref) == G):\n raise ValueError(\"All input lists must have the same length.\")\n \n # Compute clipped likelihood ratios\n clipped_rhos = np.clip(rhos, 1 - epsilon, 1 + epsilon)\n \n # Compute the minimum terms for the objective\n unclipped = np.array(rhos) * np.array(A)\n clipped = clipped_rhos * np.array(A)\n min_terms = np.minimum(unclipped, clipped)\n average_min = np.mean(min_terms)\n \n # Compute pi_theta from rhos and pi_theta_old\n pi_theta = np.array(rhos) * np.array(pi_theta_old)\n \n # Normalize pi_theta and pi_theta_ref to ensure they are valid probability distributions\n pi_theta /= np.sum(pi_theta)\n pi_theta_ref /= np.sum(pi_theta_ref)\n \n # Compute KL divergence D_KL(pi_theta || pi_theta_ref)\n kl_divergence = np.sum(pi_theta * np.log(pi_theta / pi_theta_ref + 1e-10)) # Added epsilon to avoid log(0)\n \n # Compute the final objective\n objective = average_min - beta * kl_divergence\n \n return objective", + "example": { + "input": "grpo_objective([1.2, 0.8, 1.1], [1.0, 1.0, 1.0], [0.9, 1.1, 1.0], [1.0, 0.5, 1.5], epsilon=0.2, beta=0.01)", + "output": "1.032749", + "reasoning": "The function calculates the GRPO objective by first clipping the likelihood ratios, computing the minimum terms, averaging them, and then subtracting the KL divergence penalty scaled by beta." + }, + "test_cases": [ + { + "test": "print(round(grpo_objective([1.2, 0.8, 1.1], [1.0, 1.0, 1.0], [0.9, 1.1, 1.0], [1.0, 0.5, 1.5], epsilon=0.2, beta=0.01),6))", + "expected_output": "1.032708" + }, + { + "test": "print(round(grpo_objective([0.9, 1.1], [1.0, 1.0], [1.0, 1.0], [0.8, 1.2], epsilon=0.1, beta=0.05),6))", + "expected_output": "0.999736" + }, + { + "test": "print(round(grpo_objective([1.5, 0.5, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.2, 0.7, 1.3], epsilon=0.15, beta=0.02),6))", + "expected_output": "0.882672" + }, + { + "test": "print(round(grpo_objective([1.0], [1.0], [1.0], [1.0], epsilon=0.1, beta=0.01),6))", + "expected_output": "1.0" + } + ] +} \ No newline at end of file diff --git a/.history/questions/101_implement-the-grpo-objective-function/description_20251128232117.md b/.history/questions/101_implement-the-grpo-objective-function/description_20251128232117.md new file mode 100644 index 00000000..66e9da43 --- /dev/null +++ b/.history/questions/101_implement-the-grpo-objective-function/description_20251128232117.md @@ -0,0 +1 @@ +Implement the GRPO (Generalized Relative Policy Optimization) objective function used to optimize policy parameters in reinforcement learning. Your task is to compute the GRPO objective given the likelihood ratios, advantage estimates, old policy probabilities, reference policy probabilities, and apply the clipping mechanism and KL divergence penalty correctly to maintain training stability. diff --git a/.history/questions/101_implement-the-grpo-objective-function/description_20251128232318.md b/.history/questions/101_implement-the-grpo-objective-function/description_20251128232318.md new file mode 100644 index 00000000..ea11b8d8 --- /dev/null +++ b/.history/questions/101_implement-the-grpo-objective-function/description_20251128232318.md @@ -0,0 +1 @@ +Implement the GRPO (Group Relative Policy Optimization) objective function used to optimize policy parameters in reinforcement learning. Your task is to compute the GRPO objective given the likelihood ratios, advantage estimates, old policy probabilities, reference policy probabilities, and apply the clipping mechanism and KL divergence penalty correctly to maintain training stability. diff --git a/.history/questions/101_implement-the-grpo-objective-function/example_20251128232117.json b/.history/questions/101_implement-the-grpo-objective-function/example_20251128232117.json new file mode 100644 index 00000000..7fc59074 --- /dev/null +++ b/.history/questions/101_implement-the-grpo-objective-function/example_20251128232117.json @@ -0,0 +1,5 @@ +{ + "input": "grpo_objective([1.2, 0.8, 1.1], [1.0, 1.0, 1.0], [0.9, 1.1, 1.0], [1.0, 0.5, 1.5], epsilon=0.2, beta=0.01)", + "output": "1.032749", + "reasoning": "The function calculates the GRPO objective by first clipping the likelihood ratios, computing the minimum terms, averaging them, and then subtracting the KL divergence penalty scaled by beta." +} diff --git a/.history/questions/101_implement-the-grpo-objective-function/example_20251128233252.json b/.history/questions/101_implement-the-grpo-objective-function/example_20251128233252.json new file mode 100644 index 00000000..b47fbd6c --- /dev/null +++ b/.history/questions/101_implement-the-grpo-objective-function/example_20251128233252.json @@ -0,0 +1,5 @@ +{ + "input": "grpo_objective([1.2, 0.8, 1.1], [1.0, 1.0, 1.0], [0.9, 1.1, 1.0], [1.0, 0.5, 1.5], epsilon=0.2, beta=0.01)", + "output": "1.032708", + "reasoning": "The function calculates the GRPO objective by first clipping the likelihood ratios, computing the minimum terms, averaging them, and then subtracting the KL divergence penalty scaled by beta." +} diff --git a/.history/questions/101_implement-the-grpo-objective-function/example_20251128233336.json b/.history/questions/101_implement-the-grpo-objective-function/example_20251128233336.json new file mode 100644 index 00000000..03e9bb29 --- /dev/null +++ b/.history/questions/101_implement-the-grpo-objective-function/example_20251128233336.json @@ -0,0 +1,5 @@ +{ + "input": "grpo_objective([1.2, 0.8, 1.1], [1.0, 1.0, 1.0], [0.9, 1.1, 1.0], [1.0, 0.5, 1.5], epsilon=0.2, beta=0.01)", + "output": "1.032708", + "reasoning": "The function calculates the GRPO objective by first clipping the likelihood ratios, computing the minimum terms, averaging them, and then subtracting the KL divergence penalty scaled by beta." +} \ No newline at end of file diff --git a/.history/questions/101_implement-the-grpo-objective-function/solution_20251128232117.py b/.history/questions/101_implement-the-grpo-objective-function/solution_20251128232117.py new file mode 100644 index 00000000..32d5432b --- /dev/null +++ b/.history/questions/101_implement-the-grpo-objective-function/solution_20251128232117.py @@ -0,0 +1,44 @@ +import numpy as np + +def grpo_objective(rhos, A, pi_theta_old, pi_theta_ref, epsilon=0.2, beta=0.01) -> float: + """ + Compute the GRPO objective function. + + Args: + rhos: List of likelihood ratios (ρ_i) = π_theta(o_i | q) / π_theta_old(o_i | q). + A: List of advantage estimates (A_i). + pi_theta_old: List representing the old policy probabilities π_theta_old(o_i | q). + pi_theta_ref: List representing the reference policy probabilities π_ref(o_i | q). + epsilon: Clipping parameter (ϵ). + beta: KL divergence penalty coefficient (β). + + Returns: + The computed GRPO objective value. + """ + G = len(rhos) + if not (len(A) == len(pi_theta_old) == len(pi_theta_ref) == G): + raise ValueError("All input lists must have the same length.") + + # Compute clipped likelihood ratios + clipped_rhos = np.clip(rhos, 1 - epsilon, 1 + epsilon) + + # Compute the minimum terms for the objective + unclipped = np.array(rhos) * np.array(A) + clipped = clipped_rhos * np.array(A) + min_terms = np.minimum(unclipped, clipped) + average_min = np.mean(min_terms) + + # Compute pi_theta from rhos and pi_theta_old + pi_theta = np.array(rhos) * np.array(pi_theta_old) + + # Normalize pi_theta and pi_theta_ref to ensure they are valid probability distributions + pi_theta /= np.sum(pi_theta) + pi_theta_ref /= np.sum(pi_theta_ref) + + # Compute KL divergence D_KL(pi_theta || pi_theta_ref) + kl_divergence = np.sum(pi_theta * np.log(pi_theta / pi_theta_ref + 1e-10)) # Added epsilon to avoid log(0) + + # Compute the final objective + objective = average_min - beta * kl_divergence + + return objective diff --git a/.history/questions/101_implement-the-grpo-objective-function/solution_20251128232521.py b/.history/questions/101_implement-the-grpo-objective-function/solution_20251128232521.py new file mode 100644 index 00000000..7e0fdbdc --- /dev/null +++ b/.history/questions/101_implement-the-grpo-objective-function/solution_20251128232521.py @@ -0,0 +1,45 @@ +import numpy as np + +def grpo_objective(rhos, A, pi_theta_old, pi_theta_ref, epsilon=0.2, beta=0.01) -> float: + """ + Compute the GRPO objective function. + + Args: + rhos: List of likelihood ratios (ρ_i) = π_theta(o_i | q) / π_theta_old(o_i | q). + A: List of advantage estimates (A_i). + pi_theta_old: List representing the old policy probabilities π_theta_old(o_i | q). + pi_theta_ref: List representing the reference policy probabilities π_ref(o_i | q). + epsilon: Clipping parameter (ϵ). + beta: KL divergence penalty coefficient (β). + + Returns: + The computed GRPO objective value. + """ + G = len(rhos) + if not (len(A) == len(pi_theta_old) == len(pi_theta_ref) == G): + raise ValueError("All input lists must have the same length.") + + # Compute clipped likelihood ratios + clipped_rhos = np.clip(rhos, 1 - epsilon, 1 + epsilon) + + # Compute the minimum terms for the objective + unclipped = np.array(rhos) * np.array(A) + clipped = clipped_rhos * np.array(A) + min_terms = np.minimum(unclipped, clipped) + average_min = np.mean(min_terms) + + # Compute pi_theta from rhos and pi_theta_old + pi_theta = np.array(rhos) * np.array(pi_theta_old) + + # Normalize pi_theta and pi_theta_ref to ensure they are valid probability distributions + pi_theta /= np.sum(pi_theta) + pi_theta_ref /= np.sum(pi_theta_ref) + + # Compute KL divergence following GRPO paper (https://arxiv.org/pdf/2402.03300) + log_ratio = np.log(pi_theta_ref) - np.log(pi_theta) + kl_divergence = np.exp(log_ratio) - log_ratio - 1 + + # Compute the final objective + objective = average_min - beta * kl_divergence + + return objective diff --git a/.history/questions/101_implement-the-grpo-objective-function/tests_20251128232117.json b/.history/questions/101_implement-the-grpo-objective-function/tests_20251128232117.json new file mode 100644 index 00000000..80a7f620 --- /dev/null +++ b/.history/questions/101_implement-the-grpo-objective-function/tests_20251128232117.json @@ -0,0 +1,18 @@ +[ + { + "test": "print(round(grpo_objective([1.2, 0.8, 1.1], [1.0, 1.0, 1.0], [0.9, 1.1, 1.0], [1.0, 0.5, 1.5], epsilon=0.2, beta=0.01),6))", + "expected_output": "1.032749" + }, + { + "test": "print(round(grpo_objective([0.9, 1.1], [1.0, 1.0], [1.0, 1.0], [0.8, 1.2], epsilon=0.1, beta=0.05),6))", + "expected_output": "0.999743" + }, + { + "test": "print(round(grpo_objective([1.5, 0.5, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.2, 0.7, 1.3], epsilon=0.15, beta=0.02),6))", + "expected_output": "0.882682" + }, + { + "test": "print(round(grpo_objective([1.0], [1.0], [1.0], [1.0], epsilon=0.1, beta=0.01),6))", + "expected_output": "1.0" + } +] diff --git a/.history/questions/101_implement-the-grpo-objective-function/tests_20251128233228.json b/.history/questions/101_implement-the-grpo-objective-function/tests_20251128233228.json new file mode 100644 index 00000000..4394dedd --- /dev/null +++ b/.history/questions/101_implement-the-grpo-objective-function/tests_20251128233228.json @@ -0,0 +1,18 @@ +[ + { + "test": "print(round(grpo_objective([1.2, 0.8, 1.1], [1.0, 1.0, 1.0], [0.9, 1.1, 1.0], [1.0, 0.5, 1.5], epsilon=0.2, beta=0.01),6))", + "expected_output": "1.032708" + }, + { + "test": "print(round(grpo_objective([0.9, 1.1], [1.0, 1.0], [1.0, 1.0], [0.8, 1.2], epsilon=0.1, beta=0.05),6))", + "expected_output": "0.999736" + }, + { + "test": "print(round(grpo_objective([1.5, 0.5, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.2, 0.7, 1.3], epsilon=0.15, beta=0.02),6))", + "expected_output": "0.882672" + }, + { + "test": "print(round(grpo_objective([1.0], [1.0], [1.0], [1.0], epsilon=0.1, beta=0.01),6))", + "expected_output": "1.0" + } +] diff --git a/build/101.json b/build/101.json index 21e4e125..af1b61c7 100644 --- a/build/101.json +++ b/build/101.json @@ -24,15 +24,15 @@ "test_cases": [ { "test": "print(round(grpo_objective([1.2, 0.8, 1.1], [1.0, 1.0, 1.0], [0.9, 1.1, 1.0], [1.0, 0.5, 1.5], epsilon=0.2, beta=0.01),6))", - "expected_output": "1.032749" + "expected_output": "1.032708" }, { "test": "print(round(grpo_objective([0.9, 1.1], [1.0, 1.0], [1.0, 1.0], [0.8, 1.2], epsilon=0.1, beta=0.05),6))", - "expected_output": "0.999743" + "expected_output": "0.999736" }, { "test": "print(round(grpo_objective([1.5, 0.5, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.2, 0.7, 1.3], epsilon=0.15, beta=0.02),6))", - "expected_output": "0.882682" + "expected_output": "0.882672" }, { "test": "print(round(grpo_objective([1.0], [1.0], [1.0], [1.0], epsilon=0.1, beta=0.01),6))", diff --git a/questions/101_implement-the-grpo-objective-function/description.md b/questions/101_implement-the-grpo-objective-function/description.md index 66e9da43..ea11b8d8 100644 --- a/questions/101_implement-the-grpo-objective-function/description.md +++ b/questions/101_implement-the-grpo-objective-function/description.md @@ -1 +1 @@ -Implement the GRPO (Generalized Relative Policy Optimization) objective function used to optimize policy parameters in reinforcement learning. Your task is to compute the GRPO objective given the likelihood ratios, advantage estimates, old policy probabilities, reference policy probabilities, and apply the clipping mechanism and KL divergence penalty correctly to maintain training stability. +Implement the GRPO (Group Relative Policy Optimization) objective function used to optimize policy parameters in reinforcement learning. Your task is to compute the GRPO objective given the likelihood ratios, advantage estimates, old policy probabilities, reference policy probabilities, and apply the clipping mechanism and KL divergence penalty correctly to maintain training stability. diff --git a/questions/101_implement-the-grpo-objective-function/example.json b/questions/101_implement-the-grpo-objective-function/example.json index 7fc59074..03e9bb29 100644 --- a/questions/101_implement-the-grpo-objective-function/example.json +++ b/questions/101_implement-the-grpo-objective-function/example.json @@ -1,5 +1,5 @@ { "input": "grpo_objective([1.2, 0.8, 1.1], [1.0, 1.0, 1.0], [0.9, 1.1, 1.0], [1.0, 0.5, 1.5], epsilon=0.2, beta=0.01)", - "output": "1.032749", + "output": "1.032708", "reasoning": "The function calculates the GRPO objective by first clipping the likelihood ratios, computing the minimum terms, averaging them, and then subtracting the KL divergence penalty scaled by beta." -} +} \ No newline at end of file diff --git a/questions/101_implement-the-grpo-objective-function/solution.py b/questions/101_implement-the-grpo-objective-function/solution.py index 32d5432b..7e0fdbdc 100644 --- a/questions/101_implement-the-grpo-objective-function/solution.py +++ b/questions/101_implement-the-grpo-objective-function/solution.py @@ -35,9 +35,10 @@ def grpo_objective(rhos, A, pi_theta_old, pi_theta_ref, epsilon=0.2, beta=0.01) pi_theta /= np.sum(pi_theta) pi_theta_ref /= np.sum(pi_theta_ref) - # Compute KL divergence D_KL(pi_theta || pi_theta_ref) - kl_divergence = np.sum(pi_theta * np.log(pi_theta / pi_theta_ref + 1e-10)) # Added epsilon to avoid log(0) - + # Compute KL divergence following GRPO paper (https://arxiv.org/pdf/2402.03300) + log_ratio = np.log(pi_theta_ref) - np.log(pi_theta) + kl_divergence = np.exp(log_ratio) - log_ratio - 1 + # Compute the final objective objective = average_min - beta * kl_divergence diff --git a/questions/101_implement-the-grpo-objective-function/tests.json b/questions/101_implement-the-grpo-objective-function/tests.json index 80a7f620..4394dedd 100644 --- a/questions/101_implement-the-grpo-objective-function/tests.json +++ b/questions/101_implement-the-grpo-objective-function/tests.json @@ -1,15 +1,15 @@ [ { "test": "print(round(grpo_objective([1.2, 0.8, 1.1], [1.0, 1.0, 1.0], [0.9, 1.1, 1.0], [1.0, 0.5, 1.5], epsilon=0.2, beta=0.01),6))", - "expected_output": "1.032749" + "expected_output": "1.032708" }, { "test": "print(round(grpo_objective([0.9, 1.1], [1.0, 1.0], [1.0, 1.0], [0.8, 1.2], epsilon=0.1, beta=0.05),6))", - "expected_output": "0.999743" + "expected_output": "0.999736" }, { "test": "print(round(grpo_objective([1.5, 0.5, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.2, 0.7, 1.3], epsilon=0.15, beta=0.02),6))", - "expected_output": "0.882682" + "expected_output": "0.882672" }, { "test": "print(round(grpo_objective([1.0], [1.0], [1.0], [1.0], epsilon=0.1, beta=0.01),6))",