Refactor ProgramOfThought and add unit tests (#7878)

TomeHirata · web-flow · commit 0bb3fd4c31f3 · 2025-03-04T08:14:13.000+09:00
* refactor ProgramOfThought and add tests
diff --git a/dspy/predict/program_of_thought.py b/dspy/predict/program_of_thought.py
@@ -1,14 +1,37 @@
+import logging
 import re
+from typing import Union, Type
 
 import dspy
-from dspy.signatures.signature import ensure_signature
+from dspy.signatures.signature import ensure_signature, Signature
 
 from dspy.primitives.program import Module
 from dspy.primitives.python_interpreter import PythonInterpreter
 
+logger = logging.getLogger(__name__)
 
 class ProgramOfThought(Module):
-    def __init__(self, signature, max_iters=3):
+    """
+    A DSPy module that runs Python programs to solve a problem.
+    This module reuires deno to be installed. Please install deno following https://docs.deno.com/runtime/getting_started/installation/
+
+    Example:
+    ```
+    import dspy
+
+    lm = dspy.LM('openai/gpt-4o-mini')
+    dspy.configure(lm=lm)
+    pot = dspy.ProgramOfThought("question -> answer")
+    pot(question="what is 1+1?")
+    ```
+    """
+
+    def __init__(self, signature: Union[str, Type[Signature]], max_iters=3):
+        """
+        Args:
+            signature: The signature of the module.
+            max_iters: The maximum number of iterations to retry code generation and execution.
+        """
         super().__init__()
         self.signature = signature = ensure_signature(signature)
         self.max_iters = max_iters
@@ -56,6 +79,10 @@ def __init__(self, signature, max_iters=3):
                 self._generate_instruction("answer"),
             ),
         )
+        # Currently, the interpreter class checks the deno availability at execution time. 
+        # We may consider checking it at the initialization time for better instruction.
+        self.interpreter = PythonInterpreter()
+
     def _generate_signature(self, mode):
         signature_dict = dict(self.input_fields)
         fields_for_mode = {
@@ -125,7 +152,7 @@ def _generate_instruction(self, mode):
         return "\n".join(instr)
 
 
-    def parse_code(self, code_data):
+    def _parse_code(self, code_data):
         code = (
             code_data.get("generated_code", "").split("---", 1)[0].split("\n\n\n", 1)[0]
         )
@@ -148,35 +175,42 @@ def parse_code(self, code_data):
             )
         return code_block, None
 
-    def execute_code(self, code):
+    def _execute_code(self, code):
+        """
+        Execute the code using PythonInterpreter and return the output or error.
+        """
         if not code:
-            return code, None, "Error: Empty code before execution."
-        interpreter = PythonInterpreter()
+            return None, "Error: Empty code before execution."
+        
         try:
-            output = str(interpreter.execute(code))
-            return code, output, None
+            output = str(self.interpreter.execute(code))
+            return output, None
         except Exception as e:
-            return code, None, str(e)
+            return None, str(e)
+
     def forward(self, **kwargs):
         input_kwargs = {
             field_name: kwargs[field_name] for field_name in self.input_fields
         }
         code_data = self.code_generate(**input_kwargs)
-        parsed_code, error = self.parse_code(code_data)
-        # FIXME: Don't try to execute the code if it didn't parse
-        code, output, error = self.execute_code(parsed_code)
-        hop = 0
-        while hop < self.max_iters and error:
-            print("Error in code execution")
+        output = None
+        code, error = self._parse_code(code_data)
+        if not error:
+            output, error = self._execute_code(code)
+        hop = 1
+        # Retying code generation and execution until no error or reach max_iters
+        while error is not None:
+            logger.error(f"Error in code execution: {error}")
+            if hop == self.max_iters:
+                self.interpreter.shutdown()
+                raise RuntimeError(f"Max hops reached. Failed to run ProgramOfThought: {error}")
             input_kwargs.update({"previous_code": code, "error": error})
             code_data = self.code_regenerate(**input_kwargs)
-            parsed_code, error = self.parse_code(code_data)
-            # FIXME: Don't try to execute the code if it didn't parse
-            code, output, error = self.execute_code(parsed_code)
+            code, error = self._parse_code(code_data)
+            if not error:
+                output, error = self._execute_code(code)
             hop += 1
-            if hop == self.max_iters:
-                print("Max hops reached. Error persists.")
-                return None
         input_kwargs.update({"final_generated_code": code, "code_output": output})
         answer_gen_result = self.generate_answer(**input_kwargs)
-        return answer_gen_result
+        self.interpreter.shutdown()
+        return answer_gen_result
diff --git a/dspy/primitives/python_interpreter.py b/dspy/primitives/python_interpreter.py
@@ -117,14 +117,14 @@ def execute(
             # If not valid JSON, just return raw text
             result = {"output": output_line}
 
-        # If we have an error, handle SyntaxError vs. other error
+        # If we have an error, determine if it's a SyntaxError or other error using error.errorType.
         if "error" in result:
             error_msg = result["error"]
-            error_type = result.get("errorType", "")
+            error_type = result.get("errorType", "Sandbox Error")
             if error_type == "SyntaxError":
-                raise SyntaxError(error_msg)
+                raise SyntaxError(f"Invalid Python syntax. message: {error_msg}")
             else:
-                raise InterpreterError(f"Sandbox Error: {error_msg}")
+                raise InterpreterError(f"{error_type}: {error_msg}")
 
         # If there's no error, return the "output" field
         return result.get("output", None)
diff --git a/dspy/primitives/runner.js b/dspy/primitives/runner.js
@@ -86,7 +86,9 @@ sys.stderr = old_stderr
     console.log(JSON.stringify({ output }));
   } catch (error) {
     // We have an error => check if it's a SyntaxError or something else
-    const errorType = error.name || "Error";
+    // The Python error class name is stored in error.type: https://pyodide.org/en/stable/usage/api/js-api.html#pyodide.ffi.PythonError
+    const errorType = error.type || "Error";
+    // error.message is mostly blank.
     const errorMessage = (error.message || "").trim();
     console.log(JSON.stringify({
       error: errorMessage,
diff --git a/tests/predict/test_program_of_thought.py b/tests/predict/test_program_of_thought.py
@@ -1,13 +1,20 @@
+from unittest.mock import patch
+import pytest
+import shutil
+
 import dspy
 from dspy import ProgramOfThought, Signature
 from dspy.utils import DummyLM
 
+# This test suite requires deno to be installed. Please install deno following https://docs.deno.com/runtime/getting_started/installation/
+is_deno_available = shutil.which("deno") is not None
 
 class BasicQA(Signature):
     question = dspy.InputField()
     answer = dspy.OutputField(desc="often between 1 and 5 words")
 
 
+@pytest.mark.skipif(not is_deno_available, reason="Deno is not installed or not in PATH")
 def test_pot_code_generation():
     lm = DummyLM(
         [
@@ -19,9 +26,11 @@ def test_pot_code_generation():
     pot = ProgramOfThought(BasicQA)
     res = pot(question="What is 1+1?")
     assert res.answer == "2"
+    assert pot.interpreter.deno_process is None
 
 
-def test_pot_code_generation_with_error():
+@pytest.mark.skipif(not is_deno_available, reason="Deno is not installed or not in PATH")
+def test_pot_code_generation_with_one_error():
     lm = DummyLM(
         [
             {"reasoning": "Reason_A", "generated_code": "```python\nresult = 1+0/0\n```"},
@@ -34,3 +43,35 @@ def test_pot_code_generation_with_error():
     pot = ProgramOfThought(BasicQA)
     res = pot(question="What is 1+1?")
     assert res.answer == "2"
+    assert pot.interpreter.deno_process is None
+
+
+@pytest.mark.skipif(not is_deno_available, reason="Deno is not installed or not in PATH")
+def test_pot_code_generation_persistent_errors():
+    max_iters = 3
+    lm = DummyLM(
+        [
+            {"reasoning": "Reason_A", "generated_code": "```python\nresult = 1+0/0\n```"},
+        ] * max_iters
+    )
+    dspy.settings.configure(lm=lm)
+
+    pot = ProgramOfThought(BasicQA, max_iters=max_iters)
+    with pytest.raises(RuntimeError, match="Max hops reached. Failed to run ProgramOfThought: ZeroDivisionError:"):
+        pot(question="What is 1+1?")
+        assert pot.interpreter.deno_process is None
+
+
+def test_pot_code_parse_error():
+    max_iters = 3
+    lm = DummyLM(
+        [
+            {"reasoning": "Reason_A", "generated_code": "```python\ninvalid=python=code\n```"},
+        ] * max_iters
+    )
+    dspy.settings.configure(lm=lm)
+
+    pot = ProgramOfThought(BasicQA, max_iters=max_iters)
+    with patch("dspy.predict.program_of_thought.ProgramOfThought._execute_code") as mock_execute_code, pytest.raises(RuntimeError, match="Max hops reached. Failed to run ProgramOfThought: Error: Code format is not correct."):
+        pot(question="What is 1+1?")
+    mock_execute_code.assert_not_called()
diff --git a/tests/primitives/test_python_interpreter.py b/tests/primitives/test_python_interpreter.py
@@ -1,4 +1,10 @@
-from dspy.primitives.python_interpreter import PythonInterpreter
+import shutil
+import pytest
+from dspy.primitives.python_interpreter import PythonInterpreter, InterpreterError
+
+# This test suite requires deno to be installed. Please install deno following https://docs.deno.com/runtime/getting_started/installation/
+if shutil.which("deno") is None:
+    pytest.skip(reason="Deno is not installed or not in PATH")
 
 def test_execute_simple_code():
     interpreter = PythonInterpreter()
@@ -16,4 +22,16 @@ def test_user_variable_definitions():
     interpreter = PythonInterpreter()
     code = "result = number + 1\nresult"
     result = interpreter.execute(code, variables={'number': 4})
-    assert result == 5, "User variable assignment should work"
+    assert result == 5, "User variable assignment should work"
+
+def test_failure_syntax_error():
+    interpreter = PythonInterpreter()
+    code = "+++"
+    with pytest.raises(SyntaxError, match="Invalid Python syntax"):
+        interpreter.execute(code)
+
+def test_failure_zero_division():
+    interpreter = PythonInterpreter()
+    code = "1+0/0"
+    with pytest.raises(InterpreterError, match="ZeroDivisionError"):
+        interpreter.execute(code)