tests: map-based gsm8k (#1107)

mandel · web-flow · commit ff19fe5ffaaf · 2025-08-11T18:33:54.000-04:00
Signed-off-by: Louis Mandel &lt;lmandel@us.ibm.com&gt;
diff --git a/examples/gsm8k/gsm8k-loop-fission.pdl b/examples/gsm8k/gsm8k-loop-fission.pdl
@@ -18,6 +18,10 @@ defs:
   # How many problems to evaluate.  The entire dataset is 1319 problems.
   # MAX_ITERATIONS: 1319
   MAX_ITERATIONS: 50
+  # Which model to use
+  # MODEL: ollama/granite-code:8b
+  # MODEL: ollama/granite3.2:8b
+  MODEL: watsonx/ibm/granite-3-2-8b-instruct
 
   # PDL variables that hold statistics
   SUCCESSES: 0
@@ -29,8 +33,7 @@ text:
     TEST: ${ TESTS }
   repeat:
     # Ask the LLM for the answer
-    # - model: ollama/granite-code:8b
-    model: ollama/granite3.2:8b
+    model: ${ MODEL }
       # First, get LLM to answer the question
     input: |
       Question: ${ TEST.question }
@@ -51,8 +54,7 @@ text:
     LLM_FULL_ANSWER: ${ ALL_LLM_FULL_A }
   repeat:
     # Next, get LLM to convert its answer into a single JSON key/value
-    # - model: ollama/granite-code:8b
-    model: ollama/granite3.2:8b
+    model: ${ MODEL }
     input: | # 'input' is the prompt
       Generate the final answer from the conclusion of this text as JSON with a single key named answer.
       ${ LLM_FULL_ANSWER }
diff --git a/examples/gsm8k/gsm8k.pdl b/examples/gsm8k/gsm8k.pdl
@@ -0,0 +1,105 @@
+#!/usr/bin/env pdl
+
+# Grade School Math https://github.com/openai/grade-school-math is an
+# open source AI dataset from 2021.
+# 
+# https://github.com/openai/grade-school-math/blob/master/grade_school_math/data/test.jsonl
+# is a file with 1319 questions and answers.
+#
+#
+
+description: Grade School Math
+defs:
+  # The Grade School Math Dataset
+  ALL_TESTS:
+    read: ./test.jsonl
+    parser: jsonl
+
+  # How many problems to evaluate.  The entire dataset is 1319 problems.
+  # MAX_ITERATIONS: 1319
+  MAX_ITERATIONS: 10
+  # Which model to use
+  # MODEL: ollama/granite-code:8b
+  # MODEL: ollama/granite3.2:8b
+  MODEL: watsonx/ibm/granite-3-2-8b-instruct
+
+  # PDL variables that hold statistics
+  SUCCESSES: 0
+  FAILURES: 0
+  TESTS: ${ ALL_TESTS[:MAX_ITERATIONS] }
+  SOLUTIONS:
+    defs:
+      stats:
+        function:
+          r1: { success: integer, text: string}
+          r2: { success: integer, text: string}
+        return:
+          data:
+            success: ${ r1.success + r2.success }
+            text: ${ r1.text + "\n\n" + r2.text }
+    for:
+      TEST: ${ TESTS }
+    # maxWorkers: 10
+    map:
+      defs:
+        # First phase: ask LLM the Grade School Math questions
+        LLM_FULL_ANSWER:
+          model: ${ MODEL }
+            # First, get LLM to answer the question
+          input: |
+            Question: ${ TEST.question }
+            Answer: 
+        # Next, get LLM to convert its answer into a single JSON key/value
+        SIMPLIFIED_LLM_ANSWER:
+          model: ${ MODEL }
+          input: |
+            Generate the final answer from the conclusion of this text as JSON with a single key named answer.
+            ${ LLM_FULL_ANSWER }
+        # Third phase: Compare with Grade School Math ground truth
+      lastOf:
+        # Convert the JSON string to JSON.  (We do this in a separate step so
+        # we have access to the original for debugging.)
+        - data: ${ SIMPLIFIED_LLM_ANSWER }
+          parser: json
+          def: JSON_SIMPLIFIED_LLM_ANSWER
+
+          # Strip off any prefix or suffix off the number (dollar signs, units, etc)
+          # and place it in of the JSON format { "answer": ... }
+        - data: ${ JSON_SIMPLIFIED_LLM_ANSWER.answer|string if 'answer' in JSON_SIMPLIFIED_LLM_ANSWER else ("MISSING 'answer' in " + LLM_FULL_ANSWER) }
+          parser:
+            regex: "[^0-9]*(?P<answer>[0-9]+).*$"
+            spec:
+              answer: string
+          def: EXTRACTED_SIMPLIFIED_LLM_ANSWER
+        # (In case the simplified answer did not contain digits.)
+        - if: ${ EXTRACTED_SIMPLIFIED_LLM_ANSWER == None }
+          then:
+            def: EXTRACTED_SIMPLIFIED_LLM_ANSWER
+            data:
+              answer: "none"
+
+        # Extract the expected answer, which in this test data always follows "#### "
+        # into { "answer": ... }
+        - data: ${ TEST.answer }
+          parser:
+            regex: "(.|\n)*#### (?P<answer>([0-9])*)\n*"
+            spec:
+              answer: string
+          def: EXTRACTED_GROUND_TRUTH
+
+          # Did we get the expected answer?
+        - if: ${ EXTRACTED_SIMPLIFIED_LLM_ANSWER.answer == EXTRACTED_GROUND_TRUTH.answer}
+          then:
+            object:
+              success: 1
+              text: |
+                LLM got right answer for '${ LLM_FULL_ANSWER }' which was simplified to '${ SIMPLIFIED_LLM_ANSWER }' which was extracted to '${ EXTRACTED_SIMPLIFIED_LLM_ANSWER.answer }'
+          else:
+            object:
+              success: 0
+              text: |
+                WRONG! Wanted ${ EXTRACTED_GROUND_TRUTH.answer} } / LLM said '${ LLM_FULL_ANSWER }' which was simplified to '${ SIMPLIFIED_LLM_ANSWER }' which was extracted to '${ EXTRACTED_SIMPLIFIED_LLM_ANSWER.answer }'
+    join:
+      reduce: ${ stats }
+text:
+  Finished, ${ SOLUTIONS.success } successes on ${ MAX_ITERATIONS } tests
diff --git a/src/pdl/pdl_dumper.py b/src/pdl/pdl_dumper.py
@@ -259,6 +259,8 @@ def block_to_dict(  # noqa: C901
                 d["for"] = expr_to_dict(block.for_, json_compatible)
             if block.index is not None:
                 d["index"] = block.index
+            if block.maxWorkers is not None:
+                d["maxWorkers"] = expr_to_dict(block.maxWorkers, json_compatible)
             d["map"] = block_to_dict(block.map, json_compatible)
             if block.maxIterations is not None:
                 d["maxIterations"] = expr_to_dict(block.maxIterations, json_compatible)
diff --git a/src/pdl/pdl_interpreter.py b/src/pdl/pdl_interpreter.py
@@ -193,17 +193,15 @@ def with_role(self: "InterpreterState", role: RoleType) -> "InterpreterState":
         return self.model_copy(update={"role": role})
 
     def with_id(self: "InterpreterState", n: str) -> "InterpreterState":
-        stack = self.id_stack.copy() if self.id_stack is not None else []
-        stack.append(n)
-        return self.model_copy(update={"id_stack": stack})
+        stack = self.id_stack if self.id_stack is not None else []
+        return self.model_copy(update={"id_stack": stack + [n]})
 
     def with_iter(self: "InterpreterState", i: int) -> "InterpreterState":
         return self.with_id(str(i))
 
     def with_pop(self: "InterpreterState") -> "InterpreterState":
-        stack = self.id_stack.copy() if self.id_stack is not None else []
-        stack.pop()
-        return self.model_copy(update={"id_stack": stack})
+        stack = self.id_stack if self.id_stack is not None else []
+        return self.model_copy(update={"id_stack": stack[:-1]})
 
 
 class ClosureBlock(FunctionBlock):
@@ -961,7 +959,6 @@ def process_block_body(
             block, max_iterations = _evaluate_max_iterations_field(scope, block, loc)
             block = _evaluate_join_field(scope, block, loc)
             map_loc = append(loc, "map")
-            iidx = 0
             try:
                 if max_iterations is not None:
                     index_iterator: Any = range(max_iterations)
diff --git a/tests/test_examples_run.yaml b/tests/test_examples_run.yaml
@@ -13,7 +13,8 @@ check:
 skip:
   - examples/demos/react.pdl
   - examples/cldk/cldk-assistant.pdl
-  - examples/gsm8k/gsm8.pdl
+  - examples/gsm8k/gsm8k.pdl
+  - examples/gsm8k/gsm8k-loop-fission.pdl 
   - examples/gsm8k/gsm8k-plan.pdl
   - examples/gsm8k/gsm8k-plan-few-shots.pdl
   - examples/gsm8k/gsm8k-tot-few-shot.pdl