Skip to content

Commit ff19fe5

Browse files
authored
tests: map-based gsm8k (#1107)
Signed-off-by: Louis Mandel <[email protected]>
1 parent c845250 commit ff19fe5

File tree

5 files changed

+119
-12
lines changed

5 files changed

+119
-12
lines changed

examples/gsm8k/gsm8.pdl renamed to examples/gsm8k/gsm8k-loop-fission.pdl

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ defs:
1818
# How many problems to evaluate. The entire dataset is 1319 problems.
1919
# MAX_ITERATIONS: 1319
2020
MAX_ITERATIONS: 50
21+
# Which model to use
22+
# MODEL: ollama/granite-code:8b
23+
# MODEL: ollama/granite3.2:8b
24+
MODEL: watsonx/ibm/granite-3-2-8b-instruct
2125

2226
# PDL variables that hold statistics
2327
SUCCESSES: 0
@@ -29,8 +33,7 @@ text:
2933
TEST: ${ TESTS }
3034
repeat:
3135
# Ask the LLM for the answer
32-
# - model: ollama/granite-code:8b
33-
model: ollama/granite3.2:8b
36+
model: ${ MODEL }
3437
# First, get LLM to answer the question
3538
input: |
3639
Question: ${ TEST.question }
@@ -51,8 +54,7 @@ text:
5154
LLM_FULL_ANSWER: ${ ALL_LLM_FULL_A }
5255
repeat:
5356
# Next, get LLM to convert its answer into a single JSON key/value
54-
# - model: ollama/granite-code:8b
55-
model: ollama/granite3.2:8b
57+
model: ${ MODEL }
5658
input: | # 'input' is the prompt
5759
Generate the final answer from the conclusion of this text as JSON with a single key named answer.
5860
${ LLM_FULL_ANSWER }

examples/gsm8k/gsm8k.pdl

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/usr/bin/env pdl
2+
3+
# Grade School Math https://github.com/openai/grade-school-math is an
4+
# open source AI dataset from 2021.
5+
#
6+
# https://github.com/openai/grade-school-math/blob/master/grade_school_math/data/test.jsonl
7+
# is a file with 1319 questions and answers.
8+
#
9+
#
10+
11+
description: Grade School Math
12+
defs:
13+
# The Grade School Math Dataset
14+
ALL_TESTS:
15+
read: ./test.jsonl
16+
parser: jsonl
17+
18+
# How many problems to evaluate. The entire dataset is 1319 problems.
19+
# MAX_ITERATIONS: 1319
20+
MAX_ITERATIONS: 10
21+
# Which model to use
22+
# MODEL: ollama/granite-code:8b
23+
# MODEL: ollama/granite3.2:8b
24+
MODEL: watsonx/ibm/granite-3-2-8b-instruct
25+
26+
# PDL variables that hold statistics
27+
SUCCESSES: 0
28+
FAILURES: 0
29+
TESTS: ${ ALL_TESTS[:MAX_ITERATIONS] }
30+
SOLUTIONS:
31+
defs:
32+
stats:
33+
function:
34+
r1: { success: integer, text: string}
35+
r2: { success: integer, text: string}
36+
return:
37+
data:
38+
success: ${ r1.success + r2.success }
39+
text: ${ r1.text + "\n\n" + r2.text }
40+
for:
41+
TEST: ${ TESTS }
42+
# maxWorkers: 10
43+
map:
44+
defs:
45+
# First phase: ask LLM the Grade School Math questions
46+
LLM_FULL_ANSWER:
47+
model: ${ MODEL }
48+
# First, get LLM to answer the question
49+
input: |
50+
Question: ${ TEST.question }
51+
Answer:
52+
# Next, get LLM to convert its answer into a single JSON key/value
53+
SIMPLIFIED_LLM_ANSWER:
54+
model: ${ MODEL }
55+
input: |
56+
Generate the final answer from the conclusion of this text as JSON with a single key named answer.
57+
${ LLM_FULL_ANSWER }
58+
# Third phase: Compare with Grade School Math ground truth
59+
lastOf:
60+
# Convert the JSON string to JSON. (We do this in a separate step so
61+
# we have access to the original for debugging.)
62+
- data: ${ SIMPLIFIED_LLM_ANSWER }
63+
parser: json
64+
def: JSON_SIMPLIFIED_LLM_ANSWER
65+
66+
# Strip off any prefix or suffix off the number (dollar signs, units, etc)
67+
# and place it in of the JSON format { "answer": ... }
68+
- data: ${ JSON_SIMPLIFIED_LLM_ANSWER.answer|string if 'answer' in JSON_SIMPLIFIED_LLM_ANSWER else ("MISSING 'answer' in " + LLM_FULL_ANSWER) }
69+
parser:
70+
regex: "[^0-9]*(?P<answer>[0-9]+).*$"
71+
spec:
72+
answer: string
73+
def: EXTRACTED_SIMPLIFIED_LLM_ANSWER
74+
# (In case the simplified answer did not contain digits.)
75+
- if: ${ EXTRACTED_SIMPLIFIED_LLM_ANSWER == None }
76+
then:
77+
def: EXTRACTED_SIMPLIFIED_LLM_ANSWER
78+
data:
79+
answer: "none"
80+
81+
# Extract the expected answer, which in this test data always follows "#### "
82+
# into { "answer": ... }
83+
- data: ${ TEST.answer }
84+
parser:
85+
regex: "(.|\n)*#### (?P<answer>([0-9])*)\n*"
86+
spec:
87+
answer: string
88+
def: EXTRACTED_GROUND_TRUTH
89+
90+
# Did we get the expected answer?
91+
- if: ${ EXTRACTED_SIMPLIFIED_LLM_ANSWER.answer == EXTRACTED_GROUND_TRUTH.answer}
92+
then:
93+
object:
94+
success: 1
95+
text: |
96+
LLM got right answer for '${ LLM_FULL_ANSWER }' which was simplified to '${ SIMPLIFIED_LLM_ANSWER }' which was extracted to '${ EXTRACTED_SIMPLIFIED_LLM_ANSWER.answer }'
97+
else:
98+
object:
99+
success: 0
100+
text: |
101+
WRONG! Wanted ${ EXTRACTED_GROUND_TRUTH.answer} } / LLM said '${ LLM_FULL_ANSWER }' which was simplified to '${ SIMPLIFIED_LLM_ANSWER }' which was extracted to '${ EXTRACTED_SIMPLIFIED_LLM_ANSWER.answer }'
102+
join:
103+
reduce: ${ stats }
104+
text:
105+
Finished, ${ SOLUTIONS.success } successes on ${ MAX_ITERATIONS } tests

src/pdl/pdl_dumper.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,8 @@ def block_to_dict( # noqa: C901
259259
d["for"] = expr_to_dict(block.for_, json_compatible)
260260
if block.index is not None:
261261
d["index"] = block.index
262+
if block.maxWorkers is not None:
263+
d["maxWorkers"] = expr_to_dict(block.maxWorkers, json_compatible)
262264
d["map"] = block_to_dict(block.map, json_compatible)
263265
if block.maxIterations is not None:
264266
d["maxIterations"] = expr_to_dict(block.maxIterations, json_compatible)

src/pdl/pdl_interpreter.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -193,17 +193,15 @@ def with_role(self: "InterpreterState", role: RoleType) -> "InterpreterState":
193193
return self.model_copy(update={"role": role})
194194

195195
def with_id(self: "InterpreterState", n: str) -> "InterpreterState":
196-
stack = self.id_stack.copy() if self.id_stack is not None else []
197-
stack.append(n)
198-
return self.model_copy(update={"id_stack": stack})
196+
stack = self.id_stack if self.id_stack is not None else []
197+
return self.model_copy(update={"id_stack": stack + [n]})
199198

200199
def with_iter(self: "InterpreterState", i: int) -> "InterpreterState":
201200
return self.with_id(str(i))
202201

203202
def with_pop(self: "InterpreterState") -> "InterpreterState":
204-
stack = self.id_stack.copy() if self.id_stack is not None else []
205-
stack.pop()
206-
return self.model_copy(update={"id_stack": stack})
203+
stack = self.id_stack if self.id_stack is not None else []
204+
return self.model_copy(update={"id_stack": stack[:-1]})
207205

208206

209207
class ClosureBlock(FunctionBlock):
@@ -961,7 +959,6 @@ def process_block_body(
961959
block, max_iterations = _evaluate_max_iterations_field(scope, block, loc)
962960
block = _evaluate_join_field(scope, block, loc)
963961
map_loc = append(loc, "map")
964-
iidx = 0
965962
try:
966963
if max_iterations is not None:
967964
index_iterator: Any = range(max_iterations)

tests/test_examples_run.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ check:
1313
skip:
1414
- examples/demos/react.pdl
1515
- examples/cldk/cldk-assistant.pdl
16-
- examples/gsm8k/gsm8.pdl
16+
- examples/gsm8k/gsm8k.pdl
17+
- examples/gsm8k/gsm8k-loop-fission.pdl
1718
- examples/gsm8k/gsm8k-plan.pdl
1819
- examples/gsm8k/gsm8k-plan-few-shots.pdl
1920
- examples/gsm8k/gsm8k-tot-few-shot.pdl

0 commit comments

Comments
 (0)