Skip to content

tests: map-based gsm8k #1107

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ defs:
# How many problems to evaluate. The entire dataset is 1319 problems.
# MAX_ITERATIONS: 1319
MAX_ITERATIONS: 50
# Which model to use
# MODEL: ollama/granite-code:8b
# MODEL: ollama/granite3.2:8b
MODEL: watsonx/ibm/granite-3-2-8b-instruct

# PDL variables that hold statistics
SUCCESSES: 0
Expand All @@ -29,8 +33,7 @@ text:
TEST: ${ TESTS }
repeat:
# Ask the LLM for the answer
# - model: ollama/granite-code:8b
model: ollama/granite3.2:8b
model: ${ MODEL }
# First, get LLM to answer the question
input: |
Question: ${ TEST.question }
Expand All @@ -51,8 +54,7 @@ text:
LLM_FULL_ANSWER: ${ ALL_LLM_FULL_A }
repeat:
# Next, get LLM to convert its answer into a single JSON key/value
# - model: ollama/granite-code:8b
model: ollama/granite3.2:8b
model: ${ MODEL }
input: | # 'input' is the prompt
Generate the final answer from the conclusion of this text as JSON with a single key named answer.
${ LLM_FULL_ANSWER }
Expand Down
105 changes: 105 additions & 0 deletions examples/gsm8k/gsm8k.pdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env pdl

# Grade School Math https://github.com/openai/grade-school-math is an
# open source AI dataset from 2021.
#
# https://github.com/openai/grade-school-math/blob/master/grade_school_math/data/test.jsonl
# is a file with 1319 questions and answers.
#
#

description: Grade School Math
defs:
# The Grade School Math Dataset
ALL_TESTS:
read: ./test.jsonl
parser: jsonl

# How many problems to evaluate. The entire dataset is 1319 problems.
# MAX_ITERATIONS: 1319
MAX_ITERATIONS: 5
# Which model to use
# MODEL: ollama/granite-code:8b
# MODEL: ollama/granite3.2:8b
MODEL: watsonx/ibm/granite-3-2-8b-instruct

# PDL variables that hold statistics
SUCCESSES: 0
FAILURES: 0
TESTS: ${ ALL_TESTS[:MAX_ITERATIONS] }
lastOf:
- def: SOLUTIONS
contribute: []
defs:
stats:
function:
r1: { success: integer, text: string}
r2: { success: integer, text: string}
return:
data:
success: ${ r1.success + r2.success }
text: ${ r1.text + "\n\n" + r2.text }
for:
TEST: ${ TESTS }
maxWorkers: 5
map:
lastOf:
# First phase: ask LLM the Grade School Math questions
- def: LLM_FULL_ANSWER
model: ${ MODEL }
input: |
Question: ${ TEST.question }
Answer:
# Next, get LLM to convert its answer into a single JSON key/value
- def: SIMPLIFIED_LLM_ANSWER
model: ${ MODEL }
input: |
Generate the final answer from the conclusion of this text as JSON with a single key named answer.
${ LLM_FULL_ANSWER }
# Third phase: Compare with Grade School Math ground truth
- lastOf:
# Convert the JSON string to JSON. (We do this in a separate step so
# we have access to the original for debugging.)
- def: JSON_SIMPLIFIED_LLM_ANSWER
data: ${ SIMPLIFIED_LLM_ANSWER }
parser: json

# Strip off any prefix or suffix off the number (dollar signs, units, etc)
# and place it in of the JSON format { "answer": ... }
- def: EXTRACTED_SIMPLIFIED_LLM_ANSWER
data: ${ JSON_SIMPLIFIED_LLM_ANSWER.answer|string if 'answer' in JSON_SIMPLIFIED_LLM_ANSWER else ("MISSING 'answer' in " + LLM_FULL_ANSWER) }
parser:
regex: "[^0-9]*(?P<answer>[0-9]+).*$"
spec:
answer: string
# (In case the simplified answer did not contain digits.)
- if: ${ EXTRACTED_SIMPLIFIED_LLM_ANSWER == None }
then:
def: EXTRACTED_SIMPLIFIED_LLM_ANSWER
data:
answer: "none"

# Extract the expected answer, which in this test data always follows "#### "
# into { "answer": ... }
- data: ${ TEST.answer }
parser:
regex: "(.|\n)*#### (?P<answer>([0-9])*)\n*"
spec:
answer: string
def: EXTRACTED_GROUND_TRUTH

# Did we get the expected answer?
- if: ${ EXTRACTED_SIMPLIFIED_LLM_ANSWER.answer == EXTRACTED_GROUND_TRUTH.answer}
then:
object:
success: 1
text: |
LLM got right answer for '${ LLM_FULL_ANSWER }' which was simplified to '${ SIMPLIFIED_LLM_ANSWER }' which was extracted to '${ EXTRACTED_SIMPLIFIED_LLM_ANSWER.answer }'
else:
object:
success: 0
text: |
WRONG! Wanted ${ EXTRACTED_GROUND_TRUTH.answer} } / LLM said '${ LLM_FULL_ANSWER }' which was simplified to '${ SIMPLIFIED_LLM_ANSWER }' which was extracted to '${ EXTRACTED_SIMPLIFIED_LLM_ANSWER.answer }'
join:
reduce: ${ stats }
- Finished, ${ SOLUTIONS.success } successes on ${ MAX_ITERATIONS } tests
2 changes: 2 additions & 0 deletions src/pdl/pdl_dumper.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,8 @@ def block_to_dict( # noqa: C901
d["for"] = expr_to_dict(block.for_, json_compatible)
if block.index is not None:
d["index"] = block.index
if block.maxWorkers is not None:
d["maxWorkers"] = expr_to_dict(block.maxWorkers, json_compatible)
d["map"] = block_to_dict(block.map, json_compatible)
if block.maxIterations is not None:
d["maxIterations"] = expr_to_dict(block.maxIterations, json_compatible)
Expand Down
11 changes: 4 additions & 7 deletions src/pdl/pdl_interpreter.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,17 +193,15 @@ def with_role(self: "InterpreterState", role: RoleType) -> "InterpreterState":
return self.model_copy(update={"role": role})

def with_id(self: "InterpreterState", n: str) -> "InterpreterState":
stack = self.id_stack.copy() if self.id_stack is not None else []
stack.append(n)
return self.model_copy(update={"id_stack": stack})
stack = self.id_stack if self.id_stack is not None else []
return self.model_copy(update={"id_stack": stack + [n]})

def with_iter(self: "InterpreterState", i: int) -> "InterpreterState":
return self.with_id(str(i))

def with_pop(self: "InterpreterState") -> "InterpreterState":
stack = self.id_stack.copy() if self.id_stack is not None else []
stack.pop()
return self.model_copy(update={"id_stack": stack})
stack = self.id_stack if self.id_stack is not None else []
return self.model_copy(update={"id_stack": stack[:-1]})


class ClosureBlock(FunctionBlock):
Expand Down Expand Up @@ -961,7 +959,6 @@ def process_block_body(
block, max_iterations = _evaluate_max_iterations_field(scope, block, loc)
block = _evaluate_join_field(scope, block, loc)
map_loc = append(loc, "map")
iidx = 0
try:
if max_iterations is not None:
index_iterator: Any = range(max_iterations)
Expand Down
Loading