Skip to content

AutoPDL Simplification #1088

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/autopdl.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ Finally, we can run the example like so:

``` { .bash .copy .annotate linenums="1" }
cd examples/optimizer
python optimize.py optimize --config gsm8k_optimizer_config.yml --dataset-path ../../var/gsm8k_trajectified gsm8k.pdl
python optimize.py optimize --config gsm8k_optimizer_config.yml --dataset-path ../../var/gsm8k_trajectified
```

This will report details about the optimization process, such as the number of candidates evaluated. The output will look something like this:
Expand Down
17 changes: 17 additions & 0 deletions examples/optimizer/bea19.pdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
lastOf:
- "Here are examples of grammatically incorrect sentences and their corrected versions:\n\n"
- for:
example: ${ demonstrations }
repeat:
text: "${ example.broken } -> ${ example.sentence }"
join:
with: "\n\n"
- "Correct the following sentence:\n\n${ broken }\nHere's the corrected sentence:\n\n"
- model: ${ model }
parameters:
max_tokens: 1024
temperature: 0
stop:
- "<|endoftext|>"
- "Question:"
include_stop_sequence: false
37 changes: 37 additions & 0 deletions examples/optimizer/bea19_example.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
pdl_path: examples/optimizer/bea19.pdl # Path to the PDL file to optimize
# benchmark: gretel-math # Name our benchmark
dataset:
train: bea19_jsonl/train.jsonl # Path to the training split in JSONL format
test: bea19_jsonl/test.jsonl # Path to the test split in JSONL format
validation: bea19_jsonl/validation.jsonl # Path to the validation split in JSONL format

demonstrations_variable_name: demonstrations # variable name to insert demonstrations into
demonstration_columns:
- broken # column name for the question in the dataset
- sentence # column name for the answer in the dataset

instance_columns:
- broken # column name for the question in the dataset

groundtruth_column: sentence # column name for the ground truth in the dataset

eval_pdl: examples/optimizer/eval_levenshtein.pdl # Path to the PDL file for evaluation

budget: null # Set a budget, can be number of iterations, or a duration string e.g. "2h"
budget_growth: double # double validation set size each iteration
# or to_max: reach max_test_set_size by final iteration
initial_test_set_size: 1 # size of test set in first iteration
max_test_set_size: 1 # maximum test set size
num_candidates: 100 # how many candidates to evaluate
parallelism: 1 # how many threads to run evaluations across
shuffle_test: false # shuffling of test set
test_set_name: test # name of test set
train_set_name: train # name of train set
validation_set_name: validation # name of validation set
variables: # define discrete options to sample from
model: # set ${ model } variable
- watsonx/meta-llama/llama-3-2-3b-instruct
num_demonstrations: # overrides num demonstrations above
- 0
- 3
- 5
11 changes: 11 additions & 0 deletions examples/optimizer/eval_levenshtein.pdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
defs:
score:
function:
document: string
ground_truth: string
return:
lang: python
fallback: 0
code: |
import textdistance
result = textdistance.levenshtein.normalized_similarity(document, ground_truth)
11 changes: 9 additions & 2 deletions examples/optimizer/fever_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,5 +82,12 @@ def extract_answer(self, document: str) -> bool | None:

return None

def answer_correct(self, document: str, answer: Any, truth: Any) -> bool:
return answer == truth or document.lower().endswith(str(truth).lower())
def score(self, document: str, ground_truth: Any) -> float:
answer = self.extract_answer(document)
if answer is None:
return 0.0

return float(
answer == ground_truth
or document.lower().endswith(str(ground_truth).lower())
)
8 changes: 3 additions & 5 deletions examples/optimizer/gsm8k_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,6 @@ def get_scope(self) -> ScopeType:
scope["reasoning"] = self.example["reasoning"]
return empty_scope | scope

def extract_answer(self, document: str) -> Any:
return extract_math_answer(document)

def answer_correct(self, document: str, answer: Any, truth: Any) -> bool:
return answer == truth or document.endswith(f" {truth}")
def score(self, document: str, ground_truth: Any) -> float:
answer = extract_math_answer(document)
return float(answer == ground_truth or document.endswith(f" {ground_truth}"))
18 changes: 17 additions & 1 deletion examples/optimizer/gsm8k_optimizer_config.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
benchmark: gsm8k # Name our benchmark
pdl_path: gsm8k.pdl # Path to the PDL file to optimize
dataset: gsm8k # Name our benchmark
budget: null # Set a budget, can be number of iterations, or a duration string e.g. "2h"
budget_growth: double # double validation set size each iteration
# or to_max: reach max_test_set_size by final iteration
Expand All @@ -12,6 +13,21 @@ test_set_name: test # name of test set
train_set_name: train # name of train set
validation_set_name: validation # name of validation set
demonstrations_variable_name: demonstrations # variable name to insert demonstrations into
demonstration_columns:
- question # column name for the question in the dataset
- reasoning
- answer
- traj_keys
- traj_values
- rewoo_traj_keys
- rewoo_traj_values

instance_columns:
- question
- reasoning

groundtruth_column: answer # column name for the ground truth in the dataset

variables: # define discrete options to sample from
model: # set ${ model } variable
- watsonx/meta-llama/llama-3-2-3b-instruct
Expand Down
20 changes: 13 additions & 7 deletions examples/optimizer/gsmhard_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
from pdl.pdl_interpreter import empty_scope


def is_float(s: str) -> str:
def is_float(s: str | float) -> str:
try:
f = float(s)
return f"{f:.2f}"
except Exception:
return s
return str(s)


class GsmHardEvaluator(OptimizerEvaluator):
Expand Down Expand Up @@ -74,10 +74,16 @@ def get_scope(self) -> ScopeType:
scope["question"] = self.example["input"]
return empty_scope | scope

def extract_answer(self, document: str) -> float | int | None:
return extract_math_answer(document)
def score(self, document: str, ground_truth: Any) -> float:
answer = extract_math_answer(document)
if answer is None:
return 0.0

def answer_correct(self, document: str, answer: Any, truth: Any) -> bool:
answerf = is_float(answer)
truthf = is_float(truth)
return answer == truth or answerf == truthf or document.endswith(f" {truth}")
truthf = is_float(ground_truth)

return float(
answer == ground_truth
or answerf == truthf
or document.endswith(f" {ground_truth}")
)
15 changes: 8 additions & 7 deletions examples/optimizer/mbpp_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,10 @@ def extract_answer(self, document: str) -> str:
solution = solution.split("```")[1]
return solution.strip()

def answer_correct(self, document: str, answer: Any, truth: Any) -> bool:
if answer is None or not isinstance(answer, str):
return False
def score(self, document: str, ground_truth: Any) -> float:
answer = self.extract_answer(document)
if not answer:
return 0.0

retry_parse = False
try:
Expand All @@ -78,16 +79,16 @@ def answer_correct(self, document: str, answer: Any, truth: Any) -> bool:

if retry_parse:
pattern = r"```(?:python)?\n(.*?)\n```"
match = re.search(pattern, answer, re.DOTALL)
match = re.search(pattern, document, re.DOTALL)
if match:
answer = match.group(1)
try:
ast.parse(answer)
except Exception as e:
print(e)
return False
return 0.0
else:
return False
return 0.0

task_id = self.example["task_id"]

Expand All @@ -109,4 +110,4 @@ def answer_correct(self, document: str, answer: Any, truth: Any) -> bool:
base_stat, _ = result["base"]
plus_stat, _ = result["plus"]

return base_stat == "pass" and plus_stat == "pass"
return float(base_stat == "pass" and plus_stat == "pass")
47 changes: 29 additions & 18 deletions examples/optimizer/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
from typing import Any

import yaml
from datasets.load import load_from_disk
from datasets import load_dataset, load_from_disk
from fever_evaluator import FEVEREvaluator
from gsm8k_evaluator import Gsm8kEvaluator
from gsmhard_evaluator import GsmHardEvaluator
from mbpp_dataset import MBPPDataset
from mbpp_evaluator import MBPPEvaluator

from pdl.optimize.config_parser import OptimizationConfig
from pdl.optimize.config_parser import JsonlDataset, OptimizationConfig
from pdl.optimize.optimizer_evaluator import OptimizerEvaluator
from pdl.optimize.pdl_evaluator import PdlEvaluator
from pdl.optimize.pdl_optimizer import PDLOptimizer

if __name__ == "__main__":
Expand All @@ -38,7 +40,7 @@
"--dataset-path",
help="Path to the dataset directory",
type=Path,
required=True,
required=False,
)
common_parser.add_argument(
"--experiments-path",
Expand All @@ -56,11 +58,6 @@
action=argparse.BooleanOptionalAction,
default=False,
)
common_parser.add_argument(
"pdl_file",
type=Path,
help="Path to a PDL file to optimize",
)

# Optimize command
optimize_parser = subparsers.add_parser(
Expand All @@ -82,9 +79,6 @@
)

args = parser.parse_args()
if not args.pdl_file.exists():
print("PDL file doesn't exist:", args.pdl_file)
sys.exit(1)

if not args.config.exists():
print("Config file doesn't exist:", args.config)
Expand All @@ -100,35 +94,52 @@
traceback.print_last()
sys.exit(1)

if not Path(config.pdl_path).exists():
print("PDL file doesn't exist:", config.pdl_path)
sys.exit(1)

if args.dry:
sys.exit(0)

# Set up dataset and trial thread based on benchmark
dataset: Any
TrialThread: type[
Gsm8kEvaluator | GsmHardEvaluator | FEVEREvaluator | MBPPEvaluator
Gsm8kEvaluator
| GsmHardEvaluator
| FEVEREvaluator
| MBPPEvaluator
| OptimizerEvaluator
]

if config.benchmark == "gsm8k":
if config.dataset == "gsm8k":
dataset = load_from_disk(args.dataset_path)
TrialThread = Gsm8kEvaluator
elif config.benchmark == "gsmhard":
elif config.dataset == "gsmhard":
dataset = load_from_disk(args.dataset_path)
TrialThread = GsmHardEvaluator
elif config.benchmark == "fever":
elif config.dataset == "fever":
fever = load_from_disk(args.dataset_path)
dataset = fever
TrialThread = FEVEREvaluator
elif config.benchmark == "mbpp":
elif config.dataset == "mbpp":
dataset = MBPPDataset(args.dataset_path)
TrialThread = MBPPEvaluator
elif isinstance(config.dataset, (dict, JsonlDataset)):
dataset = load_dataset(
"json",
data_files={
"train": config.dataset.train,
"validation": config.dataset.validation,
"test": config.dataset.test,
},
)
TrialThread = PdlEvaluator
else:
print(f"Unknown benchmark: {config.benchmark}")
print(f"Unknown dataset: {config.dataset}")
sys.exit(1)

# Create optimizer instance
optimizer = PDLOptimizer(
pdl_path=args.pdl_file,
dataset=dataset,
trial_thread=TrialThread,
yield_output=args.yield_output,
Expand Down
33 changes: 33 additions & 0 deletions examples/optimizer/process_bea19.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import json
from pathlib import Path

from datasets.dataset_dict import DatasetDict
from datasets.load import load_dataset

# Load dataset
bea19 = load_dataset("juancavallotti/bea-19-corruption")
if not isinstance(bea19, DatasetDict):
raise TypeError(f"Expected bea19 to be a DatasetDict, but got: {type(bea19)}")

# Create validation split from train (1024 examples)
new_split = bea19["train"].train_test_split(test_size=1024)
bea19["test"] = new_split["test"]

val_split = new_split["train"].train_test_split()
bea19["train"] = val_split["train"]
bea19["validation"] = val_split["test"]

# Output dir
out_dir = Path("bea19_jsonl")
out_dir.mkdir(parents=True, exist_ok=True)


# Save to JSONL
def save_jsonl(dataset, path: Path) -> None:
with path.open("w") as f:
for item in dataset:
f.write(json.dumps(item) + "\n")


for split in ["train", "validation", "test"]:
save_jsonl(bea19[split], out_dir / f"{split}.jsonl")
24 changes: 20 additions & 4 deletions src/pdl/optimize/config_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,28 @@
from pydantic import BaseModel, Field


class JsonlDataset(BaseModel):
train: str = Field(..., description="Path to the training dataset in JSONL format")
test: str = Field(..., description="Path to the test dataset in JSONL format")
validation: str = Field(
..., description="Path to the validation dataset in JSONL format"
)


class OptimizationConfig(BaseModel):
benchmark: str = Field()
pdl_path: str = Field(..., description="Path to the PDL file to optimize")
dataset: str | JsonlDataset = Field()
demonstrations_variable_name: str = Field(default="demonstrations")
demonstration_columns: list[str] = Field()
instance_columns: list[str] = Field()
groundtruth_column: str | None = Field()
eval_pdl: str | None = Field(
default=None, description="Path to the PDL file used for evaluation"
)
num_candidates: int = Field(default=30)
num_demonstrations: int = Field(default=5)
initial_test_set_size: int = Field(default=10)
num_demonstrations: int | None = Field(default=None)
initial_validation_set_size: int = Field(default=10)
max_validation_set_size: int = Field(default=1000)
max_test_set_size: int = Field(default=1000)
timeout: int = Field(default=120)
budget_growth: Literal["double", "to_max"] = Field(default="double")
Expand All @@ -17,7 +34,6 @@ class OptimizationConfig(BaseModel):
train_set_name: str = Field(default="train")
test_set_name: str = Field(default="test")
validation_set_name: str = Field(default="validation")
demonstrations_variable_name: str = Field(default="demonstrations")
variables: dict[str, list] = Field(default={})
experiment_prefix: str = Field(default="")

Expand Down
Loading