diff --git a/docs/autopdl.md b/docs/autopdl.md index c3b6db7df..3019044a4 100644 --- a/docs/autopdl.md +++ b/docs/autopdl.md @@ -127,7 +127,7 @@ Finally, we can run the example like so: ``` { .bash .copy .annotate linenums="1" } cd examples/optimizer -python optimize.py optimize --config gsm8k_optimizer_config.yml --dataset-path ../../var/gsm8k_trajectified gsm8k.pdl +python optimize.py optimize --config gsm8k_optimizer_config.yml --dataset-path ../../var/gsm8k_trajectified ``` This will report details about the optimization process, such as the number of candidates evaluated. The output will look something like this: diff --git a/examples/optimizer/bea19.pdl b/examples/optimizer/bea19.pdl new file mode 100644 index 000000000..291bab410 --- /dev/null +++ b/examples/optimizer/bea19.pdl @@ -0,0 +1,17 @@ +lastOf: + - "Here are examples of grammatically incorrect sentences and their corrected versions:\n\n" + - for: + example: ${ demonstrations } + repeat: + text: "${ example.broken } -> ${ example.sentence }" + join: + with: "\n\n" + - "Correct the following sentence:\n\n${ broken }\nHere's the corrected sentence:\n\n" + - model: ${ model } + parameters: + max_tokens: 1024 + temperature: 0 + stop: + - "<|endoftext|>" + - "Question:" + include_stop_sequence: false \ No newline at end of file diff --git a/examples/optimizer/bea19_example.yml b/examples/optimizer/bea19_example.yml new file mode 100644 index 000000000..7698bb6af --- /dev/null +++ b/examples/optimizer/bea19_example.yml @@ -0,0 +1,37 @@ +pdl_path: examples/optimizer/bea19.pdl # Path to the PDL file to optimize +# benchmark: gretel-math # Name our benchmark +dataset: + train: bea19_jsonl/train.jsonl # Path to the training split in JSONL format + test: bea19_jsonl/test.jsonl # Path to the test split in JSONL format + validation: bea19_jsonl/validation.jsonl # Path to the validation split in JSONL format + +demonstrations_variable_name: demonstrations # variable name to insert demonstrations into +demonstration_columns: + - broken # column name for the question in the dataset + - sentence # column name for the answer in the dataset + +instance_columns: + - broken # column name for the question in the dataset + +groundtruth_column: sentence # column name for the ground truth in the dataset + +eval_pdl: examples/optimizer/eval_levenshtein.pdl # Path to the PDL file for evaluation + +budget: null # Set a budget, can be number of iterations, or a duration string e.g. "2h" +budget_growth: double # double validation set size each iteration +# or to_max: reach max_test_set_size by final iteration +initial_test_set_size: 1 # size of test set in first iteration +max_test_set_size: 1 # maximum test set size +num_candidates: 100 # how many candidates to evaluate +parallelism: 1 # how many threads to run evaluations across +shuffle_test: false # shuffling of test set +test_set_name: test # name of test set +train_set_name: train # name of train set +validation_set_name: validation # name of validation set +variables: # define discrete options to sample from + model: # set ${ model } variable + - watsonx/meta-llama/llama-3-2-3b-instruct + num_demonstrations: # overrides num demonstrations above + - 0 + - 3 + - 5 \ No newline at end of file diff --git a/examples/optimizer/eval_levenshtein.pdl b/examples/optimizer/eval_levenshtein.pdl new file mode 100644 index 000000000..c7c0ef534 --- /dev/null +++ b/examples/optimizer/eval_levenshtein.pdl @@ -0,0 +1,11 @@ +defs: + score: + function: + document: string + ground_truth: string + return: + lang: python + fallback: 0 + code: | + import textdistance + result = textdistance.levenshtein.normalized_similarity(document, ground_truth) \ No newline at end of file diff --git a/examples/optimizer/fever_evaluator.py b/examples/optimizer/fever_evaluator.py index 7bd056dcc..d9465199f 100644 --- a/examples/optimizer/fever_evaluator.py +++ b/examples/optimizer/fever_evaluator.py @@ -82,5 +82,12 @@ def extract_answer(self, document: str) -> bool | None: return None - def answer_correct(self, document: str, answer: Any, truth: Any) -> bool: - return answer == truth or document.lower().endswith(str(truth).lower()) + def score(self, document: str, ground_truth: Any) -> float: + answer = self.extract_answer(document) + if answer is None: + return 0.0 + + return float( + answer == ground_truth + or document.lower().endswith(str(ground_truth).lower()) + ) diff --git a/examples/optimizer/gsm8k_evaluator.py b/examples/optimizer/gsm8k_evaluator.py index d6c7c1858..8288895b9 100644 --- a/examples/optimizer/gsm8k_evaluator.py +++ b/examples/optimizer/gsm8k_evaluator.py @@ -64,8 +64,6 @@ def get_scope(self) -> ScopeType: scope["reasoning"] = self.example["reasoning"] return empty_scope | scope - def extract_answer(self, document: str) -> Any: - return extract_math_answer(document) - - def answer_correct(self, document: str, answer: Any, truth: Any) -> bool: - return answer == truth or document.endswith(f" {truth}") + def score(self, document: str, ground_truth: Any) -> float: + answer = extract_math_answer(document) + return float(answer == ground_truth or document.endswith(f" {ground_truth}")) diff --git a/examples/optimizer/gsm8k_optimizer_config.yml b/examples/optimizer/gsm8k_optimizer_config.yml index 49da371ff..822b43217 100644 --- a/examples/optimizer/gsm8k_optimizer_config.yml +++ b/examples/optimizer/gsm8k_optimizer_config.yml @@ -1,4 +1,5 @@ -benchmark: gsm8k # Name our benchmark +pdl_path: gsm8k.pdl # Path to the PDL file to optimize +dataset: gsm8k # Name our benchmark budget: null # Set a budget, can be number of iterations, or a duration string e.g. "2h" budget_growth: double # double validation set size each iteration # or to_max: reach max_test_set_size by final iteration @@ -12,6 +13,21 @@ test_set_name: test # name of test set train_set_name: train # name of train set validation_set_name: validation # name of validation set demonstrations_variable_name: demonstrations # variable name to insert demonstrations into +demonstration_columns: + - question # column name for the question in the dataset + - reasoning + - answer + - traj_keys + - traj_values + - rewoo_traj_keys + - rewoo_traj_values + +instance_columns: + - question + - reasoning + +groundtruth_column: answer # column name for the ground truth in the dataset + variables: # define discrete options to sample from model: # set ${ model } variable - watsonx/meta-llama/llama-3-2-3b-instruct diff --git a/examples/optimizer/gsmhard_evaluator.py b/examples/optimizer/gsmhard_evaluator.py index a9bf5a641..18636db00 100644 --- a/examples/optimizer/gsmhard_evaluator.py +++ b/examples/optimizer/gsmhard_evaluator.py @@ -6,12 +6,12 @@ from pdl.pdl_interpreter import empty_scope -def is_float(s: str) -> str: +def is_float(s: str | float) -> str: try: f = float(s) return f"{f:.2f}" except Exception: - return s + return str(s) class GsmHardEvaluator(OptimizerEvaluator): @@ -74,10 +74,16 @@ def get_scope(self) -> ScopeType: scope["question"] = self.example["input"] return empty_scope | scope - def extract_answer(self, document: str) -> float | int | None: - return extract_math_answer(document) + def score(self, document: str, ground_truth: Any) -> float: + answer = extract_math_answer(document) + if answer is None: + return 0.0 - def answer_correct(self, document: str, answer: Any, truth: Any) -> bool: answerf = is_float(answer) - truthf = is_float(truth) - return answer == truth or answerf == truthf or document.endswith(f" {truth}") + truthf = is_float(ground_truth) + + return float( + answer == ground_truth + or answerf == truthf + or document.endswith(f" {ground_truth}") + ) diff --git a/examples/optimizer/mbpp_evaluator.py b/examples/optimizer/mbpp_evaluator.py index 4402acefd..e14151c54 100644 --- a/examples/optimizer/mbpp_evaluator.py +++ b/examples/optimizer/mbpp_evaluator.py @@ -65,9 +65,10 @@ def extract_answer(self, document: str) -> str: solution = solution.split("```")[1] return solution.strip() - def answer_correct(self, document: str, answer: Any, truth: Any) -> bool: - if answer is None or not isinstance(answer, str): - return False + def score(self, document: str, ground_truth: Any) -> float: + answer = self.extract_answer(document) + if not answer: + return 0.0 retry_parse = False try: @@ -78,16 +79,16 @@ def answer_correct(self, document: str, answer: Any, truth: Any) -> bool: if retry_parse: pattern = r"```(?:python)?\n(.*?)\n```" - match = re.search(pattern, answer, re.DOTALL) + match = re.search(pattern, document, re.DOTALL) if match: answer = match.group(1) try: ast.parse(answer) except Exception as e: print(e) - return False + return 0.0 else: - return False + return 0.0 task_id = self.example["task_id"] @@ -109,4 +110,4 @@ def answer_correct(self, document: str, answer: Any, truth: Any) -> bool: base_stat, _ = result["base"] plus_stat, _ = result["plus"] - return base_stat == "pass" and plus_stat == "pass" + return float(base_stat == "pass" and plus_stat == "pass") diff --git a/examples/optimizer/optimize.py b/examples/optimizer/optimize.py index 24307fa26..a2b5190d8 100644 --- a/examples/optimizer/optimize.py +++ b/examples/optimizer/optimize.py @@ -5,14 +5,16 @@ from typing import Any import yaml -from datasets.load import load_from_disk +from datasets import load_dataset, load_from_disk from fever_evaluator import FEVEREvaluator from gsm8k_evaluator import Gsm8kEvaluator from gsmhard_evaluator import GsmHardEvaluator from mbpp_dataset import MBPPDataset from mbpp_evaluator import MBPPEvaluator -from pdl.optimize.config_parser import OptimizationConfig +from pdl.optimize.config_parser import JsonlDataset, OptimizationConfig +from pdl.optimize.optimizer_evaluator import OptimizerEvaluator +from pdl.optimize.pdl_evaluator import PdlEvaluator from pdl.optimize.pdl_optimizer import PDLOptimizer if __name__ == "__main__": @@ -38,7 +40,7 @@ "--dataset-path", help="Path to the dataset directory", type=Path, - required=True, + required=False, ) common_parser.add_argument( "--experiments-path", @@ -56,11 +58,6 @@ action=argparse.BooleanOptionalAction, default=False, ) - common_parser.add_argument( - "pdl_file", - type=Path, - help="Path to a PDL file to optimize", - ) # Optimize command optimize_parser = subparsers.add_parser( @@ -82,9 +79,6 @@ ) args = parser.parse_args() - if not args.pdl_file.exists(): - print("PDL file doesn't exist:", args.pdl_file) - sys.exit(1) if not args.config.exists(): print("Config file doesn't exist:", args.config) @@ -100,35 +94,52 @@ traceback.print_last() sys.exit(1) + if not Path(config.pdl_path).exists(): + print("PDL file doesn't exist:", config.pdl_path) + sys.exit(1) + if args.dry: sys.exit(0) # Set up dataset and trial thread based on benchmark dataset: Any TrialThread: type[ - Gsm8kEvaluator | GsmHardEvaluator | FEVEREvaluator | MBPPEvaluator + Gsm8kEvaluator + | GsmHardEvaluator + | FEVEREvaluator + | MBPPEvaluator + | OptimizerEvaluator ] - if config.benchmark == "gsm8k": + if config.dataset == "gsm8k": dataset = load_from_disk(args.dataset_path) TrialThread = Gsm8kEvaluator - elif config.benchmark == "gsmhard": + elif config.dataset == "gsmhard": dataset = load_from_disk(args.dataset_path) TrialThread = GsmHardEvaluator - elif config.benchmark == "fever": + elif config.dataset == "fever": fever = load_from_disk(args.dataset_path) dataset = fever TrialThread = FEVEREvaluator - elif config.benchmark == "mbpp": + elif config.dataset == "mbpp": dataset = MBPPDataset(args.dataset_path) TrialThread = MBPPEvaluator + elif isinstance(config.dataset, (dict, JsonlDataset)): + dataset = load_dataset( + "json", + data_files={ + "train": config.dataset.train, + "validation": config.dataset.validation, + "test": config.dataset.test, + }, + ) + TrialThread = PdlEvaluator else: - print(f"Unknown benchmark: {config.benchmark}") + print(f"Unknown dataset: {config.dataset}") sys.exit(1) # Create optimizer instance optimizer = PDLOptimizer( - pdl_path=args.pdl_file, dataset=dataset, trial_thread=TrialThread, yield_output=args.yield_output, diff --git a/examples/optimizer/process_bea19.py b/examples/optimizer/process_bea19.py new file mode 100644 index 000000000..3a01b3e61 --- /dev/null +++ b/examples/optimizer/process_bea19.py @@ -0,0 +1,33 @@ +import json +from pathlib import Path + +from datasets.dataset_dict import DatasetDict +from datasets.load import load_dataset + +# Load dataset +bea19 = load_dataset("juancavallotti/bea-19-corruption") +if not isinstance(bea19, DatasetDict): + raise TypeError(f"Expected bea19 to be a DatasetDict, but got: {type(bea19)}") + +# Create validation split from train (1024 examples) +new_split = bea19["train"].train_test_split(test_size=1024) +bea19["test"] = new_split["test"] + +val_split = new_split["train"].train_test_split() +bea19["train"] = val_split["train"] +bea19["validation"] = val_split["test"] + +# Output dir +out_dir = Path("bea19_jsonl") +out_dir.mkdir(parents=True, exist_ok=True) + + +# Save to JSONL +def save_jsonl(dataset, path: Path) -> None: + with path.open("w") as f: + for item in dataset: + f.write(json.dumps(item) + "\n") + + +for split in ["train", "validation", "test"]: + save_jsonl(bea19[split], out_dir / f"{split}.jsonl") diff --git a/src/pdl/optimize/config_parser.py b/src/pdl/optimize/config_parser.py index 27b85510b..ec449ee1c 100644 --- a/src/pdl/optimize/config_parser.py +++ b/src/pdl/optimize/config_parser.py @@ -3,11 +3,28 @@ from pydantic import BaseModel, Field +class JsonlDataset(BaseModel): + train: str = Field(..., description="Path to the training dataset in JSONL format") + test: str = Field(..., description="Path to the test dataset in JSONL format") + validation: str = Field( + ..., description="Path to the validation dataset in JSONL format" + ) + + class OptimizationConfig(BaseModel): - benchmark: str = Field() + pdl_path: str = Field(..., description="Path to the PDL file to optimize") + dataset: str | JsonlDataset = Field() + demonstrations_variable_name: str = Field(default="demonstrations") + demonstration_columns: list[str] = Field() + instance_columns: list[str] = Field() + groundtruth_column: str | None = Field() + eval_pdl: str | None = Field( + default=None, description="Path to the PDL file used for evaluation" + ) num_candidates: int = Field(default=30) - num_demonstrations: int = Field(default=5) - initial_test_set_size: int = Field(default=10) + num_demonstrations: int | None = Field(default=None) + initial_validation_set_size: int = Field(default=10) + max_validation_set_size: int = Field(default=1000) max_test_set_size: int = Field(default=1000) timeout: int = Field(default=120) budget_growth: Literal["double", "to_max"] = Field(default="double") @@ -17,7 +34,6 @@ class OptimizationConfig(BaseModel): train_set_name: str = Field(default="train") test_set_name: str = Field(default="test") validation_set_name: str = Field(default="validation") - demonstrations_variable_name: str = Field(default="demonstrations") variables: dict[str, list] = Field(default={}) experiment_prefix: str = Field(default="") diff --git a/src/pdl/optimize/optimizer_evaluator.py b/src/pdl/optimize/optimizer_evaluator.py index 4e809c848..5898087e7 100644 --- a/src/pdl/optimize/optimizer_evaluator.py +++ b/src/pdl/optimize/optimizer_evaluator.py @@ -45,10 +45,7 @@ def __init__( def get_scope(self) -> ScopeType: raise NotImplementedError - def extract_answer(self, document: str) -> Any: - raise NotImplementedError - - def answer_correct(self, document: str, answer: Any, truth: Any) -> bool: + def score(self, document: str, ground_truth: Any) -> float: raise NotImplementedError def run( # type: ignore # noqa: C901 @@ -58,7 +55,6 @@ def run( # type: ignore # noqa: C901 answer = None exception: PDLParseError | PDLRuntimeError | Exception | bool | None = None result = None - match = False truth = self.example[self.answer_key] scope: PdlDict = PdlDict({}) @@ -68,6 +64,7 @@ def run( # type: ignore # noqa: C901 end_time = None total_tokens = -1 errored = False + score = 0.0 while retry: if tries > 1: console.log("RETRYING! ", tries) @@ -93,6 +90,12 @@ def run( # type: ignore # noqa: C901 if isinstance(document, str): document = document.strip() + if document: + errored = False + retry = False + else: + console.log("Empty document returned, retrying...") + answer = document else: raise TypeError( f"Expected document to be a string, got {type(document)}", @@ -102,24 +105,10 @@ def run( # type: ignore # noqa: C901 runtime = end_time - start_time console.log(f"Runtime took seconds: {runtime:.2f}") - errored = False - if errored: - console.log("PDL error occured.") - else: - answer = self.extract_answer(document) - - if answer is None: - last_line = document.splitlines()[-1] - console.log("Couldn't extract answer: ", last_line) - - if answer is None or errored: - retry = True - - if answer is not None and not errored: - retry = False - if tries >= RETRY_COUNT: retry = False + + score = float(self.score(document, truth)) except PDLParseError as exc: console.print_exception(show_locals=False) errored = True @@ -160,11 +149,9 @@ def run( # type: ignore # noqa: C901 if errored and not exception: exception = errored - match = self.answer_correct(document, answer, truth) - return TrialOutput( pdl_program=self.pdl_program, - correct=match, + score=score, exception=exception, scope=scope, pdl_result=result, diff --git a/src/pdl/optimize/pdl_evaluator.py b/src/pdl/optimize/pdl_evaluator.py new file mode 100644 index 000000000..f1f9eeeb3 --- /dev/null +++ b/src/pdl/optimize/pdl_evaluator.py @@ -0,0 +1,57 @@ +from typing import Any + +from pdl.optimize.optimizer_evaluator import OptimizerEvaluator +from pdl.pdl import exec_str +from pdl.pdl_ast import ScopeType +from pdl.pdl_interpreter import empty_scope + + +class PdlEvaluator(OptimizerEvaluator): + def __init__( + self, + # scoring_pdl: str, + *args, + **kwargs, + ) -> None: + super().__init__(*args, **kwargs) + self.scoring_pdl = self.config.eval_pdl + if self.config.groundtruth_column is None: + raise ValueError("Groundtruth column must be specified") + self.answer_key = self.config.groundtruth_column + + def get_scope(self) -> ScopeType: + demo_var = self.config.demonstrations_variable_name + + scope = {} + + for k in self.config.variables: + if k in self.candidate: + scope[k] = self.candidate[k] + + scope[demo_var] = [ + {k: q[k] for k in self.config.demonstration_columns} + for q in self.candidate[demo_var] + ] + + for k in self.config.instance_columns: + if k in self.example: + scope[k] = self.example[k] + + return empty_scope | scope + + def score(self, document: str, ground_truth: Any) -> float: + scope = empty_scope | {"document": document, "ground_truth": ground_truth} + prog = f"""defs: + scoring: + import: "{self.scoring_pdl}" +lastOf: + - call: ${{ scoring.score }} + args: + document: ${{ document }} + ground_truth: ${{ ground_truth }}""" + result = exec_str(prog=prog, scope=scope, output="result") + + if isinstance(result, str): + result = result.strip() + # Note: this breaks if the result is not a number + return float(result) diff --git a/src/pdl/optimize/pdl_optimizer.py b/src/pdl/optimize/pdl_optimizer.py index b6be331a7..17b4ed011 100644 --- a/src/pdl/optimize/pdl_optimizer.py +++ b/src/pdl/optimize/pdl_optimizer.py @@ -68,22 +68,22 @@ class PDLOptimizer: # pylint: disable=too-many-instance-attributes,too-many-arguments,too-many-positional-arguments def __init__( self, - pdl_path: Path, dataset: DatasetDict, config: OptimizationConfig, trial_thread: type[OptimizerEvaluator], yield_output: bool, experiment_path: Path, ) -> None: - self.pdl_path = pdl_path self.trial_thread = trial_thread self.yield_output = yield_output self.config = config + self.pdl_path = Path(config.pdl_path) self.parallelism = config.parallelism self.num_demonstrations = config.num_demonstrations - self.starting_validation_set_size = config.initial_test_set_size - self.ending_test_set_size = config.max_test_set_size + self.starting_validation_set_size = config.initial_validation_set_size + self.ending_validation_set_size = config.max_validation_set_size + self.max_test_set_size = config.max_test_set_size self.max_candidates = config.num_candidates self.timeout = config.timeout self.budget_growth = config.budget_growth @@ -293,7 +293,7 @@ def run(self) -> dict[str, Any]: self.starting_validation_set_size, validation_set_size, ) - ending_validation_set_size = self.ending_test_set_size + ending_validation_set_size = self.ending_validation_set_size num_iterations = ceil(log2(num_candidates)) validation_set_multiplier = 0 @@ -494,7 +494,7 @@ def run(self) -> dict[str, Any]: # reset_usage_stats() range_end = min( - ending_validation_set_size, + self.max_test_set_size, len(self.dataset[self.test_set_name]), ) eval_set_indices = list(range(range_end)) @@ -519,7 +519,7 @@ def run(self) -> dict[str, Any]: self.pbar.close() self.experiment_log["final_iteration"] = { - "ending_test_set_size": ending_validation_set_size, + "ending_test_set_size": range_end, "eval_set_indices": eval_set_indices, "selected_candidates_uuid": winning_candidate["uuid"], "candidate": final_score.to_dict(), @@ -632,7 +632,7 @@ def evaluate( ), ) - matches = 0 + score = 0 exception_count = 0 timeout_count = 0 exceptions: list[BaseException | bool] = [] @@ -662,10 +662,10 @@ def evaluate( answer = result.answer logger.info( - "Answer: %s Ground truth: %s Match: %s", + "Answer: %s Ground truth: %s Score: %s", answer, result.groundtruth, - result.correct, + result.score, ) if candidate["uuid"] not in self.candidate_results: @@ -684,16 +684,16 @@ def evaluate( if trial_result.exception is not None: exceptions.append(trial_result.exception) - matches += int(trial_result.correct) + score += float(trial_result.score) - p_passing = matches / (index + 1) + p_passing = score / (index + 1) end_time = time.time() runtime = end_time - start_time logger.info( "Matches: %s Accuracy: %.2f Exceptions: %s (%s timeout, %s other) Total: %s", - f"{matches:,}", + f"{score:,}", p_passing * 100, f"{len(exceptions):,}", timeout_count, @@ -712,7 +712,9 @@ def evaluate( ) def benchmark(self, test_set_size: int, candidate: dict | None = None): - if self.num_demonstrations <= 0: + if self.num_demonstrations is None: + demo_size = 0 + elif self.num_demonstrations <= 0: demo_size = len(self.dataset[self.train_set_name]) else: demo_size = self.num_demonstrations diff --git a/src/pdl/optimize/util.py b/src/pdl/optimize/util.py index aceb0d3ff..d331f49d1 100644 --- a/src/pdl/optimize/util.py +++ b/src/pdl/optimize/util.py @@ -20,7 +20,7 @@ class TrialOutput: pdl_program: Program scope: ScopeType runtime: float - correct: bool = False + score: float = 0.0 exception: BaseException | bool | None = None pdl_result: Any = None pdl_document: str = "" @@ -32,7 +32,7 @@ class TrialOutput: def to_dict(self) -> dict: return { - "correct": self.correct, + "score": self.score, "exception": str(self.exception), "pdl_document": self.pdl_document, "answer": self.answer, diff --git a/tests/test_examples_run.yaml b/tests/test_examples_run.yaml index 6ef48a317..9250488ed 100644 --- a/tests/test_examples_run.yaml +++ b/tests/test_examples_run.yaml @@ -39,6 +39,8 @@ skip: - examples/optimizer/mbpp.pdl - examples/optimizer/fever.pdl - examples/optimizer/gsm8k.pdl + - examples/optimizer/bea19.pdl + - examples/optimizer/eval_levenshtein.pdl - examples/requirements/email.pdl with_inputs: examples/tutorial/programs/chatbot.pdl: diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index 1df9b0834..4ceef96e0 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -17,11 +17,14 @@ def test_gsm8k_cot(): pattern = "cot" num_demonstrations = 3 config = OptimizationConfig( - benchmark="gsm8k", - initial_test_set_size=1, + pdl_path="tests/data/optimizer_gsm8k.pdl", + dataset="gsm8k", + demonstration_columns=["question", "reasoning", "answer"], + instance_columns=["question", "reasoning"], + groundtruth_column="answer", + initial_validation_set_size=1, max_test_set_size=1, num_candidates=5, - num_demonstrations=num_demonstrations, parallelism=1, shuffle_test=False, test_set_name="test", @@ -349,7 +352,6 @@ def test_gsm8k_cot(): }, ) optim = PDLOptimizer( - pdl_path=Path("tests/data/optimizer_gsm8k.pdl"), dataset=gsm8k, trial_thread=Gsm8kEvaluator, yield_output=True, @@ -375,11 +377,14 @@ def test_gsm8k_cot(): def run_optimizer_gsm8k(pattern, num_demonstrations=0): config = OptimizationConfig( - benchmark="gsm8k", - initial_test_set_size=1, + pdl_path="tests/data/optimizer_gsm8k.pdl", + dataset="gsm8k", + demonstration_columns=["question", "reasoning", "answer"], + instance_columns=["question", "reasoning"], + groundtruth_column="answer", + initial_validation_set_size=1, max_test_set_size=1, num_candidates=1, - num_demonstrations=num_demonstrations, parallelism=1, shuffle_test=False, test_set_name="test", @@ -708,7 +713,6 @@ def run_optimizer_gsm8k(pattern, num_demonstrations=0): }, ) optim = PDLOptimizer( - pdl_path=Path("examples/optimizer/gsm8k.pdl"), dataset=gsm8k, trial_thread=Gsm8kEvaluator, yield_output=True, @@ -724,11 +728,22 @@ def run_optimizer_gsm8k(pattern, num_demonstrations=0): def run_optimizer_fever(pattern, num_demonstrations=0): config = OptimizationConfig( - benchmark="fever", - initial_test_set_size=1, + pdl_path="examples/optimizer/fever.pdl", + dataset="fever", + demonstration_columns=[ + "question", + "reasoning", + "reasoning", + "traj_keys", + "traj_values", + "rewoo_traj_keys", + "rewoo_traj_values", + ], + instance_columns=["claim"], + groundtruth_column="label", + initial_validation_set_size=1, max_test_set_size=1, num_candidates=1, - num_demonstrations=num_demonstrations, parallelism=1, shuffle_test=False, test_set_name="test", @@ -1064,7 +1079,6 @@ def run_optimizer_fever(pattern, num_demonstrations=0): ) optim = PDLOptimizer( - pdl_path=Path("examples/optimizer/fever.pdl"), dataset=fever, # pyright: ignore trial_thread=FEVEREvaluator, yield_output=True, @@ -1080,11 +1094,18 @@ def run_optimizer_fever(pattern, num_demonstrations=0): def run_optimizer_mbpp(pattern, num_demonstrations=0): config = OptimizationConfig( - benchmark="mbpp", - initial_test_set_size=1, + pdl_path="examples/optimizer/mbpp.pdl", + dataset="mbpp", + demonstration_columns=[ + "prompt", + "traj_keys", + "traj_values", + ], + instance_columns=["claim"], + groundtruth_column="canonical_solution", + initial_validation_set_size=1, max_test_set_size=1, num_candidates=1, - num_demonstrations=num_demonstrations, parallelism=1, shuffle_test=False, test_set_name="test", @@ -1103,7 +1124,6 @@ def run_optimizer_mbpp(pattern, num_demonstrations=0): ) optim = PDLOptimizer( - pdl_path=Path("examples/optimizer/mbpp.pdl"), dataset=mbpp_dataset, # pyright: ignore trial_thread=MBPPEvaluator, yield_output=True,