diff --git a/docs/autopdl.md b/docs/autopdl.md
index c3b6db7df..3019044a4 100644
--- a/docs/autopdl.md
+++ b/docs/autopdl.md
@@ -127,7 +127,7 @@ Finally, we can run the example like so:
 
 ``` { .bash .copy .annotate linenums="1" }
 cd examples/optimizer
-python optimize.py optimize --config gsm8k_optimizer_config.yml --dataset-path ../../var/gsm8k_trajectified gsm8k.pdl
+python optimize.py optimize --config gsm8k_optimizer_config.yml --dataset-path ../../var/gsm8k_trajectified
 ```
 
 This will report details about the optimization process, such as the number of candidates evaluated. The output will look something like this:
diff --git a/examples/optimizer/bea19.pdl b/examples/optimizer/bea19.pdl
new file mode 100644
index 000000000..291bab410
--- /dev/null
+++ b/examples/optimizer/bea19.pdl
@@ -0,0 +1,17 @@
+lastOf:
+  - "Here are examples of grammatically incorrect sentences and their corrected versions:\n\n"
+  - for:
+      example: ${ demonstrations }
+    repeat:
+      text: "${ example.broken } -> ${ example.sentence }"
+    join:
+      with: "\n\n"
+  - "Correct the following sentence:\n\n${ broken }\nHere's the corrected sentence:\n\n"
+  - model: ${ model }
+    parameters:
+      max_tokens: 1024
+      temperature: 0
+      stop:
+        - "<|endoftext|>"
+        - "Question:"
+      include_stop_sequence: false
\ No newline at end of file
diff --git a/examples/optimizer/bea19_example.yml b/examples/optimizer/bea19_example.yml
new file mode 100644
index 000000000..7698bb6af
--- /dev/null
+++ b/examples/optimizer/bea19_example.yml
@@ -0,0 +1,37 @@
+pdl_path: examples/optimizer/bea19.pdl # Path to the PDL file to optimize
+# benchmark: gretel-math # Name our benchmark
+dataset:
+  train: bea19_jsonl/train.jsonl # Path to the training split in JSONL format
+  test: bea19_jsonl/test.jsonl # Path to the test split in JSONL format
+  validation: bea19_jsonl/validation.jsonl # Path to the validation split in JSONL format
+
+demonstrations_variable_name: demonstrations # variable name to insert demonstrations into
+demonstration_columns:
+  - broken # column name for the question in the dataset
+  - sentence # column name for the answer in the dataset
+
+instance_columns:
+  - broken # column name for the question in the dataset
+
+groundtruth_column: sentence # column name for the ground truth in the dataset
+
+eval_pdl: examples/optimizer/eval_levenshtein.pdl # Path to the PDL file for evaluation
+
+budget: null # Set a budget, can be number of iterations, or a duration string e.g. "2h"
+budget_growth: double # double validation set size each iteration
+# or to_max: reach max_test_set_size by final iteration
+initial_test_set_size: 1 # size of test set in first iteration
+max_test_set_size: 1 # maximum test set size
+num_candidates: 100 # how many candidates to evaluate
+parallelism: 1 # how many threads to run evaluations across
+shuffle_test: false # shuffling of test set
+test_set_name: test # name of test set
+train_set_name: train # name of train set
+validation_set_name: validation # name of validation set
+variables: # define discrete options to sample from
+  model: # set ${ model } variable
+    - watsonx/meta-llama/llama-3-2-3b-instruct
+  num_demonstrations: # overrides num demonstrations above
+    - 0
+    - 3
+    - 5
\ No newline at end of file
diff --git a/examples/optimizer/eval_levenshtein.pdl b/examples/optimizer/eval_levenshtein.pdl
new file mode 100644
index 000000000..c7c0ef534
--- /dev/null
+++ b/examples/optimizer/eval_levenshtein.pdl
@@ -0,0 +1,11 @@
+defs:
+  score:
+    function:
+      document: string
+      ground_truth: string
+    return:
+      lang: python
+      fallback: 0
+      code: |
+        import textdistance
+        result = textdistance.levenshtein.normalized_similarity(document, ground_truth)
\ No newline at end of file
diff --git a/examples/optimizer/fever_evaluator.py b/examples/optimizer/fever_evaluator.py
index 7bd056dcc..d9465199f 100644
--- a/examples/optimizer/fever_evaluator.py
+++ b/examples/optimizer/fever_evaluator.py
@@ -82,5 +82,12 @@ def extract_answer(self, document: str) -> bool | None:
 
         return None
 
-    def answer_correct(self, document: str, answer: Any, truth: Any) -> bool:
-        return answer == truth or document.lower().endswith(str(truth).lower())
+    def score(self, document: str, ground_truth: Any) -> float:
+        answer = self.extract_answer(document)
+        if answer is None:
+            return 0.0
+
+        return float(
+            answer == ground_truth
+            or document.lower().endswith(str(ground_truth).lower())
+        )
diff --git a/examples/optimizer/gsm8k_evaluator.py b/examples/optimizer/gsm8k_evaluator.py
index d6c7c1858..8288895b9 100644
--- a/examples/optimizer/gsm8k_evaluator.py
+++ b/examples/optimizer/gsm8k_evaluator.py
@@ -64,8 +64,6 @@ def get_scope(self) -> ScopeType:
         scope["reasoning"] = self.example["reasoning"]
         return empty_scope | scope
 
-    def extract_answer(self, document: str) -> Any:
-        return extract_math_answer(document)
-
-    def answer_correct(self, document: str, answer: Any, truth: Any) -> bool:
-        return answer == truth or document.endswith(f" {truth}")
+    def score(self, document: str, ground_truth: Any) -> float:
+        answer = extract_math_answer(document)
+        return float(answer == ground_truth or document.endswith(f" {ground_truth}"))
diff --git a/examples/optimizer/gsm8k_optimizer_config.yml b/examples/optimizer/gsm8k_optimizer_config.yml
index 49da371ff..822b43217 100644
--- a/examples/optimizer/gsm8k_optimizer_config.yml
+++ b/examples/optimizer/gsm8k_optimizer_config.yml
@@ -1,4 +1,5 @@
-benchmark: gsm8k # Name our benchmark
+pdl_path: gsm8k.pdl # Path to the PDL file to optimize
+dataset: gsm8k # Name our benchmark
 budget: null # Set a budget, can be number of iterations, or a duration string e.g. "2h"
 budget_growth: double # double validation set size each iteration
 # or to_max: reach max_test_set_size by final iteration
@@ -12,6 +13,21 @@ test_set_name: test # name of test set
 train_set_name: train # name of train set
 validation_set_name: validation # name of validation set
 demonstrations_variable_name: demonstrations # variable name to insert demonstrations into
+demonstration_columns:
+  - question # column name for the question in the dataset
+  - reasoning
+  - answer
+  - traj_keys
+  - traj_values
+  - rewoo_traj_keys
+  - rewoo_traj_values
+
+instance_columns:
+  - question
+  - reasoning
+
+groundtruth_column: answer # column name for the ground truth in the dataset
+
 variables: # define discrete options to sample from
   model: # set ${ model } variable
     - watsonx/meta-llama/llama-3-2-3b-instruct
diff --git a/examples/optimizer/gsmhard_evaluator.py b/examples/optimizer/gsmhard_evaluator.py
index a9bf5a641..18636db00 100644
--- a/examples/optimizer/gsmhard_evaluator.py
+++ b/examples/optimizer/gsmhard_evaluator.py
@@ -6,12 +6,12 @@
 from pdl.pdl_interpreter import empty_scope
 
 
-def is_float(s: str) -> str:
+def is_float(s: str | float) -> str:
     try:
         f = float(s)
         return f"{f:.2f}"
     except Exception:
-        return s
+        return str(s)
 
 
 class GsmHardEvaluator(OptimizerEvaluator):
@@ -74,10 +74,16 @@ def get_scope(self) -> ScopeType:
         scope["question"] = self.example["input"]
         return empty_scope | scope
 
-    def extract_answer(self, document: str) -> float | int | None:
-        return extract_math_answer(document)
+    def score(self, document: str, ground_truth: Any) -> float:
+        answer = extract_math_answer(document)
+        if answer is None:
+            return 0.0
 
-    def answer_correct(self, document: str, answer: Any, truth: Any) -> bool:
         answerf = is_float(answer)
-        truthf = is_float(truth)
-        return answer == truth or answerf == truthf or document.endswith(f" {truth}")
+        truthf = is_float(ground_truth)
+
+        return float(
+            answer == ground_truth
+            or answerf == truthf
+            or document.endswith(f" {ground_truth}")
+        )
diff --git a/examples/optimizer/mbpp_evaluator.py b/examples/optimizer/mbpp_evaluator.py
index 4402acefd..e14151c54 100644
--- a/examples/optimizer/mbpp_evaluator.py
+++ b/examples/optimizer/mbpp_evaluator.py
@@ -65,9 +65,10 @@ def extract_answer(self, document: str) -> str:
             solution = solution.split("```")[1]
         return solution.strip()
 
-    def answer_correct(self, document: str, answer: Any, truth: Any) -> bool:
-        if answer is None or not isinstance(answer, str):
-            return False
+    def score(self, document: str, ground_truth: Any) -> float:
+        answer = self.extract_answer(document)
+        if not answer:
+            return 0.0
 
         retry_parse = False
         try:
@@ -78,16 +79,16 @@ def answer_correct(self, document: str, answer: Any, truth: Any) -> bool:
 
         if retry_parse:
             pattern = r"```(?:python)?\n(.*?)\n```"
-            match = re.search(pattern, answer, re.DOTALL)
+            match = re.search(pattern, document, re.DOTALL)
             if match:
                 answer = match.group(1)
                 try:
                     ast.parse(answer)
                 except Exception as e:
                     print(e)
-                    return False
+                    return 0.0
             else:
-                return False
+                return 0.0
 
         task_id = self.example["task_id"]
 
@@ -109,4 +110,4 @@ def answer_correct(self, document: str, answer: Any, truth: Any) -> bool:
         base_stat, _ = result["base"]
         plus_stat, _ = result["plus"]
 
-        return base_stat == "pass" and plus_stat == "pass"
+        return float(base_stat == "pass" and plus_stat == "pass")
diff --git a/examples/optimizer/optimize.py b/examples/optimizer/optimize.py
index 24307fa26..a2b5190d8 100644
--- a/examples/optimizer/optimize.py
+++ b/examples/optimizer/optimize.py
@@ -5,14 +5,16 @@
 from typing import Any
 
 import yaml
-from datasets.load import load_from_disk
+from datasets import load_dataset, load_from_disk
 from fever_evaluator import FEVEREvaluator
 from gsm8k_evaluator import Gsm8kEvaluator
 from gsmhard_evaluator import GsmHardEvaluator
 from mbpp_dataset import MBPPDataset
 from mbpp_evaluator import MBPPEvaluator
 
-from pdl.optimize.config_parser import OptimizationConfig
+from pdl.optimize.config_parser import JsonlDataset, OptimizationConfig
+from pdl.optimize.optimizer_evaluator import OptimizerEvaluator
+from pdl.optimize.pdl_evaluator import PdlEvaluator
 from pdl.optimize.pdl_optimizer import PDLOptimizer
 
 if __name__ == "__main__":
@@ -38,7 +40,7 @@
         "--dataset-path",
         help="Path to the dataset directory",
         type=Path,
-        required=True,
+        required=False,
     )
     common_parser.add_argument(
         "--experiments-path",
@@ -56,11 +58,6 @@
         action=argparse.BooleanOptionalAction,
         default=False,
     )
-    common_parser.add_argument(
-        "pdl_file",
-        type=Path,
-        help="Path to a PDL file to optimize",
-    )
 
     # Optimize command
     optimize_parser = subparsers.add_parser(
@@ -82,9 +79,6 @@
     )
 
     args = parser.parse_args()
-    if not args.pdl_file.exists():
-        print("PDL file doesn't exist:", args.pdl_file)
-        sys.exit(1)
 
     if not args.config.exists():
         print("Config file doesn't exist:", args.config)
@@ -100,35 +94,52 @@
         traceback.print_last()
         sys.exit(1)
 
+    if not Path(config.pdl_path).exists():
+        print("PDL file doesn't exist:", config.pdl_path)
+        sys.exit(1)
+
     if args.dry:
         sys.exit(0)
 
     # Set up dataset and trial thread based on benchmark
     dataset: Any
     TrialThread: type[
-        Gsm8kEvaluator | GsmHardEvaluator | FEVEREvaluator | MBPPEvaluator
+        Gsm8kEvaluator
+        | GsmHardEvaluator
+        | FEVEREvaluator
+        | MBPPEvaluator
+        | OptimizerEvaluator
     ]
 
-    if config.benchmark == "gsm8k":
+    if config.dataset == "gsm8k":
         dataset = load_from_disk(args.dataset_path)
         TrialThread = Gsm8kEvaluator
-    elif config.benchmark == "gsmhard":
+    elif config.dataset == "gsmhard":
         dataset = load_from_disk(args.dataset_path)
         TrialThread = GsmHardEvaluator
-    elif config.benchmark == "fever":
+    elif config.dataset == "fever":
         fever = load_from_disk(args.dataset_path)
         dataset = fever
         TrialThread = FEVEREvaluator
-    elif config.benchmark == "mbpp":
+    elif config.dataset == "mbpp":
         dataset = MBPPDataset(args.dataset_path)
         TrialThread = MBPPEvaluator
+    elif isinstance(config.dataset, (dict, JsonlDataset)):
+        dataset = load_dataset(
+            "json",
+            data_files={
+                "train": config.dataset.train,
+                "validation": config.dataset.validation,
+                "test": config.dataset.test,
+            },
+        )
+        TrialThread = PdlEvaluator
     else:
-        print(f"Unknown benchmark: {config.benchmark}")
+        print(f"Unknown dataset: {config.dataset}")
         sys.exit(1)
 
     # Create optimizer instance
     optimizer = PDLOptimizer(
-        pdl_path=args.pdl_file,
         dataset=dataset,
         trial_thread=TrialThread,
         yield_output=args.yield_output,
diff --git a/examples/optimizer/process_bea19.py b/examples/optimizer/process_bea19.py
new file mode 100644
index 000000000..3a01b3e61
--- /dev/null
+++ b/examples/optimizer/process_bea19.py
@@ -0,0 +1,33 @@
+import json
+from pathlib import Path
+
+from datasets.dataset_dict import DatasetDict
+from datasets.load import load_dataset
+
+# Load dataset
+bea19 = load_dataset("juancavallotti/bea-19-corruption")
+if not isinstance(bea19, DatasetDict):
+    raise TypeError(f"Expected bea19 to be a DatasetDict, but got: {type(bea19)}")
+
+# Create validation split from train (1024 examples)
+new_split = bea19["train"].train_test_split(test_size=1024)
+bea19["test"] = new_split["test"]
+
+val_split = new_split["train"].train_test_split()
+bea19["train"] = val_split["train"]
+bea19["validation"] = val_split["test"]
+
+# Output dir
+out_dir = Path("bea19_jsonl")
+out_dir.mkdir(parents=True, exist_ok=True)
+
+
+# Save to JSONL
+def save_jsonl(dataset, path: Path) -> None:
+    with path.open("w") as f:
+        for item in dataset:
+            f.write(json.dumps(item) + "\n")
+
+
+for split in ["train", "validation", "test"]:
+    save_jsonl(bea19[split], out_dir / f"{split}.jsonl")
diff --git a/src/pdl/optimize/config_parser.py b/src/pdl/optimize/config_parser.py
index 27b85510b..ec449ee1c 100644
--- a/src/pdl/optimize/config_parser.py
+++ b/src/pdl/optimize/config_parser.py
@@ -3,11 +3,28 @@
 from pydantic import BaseModel, Field
 
 
+class JsonlDataset(BaseModel):
+    train: str = Field(..., description="Path to the training dataset in JSONL format")
+    test: str = Field(..., description="Path to the test dataset in JSONL format")
+    validation: str = Field(
+        ..., description="Path to the validation dataset in JSONL format"
+    )
+
+
 class OptimizationConfig(BaseModel):
-    benchmark: str = Field()
+    pdl_path: str = Field(..., description="Path to the PDL file to optimize")
+    dataset: str | JsonlDataset = Field()
+    demonstrations_variable_name: str = Field(default="demonstrations")
+    demonstration_columns: list[str] = Field()
+    instance_columns: list[str] = Field()
+    groundtruth_column: str | None = Field()
+    eval_pdl: str | None = Field(
+        default=None, description="Path to the PDL file used for evaluation"
+    )
     num_candidates: int = Field(default=30)
-    num_demonstrations: int = Field(default=5)
-    initial_test_set_size: int = Field(default=10)
+    num_demonstrations: int | None = Field(default=None)
+    initial_validation_set_size: int = Field(default=10)
+    max_validation_set_size: int = Field(default=1000)
     max_test_set_size: int = Field(default=1000)
     timeout: int = Field(default=120)
     budget_growth: Literal["double", "to_max"] = Field(default="double")
@@ -17,7 +34,6 @@ class OptimizationConfig(BaseModel):
     train_set_name: str = Field(default="train")
     test_set_name: str = Field(default="test")
     validation_set_name: str = Field(default="validation")
-    demonstrations_variable_name: str = Field(default="demonstrations")
     variables: dict[str, list] = Field(default={})
     experiment_prefix: str = Field(default="")
 
diff --git a/src/pdl/optimize/optimizer_evaluator.py b/src/pdl/optimize/optimizer_evaluator.py
index 4e809c848..5898087e7 100644
--- a/src/pdl/optimize/optimizer_evaluator.py
+++ b/src/pdl/optimize/optimizer_evaluator.py
@@ -45,10 +45,7 @@ def __init__(
     def get_scope(self) -> ScopeType:
         raise NotImplementedError
 
-    def extract_answer(self, document: str) -> Any:
-        raise NotImplementedError
-
-    def answer_correct(self, document: str, answer: Any, truth: Any) -> bool:
+    def score(self, document: str, ground_truth: Any) -> float:
         raise NotImplementedError
 
     def run(  # type: ignore # noqa: C901
@@ -58,7 +55,6 @@ def run(  # type: ignore # noqa: C901
         answer = None
         exception: PDLParseError | PDLRuntimeError | Exception | bool | None = None
         result = None
-        match = False
         truth = self.example[self.answer_key]
         scope: PdlDict = PdlDict({})
 
@@ -68,6 +64,7 @@ def run(  # type: ignore # noqa: C901
         end_time = None
         total_tokens = -1
         errored = False
+        score = 0.0
         while retry:
             if tries > 1:
                 console.log("RETRYING! ", tries)
@@ -93,6 +90,12 @@ def run(  # type: ignore # noqa: C901
 
                 if isinstance(document, str):
                     document = document.strip()
+                    if document:
+                        errored = False
+                        retry = False
+                    else:
+                        console.log("Empty document returned, retrying...")
+                    answer = document
                 else:
                     raise TypeError(
                         f"Expected document to be a string, got {type(document)}",
@@ -102,24 +105,10 @@ def run(  # type: ignore # noqa: C901
                 runtime = end_time - start_time
                 console.log(f"Runtime took seconds: {runtime:.2f}")
 
-                errored = False
-                if errored:
-                    console.log("PDL error occured.")
-                else:
-                    answer = self.extract_answer(document)
-
-                    if answer is None:
-                        last_line = document.splitlines()[-1]
-                        console.log("Couldn't extract answer: ", last_line)
-
-                if answer is None or errored:
-                    retry = True
-
-                if answer is not None and not errored:
-                    retry = False
-
                 if tries >= RETRY_COUNT:
                     retry = False
+
+                score = float(self.score(document, truth))
             except PDLParseError as exc:
                 console.print_exception(show_locals=False)
                 errored = True
@@ -160,11 +149,9 @@ def run(  # type: ignore # noqa: C901
         if errored and not exception:
             exception = errored
 
-        match = self.answer_correct(document, answer, truth)
-
         return TrialOutput(
             pdl_program=self.pdl_program,
-            correct=match,
+            score=score,
             exception=exception,
             scope=scope,
             pdl_result=result,
diff --git a/src/pdl/optimize/pdl_evaluator.py b/src/pdl/optimize/pdl_evaluator.py
new file mode 100644
index 000000000..f1f9eeeb3
--- /dev/null
+++ b/src/pdl/optimize/pdl_evaluator.py
@@ -0,0 +1,57 @@
+from typing import Any
+
+from pdl.optimize.optimizer_evaluator import OptimizerEvaluator
+from pdl.pdl import exec_str
+from pdl.pdl_ast import ScopeType
+from pdl.pdl_interpreter import empty_scope
+
+
+class PdlEvaluator(OptimizerEvaluator):
+    def __init__(
+        self,
+        # scoring_pdl: str,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.scoring_pdl = self.config.eval_pdl
+        if self.config.groundtruth_column is None:
+            raise ValueError("Groundtruth column must be specified")
+        self.answer_key = self.config.groundtruth_column
+
+    def get_scope(self) -> ScopeType:
+        demo_var = self.config.demonstrations_variable_name
+
+        scope = {}
+
+        for k in self.config.variables:
+            if k in self.candidate:
+                scope[k] = self.candidate[k]
+
+        scope[demo_var] = [
+            {k: q[k] for k in self.config.demonstration_columns}
+            for q in self.candidate[demo_var]
+        ]
+
+        for k in self.config.instance_columns:
+            if k in self.example:
+                scope[k] = self.example[k]
+
+        return empty_scope | scope
+
+    def score(self, document: str, ground_truth: Any) -> float:
+        scope = empty_scope | {"document": document, "ground_truth": ground_truth}
+        prog = f"""defs:
+  scoring:
+    import: "{self.scoring_pdl}"
+lastOf:
+  - call: ${{ scoring.score }}
+    args:
+        document: ${{ document }}
+        ground_truth: ${{ ground_truth }}"""
+        result = exec_str(prog=prog, scope=scope, output="result")
+
+        if isinstance(result, str):
+            result = result.strip()
+        # Note: this breaks if the result is not a number
+        return float(result)
diff --git a/src/pdl/optimize/pdl_optimizer.py b/src/pdl/optimize/pdl_optimizer.py
index b6be331a7..17b4ed011 100644
--- a/src/pdl/optimize/pdl_optimizer.py
+++ b/src/pdl/optimize/pdl_optimizer.py
@@ -68,22 +68,22 @@ class PDLOptimizer:
     # pylint: disable=too-many-instance-attributes,too-many-arguments,too-many-positional-arguments
     def __init__(
         self,
-        pdl_path: Path,
         dataset: DatasetDict,
         config: OptimizationConfig,
         trial_thread: type[OptimizerEvaluator],
         yield_output: bool,
         experiment_path: Path,
     ) -> None:
-        self.pdl_path = pdl_path
         self.trial_thread = trial_thread
         self.yield_output = yield_output
 
         self.config = config
+        self.pdl_path = Path(config.pdl_path)
         self.parallelism = config.parallelism
         self.num_demonstrations = config.num_demonstrations
-        self.starting_validation_set_size = config.initial_test_set_size
-        self.ending_test_set_size = config.max_test_set_size
+        self.starting_validation_set_size = config.initial_validation_set_size
+        self.ending_validation_set_size = config.max_validation_set_size
+        self.max_test_set_size = config.max_test_set_size
         self.max_candidates = config.num_candidates
         self.timeout = config.timeout
         self.budget_growth = config.budget_growth
@@ -293,7 +293,7 @@ def run(self) -> dict[str, Any]:
             self.starting_validation_set_size,
             validation_set_size,
         )
-        ending_validation_set_size = self.ending_test_set_size
+        ending_validation_set_size = self.ending_validation_set_size
         num_iterations = ceil(log2(num_candidates))
 
         validation_set_multiplier = 0
@@ -494,7 +494,7 @@ def run(self) -> dict[str, Any]:
             # reset_usage_stats()
 
             range_end = min(
-                ending_validation_set_size,
+                self.max_test_set_size,
                 len(self.dataset[self.test_set_name]),
             )
             eval_set_indices = list(range(range_end))
@@ -519,7 +519,7 @@ def run(self) -> dict[str, Any]:
             self.pbar.close()
 
             self.experiment_log["final_iteration"] = {
-                "ending_test_set_size": ending_validation_set_size,
+                "ending_test_set_size": range_end,
                 "eval_set_indices": eval_set_indices,
                 "selected_candidates_uuid": winning_candidate["uuid"],
                 "candidate": final_score.to_dict(),
@@ -632,7 +632,7 @@ def evaluate(
                 ),
             )
 
-        matches = 0
+        score = 0
         exception_count = 0
         timeout_count = 0
         exceptions: list[BaseException | bool] = []
@@ -662,10 +662,10 @@ def evaluate(
                     answer = result.answer
 
                 logger.info(
-                    "Answer: %s Ground truth: %s Match: %s",
+                    "Answer: %s Ground truth: %s Score: %s",
                     answer,
                     result.groundtruth,
-                    result.correct,
+                    result.score,
                 )
 
                 if candidate["uuid"] not in self.candidate_results:
@@ -684,16 +684,16 @@ def evaluate(
                 if trial_result.exception is not None:
                     exceptions.append(trial_result.exception)
 
-                matches += int(trial_result.correct)
+                score += float(trial_result.score)
 
-                p_passing = matches / (index + 1)
+                p_passing = score / (index + 1)
 
         end_time = time.time()
         runtime = end_time - start_time
 
         logger.info(
             "Matches: %s Accuracy: %.2f Exceptions: %s (%s timeout, %s other) Total: %s",
-            f"{matches:,}",
+            f"{score:,}",
             p_passing * 100,
             f"{len(exceptions):,}",
             timeout_count,
@@ -712,7 +712,9 @@ def evaluate(
         )
 
     def benchmark(self, test_set_size: int, candidate: dict | None = None):
-        if self.num_demonstrations <= 0:
+        if self.num_demonstrations is None:
+            demo_size = 0
+        elif self.num_demonstrations <= 0:
             demo_size = len(self.dataset[self.train_set_name])
         else:
             demo_size = self.num_demonstrations
diff --git a/src/pdl/optimize/util.py b/src/pdl/optimize/util.py
index aceb0d3ff..d331f49d1 100644
--- a/src/pdl/optimize/util.py
+++ b/src/pdl/optimize/util.py
@@ -20,7 +20,7 @@ class TrialOutput:
     pdl_program: Program
     scope: ScopeType
     runtime: float
-    correct: bool = False
+    score: float = 0.0
     exception: BaseException | bool | None = None
     pdl_result: Any = None
     pdl_document: str = ""
@@ -32,7 +32,7 @@ class TrialOutput:
 
     def to_dict(self) -> dict:
         return {
-            "correct": self.correct,
+            "score": self.score,
             "exception": str(self.exception),
             "pdl_document": self.pdl_document,
             "answer": self.answer,
diff --git a/tests/test_examples_run.yaml b/tests/test_examples_run.yaml
index 6ef48a317..9250488ed 100644
--- a/tests/test_examples_run.yaml
+++ b/tests/test_examples_run.yaml
@@ -39,6 +39,8 @@ skip:
   - examples/optimizer/mbpp.pdl
   - examples/optimizer/fever.pdl
   - examples/optimizer/gsm8k.pdl
+  - examples/optimizer/bea19.pdl
+  - examples/optimizer/eval_levenshtein.pdl
   - examples/requirements/email.pdl
 with_inputs:
   examples/tutorial/programs/chatbot.pdl:
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
index 1df9b0834..4ceef96e0 100644
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
@@ -17,11 +17,14 @@ def test_gsm8k_cot():
     pattern = "cot"
     num_demonstrations = 3
     config = OptimizationConfig(
-        benchmark="gsm8k",
-        initial_test_set_size=1,
+        pdl_path="tests/data/optimizer_gsm8k.pdl",
+        dataset="gsm8k",
+        demonstration_columns=["question", "reasoning", "answer"],
+        instance_columns=["question", "reasoning"],
+        groundtruth_column="answer",
+        initial_validation_set_size=1,
         max_test_set_size=1,
         num_candidates=5,
-        num_demonstrations=num_demonstrations,
         parallelism=1,
         shuffle_test=False,
         test_set_name="test",
@@ -349,7 +352,6 @@ def test_gsm8k_cot():
         },
     )
     optim = PDLOptimizer(
-        pdl_path=Path("tests/data/optimizer_gsm8k.pdl"),
         dataset=gsm8k,
         trial_thread=Gsm8kEvaluator,
         yield_output=True,
@@ -375,11 +377,14 @@ def test_gsm8k_cot():
 
 def run_optimizer_gsm8k(pattern, num_demonstrations=0):
     config = OptimizationConfig(
-        benchmark="gsm8k",
-        initial_test_set_size=1,
+        pdl_path="tests/data/optimizer_gsm8k.pdl",
+        dataset="gsm8k",
+        demonstration_columns=["question", "reasoning", "answer"],
+        instance_columns=["question", "reasoning"],
+        groundtruth_column="answer",
+        initial_validation_set_size=1,
         max_test_set_size=1,
         num_candidates=1,
-        num_demonstrations=num_demonstrations,
         parallelism=1,
         shuffle_test=False,
         test_set_name="test",
@@ -708,7 +713,6 @@ def run_optimizer_gsm8k(pattern, num_demonstrations=0):
         },
     )
     optim = PDLOptimizer(
-        pdl_path=Path("examples/optimizer/gsm8k.pdl"),
         dataset=gsm8k,
         trial_thread=Gsm8kEvaluator,
         yield_output=True,
@@ -724,11 +728,22 @@ def run_optimizer_gsm8k(pattern, num_demonstrations=0):
 
 def run_optimizer_fever(pattern, num_demonstrations=0):
     config = OptimizationConfig(
-        benchmark="fever",
-        initial_test_set_size=1,
+        pdl_path="examples/optimizer/fever.pdl",
+        dataset="fever",
+        demonstration_columns=[
+            "question",
+            "reasoning",
+            "reasoning",
+            "traj_keys",
+            "traj_values",
+            "rewoo_traj_keys",
+            "rewoo_traj_values",
+        ],
+        instance_columns=["claim"],
+        groundtruth_column="label",
+        initial_validation_set_size=1,
         max_test_set_size=1,
         num_candidates=1,
-        num_demonstrations=num_demonstrations,
         parallelism=1,
         shuffle_test=False,
         test_set_name="test",
@@ -1064,7 +1079,6 @@ def run_optimizer_fever(pattern, num_demonstrations=0):
     )
 
     optim = PDLOptimizer(
-        pdl_path=Path("examples/optimizer/fever.pdl"),
         dataset=fever,  # pyright: ignore
         trial_thread=FEVEREvaluator,
         yield_output=True,
@@ -1080,11 +1094,18 @@ def run_optimizer_fever(pattern, num_demonstrations=0):
 
 def run_optimizer_mbpp(pattern, num_demonstrations=0):
     config = OptimizationConfig(
-        benchmark="mbpp",
-        initial_test_set_size=1,
+        pdl_path="examples/optimizer/mbpp.pdl",
+        dataset="mbpp",
+        demonstration_columns=[
+            "prompt",
+            "traj_keys",
+            "traj_values",
+        ],
+        instance_columns=["claim"],
+        groundtruth_column="canonical_solution",
+        initial_validation_set_size=1,
         max_test_set_size=1,
         num_candidates=1,
-        num_demonstrations=num_demonstrations,
         parallelism=1,
         shuffle_test=False,
         test_set_name="test",
@@ -1103,7 +1124,6 @@ def run_optimizer_mbpp(pattern, num_demonstrations=0):
     )
 
     optim = PDLOptimizer(
-        pdl_path=Path("examples/optimizer/mbpp.pdl"),
         dataset=mbpp_dataset,  # pyright: ignore
         trial_thread=MBPPEvaluator,
         yield_output=True,