From d957fec82eac2a776a8e2bd55a9e449317dada94 Mon Sep 17 00:00:00 2001 From: Claudio Spiess Date: Tue, 24 Jun 2025 16:05:41 +0200 Subject: [PATCH 01/11] Data processing scripts Signed-off-by: Claudio Spiess --- examples/optimizer/process_fever.py | 426 ++++++++++++++++++++++++++++ examples/optimizer/process_gsm8k.py | 167 +++++++++++ examples/optimizer/process_mbpp.py | 80 ++++++ 3 files changed, 673 insertions(+) create mode 100644 examples/optimizer/process_fever.py create mode 100644 examples/optimizer/process_gsm8k.py create mode 100644 examples/optimizer/process_mbpp.py diff --git a/examples/optimizer/process_fever.py b/examples/optimizer/process_fever.py new file mode 100644 index 000000000..db5532209 --- /dev/null +++ b/examples/optimizer/process_fever.py @@ -0,0 +1,426 @@ +# Instructions for running this script: +# 1. Ensure you have the required libraries installed. +# `datasets` should be version 3.0 or higher. +# +# `pip install prompt-declaration-language[all] funcy` +# 2. Download the original FEVER dataset with wiki-pages, and BigBench FEVER task JSON file +# from the respective sources. +# https://fever.ai/dataset/fever.html +# https://github.com/google/BIG-bench/blob/main/bigbench/benchmark_tasks/fact_checker/fever/task.json +# +# ``` +# wget https://raw.githubusercontent.com/google/BIG-bench/refs/heads/main/bigbench/benchmark_tasks/fact_checker/fever/task.json +# wget https://fever.ai/download/fever/wiki-pages.zip +# wget https://fever.ai/download/fever/shared_task_dev.jsonl +# ``` +# Place the downloaded files in the `var/fever` directory. +# Extract the `wiki-pages.zip` file into `var/fever/wiki-pages/`. +# 3. Run this script to process the FEVER dataset. +# `python examples/optimizer/process_fever.py` +# +import json +import operator +import re +import unicodedata +import warnings +from functools import cache +from itertools import groupby +from pathlib import Path +from typing import Any + +import pandas as pd +import wikipedia +from datasets.arrow_dataset import Dataset +from datasets.dataset_dict import DatasetDict +from datasets.load import load_dataset, load_from_disk +from funcy import flatten +from tqdm.autonotebook import tqdm + +tqdm.pandas() +warnings.simplefilter("ignore") + +var_dir = Path("var") +var_dir.mkdir(parents=True, exist_ok=True) + + +def clean_fever(text: str) -> str: + mapping = { + "_": " ", + "-LRB- ": "(", + " -RRB-": ")", + "-LSB- ": "[", + " -RSB-": "]", + "-LRB-": "(", + "-RRB-": ")", + "-LSB-": "[", + "-RSB-": "]", + "-COLON-": ":", + } + + for k, v in mapping.items(): + text = text.replace(k, v) + + return text.strip() + + +@cache +def search_new( + subject: str, auto_suggest: bool = False, redirect: bool = False +) -> tuple[str, str]: + try: + result = ( + wikipedia.summary( + subject, auto_suggest=auto_suggest, redirect=redirect + ).strip(), + "success", + ) + except wikipedia.DisambiguationError as d: + result = ( + f'"{subject}" may refer to one of {d.args[1]}. Please retry the search with one of the subjects using Search[].', + "disambg", + ) + except wikipedia.PageError as e: + result = f"{e} Please retry the search using Search[].", "pageerror" + except wikipedia.WikipediaException as e: + print(e, type(e)) + result = str(e), f"other:{type(e)}" + except Exception as e: + print(e, type(e)) + result = str(e), f"other:{type(e)}" + return result + + +def searcher(row: dict[str, Any], auto_suggest: bool): + cleaned = clean_fever(row["article"]) + if "msg" in row: + if row["msg"] != "success": + wiki, msg = search_new(cleaned, auto_suggest=auto_suggest, redirect=True) + else: + wiki = row["wiki"] + msg = row["msg"] + else: + wiki, msg = search_new(cleaned, auto_suggest=auto_suggest, redirect=True) + return {"wiki": wiki, "msg": msg, "cleaned": cleaned} + + +def remove_accents(x: str) -> str: + return unicodedata.normalize("NFD", x) + + +fever = load_dataset("fever/fever", "v1.0") +if not isinstance(fever, DatasetDict): + msg = f"Expected fever to be a DatasetDict, but got: {type(fever)}" + raise TypeError(msg) + +bigbench_fever = json.loads(Path("var/fever/task.json").read_text(encoding="utf-8")) + +fever.save_to_disk("var/fever/fever_original") +print(fever) + +wikipages = load_dataset( + "json", + data_files="var/fever/wiki-pages/wiki-pages/wiki-*.jsonl", + encoding="utf-8", +) +if not isinstance(wikipages, DatasetDict): + msg = f"Expected wikipages to be a DatasetDict, but got: {type(wikipages)}" + raise TypeError(msg) + +print("Loaded wikipages:", wikipages) +print("Mapping wikipages...") +wikipages["train"] = wikipages["train"].map( + lambda x: { + "lines_split": list(filter(lambda x: x != "", re.split(r"\d+\t", x["lines"]))) + }, + num_proc=32, +) +print("Mapping wikipages done.") + +print("Converting wikipages to DataFrame...") +wiki_pages_df = wikipages["train"].to_pandas() +if not isinstance(wiki_pages_df, pd.DataFrame): + msg = f"Expected wiki_pages_df to be a DataFrame, but got: {type(wiki_pages_df)}" + raise TypeError(msg) +wiki_pages_df = wiki_pages_df.set_index("id") +wiki_pages_df.index = wiki_pages_df.index.map(remove_accents) +print("Wikipages converted to DataFrame.") + + +if isinstance(wiki_pages_df, pd.DataFrame): + wiki_pages_df.to_parquet( + path="var/fever/wiki_pages.parquet", + index=True, + engine="pyarrow", + compression="zstd", + compression_level=10, + ) +else: + msg = f"Expected wiki_pages_df to be a DataFrame, but got: {type(wiki_pages_df)}" + raise TypeError(msg) + + +df = pd.read_json( + "var/fever/shared_task_dev.jsonl", lines=True, encoding="utf-8" +).set_index("id") +print("Loaded original FEVER", len(df)) +df = df[df.label.isin(["SUPPORTS", "REFUTES"])].copy() +print("Filtered original FEVER", len(df)) + + +def evidence_mapper(evidence: list[tuple]): + evidences = {(x[2], x[3]) for x in evidence[0] if x[2] is not None} + return list(evidences) + + +df["unique_evidence"] = df["evidence"].progress_apply(evidence_mapper) + + +def evidence_mapper_sentence(evidences: list[tuple[str, int]]): + if not isinstance(wiki_pages_df, pd.DataFrame): + msg = ( + f"Expected wiki_pages_df to be a DataFrame, but got: {type(wiki_pages_df)}" + ) + raise TypeError(msg) + + lines = [] + for title, line in evidences: + if title is None or line is None: + continue + title_no_acc = remove_accents(title) + + if title_no_acc not in wiki_pages_df.index: + print(title_no_acc) + continue + + sentence = wiki_pages_df.loc[title_no_acc] + if sentence["lines_split"] is not None and len(sentence["lines_split"]) > line: + sentence = sentence["lines_split"][line] + lines.append((title_no_acc, line, sentence)) + else: + print(sentence) + return list(lines) + + +df["evidence_sentences"] = df["unique_evidence"].progress_apply( + evidence_mapper_sentence +) + +bigbench = pd.DataFrame.from_records(bigbench_fever["examples"]).set_index("id") + +tqdm.pandas(desc="Mapping claims to (in) bigbench") +df["claim_in_bigbench"] = df["claim"].progress_apply( + lambda x: bigbench.input.str.contains(x).any() +) +tqdm.pandas() + +df["evidence_sentence_count"] = df["evidence_sentences"].map(len) +print("Mapped bigbench") + +train_df = df[(~df.index.isin(bigbench.index)) & (df["evidence_sentence_count"] > 0)] + +test_df = df[ + (df.index.isin(bigbench.index)) & (df["evidence_sentence_count"] > 0) +].drop(columns=["verifiable", "claim_in_bigbench", "evidence"]) +test_df["unique_evidence"] = test_df["unique_evidence"].map( + lambda x: [[str(title), str(sent_id)] for title, sent_id in x] +) +test_df["evidence_sentences"] = test_df["evidence_sentences"].map( + lambda x: [[str(title), str(sent_id), str(sent)] for title, sent_id, sent in x] +) +test_df["label"] = test_df["label"] == "SUPPORTS" +test_df.index = test_df.index.astype(pd.StringDtype()) +test_df.claim = test_df.claim.astype(pd.StringDtype()) +test_df["id"] = test_df.index +print("Saving fever test df") +test_df.to_json("fever_test_df.json", orient="records", lines=True) +print("Saved fever test df") + +train_df = df[ + (~df.index.isin(bigbench.index)) & (df["evidence_sentence_count"] > 0) +].drop(columns=["verifiable", "claim_in_bigbench", "evidence"]) +train_df["unique_evidence"] = train_df["unique_evidence"].map( + lambda x: [[str(title), str(sent_id)] for title, sent_id in x] +) +train_df["evidence_sentences"] = train_df["evidence_sentences"].map( + lambda x: [[str(title), str(sent_id), str(sent)] for title, sent_id, sent in x] +) +train_df["label"] = train_df["label"] == "SUPPORTS" +train_df.index = train_df.index.astype(pd.StringDtype()) +train_df.claim = train_df.claim.astype(pd.StringDtype()) +train_df["id"] = train_df.index +print("Saving fever train df") +train_df.to_json("fever_train_df.json", orient="records", lines=True) +print("Saved fever train df") + +fever_ds = load_dataset( + "json", data_files={"train": "fever_train_df.json", "test": "fever_test_df.json"} +) +if not isinstance(fever_ds, DatasetDict): + msg = f"Expected fever_ds to be a DatasetDict, but got: {type(fever_ds)}" + raise TypeError(msg) +fever_ds.save_to_disk("var/fever/fever_reprocessed") +print(fever_ds) + + +articles = list( + set(flatten([[y[0] for y in x] for x in fever_ds["train"]["unique_evidence"]])) +) +article_ds = Dataset.from_dict({"article": articles}) +print(article_ds) + + +article_ds = article_ds.map(lambda x: searcher(x, True), num_proc=4) +article_ds = article_ds.map(lambda x: searcher(x, False), num_proc=1) +article_df = article_ds.to_pandas() +if not isinstance(article_df, pd.DataFrame): + msg = f"Expected article_df to be a DataFrame, but got: {type(article_df)}" + raise TypeError(msg) +article_df = article_df.set_index("article") +print("Articles that did not return a successful response:") +print(article_df[article_df.msg != "success"]) +article_ds.save_to_disk("var/fever/fever_articles") +article_df.to_parquet( + "var/fever/fever_articles.parquet", + index=True, + engine="pyarrow", + compression="zstd", + compression_level=10, +) + + +def search(query: str) -> tuple: + if not isinstance(article_df, pd.DataFrame): + msg = f"Expected article_df to be a DataFrame, but got: {type(article_df)}" + raise TypeError(msg) + row = article_df.loc[query] + return row["wiki"], row["msg"] + + +def trajectorize(row: dict[str, Any]) -> dict[str, Any]: + evidence_sentences = row["evidence_sentences"] + + claim = row["claim"].strip() + task = f"On June 2017, the following claim was made: {claim}\nQ: Was this claim true or false?" + answer = str(row["label"]).lower() + + article_sentence_group = { + k: list(v) for k, v in groupby(evidence_sentences, operator.itemgetter(0)) + } + + articles = {} + statuses = [] + wiki_worked = True + for article in article_sentence_group: + cleaned_article = clean_fever(article) + wiki, worked = search(article) + wiki = wiki.strip() + if worked != "success": + wiki_worked = False + + articles[cleaned_article] = wiki + statuses.append(worked) + all_wiki_success = all(x in {"success", "fallback"} for x in statuses) + + trajectory = [{"task": task}] + + for article, evidences in article_sentence_group.items(): + cleaned_article = clean_fever(article) + trajectory.extend([ + {"thought": f"I need to search {cleaned_article}."}, + { + "action": '{"name": "Search", "arguments": {"topic": "' + + cleaned_article + + '"}}' + }, + {"observation": f"[Document]\n{articles[cleaned_article]}\n[End]"}, + ]) + + for _title, _line, sent in evidences: + trajectory.append({"observation": clean_fever(sent.split("\t")[0])}) + + trajectory.extend([ + {"thought": f"The claim is {answer}."}, + {"action": '{"name": "Finish", "arguments": {"topic": "' + answer + '"}}'}, + ]) + + traj_keys = [next(iter(t.keys())) for t in trajectory] + traj_values = [next(iter(t.values())) for t in trajectory] + + rewoo_trajectory = [{"task": task}] + + for article, evidences in article_sentence_group.items(): + cleaned_article = clean_fever(article) + rewoo_trajectory.extend([ + {"thought": f"Search for more information about {cleaned_article}."}, + { + "action": '{"name": "Search", "arguments": {"topic": "' + + cleaned_article + + '"}}' + }, + {"observation": f"[Document]\n{articles[cleaned_article]}\n[End]"}, + ]) + + for _title, _line, sent in evidences: + rewoo_trajectory.append({"observation": clean_fever(sent.split("\t")[0])}) + + rewoo_traj_keys = [next(iter(t.keys())) for t in rewoo_trajectory] + rewoo_traj_values = [next(iter(t.values())) for t in rewoo_trajectory] + + return { + "traj_keys": traj_keys, + "traj_values": traj_values, + "rewoo_traj_keys": rewoo_traj_keys, + "rewoo_traj_values": rewoo_traj_values, + "all_wiki_success": all_wiki_success, + "wiki_worked": wiki_worked, + "articles": list(articles.values()), + "statuses": statuses, + } + + +def sentencify(row: dict[str, Any]) -> dict[str, str]: + evidence_sentences = row["evidence_sentences"] + + article_sentence_group = { + clean_fever(k): list(v) + for k, v in groupby(evidence_sentences, operator.itemgetter(0)) + } + + sentences = [] + for evidences in article_sentence_group.values(): + for _title, _line, sent in evidences: + sentences.append(clean_fever(sent.split("\t")[0])) + + return {"cot": " ".join(sentences).strip().replace("\n", " ").strip()} + + +fever_ds = fever_ds.map( + lambda x: {"label": str(x["label"]).lower()}, + num_proc=4, +) + +fever_ds["train"] = ( + fever_ds["train"].map(trajectorize, num_proc=4).map(sentencify, num_proc=4) +) + + +print( + "Wiki lookup failures:", + fever_ds["train"].filter(lambda x: x["all_wiki_success"] is False), +) + + +fever_ds["train"] = fever_ds["train"].filter(lambda x: x["wiki_worked"] is True) +fever_ds["train"] = fever_ds["train"].remove_columns( + column_names=["wiki_worked", "all_wiki_success", "statuses"] +) +print(fever_ds) + +new_split = fever_ds["train"].train_test_split(test_size=1024) +fever_ds["train"] = new_split["train"] +fever_ds["validation"] = new_split["test"] +fever_ds.save_to_disk("var/fever_trajectified") + +# Make sure the saved dataset is loaded correctly +ds = load_from_disk("var/fever_trajectified") +print(ds) diff --git a/examples/optimizer/process_gsm8k.py b/examples/optimizer/process_gsm8k.py new file mode 100644 index 000000000..cb36068d2 --- /dev/null +++ b/examples/optimizer/process_gsm8k.py @@ -0,0 +1,167 @@ +import re +from pathlib import Path +from typing import Any + +from datasets.dataset_dict import DatasetDict +from datasets.load import load_dataset, load_from_disk + +from pdl.optimize.parse_number import parse_number + +# Load original GSM8K dataset and split it into train and validation sets + +var_dir = Path("var") +var_dir.mkdir(parents=True, exist_ok=True) + +gsm8k_orig = load_dataset("openai/gsm8k", "main") +if not isinstance(gsm8k_orig, DatasetDict): + msg = f"Expected gsm8k_orig to be a DatasetDict, but got: {type(gsm8k_orig)}" + raise TypeError(msg) +new_split = gsm8k_orig["train"].train_test_split(test_size=1024) +gsm8k_orig["validation"] = new_split["test"] +gsm8k_orig["train"] = new_split["train"] +gsm8k_orig.save_to_disk("var/gsm8k_split") + +# Make sure the saved dataset is loaded correctly +gsm8k = load_from_disk("var/gsm8k_split") +if not isinstance(gsm8k, DatasetDict): + msg = f"Expected gsm8k to be a DatasetDict, but got: {type(gsm8k)}" + raise TypeError(msg) + + +def parse_answers(row: dict[str, Any]) -> dict[str, Any]: + question = row["question"].strip().replace("’", "'").replace(" ", " ") + parts = row["answer"].split("####") + answer = parse_number(parts[-1]) + reasoning = "####".join(parts[:-1]).strip().replace("’", "'").replace(" ", " ") + return { + "question": question, + "answer": answer, + "reasoning": reasoning, + "raw_answer": row["answer"], + "answer_part": parts[-1], + } + + +gsm8k = gsm8k.map(parse_answers) + + +def react_trajectory(row: dict[str, Any]) -> dict[str, list[str]]: + question = row["question"] + answer = row["answer"] + reasoning = row["reasoning"].splitlines() + trajectory = [{"question": question.strip()}] + res = answer + + for line in reasoning: + pattern = ( + r"(?P
(=(\ )?|equals(\ )?)?(\$)?)<<(?P.*?)=(?P.*?)>>([^\s]*)"
+        )
+        expressions = re.search(pattern, line)
+
+        if expressions is None:
+            trajectory += [
+                {"thought": line.strip().replace("  ", " ")},
+            ]
+        else:
+            thought = re.sub(pattern, "", line)
+            thought = thought.rstrip(".").rstrip(",")
+            exp = expressions.group("exp").strip()
+            res = expressions.group("res").strip()
+
+            trajectory += [
+                {
+                    "thought": f"{thought.strip().replace('  ', ' ')}. I need to calculate {exp}"
+                },
+                {
+                    "action": '{"name": "Calculator", "arguments": {"expr": "'
+                    f"{exp}"
+                    '"}}'
+                },
+                {"observation": res},
+            ]
+    if next(iter(trajectory[-1].keys())) == "observation":
+        trajectory.append({"thought": f"The answer is {answer}"})
+
+    trajectory.append({
+        "action": '{"name": "Finish", "arguments": {"answer": "' + f"{answer}" + '"}}'
+    })
+
+    traj_keys = [next(iter(t.keys())) for t in trajectory]
+    traj_values = [next(iter(t.values())) for t in trajectory]
+
+    return {
+        "traj_keys": traj_keys,
+        "traj_values": traj_values,
+    }
+
+
+gsm8k["train"] = gsm8k["train"].map(react_trajectory)
+
+
+def rewoo_trajectory(row: dict[str, Any]) -> dict[str, list[str]]:
+    question = row["question"]
+    answer = row["answer"]
+    reasoning = row["reasoning"].splitlines()
+    trajectory = [{"question": question.strip().replace("  ", " ")}]
+    res = answer
+
+    for line in reasoning:
+        pattern = (
+            r"(?P
(=(\ )?|equals(\ )?)?(\$)?)<<(?P.*?)=(?P.*?)>>([^\s]*)"
+        )
+        expressions = re.search(pattern, line)
+
+        if expressions is None:
+            trajectory += [
+                {"thought": line.strip().replace("  ", " ")},
+            ]
+        else:
+            thought = re.sub(pattern, "", line)
+            thought = thought.rstrip(".").rstrip(",")
+            exp = expressions.group("exp").strip()
+            res = expressions.group("res").strip()
+
+            trajectory += [
+                {"thought": f"{thought.strip().replace('  ', ' ')}. Calculate {exp}"},
+                {
+                    "action": '{"name": "Calculator", "arguments": {"expr": "'
+                    f"{exp}"
+                    '"}}'
+                },
+                {"observation": res},
+            ]
+
+    evidence_counter = 0
+    for i in range(len(trajectory)):
+        outer = trajectory[i]
+        type_event = next(iter(outer.keys()))
+        value = next(iter(outer.values()))
+
+        if type_event == "action":
+            evidence_counter += 1
+        if type_event == "observation":
+            for j in range(i + 1, len(trajectory)):
+                inner = trajectory[j]
+                inner_type_event = next(iter(inner.keys()))
+                if inner_type_event == "action":
+                    trajectory[j]["action"] = trajectory[j]["action"].replace(
+                        value, f"#E{evidence_counter}"
+                    )
+                elif inner_type_event == "thought":
+                    trajectory[j]["thought"] = trajectory[j]["thought"].replace(
+                        value, f"#E{evidence_counter}"
+                    )
+    traj_keys = [next(iter(t.keys())) for t in trajectory]
+    traj_values = [next(iter(t.values())) for t in trajectory]
+
+    return {"rewoo_traj_keys": traj_keys, "rewoo_traj_values": traj_values}
+
+
+gsm8k["train"] = gsm8k["train"].map(rewoo_trajectory)
+
+# Save the processed dataset
+gsm8k.save_to_disk("var/gsm8k_trajectified")
+
+# Make sure the saved dataset is loaded correctly
+ds = load_from_disk("var/gsm8k_trajectified")
+print(ds)
diff --git a/examples/optimizer/process_mbpp.py b/examples/optimizer/process_mbpp.py
new file mode 100644
index 000000000..999e80cb9
--- /dev/null
+++ b/examples/optimizer/process_mbpp.py
@@ -0,0 +1,80 @@
+# Instructions:
+# 1. Install EvalPlus e.g. `pip install evalplus`
+# 2. Run this script to process the MBPP dataset into a format suitable for evaluation.
+import re
+from pathlib import Path
+from typing import Any
+
+from datasets.dataset_dict import DatasetDict
+from datasets.load import load_dataset, load_from_disk
+from evalplus.data import get_mbpp_plus
+
+var_dir = Path("var")
+var_dir.mkdir(parents=True, exist_ok=True)
+
+mbpp_plus = get_mbpp_plus()
+
+mbpp = load_dataset("google-research-datasets/mbpp", name="full")
+if not isinstance(mbpp, DatasetDict):
+    msg = f"Expected mbpp to be a DatasetDict, but got: {type(mbpp)}"
+    raise TypeError(msg)
+
+mbpp["test"] = mbpp["test"].filter(
+    lambda x: f"Mbpp/{x['task_id']}" in mbpp_plus,
+)
+
+mbpp["validation"] = mbpp["validation"].filter(
+    lambda x: f"Mbpp/{x['task_id']}" in mbpp_plus,
+)
+
+
+def trajectify(row: dict[str, Any]) -> dict[str, list[str]]:
+    code = row["code"].replace("\r\n", "\n").replace("\r", "\n").strip()
+    first_test = row["test_list"][0].strip().lstrip()
+    pattern = r"assert\s+(\w+\(.*?\))\s*==\s*(.+)"
+
+    # Replacement format
+    replacement = r"res = \1\nassert res == \2, \"Expected \2 but got {}\".format(res)"
+
+    # Perform the substitution
+    converted_string = (
+        re.sub(pattern, replacement, first_test)
+        .replace('\\"Expected ', '"Expected ')
+        .replace('{}\\"', '{}"')
+    )
+    code_w_assert = code + "\n" + converted_string.strip()
+    prompt = row["text"].strip() + "\n" + first_test
+
+    trajectory = [
+        {"task": prompt},
+        {
+            "thought": "I should run a solution on the test case before proposing a solution."
+        },
+        {"action": code_w_assert},
+        {"observation": "[Executed Successfully with No Output]"},
+        {"thought": "There is no AssertionError. I can now submit the solution."},
+        {"solution": code},
+    ]
+
+    traj_keys = [next(iter(t.keys())) for t in trajectory]
+    traj_values = [next(iter(t.values())) for t in trajectory]
+
+    return {
+        "react_prompt": prompt,
+        "code": code,
+        "traj_keys": traj_keys,
+        "traj_values": traj_values,
+    }
+
+
+mbpp_trajectified = mbpp.map(trajectify)
+assert len(mbpp_trajectified["train"]) == 374
+assert len(mbpp_trajectified["test"]) == 224
+assert len(mbpp_trajectified["validation"]) == 39
+
+# Save the processed dataset
+mbpp_trajectified.save_to_disk("var/mbpp_trajectified")
+
+# Make sure the saved dataset is loaded correctly
+ds = load_from_disk("var/mbpp_trajectified")
+print(ds)

From 0e33470b4050d4a0fdabb2a3ff6d74312d75e6a6 Mon Sep 17 00:00:00 2001
From: Claudio Spiess 
Date: Mon, 30 Jun 2025 17:08:16 +0200
Subject: [PATCH 02/11] Bump datasets version

Signed-off-by: Claudio Spiess 
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3350229a5..6e7ee39ff 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,7 +47,7 @@ examples = [
   "pypdf~=5.2",
   "wikipedia~=1.0",
   "textdistance~=4.0",
-  "datasets>2,<4",
+  "datasets>3,<4",
   "sympy~=1.0",
   "scikit-learn>=1.6.1,<1.8.0",
   "faiss-cpu>=1.10,<1.12"

From 2bc34b5c075ac23c1decef4ab1f68ff45cebe78f Mon Sep 17 00:00:00 2001
From: Claudio Spiess 
Date: Mon, 30 Jun 2025 17:09:06 +0200
Subject: [PATCH 03/11] Address typing/pyright issues

Signed-off-by: Claudio Spiess 
---
 examples/optimizer/mbpp_dataset.py  |  2 +-
 examples/optimizer/process_fever.py | 96 ++++++++++++++++-------------
 examples/optimizer/process_gsm8k.py | 17 +++--
 examples/optimizer/process_mbpp.py  |  3 +-
 src/pdl/optimize/pdl_optimizer.py   |  3 +-
 src/pdl/optimize/util.py            |  2 +-
 tests/test_optimizer.py             |  3 +-
 7 files changed, 69 insertions(+), 57 deletions(-)

diff --git a/examples/optimizer/mbpp_dataset.py b/examples/optimizer/mbpp_dataset.py
index 3265a8b29..61fef67a6 100644
--- a/examples/optimizer/mbpp_dataset.py
+++ b/examples/optimizer/mbpp_dataset.py
@@ -3,7 +3,7 @@
 
 from copy import deepcopy
 
-from datasets import load_from_disk
+from datasets.load import load_from_disk
 from evalplus.data import get_mbpp_plus, get_mbpp_plus_hash
 from evalplus.evaluate import MBPP_OUTPUT_NOT_NONE_TASKS, get_groundtruth
 
diff --git a/examples/optimizer/process_fever.py b/examples/optimizer/process_fever.py
index db5532209..c783fbca2 100644
--- a/examples/optimizer/process_fever.py
+++ b/examples/optimizer/process_fever.py
@@ -109,8 +109,7 @@ def remove_accents(x: str) -> str:
 
 fever = load_dataset("fever/fever", "v1.0")
 if not isinstance(fever, DatasetDict):
-    msg = f"Expected fever to be a DatasetDict, but got: {type(fever)}"
-    raise TypeError(msg)
+    raise TypeError(f"Expected fever to be a DatasetDict, but got: {type(fever)}")
 
 bigbench_fever = json.loads(Path("var/fever/task.json").read_text(encoding="utf-8"))
 
@@ -123,14 +122,15 @@ def remove_accents(x: str) -> str:
     encoding="utf-8",
 )
 if not isinstance(wikipages, DatasetDict):
-    msg = f"Expected wikipages to be a DatasetDict, but got: {type(wikipages)}"
-    raise TypeError(msg)
+    raise TypeError(
+        f"Expected wikipages to be a DatasetDict, but got: {type(wikipages)}"
+    )
 
 print("Loaded wikipages:", wikipages)
 print("Mapping wikipages...")
 wikipages["train"] = wikipages["train"].map(
     lambda x: {
-        "lines_split": list(filter(lambda x: x != "", re.split(r"\d+\t", x["lines"])))
+        "lines_split": [x for x in re.split(r"\d+\t", x["lines"]) if x],
     },
     num_proc=32,
 )
@@ -139,8 +139,9 @@ def remove_accents(x: str) -> str:
 print("Converting wikipages to DataFrame...")
 wiki_pages_df = wikipages["train"].to_pandas()
 if not isinstance(wiki_pages_df, pd.DataFrame):
-    msg = f"Expected wiki_pages_df to be a DataFrame, but got: {type(wiki_pages_df)}"
-    raise TypeError(msg)
+    raise TypeError(
+        f"Expected wiki_pages_df to be a DataFrame, but got: {type(wiki_pages_df)}"
+    )
 wiki_pages_df = wiki_pages_df.set_index("id")
 wiki_pages_df.index = wiki_pages_df.index.map(remove_accents)
 print("Wikipages converted to DataFrame.")
@@ -155,8 +156,9 @@ def remove_accents(x: str) -> str:
         compression_level=10,
     )
 else:
-    msg = f"Expected wiki_pages_df to be a DataFrame, but got: {type(wiki_pages_df)}"
-    raise TypeError(msg)
+    raise TypeError(
+        f"Expected wiki_pages_df to be a DataFrame, but got: {type(wiki_pages_df)}"
+    )
 
 
 df = pd.read_json(
@@ -177,10 +179,9 @@ def evidence_mapper(evidence: list[tuple]):
 
 def evidence_mapper_sentence(evidences: list[tuple[str, int]]):
     if not isinstance(wiki_pages_df, pd.DataFrame):
-        msg = (
+        raise TypeError(
             f"Expected wiki_pages_df to be a DataFrame, but got: {type(wiki_pages_df)}"
         )
-        raise TypeError(msg)
 
     lines = []
     for title, line in evidences:
@@ -256,8 +257,7 @@ def evidence_mapper_sentence(evidences: list[tuple[str, int]]):
     "json", data_files={"train": "fever_train_df.json", "test": "fever_test_df.json"}
 )
 if not isinstance(fever_ds, DatasetDict):
-    msg = f"Expected fever_ds to be a DatasetDict, but got: {type(fever_ds)}"
-    raise TypeError(msg)
+    raise TypeError(f"Expected fever_ds to be a DatasetDict, but got: {type(fever_ds)}")
 fever_ds.save_to_disk("var/fever/fever_reprocessed")
 print(fever_ds)
 
@@ -273,8 +273,9 @@ def evidence_mapper_sentence(evidences: list[tuple[str, int]]):
 article_ds = article_ds.map(lambda x: searcher(x, False), num_proc=1)
 article_df = article_ds.to_pandas()
 if not isinstance(article_df, pd.DataFrame):
-    msg = f"Expected article_df to be a DataFrame, but got: {type(article_df)}"
-    raise TypeError(msg)
+    raise TypeError(
+        f"Expected article_df to be a DataFrame, but got: {type(article_df)}"
+    )
 article_df = article_df.set_index("article")
 print("Articles that did not return a successful response:")
 print(article_df[article_df.msg != "success"])
@@ -290,8 +291,9 @@ def evidence_mapper_sentence(evidences: list[tuple[str, int]]):
 
 def search(query: str) -> tuple:
     if not isinstance(article_df, pd.DataFrame):
-        msg = f"Expected article_df to be a DataFrame, but got: {type(article_df)}"
-        raise TypeError(msg)
+        raise TypeError(
+            f"Expected article_df to be a DataFrame, but got: {type(article_df)}"
+        )
     row = article_df.loc[query]
     return row["wiki"], row["msg"]
 
@@ -307,7 +309,7 @@ def trajectorize(row: dict[str, Any]) -> dict[str, Any]:
         k: list(v) for k, v in groupby(evidence_sentences, operator.itemgetter(0))
     }
 
-    articles = {}
+    sample_articles = {}
     statuses = []
     wiki_worked = True
     for article in article_sentence_group:
@@ -317,7 +319,7 @@ def trajectorize(row: dict[str, Any]) -> dict[str, Any]:
         if worked != "success":
             wiki_worked = False
 
-        articles[cleaned_article] = wiki
+        sample_articles[cleaned_article] = wiki
         statuses.append(worked)
     all_wiki_success = all(x in {"success", "fallback"} for x in statuses)
 
@@ -325,23 +327,29 @@ def trajectorize(row: dict[str, Any]) -> dict[str, Any]:
 
     for article, evidences in article_sentence_group.items():
         cleaned_article = clean_fever(article)
-        trajectory.extend([
-            {"thought": f"I need to search {cleaned_article}."},
-            {
-                "action": '{"name": "Search", "arguments": {"topic": "'
-                + cleaned_article
-                + '"}}'
-            },
-            {"observation": f"[Document]\n{articles[cleaned_article]}\n[End]"},
-        ])
+        trajectory.extend(
+            [
+                {"thought": f"I need to search {cleaned_article}."},
+                {
+                    "action": '{"name": "Search", "arguments": {"topic": "'
+                    + cleaned_article
+                    + '"}}'
+                },
+                {
+                    "observation": f"[Document]\n{sample_articles[cleaned_article]}\n[End]"
+                },
+            ]
+        )
 
         for _title, _line, sent in evidences:
             trajectory.append({"observation": clean_fever(sent.split("\t")[0])})
 
-    trajectory.extend([
-        {"thought": f"The claim is {answer}."},
-        {"action": '{"name": "Finish", "arguments": {"topic": "' + answer + '"}}'},
-    ])
+    trajectory.extend(
+        [
+            {"thought": f"The claim is {answer}."},
+            {"action": '{"name": "Finish", "arguments": {"topic": "' + answer + '"}}'},
+        ]
+    )
 
     traj_keys = [next(iter(t.keys())) for t in trajectory]
     traj_values = [next(iter(t.values())) for t in trajectory]
@@ -350,15 +358,19 @@ def trajectorize(row: dict[str, Any]) -> dict[str, Any]:
 
     for article, evidences in article_sentence_group.items():
         cleaned_article = clean_fever(article)
-        rewoo_trajectory.extend([
-            {"thought": f"Search for more information about {cleaned_article}."},
-            {
-                "action": '{"name": "Search", "arguments": {"topic": "'
-                + cleaned_article
-                + '"}}'
-            },
-            {"observation": f"[Document]\n{articles[cleaned_article]}\n[End]"},
-        ])
+        rewoo_trajectory.extend(
+            [
+                {"thought": f"Search for more information about {cleaned_article}."},
+                {
+                    "action": '{"name": "Search", "arguments": {"topic": "'
+                    + cleaned_article
+                    + '"}}'
+                },
+                {
+                    "observation": f"[Document]\n{sample_articles[cleaned_article]}\n[End]"
+                },
+            ]
+        )
 
         for _title, _line, sent in evidences:
             rewoo_trajectory.append({"observation": clean_fever(sent.split("\t")[0])})
@@ -373,7 +385,7 @@ def trajectorize(row: dict[str, Any]) -> dict[str, Any]:
         "rewoo_traj_values": rewoo_traj_values,
         "all_wiki_success": all_wiki_success,
         "wiki_worked": wiki_worked,
-        "articles": list(articles.values()),
+        "articles": list(sample_articles.values()),
         "statuses": statuses,
     }
 
diff --git a/examples/optimizer/process_gsm8k.py b/examples/optimizer/process_gsm8k.py
index cb36068d2..57911a0c7 100644
--- a/examples/optimizer/process_gsm8k.py
+++ b/examples/optimizer/process_gsm8k.py
@@ -14,8 +14,9 @@
 
 gsm8k_orig = load_dataset("openai/gsm8k", "main")
 if not isinstance(gsm8k_orig, DatasetDict):
-    msg = f"Expected gsm8k_orig to be a DatasetDict, but got: {type(gsm8k_orig)}"
-    raise TypeError(msg)
+    raise TypeError(
+        f"Expected gsm8k_orig to be a DatasetDict, but got: {type(gsm8k_orig)}"
+    )
 new_split = gsm8k_orig["train"].train_test_split(test_size=1024)
 gsm8k_orig["validation"] = new_split["test"]
 gsm8k_orig["train"] = new_split["train"]
@@ -24,8 +25,7 @@
 # Make sure the saved dataset is loaded correctly
 gsm8k = load_from_disk("var/gsm8k_split")
 if not isinstance(gsm8k, DatasetDict):
-    msg = f"Expected gsm8k to be a DatasetDict, but got: {type(gsm8k)}"
-    raise TypeError(msg)
+    raise TypeError(f"Expected gsm8k to be a DatasetDict, but got: {type(gsm8k)}")
 
 
 def parse_answers(row: dict[str, Any]) -> dict[str, Any]:
@@ -82,9 +82,9 @@ def react_trajectory(row: dict[str, Any]) -> dict[str, list[str]]:
     if next(iter(trajectory[-1].keys())) == "observation":
         trajectory.append({"thought": f"The answer is {answer}"})
 
-    trajectory.append({
-        "action": '{"name": "Finish", "arguments": {"answer": "' + f"{answer}" + '"}}'
-    })
+    trajectory.append(
+        {"action": '{"name": "Finish", "arguments": {"answer": "' + f"{answer}" + '"}}'}
+    )
 
     traj_keys = [next(iter(t.keys())) for t in trajectory]
     traj_values = [next(iter(t.values())) for t in trajectory]
@@ -132,8 +132,7 @@ def rewoo_trajectory(row: dict[str, Any]) -> dict[str, list[str]]:
             ]
 
     evidence_counter = 0
-    for i in range(len(trajectory)):
-        outer = trajectory[i]
+    for i, outer in enumerate(trajectory):
         type_event = next(iter(outer.keys()))
         value = next(iter(outer.values()))
 
diff --git a/examples/optimizer/process_mbpp.py b/examples/optimizer/process_mbpp.py
index 999e80cb9..c995b9d44 100644
--- a/examples/optimizer/process_mbpp.py
+++ b/examples/optimizer/process_mbpp.py
@@ -16,8 +16,7 @@
 
 mbpp = load_dataset("google-research-datasets/mbpp", name="full")
 if not isinstance(mbpp, DatasetDict):
-    msg = f"Expected mbpp to be a DatasetDict, but got: {type(mbpp)}"
-    raise TypeError(msg)
+    raise TypeError(f"Expected mbpp to be a DatasetDict, but got: {type(mbpp)}")
 
 mbpp["test"] = mbpp["test"].filter(
     lambda x: f"Mbpp/{x['task_id']}" in mbpp_plus,
diff --git a/src/pdl/optimize/pdl_optimizer.py b/src/pdl/optimize/pdl_optimizer.py
index aa6316d53..1fd1320e1 100644
--- a/src/pdl/optimize/pdl_optimizer.py
+++ b/src/pdl/optimize/pdl_optimizer.py
@@ -11,7 +11,8 @@
 from typing import Any
 
 import yaml
-from datasets import Dataset, DatasetDict
+from datasets.arrow_dataset import Dataset
+from datasets.dataset_dict import DatasetDict
 from duration_parser import parse as parse_duration
 from numpy.random import default_rng
 from rich.logging import RichHandler
diff --git a/src/pdl/optimize/util.py b/src/pdl/optimize/util.py
index 9925f8e0e..aceb0d3ff 100644
--- a/src/pdl/optimize/util.py
+++ b/src/pdl/optimize/util.py
@@ -4,7 +4,7 @@
 from typing import Any
 
 import yaml
-from datasets import Dataset
+from datasets.arrow_dataset import Dataset
 from rich.console import Console
 
 from pdl.pdl_ast import Program, ScopeType
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
index 76c5ed7ad..1df9b0834 100644
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
@@ -2,7 +2,8 @@
 from pprint import pprint
 
 import pytest
-from datasets import Dataset, DatasetDict
+from datasets.arrow_dataset import Dataset
+from datasets.dataset_dict import DatasetDict
 
 from examples.optimizer.fever_evaluator import FEVEREvaluator
 from examples.optimizer.gsm8k_evaluator import Gsm8kEvaluator

From dab96d90e421028788cfc292e6bcb74d5a2f3627 Mon Sep 17 00:00:00 2001
From: Claudio Spiess 
Date: Mon, 30 Jun 2025 17:14:27 +0200
Subject: [PATCH 04/11] Add funcy dependency

Signed-off-by: Claudio Spiess 
---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6e7ee39ff..9970b8670 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,8 @@ examples = [
   "datasets>3,<4",
   "sympy~=1.0",
   "scikit-learn>=1.6.1,<1.8.0",
-  "faiss-cpu>=1.10,<1.12"
+  "faiss-cpu>=1.10,<1.12",
+  "funcy>2",
 ]
 docs = [
   "mkdocs~=1.0",

From 03b785d13b65124eb876566503c0ab9198759179 Mon Sep 17 00:00:00 2001
From: Claudio Spiess 
Date: Mon, 30 Jun 2025 17:21:24 +0200
Subject: [PATCH 05/11] Fix impossible version constraint

Signed-off-by: Claudio Spiess 
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9970b8670..e1ee1c5d7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,7 @@ examples = [
   "sympy~=1.0",
   "scikit-learn>=1.6.1,<1.8.0",
   "faiss-cpu>=1.10,<1.12",
-  "funcy>2",
+  "funcy>=2",
 ]
 docs = [
   "mkdocs~=1.0",

From 6cac091d57de4adc7c403c9bb1a03814070ff166 Mon Sep 17 00:00:00 2001
From: Claudio Spiess 
Date: Mon, 30 Jun 2025 21:01:39 +0200
Subject: [PATCH 06/11] pyright ignores

Signed-off-by: Claudio Spiess 
---
 examples/optimizer/process_fever.py | 44 ++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/examples/optimizer/process_fever.py b/examples/optimizer/process_fever.py
index c783fbca2..f2ef82474 100644
--- a/examples/optimizer/process_fever.py
+++ b/examples/optimizer/process_fever.py
@@ -174,7 +174,11 @@ def evidence_mapper(evidence: list[tuple]):
     return list(evidences)
 
 
-df["unique_evidence"] = df["evidence"].progress_apply(evidence_mapper)
+df["unique_evidence"] = df[
+    "evidence"
+].progress_apply(  # pyright: ignore[reportAttributeAccessIssue]
+    evidence_mapper
+)
 
 
 def evidence_mapper_sentence(evidences: list[tuple[str, int]]):
@@ -202,30 +206,44 @@ def evidence_mapper_sentence(evidences: list[tuple[str, int]]):
     return list(lines)
 
 
-df["evidence_sentences"] = df["unique_evidence"].progress_apply(
+df["evidence_sentences"] = df[
+    "unique_evidence"
+].progress_apply(  # pyright: ignore[reportAttributeAccessIssue]
     evidence_mapper_sentence
 )
 
 bigbench = pd.DataFrame.from_records(bigbench_fever["examples"]).set_index("id")
 
 tqdm.pandas(desc="Mapping claims to (in) bigbench")
-df["claim_in_bigbench"] = df["claim"].progress_apply(
+df["claim_in_bigbench"] = df[
+    "claim"
+].progress_apply(  # pyright: ignore[reportAttributeAccessIssue]
     lambda x: bigbench.input.str.contains(x).any()
 )
 tqdm.pandas()
 
-df["evidence_sentence_count"] = df["evidence_sentences"].map(len)
+df["evidence_sentence_count"] = df[
+    "evidence_sentences"
+].map(  # pyright: ignore[reportAttributeAccessIssue]
+    len
+)
 print("Mapped bigbench")
 
 train_df = df[(~df.index.isin(bigbench.index)) & (df["evidence_sentence_count"] > 0)]
 
 test_df = df[
     (df.index.isin(bigbench.index)) & (df["evidence_sentence_count"] > 0)
-].drop(columns=["verifiable", "claim_in_bigbench", "evidence"])
-test_df["unique_evidence"] = test_df["unique_evidence"].map(
+].drop(
+    columns=["verifiable", "claim_in_bigbench", "evidence"]
+)  # pyright: ignore[reportAttributeAccessIssue]
+test_df["unique_evidence"] = test_df[
+    "unique_evidence"
+].map(  # pyright: ignore[reportAttributeAccessIssue]
     lambda x: [[str(title), str(sent_id)] for title, sent_id in x]
 )
-test_df["evidence_sentences"] = test_df["evidence_sentences"].map(
+test_df["evidence_sentences"] = test_df[
+    "evidence_sentences"
+].map(  # pyright: ignore[reportAttributeAccessIssue]
     lambda x: [[str(title), str(sent_id), str(sent)] for title, sent_id, sent in x]
 )
 test_df["label"] = test_df["label"] == "SUPPORTS"
@@ -238,11 +256,17 @@ def evidence_mapper_sentence(evidences: list[tuple[str, int]]):
 
 train_df = df[
     (~df.index.isin(bigbench.index)) & (df["evidence_sentence_count"] > 0)
-].drop(columns=["verifiable", "claim_in_bigbench", "evidence"])
-train_df["unique_evidence"] = train_df["unique_evidence"].map(
+].drop(  # pyright: ignore[reportAttributeAccessIssue]
+    columns=["verifiable", "claim_in_bigbench", "evidence"]
+)
+train_df["unique_evidence"] = train_df[
+    "unique_evidence"
+].map(  # pyright: ignore[reportAttributeAccessIssue]
     lambda x: [[str(title), str(sent_id)] for title, sent_id in x]
 )
-train_df["evidence_sentences"] = train_df["evidence_sentences"].map(
+train_df["evidence_sentences"] = train_df[
+    "evidence_sentences"
+].map(  # pyright: ignore[reportAttributeAccessIssue]
     lambda x: [[str(title), str(sent_id), str(sent)] for title, sent_id, sent in x]
 )
 train_df["label"] = train_df["label"] == "SUPPORTS"

From 9b1fc48a88e3c417ca8bb6ef934b977a4732d8c1 Mon Sep 17 00:00:00 2001
From: Claudio Spiess 
Date: Mon, 30 Jun 2025 23:38:34 +0200
Subject: [PATCH 07/11] pyright ignore

Signed-off-by: Claudio Spiess 
---
 examples/optimizer/process_fever.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/optimizer/process_fever.py b/examples/optimizer/process_fever.py
index f2ef82474..f3419cb7f 100644
--- a/examples/optimizer/process_fever.py
+++ b/examples/optimizer/process_fever.py
@@ -233,9 +233,9 @@ def evidence_mapper_sentence(evidences: list[tuple[str, int]]):
 
 test_df = df[
     (df.index.isin(bigbench.index)) & (df["evidence_sentence_count"] > 0)
-].drop(
+].drop(  # pyright: ignore[reportAttributeAccessIssue]
     columns=["verifiable", "claim_in_bigbench", "evidence"]
-)  # pyright: ignore[reportAttributeAccessIssue]
+)
 test_df["unique_evidence"] = test_df[
     "unique_evidence"
 ].map(  # pyright: ignore[reportAttributeAccessIssue]

From b7cbec5c948794ea590bd6324ca9186d560915fa Mon Sep 17 00:00:00 2001
From: Claudio Spiess 
Date: Wed, 2 Jul 2025 14:18:56 +0200
Subject: [PATCH 08/11] Add evalplus dep to examples

Signed-off-by: Claudio Spiess 
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index e1ee1c5d7..089dabae1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ examples = [
   "scikit-learn>=1.6.1,<1.8.0",
   "faiss-cpu>=1.10,<1.12",
   "funcy>=2",
+  "evalplus>=0.3.1",
 ]
 docs = [
   "mkdocs~=1.0",

From d15034125ac14d4ef177021f34ddbc96b91f6ca0 Mon Sep 17 00:00:00 2001
From: Claudio Spiess 
Date: Wed, 2 Jul 2025 15:39:24 +0200
Subject: [PATCH 09/11] Improve docs

Signed-off-by: Claudio Spiess 
---
 docs/autopdl.md                               | 132 +++++++++++++-----
 examples/optimizer/gsm8k_optimizer_config.yml |  25 ++++
 mkdocs.yml                                    |   2 +-
 3 files changed, 123 insertions(+), 36 deletions(-)
 create mode 100644 examples/optimizer/gsm8k_optimizer_config.yml

diff --git a/docs/autopdl.md b/docs/autopdl.md
index 73b055ad1..601ea601d 100644
--- a/docs/autopdl.md
+++ b/docs/autopdl.md
@@ -7,7 +7,15 @@ hide:
 
 # AutoPDL Tutorial
 
-The following sections show how to use the AutoPDL optimizer to produce optimized PDL programs for specific tasks.
+The following sections show how to use the AutoPDL optimizer introduced by [Spiess et al. (2025)](https://openreview.net/forum?id=CAeISyE3aR) in "AutoPDL: Automatic Prompt Optimization for LLM Agents" ([arXiv](https://arxiv.org/abs/2504.04365)), to produce optimized PDL programs for specific tasks. Please ensure PDL was installed with extras e.g.
+
+``` { .bash .copy .annotate linenums="1" }
+pip install 'prompt-declaration-language[all]'
+# or from source
+git clone git@github.com:IBM/prompt-declaration-language.git
+cd prompt-declaration-language
+pip install -e '.[all]'
+```
 
 To optimize a PDL program, we need the program, an optimizer configuration, a dataset, and an _evaluator_. An evaluator is a Python subclass of `OptimizerEvaluator` that evaluates a candidate, which is a generated configuration instance consisting of e.g. fewshot examples. The evaluator class follows this structure:
 
@@ -52,39 +60,14 @@ class OptimizerEvaluator(Thread):
 
 Let's go through an example for `GSM8K`. Our PDL program uses different prompt patterns from the prompt library, and the variables `prompt_pattern`, `question`, `model`, and `demonstrations` are inserted at runtime by the evaluator.
 
-
 ```yaml title="examples/optimizer/gsm8k.pdl" linenums="1"
 --8<-- "./examples/optimizer/gsm8k.pdl"
 ```
 
-We write a configuration file for the optimizer, see `src/pdl/optimize/config_parser.py` for all fields:
-
-``` { .yaml .copy .annotate title="gsm8k_optimizer_config.yml" linenums="1" }
-benchmark: gsm8k # Name our benchmark
-budget: null # Set a budget, can be number of iterations, or a duration string e.g. "2h"
-budget_growth: double # double validation set size each iteration
-# or to_max: reach max_test_set_size by final iteration
-initial_test_set_size: 2 # size of test set in first iteration
-max_test_set_size: 10 # maximum test set size
-num_candidates: 100 # how many candidates to evaluate
-num_demonstrations: 5 # how many demonstrations to include per candidate
-parallelism: 1 # how many threads to run evaluations across
-shuffle_test: false # shuffling of test set
-test_set_name: test # name of test set
-train_set_name: train # name of train set
-validation_set_name: validation # name of validation set
-demonstrations_variable_name: demonstrations # variable name to insert demonstrations into
-variables: # define discrete options to sample from
-  model: # set ${ model } variable
-    - watsonx/meta-llama/llama-3-1-8b-instruct
-  prompt_pattern: # set ${ prompt_pattern } variable to one of these
-    - cot
-    - react
-    - rewoo
-  num_demonstrations: # overrides num demonstrations above
-    - 0
-    - 3
-    - 5
+We write a configuration file for the optimizer, and save it as `gsm8k_optimizer_config.yml`. See `src/pdl/optimize/config_parser.py` for all fields.
+
+``` { .yaml .copy .annotate title="examples/optimizer/gsm8k_optimizer_config.yml" linenums="1" }
+--8<-- "./examples/optimizer/gsm8k_optimizer_config.yml"
 ```
 
 
@@ -95,20 +78,99 @@ variables: # define discrete options to sample from
 We can see an example of a script to run the optimization process in `examples/optimizer/optimize.py`.
 Usage:
 
-```
+```text
 python optimize.py optimize -h
 usage: optimize.py optimize [-h] --config CONFIG --dataset-path DATASET_PATH [--experiments-path EXPERIMENTS_PATH]
                             [--yield_output | --no-yield_output] [--dry | --no-dry]
                             pdl_file
 ```
 
-We also need a dataset to optimize against, with `train`, `test`, and `validation` splits. To produce such a dataset, we can use HuggingFace Datasets `load_dataset` and `save_to_disk`. This example requires the dataset to have columns `question`, `reasoning`, and `answer`, which can be created from the original `openai/gsm8k` dataset. Processing scripts are under development and will follow shortly.
+We also need a dataset to optimize against, with `train`, `test`, and `validation` splits. To produce such a dataset, we can use HuggingFace Datasets `load_dataset` and `save_to_disk`. This example requires the dataset to have columns `question`, `reasoning`, and `answer`, which can be created from the original `openai/gsm8k` dataset.
+
+We provide three scripts in `examples/optimizer` to create datasets, including the rule based agentic trajectories. These are `process_gsm8k.py`, `process_fever.py`, and `process_mbpp.py`. They load the original datasets, process them, and save them to disk in the required format. Dataset specific instructions may be found in the respective script files. Note that the scripts create a folder named `var` in the current directory, which contains the processed dataset in a format that can be used by the optimizer. Therefore, they should be run in the root of the PDL repository.
 
-We can run an example like so:
+Let's run the GSM8K dataset processing script:
 
+``` { .bash .copy .annotate linenums="1" }
+python examples/optimizer/process_gsm8k.py
 ```
+
+Which should save the processed dataset in `var/gsm8k_trajectified` and output something like:
+
+```text
+Saving the dataset (1/1 shards): 100%|█████████████████████████████████████████████████████████████████| 6449/6449 [00:00<00:00, 557195.73 examples/s]
+Saving the dataset (1/1 shards): 100%|█████████████████████████████████████████████████████████████████| 1319/1319 [00:00<00:00, 363559.64 examples/s]
+Saving the dataset (1/1 shards): 100%|█████████████████████████████████████████████████████████████████| 1024/1024 [00:00<00:00, 271472.56 examples/s]
+Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 6449/6449 [00:00<00:00, 71242.31 examples/s]
+Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:00<00:00, 68826.30 examples/s]
+Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 6449/6449 [00:00<00:00, 22520.85 examples/s]
+Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 6449/6449 [00:00<00:00, 18186.53 examples/s]
+Saving the dataset (1/1 shards): 100%|█████████████████████████████████████████████████████████████████| 6449/6449 [00:00<00:00, 698328.77 examples/s]
+Saving the dataset (1/1 shards): 100%|█████████████████████████████████████████████████████████████████| 1319/1319 [00:00<00:00, 232468.57 examples/s]
+Saving the dataset (1/1 shards): 100%|█████████████████████████████████████████████████████████████████| 1024/1024 [00:00<00:00, 413375.10 examples/s]
+DatasetDict({
+    train: Dataset({
+        features: ['question', 'answer', 'reasoning', 'raw_answer', 'answer_part', 'traj_keys', 'traj_values', 'rewoo_traj_keys', 'rewoo_traj_values'],
+        num_rows: 6449
+    })
+    test: Dataset({
+        features: ['question', 'answer', 'reasoning', 'raw_answer', 'answer_part'],
+        num_rows: 1319
+    })
+    validation: Dataset({
+        features: ['question', 'answer', 'reasoning', 'raw_answer', 'answer_part'],
+        num_rows: 1024
+    })
+})
+```
+
+Finally, we can run the example like so:
+
+``` { .bash .copy .annotate linenums="1" }
 cd examples/optimizer
-python optimize.py optimize --config config.yml --dataset-path datasets/gsm8k gsm8k.pdl
+python optimize.py optimize --config gsm8k_optimizer_config.yml --dataset-path ../../var/gsm8k_trajectified gsm8k.pdl
+```
+
+This will report details about the optimization process, such as the number of candidates evaluated. The output will look something like this:
+
+```text
+                                           PDL Optimizer                                  pdl_optimizer.py:336
+           ┌──────────────────────────────┬─────────────────────────────────────────────┐
+           │ Config combinations          │ 9                                           │
+           │ Max candidates               │ 100                                         │
+           │ Num. candidates              │ 100                                         │
+           │ Starting validation set size │ 2                                           │
+           │ Max validation set size      │ 10                                          │
+           │ Num. iterations              │ 7                                           │
+           │ Total evaluations            │ 1,200                                       │
+           │ Num. threads                 │ 1                                           │
+           │ Validation set multiplier    │ 2                                           │
+           │ Shuffle validation set       │ False                                       │
+           │ Budget policy                │ None                                        │
+           ├──────────────────────────────┼─────────────────────────────────────────────┤
+           │ model                        │ ['watsonx/meta-llama/llama-3-2-3b-instruct… │
+           │ prompt_pattern               │ ['cot', 'react', 'rewoo']                   │
+           │ num_demonstrations           │ [0, 3, 5]                                   │
+           └──────────────────────────────┴─────────────────────────────────────────────┘
+                     Iteration                                                            pdl_optimizer.py:419
+           ┌─────────────────────┬─────┐
+           │ Index               │ 0   │
+           │ Validation set size │ 2   │
+           │ Num. candidates     │ 100 │
+           └─────────────────────┴─────┘
+                                        Evaluation                                        pdl_optimizer.py:601
+           ┌────────────────────────┬──────────────────────────────────────────┐
+           │ Test set size          │ 2                                        │
+           ├────────────────────────┼──────────────────────────────────────────┤
+           │ model                  │ watsonx/meta-llama/llama-3-2-3b-instruct │
+           │ prompt_pattern         │ cot                                      │
+           │ num_demonstrations     │ 0                                        │
+           │ uuid                   │ enl0ertp                                 │
+           │ demonstrations_indices │ 0                                        │
+           │ demonstrations         │ 0                                        │
+           └────────────────────────┴──────────────────────────────────────────┘
+           Running without parallelism                                                              util.py:74
+   0% ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0/1,200  [ 0:00:01 < -:--:-- , ? it/s ]
 ```
 
-Once the process is complete, a file `optimized_gsm8k.pdl` is written. This file contains the optimal configuration and is directly executable by the standard PDL interpreter.
+Once the process is complete, a file `optimized_gsm8k.pdl` is written in same directory as the source PDL file. This file contains the optimal configuration and is directly executable by the standard PDL interpreter. A log of the optimization process is written to `experiments/` by default.
\ No newline at end of file
diff --git a/examples/optimizer/gsm8k_optimizer_config.yml b/examples/optimizer/gsm8k_optimizer_config.yml
new file mode 100644
index 000000000..49da371ff
--- /dev/null
+++ b/examples/optimizer/gsm8k_optimizer_config.yml
@@ -0,0 +1,25 @@
+benchmark: gsm8k # Name our benchmark
+budget: null # Set a budget, can be number of iterations, or a duration string e.g. "2h"
+budget_growth: double # double validation set size each iteration
+# or to_max: reach max_test_set_size by final iteration
+initial_test_set_size: 2 # size of test set in first iteration
+max_test_set_size: 10 # maximum test set size
+num_candidates: 100 # how many candidates to evaluate
+num_demonstrations: 5 # how many demonstrations to include per candidate
+parallelism: 1 # how many threads to run evaluations across
+shuffle_test: false # shuffling of test set
+test_set_name: test # name of test set
+train_set_name: train # name of train set
+validation_set_name: validation # name of validation set
+demonstrations_variable_name: demonstrations # variable name to insert demonstrations into
+variables: # define discrete options to sample from
+  model: # set ${ model } variable
+    - watsonx/meta-llama/llama-3-2-3b-instruct
+  prompt_pattern: # set ${ prompt_pattern } variable to one of these
+    - cot
+    - react
+    - rewoo
+  num_demonstrations: # overrides num demonstrations above
+    - 0
+    - 3
+    - 5
diff --git a/mkdocs.yml b/mkdocs.yml
index 3d5fe3887..2831e62d5 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -47,7 +47,7 @@ nav:
   - API Reference: api_reference.md
   - Contribute: contrib.md
   - Viewer: viewer.md
-  # - AutoPDL: autopdl.md # Hide documentation for now
+  - AutoPDL: autopdl.md
 
 # Define some IBM colors
 extra_css:

From e4b5ef2008896a10813f1874b36e4fe1852ae0cf Mon Sep 17 00:00:00 2001
From: Claudio Spiess 
Date: Wed, 2 Jul 2025 15:40:59 +0200
Subject: [PATCH 10/11] Improve docs & lint

Signed-off-by: Claudio Spiess 
---
 docs/README.md                    | 20 +++++++--------
 docs/autopdl.md                   |  2 +-
 examples/optimizer/optimize.py    |  2 +-
 src/pdl/optimize/pdl_optimizer.py | 41 ++++++++++++++++++++-----------
 4 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index 45863c611..c5bf03b4f 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -23,7 +23,7 @@ PDL provides the following features:
 
 The PDL interpreter takes a PDL program as input and generates data by executing its instructions (calling out to models, code, etc...).
 
-See below for a quick reference, followed by [installation notes](#interpreter_installation) and an [overview](#overview) of the language. A more detailed description of the language features can be found in this [tutorial](https://ibm.github.io/prompt-declaration-language/tutorial).
+See below for a quick reference, followed by [installation notes](#interpreter-installation) and an [overview](#overview) of the language. A more detailed description of the language features can be found in this [tutorial](https://ibm.github.io/prompt-declaration-language/tutorial).
 
 
 ## Quick Reference
@@ -50,13 +50,13 @@ pip install 'prompt-declaration-language[examples]'
 
 The Live Explorer can be installed as follows (MacOS):
 ```
-brew install pdl 
+brew install pdl
 ```
 
 For other platforms, see installation notes.
 
 You can run PDL with LLM models in local using [Ollama](https://ollama.com), or other cloud service.
-See [here](https://ibm.github.io/prompt-declaration-language/tutorial/#using-ollama-models) for 
+See [here](https://ibm.github.io/prompt-declaration-language/tutorial/#using-ollama-models) for
 instructions on how to install an Ollama model locally.
 
 Most examples in this repository use IBM Granite models on [Ollama](https://ollama.com) and some are on [Replicate](https://replicate.com/). In order to run these examples, you need to create a free account
@@ -172,7 +172,7 @@ text:
     temperature: 0
 ```
 
-Notice the syntactic differences. Model ids on watsonx start with `watsonx`. 
+Notice the syntactic differences. Model ids on watsonx start with `watsonx`.
 
 Watsonx also provides a text completion endpoint as shown in the following example. A text completion endpoint does not take chat
 templates into account:
@@ -299,10 +299,10 @@ When we execute this program with the PDL interpreter, we obtain the following t
 @SuppressWarnings("unchecked")
 public static Map deserializeOffsetMap(String lastSourceOffset) throws IOException {
   Map offsetMap;
-  if (lastSourceOffset == null || lastSourceOffset.isEmpty()) {    
-    offsetMap = new HashMap<>();  
+  if (lastSourceOffset == null || lastSourceOffset.isEmpty()) {
+    offsetMap = new HashMap<>();
   } else {
-    offsetMap = JSON_MAPPER.readValue(lastSourceOffset, Map.class);  
+    offsetMap = JSON_MAPPER.readValue(lastSourceOffset, Map.class);
   }
   return offsetMap;
 }
@@ -364,10 +364,10 @@ When we execute this new program, we obtain the following:
 @SuppressWarnings("unchecked")
 public static Map deserializeOffsetMap(String lastSourceOffset) throws IOException {
   Map offsetMap;
-  if (lastSourceOffset == null || lastSourceOffset.isEmpty()) {    
-    offsetMap = new HashMap<>();  
+  if (lastSourceOffset == null || lastSourceOffset.isEmpty()) {
+    offsetMap = new HashMap<>();
   } else {
-    offsetMap = JSON_MAPPER.readValue(lastSourceOffset, Map.class);  
+    offsetMap = JSON_MAPPER.readValue(lastSourceOffset, Map.class);
   }
   return offsetMap;
 }
diff --git a/docs/autopdl.md b/docs/autopdl.md
index 601ea601d..c3ea60d37 100644
--- a/docs/autopdl.md
+++ b/docs/autopdl.md
@@ -173,4 +173,4 @@ This will report details about the optimization process, such as the number of c
    0% ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0/1,200  [ 0:00:01 < -:--:-- , ? it/s ]
 ```
 
-Once the process is complete, a file `optimized_gsm8k.pdl` is written in same directory as the source PDL file. This file contains the optimal configuration and is directly executable by the standard PDL interpreter. A log of the optimization process is written to `experiments/` by default.
\ No newline at end of file
+Once the process is complete, a file `optimized_gsm8k.pdl` is written in same directory as the source PDL file. This file contains the optimal configuration and is directly executable by the standard PDL interpreter. A log of the optimization process is written to `experiments/` by default.
diff --git a/examples/optimizer/optimize.py b/examples/optimizer/optimize.py
index c88b799cb..24307fa26 100644
--- a/examples/optimizer/optimize.py
+++ b/examples/optimizer/optimize.py
@@ -5,7 +5,7 @@
 from typing import Any
 
 import yaml
-from datasets import load_from_disk
+from datasets.load import load_from_disk
 from fever_evaluator import FEVEREvaluator
 from gsm8k_evaluator import Gsm8kEvaluator
 from gsmhard_evaluator import GsmHardEvaluator
diff --git a/src/pdl/optimize/pdl_optimizer.py b/src/pdl/optimize/pdl_optimizer.py
index 1fd1320e1..0dea78ef8 100644
--- a/src/pdl/optimize/pdl_optimizer.py
+++ b/src/pdl/optimize/pdl_optimizer.py
@@ -160,10 +160,13 @@ def sample_candidates(
         demo_name = self.config.demonstrations_variable_name
         candidates = []
 
+        num_demonstrations_set = {
+            int(x) for x in self.config.variables.get("num_demonstrations", set())
+        }
+
         if (
-            "prompt_pattern" in self.config.variables
-            and "cot" in self.config.variables.get("prompt_pattern", [])
-            and 0 in self.config.variables.get("num_demonstrations", [])
+            "cot" in self.config.variables.get("prompt_pattern", [])
+            and 0 in num_demonstrations_set
         ):
             cot_candidate = {
                 k: self.sample_random_index(v) for k, v in self.config.variables.items()
@@ -179,18 +182,18 @@ def sample_candidates(
 
             candidates.append(cot_candidate)
 
-        zero_shots_seen = ["cot"]
+        zero_shots_seen = {"cot"}
         while len(candidates) < num_candidates:
             variable_instance = {
                 k: self.sample_random_index(v) for k, v in self.config.variables.items()
             }
             if (
                 variable_instance.get("num_demonstrations") == 0
-                and variable_instance.get("prompt_pattern") == "cot"
+                and variable_instance.get("prompt_pattern") is not None
             ):
                 if variable_instance["prompt_pattern"] in zero_shots_seen:
                     continue
-                zero_shots_seen.append(variable_instance["prompt_pattern"])
+                zero_shots_seen.add(variable_instance["prompt_pattern"])
 
             num_demonstrations = int(
                 variable_instance.get("num_demonstrations", self.num_demonstrations),
@@ -215,16 +218,26 @@ def sample_candidates(
             candidates.append(candidate)
 
         if (
-            "num_demonstrations"
-            in self.config.variables  # check if is variable in config
-            and len(self.config.variables["num_demonstrations"])
-            > 1  # check more than 1 option
-            and 0 in [int(x) for x in self.config.variables["num_demonstrations"]]
-            # check zeroshot is an option
+            len(num_demonstrations_set) > 1  # check more than 1 option
+            and 0 in num_demonstrations_set  # check zeroshot is an option
         ):
-            zero_shotters = [x for x in candidates if x["num_demonstrations"] == 0]
+            zero_shotters = [
+                x.get("uuid") for x in candidates if x.get("num_demonstrations") == 0
+            ]
+            variables_zs = self.config.variables.copy()
+            variables_zs.pop("num_demonstrations", None)
+
+            max_zs = len(list(itertools.product(*variables_zs.values())))
+
+            if len(zero_shotters) > max_zs:
+                logger.warning(
+                    "More zero-shot candidates (%d) than expected (%d; "
+                    "product of all variables). "
+                    "Identical duplicated candidates may waste compute.",
+                    len(zero_shotters),
+                    max_zs,
+                )
 
-            assert len(zero_shotters) <= 3
         assert len(candidates) == num_candidates
         return candidates
 

From ed3c4fa64cf05a8e2ad98562603ae5b2f7fa21c3 Mon Sep 17 00:00:00 2001
From: Claudio Spiess 
Date: Mon, 7 Jul 2025 16:35:55 +0200
Subject: [PATCH 11/11] Update doc

Signed-off-by: Claudio Spiess 
---
 docs/autopdl.md | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/docs/autopdl.md b/docs/autopdl.md
index c3ea60d37..c3b6db7df 100644
--- a/docs/autopdl.md
+++ b/docs/autopdl.md
@@ -64,13 +64,12 @@ Let's go through an example for `GSM8K`. Our PDL program uses different prompt p
 --8<-- "./examples/optimizer/gsm8k.pdl"
 ```
 
-We write a configuration file for the optimizer, and save it as `gsm8k_optimizer_config.yml`. See `src/pdl/optimize/config_parser.py` for all fields.
+We write a configuration file for the optimizer, and save it as `gsm8k_optimizer_config.yml`. See `src/pdl/optimize/config_parser.py` for all fields. Please note that this example uses the `watsonx` inference service, so an API key is required, although you can also use a local model or any other inference service.
 
 ``` { .yaml .copy .annotate title="examples/optimizer/gsm8k_optimizer_config.yml" linenums="1" }
 --8<-- "./examples/optimizer/gsm8k_optimizer_config.yml"
 ```
 
-
 ```python title="examples/optimizer/gsm8k_evaluator.py" linenums="1"
 --8<-- "./examples/optimizer/gsm8k_evaluator.py"
 ```
@@ -173,4 +172,17 @@ This will report details about the optimization process, such as the number of c
    0% ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0/1,200  [ 0:00:01 < -:--:-- , ? it/s ]
 ```
 
+Note that it is not unusual to observe PDL exceptions during the optimization process.
+
+```text
+[15:44:14] Type errors during spec checking:
+../../contrib/prompt_library/ReAct.pdl:0 -  should be an object
+../../contrib/prompt_library/ReAct.pdl:0 - Type errors during spec checking:
+../../contrib/prompt_library/ReAct.pdl:0 -  should be an object
+Retrying:  False
+Runtime FAILED and took seconds: 10.21
+```
+
+Such exceptions, here for example in `ReAct.pdl`, are caused by the _typed_ model call in `ReAct.pdl:98`. If the model output does not result in a parsable JSON that matches the expected type `{ name: string, arguments: object }`, the PDL interpreter raises an exception.
+
 Once the process is complete, a file `optimized_gsm8k.pdl` is written in same directory as the source PDL file. This file contains the optimal configuration and is directly executable by the standard PDL interpreter. A log of the optimization process is written to `experiments/` by default.