diff --git a/docs/README.md b/docs/README.md index 45863c611..c5bf03b4f 100644 --- a/docs/README.md +++ b/docs/README.md @@ -23,7 +23,7 @@ PDL provides the following features: The PDL interpreter takes a PDL program as input and generates data by executing its instructions (calling out to models, code, etc...). -See below for a quick reference, followed by [installation notes](#interpreter_installation) and an [overview](#overview) of the language. A more detailed description of the language features can be found in this [tutorial](https://ibm.github.io/prompt-declaration-language/tutorial). +See below for a quick reference, followed by [installation notes](#interpreter-installation) and an [overview](#overview) of the language. A more detailed description of the language features can be found in this [tutorial](https://ibm.github.io/prompt-declaration-language/tutorial). ## Quick Reference @@ -50,13 +50,13 @@ pip install 'prompt-declaration-language[examples]' The Live Explorer can be installed as follows (MacOS): ``` -brew install pdl +brew install pdl ``` For other platforms, see installation notes. You can run PDL with LLM models in local using [Ollama](https://ollama.com), or other cloud service. -See [here](https://ibm.github.io/prompt-declaration-language/tutorial/#using-ollama-models) for +See [here](https://ibm.github.io/prompt-declaration-language/tutorial/#using-ollama-models) for instructions on how to install an Ollama model locally. Most examples in this repository use IBM Granite models on [Ollama](https://ollama.com) and some are on [Replicate](https://replicate.com/). In order to run these examples, you need to create a free account @@ -172,7 +172,7 @@ text: temperature: 0 ``` -Notice the syntactic differences. Model ids on watsonx start with `watsonx`. +Notice the syntactic differences. Model ids on watsonx start with `watsonx`. Watsonx also provides a text completion endpoint as shown in the following example. A text completion endpoint does not take chat templates into account: @@ -299,10 +299,10 @@ When we execute this program with the PDL interpreter, we obtain the following t @SuppressWarnings("unchecked") public static Map deserializeOffsetMap(String lastSourceOffset) throws IOException { Map offsetMap; - if (lastSourceOffset == null || lastSourceOffset.isEmpty()) { - offsetMap = new HashMap<>(); + if (lastSourceOffset == null || lastSourceOffset.isEmpty()) { + offsetMap = new HashMap<>(); } else { - offsetMap = JSON_MAPPER.readValue(lastSourceOffset, Map.class); + offsetMap = JSON_MAPPER.readValue(lastSourceOffset, Map.class); } return offsetMap; } @@ -364,10 +364,10 @@ When we execute this new program, we obtain the following: @SuppressWarnings("unchecked") public static Map deserializeOffsetMap(String lastSourceOffset) throws IOException { Map offsetMap; - if (lastSourceOffset == null || lastSourceOffset.isEmpty()) { - offsetMap = new HashMap<>(); + if (lastSourceOffset == null || lastSourceOffset.isEmpty()) { + offsetMap = new HashMap<>(); } else { - offsetMap = JSON_MAPPER.readValue(lastSourceOffset, Map.class); + offsetMap = JSON_MAPPER.readValue(lastSourceOffset, Map.class); } return offsetMap; } diff --git a/docs/autopdl.md b/docs/autopdl.md index 73b055ad1..c3b6db7df 100644 --- a/docs/autopdl.md +++ b/docs/autopdl.md @@ -7,7 +7,15 @@ hide: # AutoPDL Tutorial -The following sections show how to use the AutoPDL optimizer to produce optimized PDL programs for specific tasks. +The following sections show how to use the AutoPDL optimizer introduced by [Spiess et al. (2025)](https://openreview.net/forum?id=CAeISyE3aR) in "AutoPDL: Automatic Prompt Optimization for LLM Agents" ([arXiv](https://arxiv.org/abs/2504.04365)), to produce optimized PDL programs for specific tasks. Please ensure PDL was installed with extras e.g. + +``` { .bash .copy .annotate linenums="1" } +pip install 'prompt-declaration-language[all]' +# or from source +git clone git@github.com:IBM/prompt-declaration-language.git +cd prompt-declaration-language +pip install -e '.[all]' +``` To optimize a PDL program, we need the program, an optimizer configuration, a dataset, and an _evaluator_. An evaluator is a Python subclass of `OptimizerEvaluator` that evaluates a candidate, which is a generated configuration instance consisting of e.g. fewshot examples. The evaluator class follows this structure: @@ -52,41 +60,15 @@ class OptimizerEvaluator(Thread): Let's go through an example for `GSM8K`. Our PDL program uses different prompt patterns from the prompt library, and the variables `prompt_pattern`, `question`, `model`, and `demonstrations` are inserted at runtime by the evaluator. - ```yaml title="examples/optimizer/gsm8k.pdl" linenums="1" --8<-- "./examples/optimizer/gsm8k.pdl" ``` -We write a configuration file for the optimizer, see `src/pdl/optimize/config_parser.py` for all fields: - -``` { .yaml .copy .annotate title="gsm8k_optimizer_config.yml" linenums="1" } -benchmark: gsm8k # Name our benchmark -budget: null # Set a budget, can be number of iterations, or a duration string e.g. "2h" -budget_growth: double # double validation set size each iteration -# or to_max: reach max_test_set_size by final iteration -initial_test_set_size: 2 # size of test set in first iteration -max_test_set_size: 10 # maximum test set size -num_candidates: 100 # how many candidates to evaluate -num_demonstrations: 5 # how many demonstrations to include per candidate -parallelism: 1 # how many threads to run evaluations across -shuffle_test: false # shuffling of test set -test_set_name: test # name of test set -train_set_name: train # name of train set -validation_set_name: validation # name of validation set -demonstrations_variable_name: demonstrations # variable name to insert demonstrations into -variables: # define discrete options to sample from - model: # set ${ model } variable - - watsonx/meta-llama/llama-3-1-8b-instruct - prompt_pattern: # set ${ prompt_pattern } variable to one of these - - cot - - react - - rewoo - num_demonstrations: # overrides num demonstrations above - - 0 - - 3 - - 5 -``` +We write a configuration file for the optimizer, and save it as `gsm8k_optimizer_config.yml`. See `src/pdl/optimize/config_parser.py` for all fields. Please note that this example uses the `watsonx` inference service, so an API key is required, although you can also use a local model or any other inference service. +``` { .yaml .copy .annotate title="examples/optimizer/gsm8k_optimizer_config.yml" linenums="1" } +--8<-- "./examples/optimizer/gsm8k_optimizer_config.yml" +``` ```python title="examples/optimizer/gsm8k_evaluator.py" linenums="1" --8<-- "./examples/optimizer/gsm8k_evaluator.py" @@ -95,20 +77,112 @@ variables: # define discrete options to sample from We can see an example of a script to run the optimization process in `examples/optimizer/optimize.py`. Usage: -``` +```text python optimize.py optimize -h usage: optimize.py optimize [-h] --config CONFIG --dataset-path DATASET_PATH [--experiments-path EXPERIMENTS_PATH] [--yield_output | --no-yield_output] [--dry | --no-dry] pdl_file ``` -We also need a dataset to optimize against, with `train`, `test`, and `validation` splits. To produce such a dataset, we can use HuggingFace Datasets `load_dataset` and `save_to_disk`. This example requires the dataset to have columns `question`, `reasoning`, and `answer`, which can be created from the original `openai/gsm8k` dataset. Processing scripts are under development and will follow shortly. +We also need a dataset to optimize against, with `train`, `test`, and `validation` splits. To produce such a dataset, we can use HuggingFace Datasets `load_dataset` and `save_to_disk`. This example requires the dataset to have columns `question`, `reasoning`, and `answer`, which can be created from the original `openai/gsm8k` dataset. + +We provide three scripts in `examples/optimizer` to create datasets, including the rule based agentic trajectories. These are `process_gsm8k.py`, `process_fever.py`, and `process_mbpp.py`. They load the original datasets, process them, and save them to disk in the required format. Dataset specific instructions may be found in the respective script files. Note that the scripts create a folder named `var` in the current directory, which contains the processed dataset in a format that can be used by the optimizer. Therefore, they should be run in the root of the PDL repository. -We can run an example like so: +Let's run the GSM8K dataset processing script: + +``` { .bash .copy .annotate linenums="1" } +python examples/optimizer/process_gsm8k.py +``` +Which should save the processed dataset in `var/gsm8k_trajectified` and output something like: + +```text +Saving the dataset (1/1 shards): 100%|█████████████████████████████████████████████████████████████████| 6449/6449 [00:00<00:00, 557195.73 examples/s] +Saving the dataset (1/1 shards): 100%|█████████████████████████████████████████████████████████████████| 1319/1319 [00:00<00:00, 363559.64 examples/s] +Saving the dataset (1/1 shards): 100%|█████████████████████████████████████████████████████████████████| 1024/1024 [00:00<00:00, 271472.56 examples/s] +Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 6449/6449 [00:00<00:00, 71242.31 examples/s] +Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:00<00:00, 68826.30 examples/s] +Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 6449/6449 [00:00<00:00, 22520.85 examples/s] +Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 6449/6449 [00:00<00:00, 18186.53 examples/s] +Saving the dataset (1/1 shards): 100%|█████████████████████████████████████████████████████████████████| 6449/6449 [00:00<00:00, 698328.77 examples/s] +Saving the dataset (1/1 shards): 100%|█████████████████████████████████████████████████████████████████| 1319/1319 [00:00<00:00, 232468.57 examples/s] +Saving the dataset (1/1 shards): 100%|█████████████████████████████████████████████████████████████████| 1024/1024 [00:00<00:00, 413375.10 examples/s] +DatasetDict({ + train: Dataset({ + features: ['question', 'answer', 'reasoning', 'raw_answer', 'answer_part', 'traj_keys', 'traj_values', 'rewoo_traj_keys', 'rewoo_traj_values'], + num_rows: 6449 + }) + test: Dataset({ + features: ['question', 'answer', 'reasoning', 'raw_answer', 'answer_part'], + num_rows: 1319 + }) + validation: Dataset({ + features: ['question', 'answer', 'reasoning', 'raw_answer', 'answer_part'], + num_rows: 1024 + }) +}) ``` + +Finally, we can run the example like so: + +``` { .bash .copy .annotate linenums="1" } cd examples/optimizer -python optimize.py optimize --config config.yml --dataset-path datasets/gsm8k gsm8k.pdl +python optimize.py optimize --config gsm8k_optimizer_config.yml --dataset-path ../../var/gsm8k_trajectified gsm8k.pdl +``` + +This will report details about the optimization process, such as the number of candidates evaluated. The output will look something like this: + +```text + PDL Optimizer pdl_optimizer.py:336 + ┌──────────────────────────────┬─────────────────────────────────────────────┐ + │ Config combinations │ 9 │ + │ Max candidates │ 100 │ + │ Num. candidates │ 100 │ + │ Starting validation set size │ 2 │ + │ Max validation set size │ 10 │ + │ Num. iterations │ 7 │ + │ Total evaluations │ 1,200 │ + │ Num. threads │ 1 │ + │ Validation set multiplier │ 2 │ + │ Shuffle validation set │ False │ + │ Budget policy │ None │ + ├──────────────────────────────┼─────────────────────────────────────────────┤ + │ model │ ['watsonx/meta-llama/llama-3-2-3b-instruct… │ + │ prompt_pattern │ ['cot', 'react', 'rewoo'] │ + │ num_demonstrations │ [0, 3, 5] │ + └──────────────────────────────┴─────────────────────────────────────────────┘ + Iteration pdl_optimizer.py:419 + ┌─────────────────────┬─────┐ + │ Index │ 0 │ + │ Validation set size │ 2 │ + │ Num. candidates │ 100 │ + └─────────────────────┴─────┘ + Evaluation pdl_optimizer.py:601 + ┌────────────────────────┬──────────────────────────────────────────┐ + │ Test set size │ 2 │ + ├────────────────────────┼──────────────────────────────────────────┤ + │ model │ watsonx/meta-llama/llama-3-2-3b-instruct │ + │ prompt_pattern │ cot │ + │ num_demonstrations │ 0 │ + │ uuid │ enl0ertp │ + │ demonstrations_indices │ 0 │ + │ demonstrations │ 0 │ + └────────────────────────┴──────────────────────────────────────────┘ + Running without parallelism util.py:74 + 0% ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0/1,200 [ 0:00:01 < -:--:-- , ? it/s ] ``` -Once the process is complete, a file `optimized_gsm8k.pdl` is written. This file contains the optimal configuration and is directly executable by the standard PDL interpreter. +Note that it is not unusual to observe PDL exceptions during the optimization process. + +```text +[15:44:14] Type errors during spec checking: +../../contrib/prompt_library/ReAct.pdl:0 - should be an object +../../contrib/prompt_library/ReAct.pdl:0 - Type errors during spec checking: +../../contrib/prompt_library/ReAct.pdl:0 - should be an object +Retrying: False +Runtime FAILED and took seconds: 10.21 +``` + +Such exceptions, here for example in `ReAct.pdl`, are caused by the _typed_ model call in `ReAct.pdl:98`. If the model output does not result in a parsable JSON that matches the expected type `{ name: string, arguments: object }`, the PDL interpreter raises an exception. + +Once the process is complete, a file `optimized_gsm8k.pdl` is written in same directory as the source PDL file. This file contains the optimal configuration and is directly executable by the standard PDL interpreter. A log of the optimization process is written to `experiments/` by default. diff --git a/examples/optimizer/gsm8k_optimizer_config.yml b/examples/optimizer/gsm8k_optimizer_config.yml new file mode 100644 index 000000000..49da371ff --- /dev/null +++ b/examples/optimizer/gsm8k_optimizer_config.yml @@ -0,0 +1,25 @@ +benchmark: gsm8k # Name our benchmark +budget: null # Set a budget, can be number of iterations, or a duration string e.g. "2h" +budget_growth: double # double validation set size each iteration +# or to_max: reach max_test_set_size by final iteration +initial_test_set_size: 2 # size of test set in first iteration +max_test_set_size: 10 # maximum test set size +num_candidates: 100 # how many candidates to evaluate +num_demonstrations: 5 # how many demonstrations to include per candidate +parallelism: 1 # how many threads to run evaluations across +shuffle_test: false # shuffling of test set +test_set_name: test # name of test set +train_set_name: train # name of train set +validation_set_name: validation # name of validation set +demonstrations_variable_name: demonstrations # variable name to insert demonstrations into +variables: # define discrete options to sample from + model: # set ${ model } variable + - watsonx/meta-llama/llama-3-2-3b-instruct + prompt_pattern: # set ${ prompt_pattern } variable to one of these + - cot + - react + - rewoo + num_demonstrations: # overrides num demonstrations above + - 0 + - 3 + - 5 diff --git a/examples/optimizer/mbpp_dataset.py b/examples/optimizer/mbpp_dataset.py index 3265a8b29..61fef67a6 100644 --- a/examples/optimizer/mbpp_dataset.py +++ b/examples/optimizer/mbpp_dataset.py @@ -3,7 +3,7 @@ from copy import deepcopy -from datasets import load_from_disk +from datasets.load import load_from_disk from evalplus.data import get_mbpp_plus, get_mbpp_plus_hash from evalplus.evaluate import MBPP_OUTPUT_NOT_NONE_TASKS, get_groundtruth diff --git a/examples/optimizer/optimize.py b/examples/optimizer/optimize.py index c88b799cb..24307fa26 100644 --- a/examples/optimizer/optimize.py +++ b/examples/optimizer/optimize.py @@ -5,7 +5,7 @@ from typing import Any import yaml -from datasets import load_from_disk +from datasets.load import load_from_disk from fever_evaluator import FEVEREvaluator from gsm8k_evaluator import Gsm8kEvaluator from gsmhard_evaluator import GsmHardEvaluator diff --git a/examples/optimizer/process_fever.py b/examples/optimizer/process_fever.py new file mode 100644 index 000000000..f3419cb7f --- /dev/null +++ b/examples/optimizer/process_fever.py @@ -0,0 +1,462 @@ +# Instructions for running this script: +# 1. Ensure you have the required libraries installed. +# `datasets` should be version 3.0 or higher. +# +# `pip install prompt-declaration-language[all] funcy` +# 2. Download the original FEVER dataset with wiki-pages, and BigBench FEVER task JSON file +# from the respective sources. +# https://fever.ai/dataset/fever.html +# https://github.com/google/BIG-bench/blob/main/bigbench/benchmark_tasks/fact_checker/fever/task.json +# +# ``` +# wget https://raw.githubusercontent.com/google/BIG-bench/refs/heads/main/bigbench/benchmark_tasks/fact_checker/fever/task.json +# wget https://fever.ai/download/fever/wiki-pages.zip +# wget https://fever.ai/download/fever/shared_task_dev.jsonl +# ``` +# Place the downloaded files in the `var/fever` directory. +# Extract the `wiki-pages.zip` file into `var/fever/wiki-pages/`. +# 3. Run this script to process the FEVER dataset. +# `python examples/optimizer/process_fever.py` +# +import json +import operator +import re +import unicodedata +import warnings +from functools import cache +from itertools import groupby +from pathlib import Path +from typing import Any + +import pandas as pd +import wikipedia +from datasets.arrow_dataset import Dataset +from datasets.dataset_dict import DatasetDict +from datasets.load import load_dataset, load_from_disk +from funcy import flatten +from tqdm.autonotebook import tqdm + +tqdm.pandas() +warnings.simplefilter("ignore") + +var_dir = Path("var") +var_dir.mkdir(parents=True, exist_ok=True) + + +def clean_fever(text: str) -> str: + mapping = { + "_": " ", + "-LRB- ": "(", + " -RRB-": ")", + "-LSB- ": "[", + " -RSB-": "]", + "-LRB-": "(", + "-RRB-": ")", + "-LSB-": "[", + "-RSB-": "]", + "-COLON-": ":", + } + + for k, v in mapping.items(): + text = text.replace(k, v) + + return text.strip() + + +@cache +def search_new( + subject: str, auto_suggest: bool = False, redirect: bool = False +) -> tuple[str, str]: + try: + result = ( + wikipedia.summary( + subject, auto_suggest=auto_suggest, redirect=redirect + ).strip(), + "success", + ) + except wikipedia.DisambiguationError as d: + result = ( + f'"{subject}" may refer to one of {d.args[1]}. Please retry the search with one of the subjects using Search[].', + "disambg", + ) + except wikipedia.PageError as e: + result = f"{e} Please retry the search using Search[].", "pageerror" + except wikipedia.WikipediaException as e: + print(e, type(e)) + result = str(e), f"other:{type(e)}" + except Exception as e: + print(e, type(e)) + result = str(e), f"other:{type(e)}" + return result + + +def searcher(row: dict[str, Any], auto_suggest: bool): + cleaned = clean_fever(row["article"]) + if "msg" in row: + if row["msg"] != "success": + wiki, msg = search_new(cleaned, auto_suggest=auto_suggest, redirect=True) + else: + wiki = row["wiki"] + msg = row["msg"] + else: + wiki, msg = search_new(cleaned, auto_suggest=auto_suggest, redirect=True) + return {"wiki": wiki, "msg": msg, "cleaned": cleaned} + + +def remove_accents(x: str) -> str: + return unicodedata.normalize("NFD", x) + + +fever = load_dataset("fever/fever", "v1.0") +if not isinstance(fever, DatasetDict): + raise TypeError(f"Expected fever to be a DatasetDict, but got: {type(fever)}") + +bigbench_fever = json.loads(Path("var/fever/task.json").read_text(encoding="utf-8")) + +fever.save_to_disk("var/fever/fever_original") +print(fever) + +wikipages = load_dataset( + "json", + data_files="var/fever/wiki-pages/wiki-pages/wiki-*.jsonl", + encoding="utf-8", +) +if not isinstance(wikipages, DatasetDict): + raise TypeError( + f"Expected wikipages to be a DatasetDict, but got: {type(wikipages)}" + ) + +print("Loaded wikipages:", wikipages) +print("Mapping wikipages...") +wikipages["train"] = wikipages["train"].map( + lambda x: { + "lines_split": [x for x in re.split(r"\d+\t", x["lines"]) if x], + }, + num_proc=32, +) +print("Mapping wikipages done.") + +print("Converting wikipages to DataFrame...") +wiki_pages_df = wikipages["train"].to_pandas() +if not isinstance(wiki_pages_df, pd.DataFrame): + raise TypeError( + f"Expected wiki_pages_df to be a DataFrame, but got: {type(wiki_pages_df)}" + ) +wiki_pages_df = wiki_pages_df.set_index("id") +wiki_pages_df.index = wiki_pages_df.index.map(remove_accents) +print("Wikipages converted to DataFrame.") + + +if isinstance(wiki_pages_df, pd.DataFrame): + wiki_pages_df.to_parquet( + path="var/fever/wiki_pages.parquet", + index=True, + engine="pyarrow", + compression="zstd", + compression_level=10, + ) +else: + raise TypeError( + f"Expected wiki_pages_df to be a DataFrame, but got: {type(wiki_pages_df)}" + ) + + +df = pd.read_json( + "var/fever/shared_task_dev.jsonl", lines=True, encoding="utf-8" +).set_index("id") +print("Loaded original FEVER", len(df)) +df = df[df.label.isin(["SUPPORTS", "REFUTES"])].copy() +print("Filtered original FEVER", len(df)) + + +def evidence_mapper(evidence: list[tuple]): + evidences = {(x[2], x[3]) for x in evidence[0] if x[2] is not None} + return list(evidences) + + +df["unique_evidence"] = df[ + "evidence" +].progress_apply( # pyright: ignore[reportAttributeAccessIssue] + evidence_mapper +) + + +def evidence_mapper_sentence(evidences: list[tuple[str, int]]): + if not isinstance(wiki_pages_df, pd.DataFrame): + raise TypeError( + f"Expected wiki_pages_df to be a DataFrame, but got: {type(wiki_pages_df)}" + ) + + lines = [] + for title, line in evidences: + if title is None or line is None: + continue + title_no_acc = remove_accents(title) + + if title_no_acc not in wiki_pages_df.index: + print(title_no_acc) + continue + + sentence = wiki_pages_df.loc[title_no_acc] + if sentence["lines_split"] is not None and len(sentence["lines_split"]) > line: + sentence = sentence["lines_split"][line] + lines.append((title_no_acc, line, sentence)) + else: + print(sentence) + return list(lines) + + +df["evidence_sentences"] = df[ + "unique_evidence" +].progress_apply( # pyright: ignore[reportAttributeAccessIssue] + evidence_mapper_sentence +) + +bigbench = pd.DataFrame.from_records(bigbench_fever["examples"]).set_index("id") + +tqdm.pandas(desc="Mapping claims to (in) bigbench") +df["claim_in_bigbench"] = df[ + "claim" +].progress_apply( # pyright: ignore[reportAttributeAccessIssue] + lambda x: bigbench.input.str.contains(x).any() +) +tqdm.pandas() + +df["evidence_sentence_count"] = df[ + "evidence_sentences" +].map( # pyright: ignore[reportAttributeAccessIssue] + len +) +print("Mapped bigbench") + +train_df = df[(~df.index.isin(bigbench.index)) & (df["evidence_sentence_count"] > 0)] + +test_df = df[ + (df.index.isin(bigbench.index)) & (df["evidence_sentence_count"] > 0) +].drop( # pyright: ignore[reportAttributeAccessIssue] + columns=["verifiable", "claim_in_bigbench", "evidence"] +) +test_df["unique_evidence"] = test_df[ + "unique_evidence" +].map( # pyright: ignore[reportAttributeAccessIssue] + lambda x: [[str(title), str(sent_id)] for title, sent_id in x] +) +test_df["evidence_sentences"] = test_df[ + "evidence_sentences" +].map( # pyright: ignore[reportAttributeAccessIssue] + lambda x: [[str(title), str(sent_id), str(sent)] for title, sent_id, sent in x] +) +test_df["label"] = test_df["label"] == "SUPPORTS" +test_df.index = test_df.index.astype(pd.StringDtype()) +test_df.claim = test_df.claim.astype(pd.StringDtype()) +test_df["id"] = test_df.index +print("Saving fever test df") +test_df.to_json("fever_test_df.json", orient="records", lines=True) +print("Saved fever test df") + +train_df = df[ + (~df.index.isin(bigbench.index)) & (df["evidence_sentence_count"] > 0) +].drop( # pyright: ignore[reportAttributeAccessIssue] + columns=["verifiable", "claim_in_bigbench", "evidence"] +) +train_df["unique_evidence"] = train_df[ + "unique_evidence" +].map( # pyright: ignore[reportAttributeAccessIssue] + lambda x: [[str(title), str(sent_id)] for title, sent_id in x] +) +train_df["evidence_sentences"] = train_df[ + "evidence_sentences" +].map( # pyright: ignore[reportAttributeAccessIssue] + lambda x: [[str(title), str(sent_id), str(sent)] for title, sent_id, sent in x] +) +train_df["label"] = train_df["label"] == "SUPPORTS" +train_df.index = train_df.index.astype(pd.StringDtype()) +train_df.claim = train_df.claim.astype(pd.StringDtype()) +train_df["id"] = train_df.index +print("Saving fever train df") +train_df.to_json("fever_train_df.json", orient="records", lines=True) +print("Saved fever train df") + +fever_ds = load_dataset( + "json", data_files={"train": "fever_train_df.json", "test": "fever_test_df.json"} +) +if not isinstance(fever_ds, DatasetDict): + raise TypeError(f"Expected fever_ds to be a DatasetDict, but got: {type(fever_ds)}") +fever_ds.save_to_disk("var/fever/fever_reprocessed") +print(fever_ds) + + +articles = list( + set(flatten([[y[0] for y in x] for x in fever_ds["train"]["unique_evidence"]])) +) +article_ds = Dataset.from_dict({"article": articles}) +print(article_ds) + + +article_ds = article_ds.map(lambda x: searcher(x, True), num_proc=4) +article_ds = article_ds.map(lambda x: searcher(x, False), num_proc=1) +article_df = article_ds.to_pandas() +if not isinstance(article_df, pd.DataFrame): + raise TypeError( + f"Expected article_df to be a DataFrame, but got: {type(article_df)}" + ) +article_df = article_df.set_index("article") +print("Articles that did not return a successful response:") +print(article_df[article_df.msg != "success"]) +article_ds.save_to_disk("var/fever/fever_articles") +article_df.to_parquet( + "var/fever/fever_articles.parquet", + index=True, + engine="pyarrow", + compression="zstd", + compression_level=10, +) + + +def search(query: str) -> tuple: + if not isinstance(article_df, pd.DataFrame): + raise TypeError( + f"Expected article_df to be a DataFrame, but got: {type(article_df)}" + ) + row = article_df.loc[query] + return row["wiki"], row["msg"] + + +def trajectorize(row: dict[str, Any]) -> dict[str, Any]: + evidence_sentences = row["evidence_sentences"] + + claim = row["claim"].strip() + task = f"On June 2017, the following claim was made: {claim}\nQ: Was this claim true or false?" + answer = str(row["label"]).lower() + + article_sentence_group = { + k: list(v) for k, v in groupby(evidence_sentences, operator.itemgetter(0)) + } + + sample_articles = {} + statuses = [] + wiki_worked = True + for article in article_sentence_group: + cleaned_article = clean_fever(article) + wiki, worked = search(article) + wiki = wiki.strip() + if worked != "success": + wiki_worked = False + + sample_articles[cleaned_article] = wiki + statuses.append(worked) + all_wiki_success = all(x in {"success", "fallback"} for x in statuses) + + trajectory = [{"task": task}] + + for article, evidences in article_sentence_group.items(): + cleaned_article = clean_fever(article) + trajectory.extend( + [ + {"thought": f"I need to search {cleaned_article}."}, + { + "action": '{"name": "Search", "arguments": {"topic": "' + + cleaned_article + + '"}}' + }, + { + "observation": f"[Document]\n{sample_articles[cleaned_article]}\n[End]" + }, + ] + ) + + for _title, _line, sent in evidences: + trajectory.append({"observation": clean_fever(sent.split("\t")[0])}) + + trajectory.extend( + [ + {"thought": f"The claim is {answer}."}, + {"action": '{"name": "Finish", "arguments": {"topic": "' + answer + '"}}'}, + ] + ) + + traj_keys = [next(iter(t.keys())) for t in trajectory] + traj_values = [next(iter(t.values())) for t in trajectory] + + rewoo_trajectory = [{"task": task}] + + for article, evidences in article_sentence_group.items(): + cleaned_article = clean_fever(article) + rewoo_trajectory.extend( + [ + {"thought": f"Search for more information about {cleaned_article}."}, + { + "action": '{"name": "Search", "arguments": {"topic": "' + + cleaned_article + + '"}}' + }, + { + "observation": f"[Document]\n{sample_articles[cleaned_article]}\n[End]" + }, + ] + ) + + for _title, _line, sent in evidences: + rewoo_trajectory.append({"observation": clean_fever(sent.split("\t")[0])}) + + rewoo_traj_keys = [next(iter(t.keys())) for t in rewoo_trajectory] + rewoo_traj_values = [next(iter(t.values())) for t in rewoo_trajectory] + + return { + "traj_keys": traj_keys, + "traj_values": traj_values, + "rewoo_traj_keys": rewoo_traj_keys, + "rewoo_traj_values": rewoo_traj_values, + "all_wiki_success": all_wiki_success, + "wiki_worked": wiki_worked, + "articles": list(sample_articles.values()), + "statuses": statuses, + } + + +def sentencify(row: dict[str, Any]) -> dict[str, str]: + evidence_sentences = row["evidence_sentences"] + + article_sentence_group = { + clean_fever(k): list(v) + for k, v in groupby(evidence_sentences, operator.itemgetter(0)) + } + + sentences = [] + for evidences in article_sentence_group.values(): + for _title, _line, sent in evidences: + sentences.append(clean_fever(sent.split("\t")[0])) + + return {"cot": " ".join(sentences).strip().replace("\n", " ").strip()} + + +fever_ds = fever_ds.map( + lambda x: {"label": str(x["label"]).lower()}, + num_proc=4, +) + +fever_ds["train"] = ( + fever_ds["train"].map(trajectorize, num_proc=4).map(sentencify, num_proc=4) +) + + +print( + "Wiki lookup failures:", + fever_ds["train"].filter(lambda x: x["all_wiki_success"] is False), +) + + +fever_ds["train"] = fever_ds["train"].filter(lambda x: x["wiki_worked"] is True) +fever_ds["train"] = fever_ds["train"].remove_columns( + column_names=["wiki_worked", "all_wiki_success", "statuses"] +) +print(fever_ds) + +new_split = fever_ds["train"].train_test_split(test_size=1024) +fever_ds["train"] = new_split["train"] +fever_ds["validation"] = new_split["test"] +fever_ds.save_to_disk("var/fever_trajectified") + +# Make sure the saved dataset is loaded correctly +ds = load_from_disk("var/fever_trajectified") +print(ds) diff --git a/examples/optimizer/process_gsm8k.py b/examples/optimizer/process_gsm8k.py new file mode 100644 index 000000000..57911a0c7 --- /dev/null +++ b/examples/optimizer/process_gsm8k.py @@ -0,0 +1,166 @@ +import re +from pathlib import Path +from typing import Any + +from datasets.dataset_dict import DatasetDict +from datasets.load import load_dataset, load_from_disk + +from pdl.optimize.parse_number import parse_number + +# Load original GSM8K dataset and split it into train and validation sets + +var_dir = Path("var") +var_dir.mkdir(parents=True, exist_ok=True) + +gsm8k_orig = load_dataset("openai/gsm8k", "main") +if not isinstance(gsm8k_orig, DatasetDict): + raise TypeError( + f"Expected gsm8k_orig to be a DatasetDict, but got: {type(gsm8k_orig)}" + ) +new_split = gsm8k_orig["train"].train_test_split(test_size=1024) +gsm8k_orig["validation"] = new_split["test"] +gsm8k_orig["train"] = new_split["train"] +gsm8k_orig.save_to_disk("var/gsm8k_split") + +# Make sure the saved dataset is loaded correctly +gsm8k = load_from_disk("var/gsm8k_split") +if not isinstance(gsm8k, DatasetDict): + raise TypeError(f"Expected gsm8k to be a DatasetDict, but got: {type(gsm8k)}") + + +def parse_answers(row: dict[str, Any]) -> dict[str, Any]: + question = row["question"].strip().replace("’", "'").replace(" ", " ") + parts = row["answer"].split("####") + answer = parse_number(parts[-1]) + reasoning = "####".join(parts[:-1]).strip().replace("’", "'").replace(" ", " ") + return { + "question": question, + "answer": answer, + "reasoning": reasoning, + "raw_answer": row["answer"], + "answer_part": parts[-1], + } + + +gsm8k = gsm8k.map(parse_answers) + + +def react_trajectory(row: dict[str, Any]) -> dict[str, list[str]]: + question = row["question"] + answer = row["answer"] + reasoning = row["reasoning"].splitlines() + trajectory = [{"question": question.strip()}] + res = answer + + for line in reasoning: + pattern = ( + r"(?P
(=(\ )?|equals(\ )?)?(\$)?)<<(?P.*?)=(?P.*?)>>([^\s]*)"
+        )
+        expressions = re.search(pattern, line)
+
+        if expressions is None:
+            trajectory += [
+                {"thought": line.strip().replace("  ", " ")},
+            ]
+        else:
+            thought = re.sub(pattern, "", line)
+            thought = thought.rstrip(".").rstrip(",")
+            exp = expressions.group("exp").strip()
+            res = expressions.group("res").strip()
+
+            trajectory += [
+                {
+                    "thought": f"{thought.strip().replace('  ', ' ')}. I need to calculate {exp}"
+                },
+                {
+                    "action": '{"name": "Calculator", "arguments": {"expr": "'
+                    f"{exp}"
+                    '"}}'
+                },
+                {"observation": res},
+            ]
+    if next(iter(trajectory[-1].keys())) == "observation":
+        trajectory.append({"thought": f"The answer is {answer}"})
+
+    trajectory.append(
+        {"action": '{"name": "Finish", "arguments": {"answer": "' + f"{answer}" + '"}}'}
+    )
+
+    traj_keys = [next(iter(t.keys())) for t in trajectory]
+    traj_values = [next(iter(t.values())) for t in trajectory]
+
+    return {
+        "traj_keys": traj_keys,
+        "traj_values": traj_values,
+    }
+
+
+gsm8k["train"] = gsm8k["train"].map(react_trajectory)
+
+
+def rewoo_trajectory(row: dict[str, Any]) -> dict[str, list[str]]:
+    question = row["question"]
+    answer = row["answer"]
+    reasoning = row["reasoning"].splitlines()
+    trajectory = [{"question": question.strip().replace("  ", " ")}]
+    res = answer
+
+    for line in reasoning:
+        pattern = (
+            r"(?P
(=(\ )?|equals(\ )?)?(\$)?)<<(?P.*?)=(?P.*?)>>([^\s]*)"
+        )
+        expressions = re.search(pattern, line)
+
+        if expressions is None:
+            trajectory += [
+                {"thought": line.strip().replace("  ", " ")},
+            ]
+        else:
+            thought = re.sub(pattern, "", line)
+            thought = thought.rstrip(".").rstrip(",")
+            exp = expressions.group("exp").strip()
+            res = expressions.group("res").strip()
+
+            trajectory += [
+                {"thought": f"{thought.strip().replace('  ', ' ')}. Calculate {exp}"},
+                {
+                    "action": '{"name": "Calculator", "arguments": {"expr": "'
+                    f"{exp}"
+                    '"}}'
+                },
+                {"observation": res},
+            ]
+
+    evidence_counter = 0
+    for i, outer in enumerate(trajectory):
+        type_event = next(iter(outer.keys()))
+        value = next(iter(outer.values()))
+
+        if type_event == "action":
+            evidence_counter += 1
+        if type_event == "observation":
+            for j in range(i + 1, len(trajectory)):
+                inner = trajectory[j]
+                inner_type_event = next(iter(inner.keys()))
+                if inner_type_event == "action":
+                    trajectory[j]["action"] = trajectory[j]["action"].replace(
+                        value, f"#E{evidence_counter}"
+                    )
+                elif inner_type_event == "thought":
+                    trajectory[j]["thought"] = trajectory[j]["thought"].replace(
+                        value, f"#E{evidence_counter}"
+                    )
+    traj_keys = [next(iter(t.keys())) for t in trajectory]
+    traj_values = [next(iter(t.values())) for t in trajectory]
+
+    return {"rewoo_traj_keys": traj_keys, "rewoo_traj_values": traj_values}
+
+
+gsm8k["train"] = gsm8k["train"].map(rewoo_trajectory)
+
+# Save the processed dataset
+gsm8k.save_to_disk("var/gsm8k_trajectified")
+
+# Make sure the saved dataset is loaded correctly
+ds = load_from_disk("var/gsm8k_trajectified")
+print(ds)
diff --git a/examples/optimizer/process_mbpp.py b/examples/optimizer/process_mbpp.py
new file mode 100644
index 000000000..c995b9d44
--- /dev/null
+++ b/examples/optimizer/process_mbpp.py
@@ -0,0 +1,79 @@
+# Instructions:
+# 1. Install EvalPlus e.g. `pip install evalplus`
+# 2. Run this script to process the MBPP dataset into a format suitable for evaluation.
+import re
+from pathlib import Path
+from typing import Any
+
+from datasets.dataset_dict import DatasetDict
+from datasets.load import load_dataset, load_from_disk
+from evalplus.data import get_mbpp_plus
+
+var_dir = Path("var")
+var_dir.mkdir(parents=True, exist_ok=True)
+
+mbpp_plus = get_mbpp_plus()
+
+mbpp = load_dataset("google-research-datasets/mbpp", name="full")
+if not isinstance(mbpp, DatasetDict):
+    raise TypeError(f"Expected mbpp to be a DatasetDict, but got: {type(mbpp)}")
+
+mbpp["test"] = mbpp["test"].filter(
+    lambda x: f"Mbpp/{x['task_id']}" in mbpp_plus,
+)
+
+mbpp["validation"] = mbpp["validation"].filter(
+    lambda x: f"Mbpp/{x['task_id']}" in mbpp_plus,
+)
+
+
+def trajectify(row: dict[str, Any]) -> dict[str, list[str]]:
+    code = row["code"].replace("\r\n", "\n").replace("\r", "\n").strip()
+    first_test = row["test_list"][0].strip().lstrip()
+    pattern = r"assert\s+(\w+\(.*?\))\s*==\s*(.+)"
+
+    # Replacement format
+    replacement = r"res = \1\nassert res == \2, \"Expected \2 but got {}\".format(res)"
+
+    # Perform the substitution
+    converted_string = (
+        re.sub(pattern, replacement, first_test)
+        .replace('\\"Expected ', '"Expected ')
+        .replace('{}\\"', '{}"')
+    )
+    code_w_assert = code + "\n" + converted_string.strip()
+    prompt = row["text"].strip() + "\n" + first_test
+
+    trajectory = [
+        {"task": prompt},
+        {
+            "thought": "I should run a solution on the test case before proposing a solution."
+        },
+        {"action": code_w_assert},
+        {"observation": "[Executed Successfully with No Output]"},
+        {"thought": "There is no AssertionError. I can now submit the solution."},
+        {"solution": code},
+    ]
+
+    traj_keys = [next(iter(t.keys())) for t in trajectory]
+    traj_values = [next(iter(t.values())) for t in trajectory]
+
+    return {
+        "react_prompt": prompt,
+        "code": code,
+        "traj_keys": traj_keys,
+        "traj_values": traj_values,
+    }
+
+
+mbpp_trajectified = mbpp.map(trajectify)
+assert len(mbpp_trajectified["train"]) == 374
+assert len(mbpp_trajectified["test"]) == 224
+assert len(mbpp_trajectified["validation"]) == 39
+
+# Save the processed dataset
+mbpp_trajectified.save_to_disk("var/mbpp_trajectified")
+
+# Make sure the saved dataset is loaded correctly
+ds = load_from_disk("var/mbpp_trajectified")
+print(ds)
diff --git a/mkdocs.yml b/mkdocs.yml
index 3d5fe3887..2831e62d5 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -47,7 +47,7 @@ nav:
   - API Reference: api_reference.md
   - Contribute: contrib.md
   - Viewer: viewer.md
-  # - AutoPDL: autopdl.md # Hide documentation for now
+  - AutoPDL: autopdl.md
 
 # Define some IBM colors
 extra_css:
diff --git a/pyproject.toml b/pyproject.toml
index 3350229a5..089dabae1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,10 +47,12 @@ examples = [
   "pypdf~=5.2",
   "wikipedia~=1.0",
   "textdistance~=4.0",
-  "datasets>2,<4",
+  "datasets>3,<4",
   "sympy~=1.0",
   "scikit-learn>=1.6.1,<1.8.0",
-  "faiss-cpu>=1.10,<1.12"
+  "faiss-cpu>=1.10,<1.12",
+  "funcy>=2",
+  "evalplus>=0.3.1",
 ]
 docs = [
   "mkdocs~=1.0",
diff --git a/src/pdl/optimize/pdl_optimizer.py b/src/pdl/optimize/pdl_optimizer.py
index aa6316d53..0dea78ef8 100644
--- a/src/pdl/optimize/pdl_optimizer.py
+++ b/src/pdl/optimize/pdl_optimizer.py
@@ -11,7 +11,8 @@
 from typing import Any
 
 import yaml
-from datasets import Dataset, DatasetDict
+from datasets.arrow_dataset import Dataset
+from datasets.dataset_dict import DatasetDict
 from duration_parser import parse as parse_duration
 from numpy.random import default_rng
 from rich.logging import RichHandler
@@ -159,10 +160,13 @@ def sample_candidates(
         demo_name = self.config.demonstrations_variable_name
         candidates = []
 
+        num_demonstrations_set = {
+            int(x) for x in self.config.variables.get("num_demonstrations", set())
+        }
+
         if (
-            "prompt_pattern" in self.config.variables
-            and "cot" in self.config.variables.get("prompt_pattern", [])
-            and 0 in self.config.variables.get("num_demonstrations", [])
+            "cot" in self.config.variables.get("prompt_pattern", [])
+            and 0 in num_demonstrations_set
         ):
             cot_candidate = {
                 k: self.sample_random_index(v) for k, v in self.config.variables.items()
@@ -178,18 +182,18 @@ def sample_candidates(
 
             candidates.append(cot_candidate)
 
-        zero_shots_seen = ["cot"]
+        zero_shots_seen = {"cot"}
         while len(candidates) < num_candidates:
             variable_instance = {
                 k: self.sample_random_index(v) for k, v in self.config.variables.items()
             }
             if (
                 variable_instance.get("num_demonstrations") == 0
-                and variable_instance.get("prompt_pattern") == "cot"
+                and variable_instance.get("prompt_pattern") is not None
             ):
                 if variable_instance["prompt_pattern"] in zero_shots_seen:
                     continue
-                zero_shots_seen.append(variable_instance["prompt_pattern"])
+                zero_shots_seen.add(variable_instance["prompt_pattern"])
 
             num_demonstrations = int(
                 variable_instance.get("num_demonstrations", self.num_demonstrations),
@@ -214,16 +218,26 @@ def sample_candidates(
             candidates.append(candidate)
 
         if (
-            "num_demonstrations"
-            in self.config.variables  # check if is variable in config
-            and len(self.config.variables["num_demonstrations"])
-            > 1  # check more than 1 option
-            and 0 in [int(x) for x in self.config.variables["num_demonstrations"]]
-            # check zeroshot is an option
+            len(num_demonstrations_set) > 1  # check more than 1 option
+            and 0 in num_demonstrations_set  # check zeroshot is an option
         ):
-            zero_shotters = [x for x in candidates if x["num_demonstrations"] == 0]
+            zero_shotters = [
+                x.get("uuid") for x in candidates if x.get("num_demonstrations") == 0
+            ]
+            variables_zs = self.config.variables.copy()
+            variables_zs.pop("num_demonstrations", None)
+
+            max_zs = len(list(itertools.product(*variables_zs.values())))
+
+            if len(zero_shotters) > max_zs:
+                logger.warning(
+                    "More zero-shot candidates (%d) than expected (%d; "
+                    "product of all variables). "
+                    "Identical duplicated candidates may waste compute.",
+                    len(zero_shotters),
+                    max_zs,
+                )
 
-            assert len(zero_shotters) <= 3
         assert len(candidates) == num_candidates
         return candidates
 
diff --git a/src/pdl/optimize/util.py b/src/pdl/optimize/util.py
index 9925f8e0e..aceb0d3ff 100644
--- a/src/pdl/optimize/util.py
+++ b/src/pdl/optimize/util.py
@@ -4,7 +4,7 @@
 from typing import Any
 
 import yaml
-from datasets import Dataset
+from datasets.arrow_dataset import Dataset
 from rich.console import Console
 
 from pdl.pdl_ast import Program, ScopeType
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
index 76c5ed7ad..1df9b0834 100644
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
@@ -2,7 +2,8 @@
 from pprint import pprint
 
 import pytest
-from datasets import Dataset, DatasetDict
+from datasets.arrow_dataset import Dataset
+from datasets.dataset_dict import DatasetDict
 
 from examples.optimizer.fever_evaluator import FEVEREvaluator
 from examples.optimizer.gsm8k_evaluator import Gsm8kEvaluator