diff --git a/Makefile b/Makefile index 84cf255aa..52ddc1565 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ install: @echo "--- šŸš€ Installing project dependencies ---" - pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/ + pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/webarenalite -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/ playwright install chromium install-demo: diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index 0a1ff0a12..ab2858d37 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -132,6 +132,20 @@ ), task_metadata=task_metadata("webarena"), ), + "webarena_lite": lambda n_repeats=1: Benchmark( + name="webarena_lite", + high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"], + is_multi_tab=True, + supports_parallel_seeds=False, + backends=["webarena"], + env_args_list=make_env_args_list_from_repeat_tasks( + task_list=task_list_from_metadata(metadata=task_metadata("webarenalite")), + max_steps=30, + n_repeats=n_repeats, + seeds_rng=np.random.RandomState(42), + ), + task_metadata=task_metadata("webarenalite"), + ), "webarena_tiny": lambda n_repeats=1: Benchmark( name="webarena_tiny", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"], diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarenalite.csv b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarenalite.csv new file mode 100644 index 000000000..566e8627a --- /dev/null +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarenalite.csv @@ -0,0 +1,166 @@ +task_name,requires_reset,sites,eval_types,task_id,browsergym_split,depends_on +webarenalite.4,False,shopping_admin,string_match,4,train, +webarenalite.7,False,map,string_match,7,train, +webarenalite.15,False,shopping_admin,string_match,15,test, +webarenalite.20,False,map,string_match,20,test, +webarenalite.23,False,shopping,string_match,23,test, +webarenalite.27,False,reddit,string_match,27,test, +webarenalite.33,False,map,string_match,33,test, +webarenalite.37,False,map,string_match,37,train, +webarenalite.43,False,shopping_admin,string_match,43,test, +webarenalite.44,False,gitlab,url_match,44,train, +webarenalite.48,False,shopping,string_match,48,test, +webarenalite.56,False,map,string_match,56,train, +webarenalite.58,False,map,string_match,58,train, +webarenalite.65,False,shopping_admin,string_match,65,train, +webarenalite.69,False,reddit,string_match,69,test, +webarenalite.71,False,map,string_match,71,test, +webarenalite.75,False,map,string_match,75,train, +webarenalite.77,False,shopping_admin,string_match,77,test,webarenalite.65 +webarenalite.82,False,map,string_match,82,train, +webarenalite.88,False,map,string_match,88,train, +webarenalite.93,False,map,string_match,93,train, +webarenalite.95,False,shopping_admin,string_match,95,train, +webarenalite.96,False,shopping,string_match,96,test, +webarenalite.97,False,map wikipedia,string_match,97,test,webarenalite.93 +webarenalite.98,False,map,string_match,98,test,webarenalite.97 +webarenalite.103,False,gitlab,url_match,103,train, +webarenalite.109,False,shopping_admin,string_match,109,test, +webarenalite.115,False,shopping_admin,string_match,115,test, +webarenalite.117,False,shopping,string_match,117,test,webarenalite.96 +webarenalite.118,False,shopping,program_html,118,train,webarenalite.117 +webarenalite.123,False,shopping_admin,string_match,123,train, +webarenalite.125,False,shopping,string_match,125,train, +webarenalite.127,False,shopping_admin,string_match,127,train,webarenalite.123 +webarenalite.131,False,shopping_admin,string_match,131,test, +webarenalite.135,False,gitlab,string_match,135,train, +webarenalite.139,False,map,string_match,139,test, +webarenalite.144,False,shopping,string_match,144,test, +webarenalite.149,False,shopping,string_match,149,test, +webarenalite.155,False,map,string_match,155,test, +webarenalite.156,False,gitlab,url_match,156,test, +webarenalite.157,False,shopping_admin,url_match,157,train,webarenalite.131 +webarenalite.162,False,shopping,url_match,162,test, +webarenalite.167,False,shopping,string_match,167,test, +webarenalite.169,False,gitlab,string_match,169,train, +webarenalite.173,False,gitlab,string_match url_match,173,train, +webarenalite.182,False,gitlab,string_match url_match,182,train, +webarenalite.190,False,shopping,string_match,190,train, +webarenalite.196,False,shopping_admin,string_match,196,train, +webarenalite.202,False,shopping_admin,string_match,202,train, +webarenalite.205,False,gitlab,string_match,205,train,webarenalite.182 +webarenalite.211,False,shopping_admin,string_match,211,train, +webarenalite.215,False,shopping_admin,string_match,215,test, +webarenalite.220,False,map,string_match,220,train, +webarenalite.221,False,map,string_match,221,test,webarenalite.220 +webarenalite.225,False,shopping,string_match,225,test, +webarenalite.227,False,shopping,string_match,227,train, +webarenalite.235,False,shopping,string_match,235,train, +webarenalite.236,False,map,string_match,236,train, +webarenalite.240,False,shopping,url_match,240,test, +webarenalite.247,False,shopping_admin,string_match,247,train, +webarenalite.250,False,map,string_match,250,test, +webarenalite.254,False,map,string_match,254,train, +webarenalite.258,False,gitlab,url_match,258,train, +webarenalite.259,False,gitlab,string_match,259,train,webarenalite.258 +webarenalite.268,False,wikipedia map,string_match,268,test, +webarenalite.270,False,shopping,url_match,270,train, +webarenalite.276,False,shopping,url_match,276,train, +webarenalite.283,False,shopping,url_match,283,test, +webarenalite.285,False,shopping,url_match,285,train, +webarenalite.287,False,map,string_match,287,test,webarenalite.268 +webarenalite.288,False,shopping_admin,string_match,288,train,webarenalite.247 +webarenalite.296,False,gitlab,string_match,296,train, +webarenalite.300,False,shopping,url_match,300,test, +webarenalite.311,False,gitlab,string_match,311,test, +webarenalite.313,False,shopping,string_match,313,train, +webarenalite.318,False,gitlab,string_match,318,train, +webarenalite.321,False,shopping,string_match,321,train, +webarenalite.324,False,shopping,url_match,324,train, +webarenalite.333,False,shopping,string_match,333,train, +webarenalite.335,False,shopping,string_match,335,train, +webarenalite.348,False,shopping_admin,string_match,348,test, +webarenalite.349,False,gitlab,string_match,349,test, +webarenalite.354,False,shopping,url_match,354,train, +webarenalite.357,False,gitlab,url_match,357,test, +webarenalite.361,False,shopping,string_match,361,train, +webarenalite.367,False,map,string_match,367,train, +webarenalite.368,False,shopping,string_match,368,test, +webarenalite.369,False,map,program_html,369,train,webarenalite.367 +webarenalite.374,False,shopping_admin,url_match,374,train,webarenalite.348 +webarenalite.376,False,shopping,string_match,376,test,webarenalite.368 +webarenalite.381,False,map,url_match,381,train, +webarenalite.382,False,map,string_match,382,test,webarenalite.381 +webarenalite.383,False,map,string_match,383,test,webarenalite.382 +webarenalite.384,False,shopping,string_match,384,test,webarenalite.376 +webarenalite.386,False,shopping,string_match,386,test, +webarenalite.387,False,shopping,string_match,387,train,webarenalite.386 +webarenalite.392,False,gitlab,program_html,392,test, +webarenalite.401,False,reddit,program_html,401,train, +webarenalite.404,False,reddit,program_html,404,train, +webarenalite.415,False,gitlab,program_html,415,test, +webarenalite.419,False,gitlab,program_html,419,test, +webarenalite.423,False,shopping_admin,program_html,423,train, +webarenalite.426,False,wikipedia map,program_html,426,test, +webarenalite.431,False,shopping,program_html,431,train, +webarenalite.440,False,shopping,program_html,440,test, +webarenalite.448,False,gitlab,program_html,448,test, +webarenalite.454,False,shopping_admin,program_html,454,test, +webarenalite.458,False,shopping_admin,program_html,458,test, +webarenalite.464,False,shopping_admin,program_html,464,train, +webarenalite.466,False,shopping,program_html,466,train, +webarenalite.470,False,shopping_admin,program_html,470,test,webarenalite.464 +webarenalite.476,False,gitlab,program_html,476,train, +webarenalite.485,False,gitlab,program_html,485,test, +webarenalite.488,False,shopping_admin,program_html,488,test, +webarenalite.491,False,shopping_admin,string_match,491,test, +webarenalite.497,False,shopping_admin,program_html,497,test, +webarenalite.505,False,shopping_admin,program_html,505,train, +webarenalite.506,False,shopping,program_html,506,train, +webarenalite.509,False,shopping,program_html,509,test, +webarenalite.514,False,shopping,program_html,514,test, +webarenalite.516,False,shopping,program_html,516,train, +webarenalite.521,False,shopping,program_html,521,test, +webarenalite.524,False,gitlab,program_html,524,test, +webarenalite.528,False,shopping,program_html,528,train,webarenalite.521 +webarenalite.534,False,gitlab,program_html,534,train, +webarenalite.538,False,shopping_admin,program_html,538,train,webarenalite.505 +webarenalite.548,False,shopping_admin,program_html,548,train, +webarenalite.566,False,gitlab reddit,program_html,566,test, +webarenalite.567,False,gitlab,program_html,567,test,webarenalite.566 +webarenalite.574,False,shopping,program_html,574,test, +webarenalite.577,False,gitlab,program_html,577,train, +webarenalite.582,False,reddit,program_html,582,test, +webarenalite.599,False,reddit,url_match program_html,599,test, +webarenalite.601,False,reddit,url_match program_html,601,train, +webarenalite.605,False,reddit,url_match program_html,605,train, +webarenalite.612,False,reddit,url_match program_html,612,test, +webarenalite.619,False,reddit,url_match program_html,619,train, +webarenalite.626,False,reddit,url_match program_html,626,train, +webarenalite.631,False,reddit,url_match program_html,631,train, +webarenalite.641,False,reddit,url_match program_html,641,test, +webarenalite.645,False,reddit,url_match program_html,645,train, +webarenalite.652,False,reddit,url_match program_html,652,train, +webarenalite.657,False,shopping,url_match program_html,657,train, +webarenalite.668,False,gitlab,url_match program_html,668,test, +webarenalite.673,False,shopping reddit,url_match program_html,673,test, +webarenalite.678,False,shopping_admin,url_match program_html,678,train, +webarenalite.682,False,reddit gitlab,url_match program_html,682,train, +webarenalite.686,False,reddit gitlab,url_match program_html,686,train, +webarenalite.693,False,shopping,url_match program_html,693,train, +webarenalite.704,False,shopping_admin,url_match program_html,704,test, +webarenalite.710,False,shopping_admin,url_match program_html,710,test, +webarenalite.714,False,reddit,program_html,714,train, +webarenalite.720,False,reddit,program_html,720,test, +webarenalite.729,False,reddit,program_html,729,train, +webarenalite.733,False,reddit,program_html,733,train, +webarenalite.741,False,wikipedia map,program_html,741,train, +webarenalite.745,False,gitlab,program_html,745,test, +webarenalite.748,False,gitlab,program_html,748,train, +webarenalite.760,False,map shopping_admin,program_html,760,test, +webarenalite.762,False,map,program_html,762,train, +webarenalite.768,False,shopping_admin,program_html,768,test,webarenalite.760 +webarenalite.791,False,gitlab reddit,string_match,791,train, +webarenalite.798,False,shopping,string_match,798,train, +webarenalite.809,False,gitlab,url_match program_html,809,train, +webarenalite.811,False,gitlab,program_html,811,test, diff --git a/browsergym/experiments/src/browsergym/experiments/loop.py b/browsergym/experiments/src/browsergym/experiments/loop.py index 0ebb9e94c..1a36e4c75 100644 --- a/browsergym/experiments/src/browsergym/experiments/loop.py +++ b/browsergym/experiments/src/browsergym/experiments/loop.py @@ -936,6 +936,7 @@ def _get_env_name(task_name: str): import browsergym.workarena elif task_name.startswith("webarena"): import browsergym.webarena + import browsergym.webarenalite elif task_name.startswith("visualwebarena"): import browsergym.visualwebarena elif task_name.startswith("assistantbench"): diff --git a/browsergym/webarenalite/pyproject.toml b/browsergym/webarenalite/pyproject.toml new file mode 100644 index 000000000..902b3c0cf --- /dev/null +++ b/browsergym/webarenalite/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = ["hatchling", "hatch-requirements-txt"] +build-backend = "hatchling.build" + +[project] +name = "browsergym-webarenalite" +description = "WebArena Lite benchmark for BrowserGym" +authors = [ + {name = "Aman Jaiswal"}, +] +requires-python = ">3.7" +license = {text = "Apache-2.0"} +classifiers = [ + "Development Status :: 3 - Alpha", + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "License :: OSI Approved :: Apache Software License", +] +dynamic = ["dependencies", "version"] + +[project.urls] +homepage = "https://github.com/ServiceNow/BrowserGym" + +[tool.hatch.version] +path = "../core/src/browsergym/core/__init__.py" + +[tool.hatch.metadata.hooks.requirements_txt] +files = ["requirements.txt"] + +[tool.hatch.build] +include = [ + "src/browsergym/webarenalite/test_webarena_lite.raw.json" +] + +[tool.hatch.build.targets.wheel] +packages = ["src/browsergym"] diff --git a/browsergym/webarenalite/requirements.txt b/browsergym/webarenalite/requirements.txt new file mode 100644 index 000000000..4883f2389 --- /dev/null +++ b/browsergym/webarenalite/requirements.txt @@ -0,0 +1,2 @@ +browsergym-core==0.14.2 +libwebarena==0.0.4 diff --git a/browsergym/webarenalite/src/browsergym/webarenalite/__init__.py b/browsergym/webarenalite/src/browsergym/webarenalite/__init__.py new file mode 100644 index 000000000..10f785635 --- /dev/null +++ b/browsergym/webarenalite/src/browsergym/webarenalite/__init__.py @@ -0,0 +1,24 @@ +import nltk + +from browsergym.core.registration import register_task + +from . import config, task + +# download necessary tokenizer resources +# note: deprecated punkt -> punkt_tab https://github.com/nltk/nltk/issues/3293 +try: + nltk.data.find("tokenizers/punkt_tab") +except: + nltk.download("punkt_tab", quiet=True, raise_on_error=True) + +ALL_WEBARENA_TASK_IDS = [] + +# register all WebArena benchmark +for task_id in config.TASK_IDS: + gym_id = f"webarenalite.{task_id}" + register_task( + gym_id, + task.WebArenaLiteTask, + task_kwargs={"task_id": task_id}, + ) + ALL_WEBARENA_TASK_IDS.append(gym_id) diff --git a/browsergym/webarenalite/src/browsergym/webarenalite/config.py b/browsergym/webarenalite/src/browsergym/webarenalite/config.py new file mode 100644 index 000000000..b5ccd9f9e --- /dev/null +++ b/browsergym/webarenalite/src/browsergym/webarenalite/config.py @@ -0,0 +1,167 @@ +TASK_IDS = [ + 4, + 7, + 15, + 20, + 23, + 27, + 33, + 37, + 43, + 44, + 48, + 56, + 58, + 65, + 69, + 71, + 75, + 77, + 82, + 88, + 93, + 95, + 96, + 97, + 98, + 103, + 109, + 115, + 117, + 118, + 123, + 125, + 127, + 131, + 135, + 139, + 144, + 149, + 155, + 156, + 157, + 162, + 167, + 169, + 173, + 182, + 190, + 196, + 202, + 205, + 211, + 215, + 220, + 221, + 225, + 227, + 235, + 236, + 240, + 247, + 250, + 254, + 258, + 259, + 268, + 270, + 276, + 283, + 285, + 287, + 288, + 296, + 300, + 311, + 313, + 318, + 321, + 324, + 333, + 335, + 348, + 349, + 354, + 357, + 361, + 367, + 368, + 369, + 374, + 376, + 381, + 382, + 383, + 384, + 386, + 387, + 392, + 401, + 404, + 415, + 419, + 423, + 426, + 431, + 440, + 448, + 454, + 458, + 464, + 466, + 470, + 476, + 485, + 488, + 491, + 497, + 505, + 506, + 509, + 514, + 516, + 521, + 524, + 528, + 534, + 538, + 548, + 566, + 567, + 574, + 577, + 582, + 599, + 601, + 605, + 612, + 619, + 626, + 631, + 641, + 645, + 652, + 657, + 668, + 673, + 678, + 682, + 686, + 693, + 704, + 710, + 714, + 720, + 729, + 733, + 741, + 745, + 748, + 760, + 762, + 768, + 791, + 798, + 809, + 811, +] diff --git a/browsergym/webarenalite/src/browsergym/webarenalite/evaluators.py b/browsergym/webarenalite/src/browsergym/webarenalite/evaluators.py new file mode 100644 index 000000000..fc37a2499 --- /dev/null +++ b/browsergym/webarenalite/src/browsergym/webarenalite/evaluators.py @@ -0,0 +1,374 @@ +""" +Adopted from VisualAgentBench https://github.com/THUDM/VisualAgentBench/blob/main/VAB-WebArena-Lite/new/evaluators.py +base class for evaluation +""" + +# answer string match +import collections +import html +import importlib +import json +import time +import urllib +from pathlib import Path +from typing import Union + +from beartype import beartype +from nltk.tokenize import word_tokenize # type: ignore +from playwright.sync_api import CDPSession, Page +from webarena.evaluation_harness.helper_functions import ( + PseudoPage, + gitlab_get_project_memeber_role, + llm_fuzzy_match, + llm_ua_match, + reddit_get_post_url, + shopping_get_latest_order_url, + shopping_get_sku_latest_review_author, + shopping_get_sku_latest_review_rating, +) +from webarena.browser_env import Action, StateInfo + +Trajectory = list[Union[Action, StateInfo]] + + +class Evaluator(object): + def __init__(self, eval_tag: str = "") -> None: + self.eval_tag = eval_tag + + @beartype + def __call__( + self, + trajectory: Trajectory, + config_file: Path | str, + page: Page | PseudoPage, + client: CDPSession, + ) -> float: + raise NotImplementedError + + @staticmethod + def get_last_action(trajectory: Trajectory) -> "Action": + try: + # is_bearable(trajectory[-1], Action) + last_action = trajectory[-1] + except Exception: + raise ValueError( + "The last element of trajectory should be an action, add a fake stop action if needed" + ) + + return last_action # type: ignore[return-value] + + @staticmethod + def get_last_state(trajectory: Trajectory) -> "StateInfo": + try: + # is_bearable(trajectory[-2], StateInfo) + last_state = trajectory[-2] + except Exception: + raise ValueError( + "The second last element of trajectory should be a state, add a fake stop action if needed" + ) + + return last_state # type: ignore[return-value] + + +class StringEvaluator(Evaluator): + """Check whether the answer is correct with: + exact match: the answer is exactly the same as the reference answer + must include: each phrase in the reference answer must be included in the answer + fuzzy match: the answer is similar to the reference answer, using LLM judge + """ + + @staticmethod + @beartype + def clean_answer(answer: str) -> str: + answer = answer.strip() + if answer.startswith("'") and answer.endswith("'"): + answer = answer[1:-1] + elif answer.startswith('"') and answer.endswith('"'): + answer = answer[1:-1] + return answer.lower() + + @staticmethod + @beartype + def exact_match(ref: str, pred: str) -> float: + return float(StringEvaluator.clean_answer(pred) == StringEvaluator.clean_answer(ref)) + + @staticmethod + @beartype + def must_include(ref: str, pred: str, tokenize: bool = False) -> float: + clean_ref = StringEvaluator.clean_answer(ref) + clean_pred = StringEvaluator.clean_answer(pred) + # tokenize the answer if the ref is a single word + # prevent false positive (e.g, 0) + if tokenize and len(clean_ref) == 1 and len(word_tokenize(clean_ref)) == 1: + tok_pred = word_tokenize(clean_pred) + return float(clean_ref in tok_pred) + else: + return float(clean_ref in clean_pred) + + @staticmethod + @beartype + def fuzzy_match(ref: str, pred: str, intent: str) -> float: + return llm_fuzzy_match(pred, ref, intent) + + @staticmethod + @beartype + def ua_match(ref: str, pred: str, intent: str) -> float: + return llm_ua_match(pred, ref, intent) + + def __call__( + self, + trajectory: Trajectory, + config_file: Path | str, + page: Page | PseudoPage | None = None, + client: CDPSession | None = None, + ) -> float: + with open(config_file, "r") as f: + configs = json.load(f) + + last_action = self.get_last_action(trajectory) + pred = self.clean_answer(last_action["answer"]) + + score = 1.0 + for approach, value in configs["eval"]["reference_answers"].items(): + match approach: + case "exact_match": + score *= self.exact_match(ref=value, pred=pred) + + case "must_include": + assert isinstance(value, list) + for must_value in value: + score *= self.must_include( + ref=must_value, + pred=pred, + tokenize=(len(value) == 1), + ) + case "fuzzy_match": + intent = configs["intent"] + if value == "N/A": + # if the instruction only asks the model to generate N/A when encountering an unachievable task + # without more concrete reasons + score *= self.exact_match(ref=value, pred=pred) + # if the instruction also asks the model to generate the reason why the task is unachievable + # this should be the default as it will prevent false positive N/A` + if score != 1: + score = 1.0 * self.ua_match( + intent=configs["intent"], + ref=configs["eval"]["string_note"], + pred=pred, + ) + else: + assert isinstance(value, list) + for reference in value: + score *= self.fuzzy_match(ref=reference, pred=pred, intent=intent) + return score + + +class URLEvaluator(Evaluator): + """Check URL matching""" + + @beartype + def __call__( + self, + trajectory: Trajectory, + config_file: Path | str, + page: Page | PseudoPage, + client: CDPSession | None = None, + ) -> float: + with open(config_file, "r") as f: + configs = json.load(f) + + def clean_url(url: str) -> str: + url = str(url) + url = url.rstrip("/") + return url + + def parse_url(url: str) -> tuple[str, dict[str, list[str]]]: + """Parse a URL into its base, path, and query components.""" + parsed_url = urllib.parse.urlparse(url) + base_path = parsed_url.netloc + parsed_url.path + query = urllib.parse.parse_qs(parsed_url.query) + return base_path, query + + def parse_urls( + urls: list[str], + ) -> tuple[list[str], dict[str, set[str]]]: + """Parse a list of URLs.""" + base_paths = [] + queries = collections.defaultdict(set) + for url in urls: + base_path, query = parse_url(url) + base_paths.append(base_path) + for k, v in query.items(): + queries[k].update(v) + return base_paths, queries + + pred = clean_url(page.url) + ref_urls = configs["eval"]["reference_url"].split(" |OR| ") + ref_urls = [clean_url(url) for url in ref_urls] + matching_rule = configs["eval"].get("url_note", "GOLD in PRED") + if matching_rule == "GOLD in PRED": + ref_base_paths, ref_queries = parse_urls(ref_urls) + pred_base_paths, pred_query = parse_url(pred) + + base_score = float( + any([ref_base_path in pred_base_paths for ref_base_path in ref_base_paths]) + ) + query_score = 1.0 + for k, possible_values in ref_queries.items(): + query_score *= float( + any( + possible_ref_value in pred_query.get(k, []) + for possible_ref_value in possible_values + ) + ) + score = base_score * query_score + + else: + raise ValueError(f"Unknown matching rule: {matching_rule}") + + return score + + +class HTMLContentEvaluator(Evaluator): + """Check whether the contents appear in the page""" + + """Check whether the contents appear in the page""" + + @staticmethod + @beartype + def fuzzy_match(ref: str, pred: str, intent: str) -> float: + return llm_fuzzy_match(pred, ref, intent) + + def __call__( + self, + trajectory: Trajectory, + config_file: Path | str, + page: Page | PseudoPage, + client: CDPSession | None = None, + ) -> float: + with open(config_file, "r") as f: + configs = json.load(f) + + targets = configs["eval"]["program_html"] + + score = 1.0 + for target in targets: + target_url: str = target["url"] # which url to check + if target_url.startswith("func"): + func = target_url.split("func:")[1] + func = func.replace("__last_url__", page.url) + target_url = eval(func) + + locator: str = target["locator"] # js element locator + + # navigate to that url + if target_url != "last": + page.goto(target_url) + time.sleep(3) # TODO [shuyanzh]: fix this hard-coded sleep + + # empty, use the full page + if not locator.strip(): + selected_element = page.content() + # use JS to select the element + elif locator.startswith("document.") or locator.startswith("[...document."): + if "prep_actions" in target: + try: + for prep_action in target["prep_actions"]: + page.evaluate(f"() => {prep_action}") + except Exception: + pass + try: + selected_element = str(page.evaluate(f"() => {locator}")) + if not selected_element: + selected_element = "" + except Exception: + # the page is wrong, return empty + selected_element = "" + elif locator.startswith("lambda:"): + try: + locator = locator.lstrip("lambda:") + selected_element = page.evaluate(locator) + if not selected_element: + selected_element = None + except Exception: + # the page is wrong, return empty + selected_element = None + # run program to call API + elif locator.startswith("func:"): # a helper function + func = locator.split("func:")[1] + func = func.replace("__page__", "page") + selected_element = eval(func) + else: + raise ValueError(f"Unknown locator: {locator}") + + # If the selected element is None, then the page is wrong + if selected_element is None: + score = 0.0 + break + + if "exact_match" in target["required_contents"]: + required_contents = target["required_contents"]["exact_match"] + score *= StringEvaluator.exact_match(ref=required_contents, pred=selected_element) + elif "must_include" in target["required_contents"]: + required_contents = target["required_contents"]["must_include"] + assert isinstance(required_contents, list) + for content in required_contents: + content_or = content.split(" |OR| ") + score *= any( + [ + StringEvaluator.must_include(ref=content, pred=selected_element) + for content in content_or + ] + ) + elif "fuzzy_match" in target["required_contents"]: + required_contents = target["required_contents"]["fuzzy_match"] + intent = configs["intent"] + + assert isinstance(required_contents, list) + reference = ", ".join(required_contents) + score *= self.fuzzy_match(ref=reference, pred=selected_element, intent=intent) + else: + raise ValueError(f"Unknown required_contents: {target['required_contents'].keys()}") + + return score + + +class EvaluatorComb: + def __init__(self, evaluators: list[Evaluator]) -> None: + self.evaluators = evaluators + + @beartype + def __call__( + self, + trajectory: Trajectory, + config_file: Path | str, + page: Page | PseudoPage, + client: CDPSession | None = None, + ) -> float: + score = 1.0 + for evaluator in self.evaluators: + cur_score = evaluator(trajectory, config_file, page, client) + score *= cur_score + return score + + +@beartype +def evaluator_router(config_file: Path | str) -> EvaluatorComb: + """Router to get the evaluator class""" + with open(config_file, "r") as f: + configs = json.load(f) + + eval_types = configs["eval"]["eval_types"] + evaluators: list[Evaluator] = [] + for eval_type in eval_types: + match eval_type: + case "string_match": + evaluators.append(StringEvaluator()) + case "url_match": + evaluators.append(URLEvaluator()) + case "program_html": + evaluators.append(HTMLContentEvaluator()) + case _: + raise ValueError(f"eval_type {eval_type} is not supported") + + return EvaluatorComb(evaluators) diff --git a/browsergym/webarenalite/src/browsergym/webarenalite/helper_functions.py b/browsergym/webarenalite/src/browsergym/webarenalite/helper_functions.py new file mode 100644 index 000000000..7c3365c3f --- /dev/null +++ b/browsergym/webarenalite/src/browsergym/webarenalite/helper_functions.py @@ -0,0 +1,311 @@ +""" +Adopted from VisualAgentBench https://github.com/THUDM/VisualAgentBench/blob/main/VAB-WebArena-Lite/new/helper_functions_eval.py +Implements helper functions to assist evaluation cases where other evaluators are not suitable. +""" + +import json +from typing import Any +from urllib.parse import urlparse + +import requests +from playwright.sync_api import CDPSession, Page + +from webarena.browser_env.env_config import ( + ACCOUNTS, + GITLAB, + MAP, + REDDIT, + SHOPPING, + SHOPPING_ADMIN, + WIKIPEDIA, +) +from webarena.llms.providers.openai_utils import ( + generate_from_openai_chat_completion, +) + + +def shopping_get_auth_token() -> str: + response = requests.post( + url=f"{SHOPPING}/rest/default/V1/integration/admin/token", + headers={"content-type": "application/json"}, + data=json.dumps( + { + "username": ACCOUNTS["shopping_site_admin"]["username"], + "password": ACCOUNTS["shopping_site_admin"]["password"], + } + ), + ) + token: str = response.json() + return token + + +def shopping_get_latest_order_url() -> str: + """Get the latest order url from the shopping website.""" + + header = { + "Authorization": f"Bearer {shopping_get_auth_token()}", + "Content-Type": "application/json", + } + + params = { + "searchCriteria[sortOrders][0][field]": "created_at", + "searchCriteria[sortOrders][0][direction]": "DESC", + "searchCriteria[pageSize]": "1", + } + + response = requests.get(f"{SHOPPING}/rest/V1/orders", params=params, headers=header) + assert response.status_code == 200 + response_obj = response.json()["items"][0] + order_id = int(response_obj["increment_id"]) + order_url = f"{SHOPPING}/sales/order/view/order_id/{order_id}/" + return order_url + + +def shopping_get_sku_latest_review_author(sku: str) -> str: + """Get the latest review for shopping admin.""" + header = { + "Authorization": f"Bearer {shopping_get_auth_token()}", + "Content-Type": "application/json", + } + response = requests.get(f"{SHOPPING}/rest/V1/products/{sku}/reviews", headers=header) + assert response.status_code == 200 + response_obj = response.json() + if len(response_obj) == 0: + return "" + author: str = response_obj[-1]["nickname"] + return author + + +def shopping_get_sku_latest_review_rating(sku: str) -> str: + """Get the latest review for shopping admin.""" + header = { + "Authorization": f"Bearer {shopping_get_auth_token()}", + "Content-Type": "application/json", + } + response = requests.get(f"{SHOPPING}/rest/V1/products/{sku}/reviews", headers=header) + assert response.status_code == 200 + response_obj = response.json() + if len(response_obj) == 0: + return "" + assert response_obj[0]["ratings"][0]["rating_name"] == "Rating" + rating: str = str(response_obj[-1]["ratings"][0]["percent"]) + return rating + + +def reddit_get_post_url(url: str) -> str: + """Get the post url""" + # Url is http://domain/f/subreddit/post_id/... + # get domain, subreddit, post_id + domain = urlparse(url).netloc + tok_url = urlparse(url).path.split("/") + # not a valid post/comment url, return the url as is + if len(tok_url) < 4: + return url + if tok_url[1] != "f": + return url + subreddit = urlparse(url).path.split("/")[2] + post_id = urlparse(url).path.split("/")[3] + scheme = urlparse(url).scheme + post_url = f"{scheme}://{domain}/f/{subreddit}/{post_id}/" + return post_url + + +def gitlab_get_project_memeber_role(page: Page, account_name: str) -> str: + # get the account index + try: + account_idx = page.evaluate( + f"""(() => {{ + const elements = document.querySelectorAll("td[data-label='Account'] span.gl-avatar-labeled-sublabel"); + let index = -1; // Default value if not found + + for(let i = 0; i < elements.length; i++) {{ + if(elements[i].outerText === '@{account_name}') {{ + index = i; + break; + }} + }} + + return index; + }})()""" + ) + + # get the role + role: str = page.evaluate( + f"""(() => {{ + return document.querySelectorAll("td.col-max-role span")[{account_idx}].outerText; + }})()""" + ) + except Exception: + role = "" + + return role + + +def _normalize_openai_response_text(response: Any) -> str: + """Extract text content from various possible return types of the OpenAI wrapper. + + Supports strings, tuples, dicts, or SDK objects with a `choices` attribute. + """ + # If wrapper returns a tuple like (text, ...), pick the first item + if isinstance(response, tuple): + if len(response) == 0: + return "" + response = response[0] + + # If it's a Responses API object with an aggregated `output_text` + try: + output_text = getattr(response, "output_text", None) + if isinstance(output_text, str) and output_text.strip(): + return output_text + except Exception: + pass + + # If it's a dict from Responses API: extract text from output -> content blocks + if isinstance(response, dict) and isinstance(response.get("output"), list): + + def _collect_text(block: Any) -> list[str]: + texts: list[str] = [] + if isinstance(block, dict): + # Direct text field + if isinstance(block.get("text"), str): + texts.append(block["text"]) + # Nested content list + inner = block.get("content") + if isinstance(inner, list): + for sub in inner: + texts.extend(_collect_text(sub)) + elif isinstance(block, list): + for item in block: + texts.extend(_collect_text(item)) + return texts + + gathered = _collect_text(response["output"]) # type: ignore[index] + if gathered: + return "\n".join(gathered) + + # If it's an SDK-like object with `choices` + try: + choices = getattr(response, "choices", None) + if choices: + first_choice = choices[0] + # Try message.content (chat) then text (completion) + message = getattr(first_choice, "message", None) + if message is None and isinstance(first_choice, dict): + message = first_choice.get("message") + if message is not None: + content = getattr(message, "content", None) + if content is None and isinstance(message, dict): + content = message.get("content") + if content: + return content + text = getattr(first_choice, "text", None) + if text is None and isinstance(first_choice, dict): + text = first_choice.get("text") + if text: + return text + except Exception: + pass + + # If it's a plain dict, try common keys + if isinstance(response, dict): + # Direct content + if "content" in response and isinstance(response["content"], str): + return response["content"] + # choices-like dict + if "choices" in response and response["choices"]: + first_choice = response["choices"][0] + if isinstance(first_choice, dict): + msg = first_choice.get("message", {}) + if isinstance(msg, dict) and isinstance(msg.get("content"), str): + return msg["content"] + if isinstance(first_choice.get("text"), str): + return first_choice["text"] + + # Already a string + if isinstance(response, str): + return response + + # Fallback: stringify + return str(response) + + +def llm_fuzzy_match(pred: str, reference: str, question: str) -> float: + """Check whether the prediction matches the reference with GPT4-turbo""" + messages: list[dict[str, Any]] = [] + # construct the question to ask + message = "Help a teacher to grade the answer of a student given a question. Keep in mind that the student may use different phrasing or wording to answer the question. The goal is to evaluate whether the answer is semantically equivalent to the reference answer.\n" + message += f"question: {question}\n" + message += f"reference answer: {reference}\n" + message += ( + "all the string 'N/A' that you see is a special sequence that means 'not achievable'\n" + ) + message += f"student answer: {pred}\n" + message += "Conclude the judgement by correct/incorrect/partially correct." + messages = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": message}, + ] + + raw_response = generate_from_openai_chat_completion( + model="gpt-4-1106-preview", + messages=messages, + temperature=0, + max_tokens=768, + top_p=1.0, + context_length=0, + ) + response = _normalize_openai_response_text(raw_response).lower() + if "partially correct" in response or "incorrect" in response: + return 0.0 + else: + assert "correct" in response + return 1.0 + + +def llm_ua_match(pred: str, reference: str, question: str) -> float: + """Check whether the prediction matches the reference with GPT-turbo""" + messages: list[dict[str, Any]] = [] + # construct the question to ask + message = "" + message += f"task: {question}\n" + message += f"actual unachievable reason: {reference}\n" + message += f"reported unachievable reason: {pred}\n" + message += ( + "The task described above is inherently unachievable due to the reason specified under 'actual unachievable reason'. " + "An individual previously attempted this task and was unable to complete it. They provided a reason for their failure, " + "which is listed under 'reported unachievable reason'. Your role is to review both the actual and reported reasons. " + "Determine if the reported reason aligns with the actual reason, even if implicitly. " + "If the stated reason is in line with the actual reason, respond with 'same'. Otherwise, respond with 'different'." + ) + messages = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": message}, + ] + + raw_response = generate_from_openai_chat_completion( + model="gpt-4-1106-preview", + messages=messages, + temperature=0, + max_tokens=768, + top_p=1.0, + context_length=0, + ) + response = _normalize_openai_response_text(raw_response).lower() + if "different" in response: + return 0.0 + else: + assert "same" in response + return 1.0 + + +class PseudoPage: + def __init__(self, original_page: Page, url: str): + self.url = url + self.original_page = original_page + + def __getattr__(self, attr: str) -> Any: + # Delegate attribute access to the original page object + if attr not in ["url"]: + return getattr(self.original_page, attr) + else: + return getattr(self, attr) diff --git a/browsergym/webarenalite/src/browsergym/webarenalite/task.py b/browsergym/webarenalite/src/browsergym/webarenalite/task.py new file mode 100644 index 000000000..501c8a29a --- /dev/null +++ b/browsergym/webarenalite/src/browsergym/webarenalite/task.py @@ -0,0 +1,124 @@ +import importlib.resources +import json +import logging +import tempfile +from typing import Optional + +import playwright.sync_api + +from browsergym.webarena.task import GenericWebArenaTask + + +logger = logging.getLogger(__name__) + + +class WebArenaLiteTask(GenericWebArenaTask): + """ + Base class for all WebArena tasks. + + """ + + def __init__( + self, + seed: int, + task_id: Optional[int] = None, + intent_template_id: Optional[int] = None, + with_na_hint: bool = False, + with_homepage_hint: bool = False, + ): + super().__init__( + seed=seed, + task_id=task_id, + intent_template_id=intent_template_id, + with_na_hint=with_na_hint, + with_homepage_hint=with_homepage_hint, + ) + + all_configs_str = ( + importlib.resources.files("browsergym.webarenalite") + .joinpath("test_webarena_lite.raw.json") + .read_text() + ) + # substitute URLs + for pattern, url_key in { + "__GITLAB__": "gitlab", + "__REDDIT__": "reddit", + "__SHOPPING__": "shopping", + "__SHOPPING_ADMIN__": "shopping_admin", + "__WIKIPEDIA__": "wikipedia", + "__MAP__": "map", + }.items(): + all_configs_str = all_configs_str.replace(pattern, self.webarena_instance.urls[url_key]) + + # load all task configs to JSON + all_configs = json.loads(all_configs_str) + + # keep only the desired task configs + if intent_template_id is not None: + task_configs = [ + conf for conf in all_configs if conf["intent_template_id"] == intent_template_id + ] + if not task_configs: + raise ValueError( + f"Could not find any task config with intent_template_id={intent_template_id}." + ) + + elif task_id is not None: + # use old_task_id to filter configs + task_configs = [conf for conf in all_configs if conf["old_task_id"] == task_id] + if not task_configs: + raise ValueError(f"Could not find any task config with old_task_id={task_id}.") + + self.task_configs = task_configs + + def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: + # Using the custom evaluator for WebArena Lite + from .evaluators import evaluator_router + + # pick a task at random + self.config = self.random.choice(self.task_configs) + + # hack: dynamically build a config file to read from + with tempfile.NamedTemporaryFile(mode="w+", delete=False) as f: + json.dump(self.config, f) + f.flush() + self.config_file = f.name + + # build the evaluator + self.evaluator = evaluator_router(self.config_file) + + # authenticate + for site in self.config["sites"]: + self.webarena_instance.ui_login(site=site, page=page) + + # set geolocation + page.context.set_geolocation(self.config["geolocation"]) + + # navigate to the starting url(s) (might need several pages) + # https://github.com/web-arena-x/webarena/blob/c6475f0e9affe5252a2966e26b8cb4c834a4ae40/browser_env/envs.py#L150 + if self.config["start_url"]: + start_urls = self.config["start_url"].split(" |AND| ") + for i, url in enumerate(start_urls): + page.goto(url) + if i < len(start_urls) - 1: + page = page.context.new_page() + + # recover goal + goal = self.config["intent"] + + # This note is present in all webarena's agent prompts + # https://github.com/web-arena-x/webarena/blob/c6475f0e9affe5252a2966e26b8cb4c834a4ae40/agent/prompts/raw/p_cot_id_actree_2s.py#L34 + if self.with_homepage_hint: + goal += f""" + +(Note: if you want to visit other websites, check out the homepage at {self.webarena_instance.home_url}. It has a list of websites you can visit. {self.webarena_instance.home_url}/password.html lists all the account name and password for the websites. You can use them to log in to the websites.) +""" + + # This note is present in some of webarena's agent prompts + if self.with_na_hint: + goal += """\ + +If you believe the task is impossible to complete, provide the answer "N/A". +""" + + return goal, {} diff --git a/browsergym/webarenalite/src/browsergym/webarenalite/test_webarena_lite.raw.json b/browsergym/webarenalite/src/browsergym/webarenalite/test_webarena_lite.raw.json new file mode 100644 index 000000000..adb1822c0 --- /dev/null +++ b/browsergym/webarenalite/src/browsergym/webarenalite/test_webarena_lite.raw.json @@ -0,0 +1,5838 @@ +[ + { + "sites": [ + "shopping_admin" + ], + "task_id": 0, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What are the top-{{n}} best-selling product in {{period}}", + "instantiation_dict": { + "n": 3, + "period": "Jan 2023" + }, + "intent": "What are the top-3 best-selling product in Jan 2023", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Impulse Duffle", + "Overnight Duffle", + "Hawkeye Yoga Short-32-Blue" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Impulse Duffle, Overnight Duffle, Hawkeye Yoga Short-32-Blue" + }, + "intent_template_id": 279, + "old_task_id": 4 + }, + { + "sites": [ + "map" + ], + "task_id": 1, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "instantiation_dict": { + "airport_type": "international airports", + "start": "Carnegie Mellon University", + "radius": "50 km" + }, + "intent": "Tell me the full address of all international airports that are within a driving distance of 50 km to Carnegie Mellon University", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Pittsburgh International Airport People Movers, Airport Boulevard, Findlay Township, Allegheny County, Pennsylvania, 15231, United States" + }, + "intent_template_id": 79, + "old_task_id": 7 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 2, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", + "instantiation_dict": { + "term": "best" + }, + "intent": "Tell me the the number of reviews that our store received by far that mention term \"best\"", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "2" + }, + "intent_template_id": 288, + "old_task_id": 15 + }, + { + "sites": [ + "map" + ], + "task_id": 3, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Compare the difference in time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": { + "start": "Randyland", + "end": "Carnegie Mellon University" + }, + "intent": "Compare the difference in time for walking and driving route from Randyland to Carnegie Mellon University", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "driving: 13min", + "walking: 1h 45min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "driving: 13min, walking: 1h 45min." + }, + "intent_template_id": 73, + "old_task_id": 20 + }, + { + "sites": [ + "shopping" + ], + "task_id": 4, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/3-pack-samsung-galaxy-s6-screen-protector-nearpow-tempered-glass-screen-protector-with-9h-hardness-crystal-clear-easy-bubble-free-installation-scratch-resist.html", + "geolocation": null, + "intent_template": "List out reviewers, if exist, who mention about {{description}}", + "instantiation_dict": { + "description": "good fingerprint resistant" + }, + "intent": "List out reviewers, if exist, who mention about good fingerprint resistant", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Rachel", + "T. Gannon" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Rachel, T. Gannon, " + }, + "intent_template_id": 222, + "old_task_id": 23 + }, + { + "sites": [ + "reddit" + ], + "task_id": 5, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": { + "forum": "Showerthoughts" + }, + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 33, + "old_task_id": 27 + }, + { + "sites": [ + "map" + ], + "task_id": 6, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "instantiation_dict": { + "place": "Pittsburgh Airport", + "target1": "Hilton hotel", + "information": "the shortest walking distance", + "target2": "a supermarket" + }, + "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "DoubleTree by Hilton Hotel Pittsburgh Airport", + "1.4km" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "DoubleTree by Hilton Hotel Pittsburgh Airport Distance: 1.4km" + }, + "intent_template_id": 78, + "old_task_id": 33 + }, + { + "sites": [ + "map" + ], + "task_id": 7, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": { + "place": "police station", + "location": "gates building at CMU" + }, + "intent": "Check if the police station in pittsburgh can be reached in one hour by car from gates building at CMU", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Yes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Yes" + }, + "intent_template_id": 77, + "old_task_id": 37 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 8, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List the top {{n}} search terms in my store", + "instantiation_dict": { + "n": "3" + }, + "intent": "List the top 3 search terms in my store", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "hollister", + "Joust Bag", + "nike" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "hollister, Joust Bag, nike" + }, + "intent_template_id": 285, + "old_task_id": 43 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 9, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Check out my todos", + "instantiation_dict": {}, + "intent": "Check out my todos", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/dashboard/todos", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 303, + "old_task_id": 44 + }, + { + "sites": [ + "shopping" + ], + "task_id": 10, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": { + "period": "over the past three days" + }, + "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "0 order", + "$0 total spend" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0 order, $0 total spend" + }, + "intent_template_id": 197, + "old_task_id": 48 + }, + { + "sites": [ + "map" + ], + "task_id": 11, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": { + "start": "Carnegie Museum of Art", + "end": "a library at CMU" + }, + "intent": "How long does it take to walk from Carnegie Museum of Art to a library at CMU?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "11 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "11 min" + }, + "intent_template_id": 68, + "old_task_id": 56 + }, + { + "sites": [ + "map" + ], + "task_id": 12, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": { + "place1": "cafe", + "place2": "CMU Hunt library" + }, + "intent": "Tell me the closest cafe(s) to CMU Hunt library", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "De Fer Coffee & Tea" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "De Fer Coffee & Tea" + }, + "intent_template_id": 69, + "old_task_id": 58 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 13, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Which customer has completed the {{quantifier}} number of orders in the entire history?", + "instantiation_dict": { + "quantifier": "fifth most" + }, + "intent": "Which customer has completed the fifth most number of orders in the entire history?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Matt Baker" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Matt Baker" + }, + "intent_template_id": 276, + "old_task_id": 65 + }, + { + "sites": [ + "reddit" + ], + "task_id": 14, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", + "instantiation_dict": { + "number": 10, + "subreddit": "books", + "description": "is there any post talks about supporting local book stores? If so, tell me the organizations involved" + }, + "intent": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the organizations involved", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "bookshop.org" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "bookshop.org" + }, + "intent_template_id": 17, + "old_task_id": 69 + }, + { + "sites": [ + "map" + ], + "task_id": 15, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the zip code of {{place}}?", + "instantiation_dict": { + "place": "Chatham University" + }, + "intent": "What is the zip code of Chatham University?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "15232" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "15232" + }, + "intent_template_id": 70, + "old_task_id": 71 + }, + { + "sites": [ + "map" + ], + "task_id": 16, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Given the following locations, {{place_list}}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "instantiation_dict": { + "place_list": [ + "Massachusetts Institute of Technology", + "Harvard University", + "Boston Logan International Airport" + ] + }, + "intent": "Given the following locations, ['Massachusetts Institute of Technology', 'Harvard University', 'Boston Logan International Airport'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "The order is Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport" + }, + "intent_template_id": 65, + "old_task_id": 75 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 17, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", + "instantiation_dict": { + "status": "Pending" + }, + "intent": "What is the total count of Pending reviews amongst all the reviews?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "5" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "5" + }, + "intent_template_id": 277, + "old_task_id": 77 + }, + { + "sites": [ + "map" + ], + "task_id": 18, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", + "instantiation_dict": { + "place_A": "Massachusetts Institute of Technology", + "place_B": "Harvard University", + "place_C": "Boston Logan International Airport" + }, + "intent": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "64 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "63 min" + }, + "intent_template_id": 72, + "old_task_id": 82 + }, + { + "sites": [ + "map" + ], + "task_id": 19, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": { + "hotel": "Homewood Suites Southpointe", + "place": "PPG Paints Arena" + }, + "intent": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "34 minutes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "34 minutes" + }, + "intent_template_id": 64, + "old_task_id": 88 + }, + { + "sites": [ + "map" + ], + "task_id": 20, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": { + "state": "New Hampshire" + }, + "intent": "Which US states border New Hampshire?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Massachusetts", + "Vermont", + "Maine" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Massachusetts, Vermont, Maine" + }, + "intent_template_id": 67, + "old_task_id": 93 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 21, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Telll me the grand total of invoice {{id}}.", + "instantiation_dict": { + "id": "000000002" + }, + "intent": "Telll me the grand total of invoice 000000002.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "39.64" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$39.64" + }, + "intent_template_id": 274, + "old_task_id": 95 + }, + { + "sites": [ + "shopping" + ], + "task_id": 22, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me the status of my latest order and when will it arrive", + "instantiation_dict": {}, + "intent": "Tell me the status of my latest order and when will it arrive", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "The last order was canceled. It will never arrive." + ] + }, + "reference_url": "", + "program_html": [], + "reference_answer_raw_annotation": "The last order was canceled. It will never arrive.", + "string_note": "" + }, + "intent_template_id": 193, + "old_task_id": 96 + }, + { + "sites": [ + "map", + "wikipedia" + ], + "task_id": 23, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts", + "instantiation_dict": {}, + "intent": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "914" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "914 km" + }, + "intent_template_id": 120, + "old_task_id": 97 + }, + { + "sites": [ + "map" + ], + "task_id": 24, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", + "instantiation_dict": { + "places": "tea cafe", + "start": "University of Pittsburgh" + }, + "intent": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Fuku Tea", + "3716", + "Forbes Avenue", + "Central Oakland", + "Pittsburgh", + "653m" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Fuku Tea, 3716, Forbes Avenue, Oakland, Central Oakland, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States\n653m" + }, + "intent_template_id": 66, + "old_task_id": 98 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 25, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": { + "label": "questions", + "repo": "kkroening/ffmpeg-python" + }, + "intent": "Display the list of issues in the kkroening/ffmpeg-python repository that have labels related to questions", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/kkroening/ffmpeg-python/-/issues/?sort=created_date&state=opened&label_name%5B%5D=question&first_page_size=20", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 349, + "old_task_id": 103 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 26, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", + "instantiation_dict": { + "period": "from Jan to December 2022" + }, + "intent": "Presents the monthly count of successful orders from Jan to December 2022 in MM:COUNT format", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "01:11", + "02:16", + "03:14", + "04:8", + "05:8", + "06:13", + "07:9", + "08:8", + "09:10", + "10:4", + "11:5", + "12:10" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "01:11 02:16 03:14 04:8 05:8 06:13 07:9 08:8 09:10 10:4 11:5 12:10" + }, + "intent_template_id": 270, + "old_task_id": 109 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 27, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the name of the customers who have expressed dissatisfaction with {{product}}", + "instantiation_dict": { + "product": "Chloe tank" + }, + "intent": "Show me the name of the customers who have expressed dissatisfaction with Chloe tank", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no negative review for Chloe tank", + "reference_answer_raw_annotation": "" + }, + "intent_template_id": 245, + "old_task_id": 115 + }, + { + "sites": [ + "shopping" + ], + "task_id": 28, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the date when I made my first purchase on this site?", + "instantiation_dict": {}, + "intent": "What is the date when I made my first purchase on this site?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "3/2/22" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3/2/22" + }, + "intent_template_id": 161, + "old_task_id": 117 + }, + { + "sites": [ + "shopping" + ], + "task_id": 29, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I have jaw bruxism problem, show me something that could alleviate the problem.", + "instantiation_dict": {}, + "intent": "I have jaw bruxism problem, show me something that could alleviate the problem.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "", + "required_contents": { + "must_include": [ + "jaw bruxism", + "mouth guard" + ] + } + } + ] + }, + "intent_template_id": 151, + "old_task_id": 118 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 30, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the reasons why customers like {{product}}", + "instantiation_dict": { + "product": "Circe's products" + }, + "intent": "Tell me the reasons why customers like Circe's products", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Warm and comfortable. True to size." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Warm and comfortable. True to size." + }, + "intent_template_id": 250, + "old_task_id": 123 + }, + { + "sites": [ + "shopping" + ], + "task_id": 31, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range of {{product}} in the One Stop Market?", + "instantiation_dict": { + "product": "teeth grinding mouth guard" + }, + "intent": "What is the price range of teeth grinding mouth guard in the One Stop Market?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1.46", + "85" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$1.46 - $85" + }, + "intent_template_id": 159, + "old_task_id": 125 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 32, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What brands appear most frequently among the top search terms?", + "instantiation_dict": {}, + "intent": "What brands appear most frequently among the top search terms?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Hollister" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Hollister" + }, + "intent_template_id": 1001, + "old_task_id": 127 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 33, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What's the total number of items sold in the most recent {{k}} orders?", + "instantiation_dict": { + "k": "7" + }, + "intent": "What's the total number of items sold in the most recent 7 orders?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "25" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "25" + }, + "intent_template_id": 1002, + "old_task_id": 131 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 34, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": { + "user": "Eric and Kilian", + "repo": "a11yproject", + "date": "1/3/2023" + }, + "intent": "How many commits did Eric and Kilian make to a11yproject on 1/3/2023?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 322, + "old_task_id": 135 + }, + { + "sites": [ + "map" + ], + "task_id": 35, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", + "instantiation_dict": { + "city1": "the hometown of Joe Biden", + "city2": "Bridgeport" + }, + "intent": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "3h 20min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3h 20min" + }, + "intent_template_id": 51, + "old_task_id": 139 + }, + { + "sites": [ + "shopping" + ], + "task_id": 36, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much I spent on {{category}} shopping during {{time}}", + "instantiation_dict": { + "category": "food", + "time": "from mid Jan to the end Jan 2023" + }, + "intent": "How much I spent on food shopping during from mid Jan to the end Jan 2023", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 162, + "old_task_id": 144 + }, + { + "sites": [ + "shopping" + ], + "task_id": 37, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", + "instantiation_dict": { + "option": "color", + "product": "artifical plants", + "time": "Feb 2023" + }, + "intent": "What is the color configuration of the artifical plants I bought Feb 2023", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Green-vines" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Green-vines" + }, + "intent_template_id": 155, + "old_task_id": 149 + }, + { + "sites": [ + "map" + ], + "task_id": 38, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": { + "location1": "Animal Rescue League of Pittsburgh", + "location2": "Schenley park" + }, + "intent": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "9min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "9min" + }, + "intent_template_id": 36, + "old_task_id": 155 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 39, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Checkout merge requests assigned to me", + "instantiation_dict": {}, + "intent": "Checkout merge requests assigned to me", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/dashboard/merge_requests?assignee_username=byteblaze", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 290, + "old_task_id": 156 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 40, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show all customers", + "instantiation_dict": {}, + "intent": "Show all customers", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/customer/index/", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 255, + "old_task_id": 157 + }, + { + "sites": [ + "shopping" + ], + "task_id": 41, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": { + "num": 40 + }, + "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 40 cards", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 171, + "old_task_id": 162 + }, + { + "sites": [ + "shopping" + ], + "task_id": 42, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/photosmart-plus-b209-clr-inkjetfb-p-s-c-usb-wrls-1.html", + "geolocation": null, + "intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", + "instantiation_dict": {}, + "intent": "What are the main criticisms of this product? Please extract the relevant sentences.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "The wireless connection works on a whim (about 40% of the time I've owned it)", + "It seems to constantly run out of ink", + "Cartridge prices are less than some printers I've had", + "This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason", + "Scanner is as slow as my first scanner I ever owned in the mid-90's", + "For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "The wireless connection works on a whim (about 40% of the time I've owned it). It seems to constantly run out of ink. Cartridge prices are less than some printers I've had, but now I understand why. This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason. Scanner is as slow as my first scanner I ever owned in the mid-90's. For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months." + }, + "intent_template_id": 136, + "old_task_id": 167 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 43, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", + "instantiation_dict": { + "description": "the most" + }, + "intent": "Tell me the full names of the repositories where I made contributions and they got the most stars?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "a11yproject.com", + "design" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "a11yproject.com, Primer/design" + }, + "intent_template_id": 289, + "old_task_id": 169 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 44, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "instantiation_dict": { + "keyword": "better" + }, + "intent": "Open my latest updated issue that has keyword \"better\" in its title to check if it is closed", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": [ + "No, it is open" + ] + }, + "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/8", + "program_html": [], + "reference_answer_raw_annotation": "Not closed", + "string_note": "", + "url_note": "GOLD in PRED" + }, + "intent_template_id": 310, + "old_task_id": 173 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 45, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", + "instantiation_dict": { + "keyword": "homepage content" + }, + "intent": "Open my latest created issue that has homepage content in its title to check if it is closed", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "exact_match": "Yes" + }, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/719", + "program_html": [], + "reference_answer_raw_annotation": "closed", + "string_note": "" + }, + "intent_template_id": 500, + "old_task_id": 182 + }, + { + "sites": [ + "shopping" + ], + "task_id": 46, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me the total cost of my latest {{status}} order?", + "instantiation_dict": { + "status": "complete" + }, + "intent": "Tell me the total cost of my latest complete order?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "65.32" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "65.32" + }, + "intent_template_id": 214, + "old_task_id": 190 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 47, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Compare the payment difference of the last {{N}} {{status_1}} orders and {{status_2}} orders", + "instantiation_dict": { + "status_1": "cancelled", + "status_2": "completed", + "N": "4" + }, + "intent": "Compare the payment difference of the last 4 cancelled orders and completed orders", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "194.25" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "194.25" + }, + "intent_template_id": 367, + "old_task_id": 196 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 48, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "date", + "status": "most recent canlled" + }, + "intent": "Get the date of the most recent canlled order", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "May 23 2023" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "May 23, 2023" + }, + "intent_template_id": 366, + "old_task_id": 202 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 49, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make on {{date}}?", + "instantiation_dict": { + "user": "kilian", + "date": "3/5/2023" + }, + "intent": "How many commits did kilian make on 3/5/2023?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1" + }, + "intent_template_id": 320, + "old_task_id": 205 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 50, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": { + "PhoneNum": "8015551212" + }, + "intent": "Find the customer name and email with phone number 8015551212", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Sean Miller", + "sean.miller@gmail.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Sean Miller, sean.miller@gmail.com" + }, + "intent_template_id": 364, + "old_task_id": 211 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 51, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What are the key aspects that the customers don't like about {{product}}", + "instantiation_dict": { + "product": "Circe ice fleece" + }, + "intent": "What are the key aspects that the customers don't like about Circe ice fleece", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": ["N/A"] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 249, + "old_task_id": 215 + }, + { + "sites": [ + "map" + ], + "task_id": 52, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the walking distance from nearby hotels to {{location}} that take at most {{n}} minutes?", + "instantiation_dict": { + "location": "Gardner Steel Conference Center,", + "n": 5 + }, + "intent": "Show me the walking distance from nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Wyndham Pittsburgh University Cente: 375m", + "The Oaklander Hotel: 338m" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Wyndham Pittsburgh University Cente: 375 m\nThe Oaklander Hotel: 338 m" + }, + "intent_template_id": 41, + "old_task_id": 220 + }, + { + "sites": [ + "map" + ], + "task_id": 53, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am at CMU Pittsburgh, how long it takes to the nearest {{location}} with different transportation methods?", + "instantiation_dict": { + "location": "USPS postal office" + }, + "intent": "I am at CMU Pittsburgh, how long it takes to the nearest USPS postal office with different transportation methods?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Walk: 1 minute", + "Drive: less than 1 minute", + "Bike: less than 1 minute" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Walk: 1 minute to walk and\nDrive: less than 1 minute\nBike: less than 1 minute" + }, + "intent_template_id": 35, + "old_task_id": 221 + }, + { + "sites": [ + "shopping" + ], + "task_id": 54, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What do customers say about {{product_type}} from {{manufature}}", + "instantiation_dict": { + "product_type": "brush", + "manufature": "sephora" + }, + "intent": "What do customers say about brush from sephora", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "No reviews available for the sephora brushes." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "The sephora brushes don't have reviews", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 135, + "old_task_id": 225 + }, + { + "sites": [ + "shopping" + ], + "task_id": 55, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": { + "brand": "EYZUTAK" + }, + "intent": "What is the price range for products from EYZUTAK?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "9.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$9.99" + }, + "intent_template_id": 370, + "old_task_id": 227 + }, + { + "sites": [ + "shopping" + ], + "task_id": 56, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": { + "status": "under delivery" + }, + "intent": "Get the order number of my most recent under delivery order ", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "N/A" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "There is no under delivery order" + }, + "intent_template_id": 213, + "old_task_id": 235 + }, + { + "sites": [ + "map" + ], + "task_id": 57, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Where is the nearest {{location}} from {{location2}} {{condition}}", + "instantiation_dict": { + "location": "pharmacy", + "location2": "Carnegie Mellon", + "condition": "I can walk within 20mins" + }, + "intent": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Schiller's Pharmacy", + "811", + "South Aiken Avenue", + "Shadyside", + "Pittsburgh" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Schiller's Pharmacy, 811, South Aiken Avenue, Shadyside, Pittsburgh, Allegheny County, 15232, United States" + }, + "intent_template_id": 39, + "old_task_id": 236 + }, + { + "sites": [ + "shopping" + ], + "task_id": 58, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": { + "product_category": "competative swimwear" + }, + "intent": "I am doing a market survey for one stop market, show me the most expensive product from competative swimwear category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/clothing-shoes-jewelry/sport-specific-clothing/competitive-swimwear.html?product_list_order=price&product_list_dir=desc", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 138, + "old_task_id": 240 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 59, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": { + "information": "email address", + "product": "the style of Zoe products" + }, + "intent": "Show me the email address of the customer who is the most unhappy with the style of Zoe products", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "N/A" + }, + "reference_url": "Valorie doesn't have a email in the system", + "program_html": [], + "string_note": "There is no negative review for Zoe products", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 244, + "old_task_id": 247 + }, + { + "sites": [ + "map" + ], + "task_id": 60, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": { + "location": "Apple Store near Pitt" + }, + "intent": "Tell me the coordinates of Apple Store near Pitt in DD format", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "40.451", + "-79.933" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "40.4511693, -79.9334241" + }, + "intent_template_id": 46, + "old_task_id": 250 + }, + { + "sites": [ + "map" + ], + "task_id": 61, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the {{information}} of {{location}}", + "instantiation_dict": { + "location": "Western Pennsylvania Hospital", + "information": "phone number" + }, + "intent": "What is the phone number of Western Pennsylvania Hospital", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "+1 412 578 5000" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "4125785000" + }, + "intent_template_id": 501, + "old_task_id": 254 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 62, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "See all public projects", + "instantiation_dict": {}, + "intent": "See all public projects", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/explore", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 325, + "old_task_id": 258 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 63, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Get me my RSS feed token", + "instantiation_dict": {}, + "intent": "Get me my RSS feed token", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "TMN_bBn9Z48qVbUFZV45" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "TMN_bBn9Z48qVbUFZV45" + }, + "intent_template_id": 312, + "old_task_id": 259 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 64, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What's the closest national park to {{city}}? How long does it take to bike there?", + "instantiation_dict": { + "city": "Vinalhaven, ME" + }, + "intent": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Acadia National Park" + ], + "fuzzy_match": [ + "10h 33min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Acadia National Park\n10h 33min" + }, + "intent_template_id": 85, + "old_task_id": 268 + }, + { + "sites": [ + "shopping" + ], + "task_id": 65, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": { + "price": "30", + "product_category": "men shoes" + }, + "intent": "Show me products under $30 in \"men shoes\" category", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html?price=0-30", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 139, + "old_task_id": 270 + }, + { + "sites": [ + "shopping" + ], + "task_id": 66, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": { + "keyword": "switch accessories" + }, + "intent": "Search for \"switch accessories\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/?q=switch+accessories", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 212, + "old_task_id": 276 + }, + { + "sites": [ + "shopping" + ], + "task_id": 67, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Look up the most recent models of XBox controllers released between 2020-2021?", + "instantiation_dict": {}, + "intent": "Look up the most recent models of XBox controllers released between 2020-2021?", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/microsoft-xbox-controller-carbon-black-for-series-x-series-s-xbox-one-windows-10-android-ios-bundled-with-dual-port-charging-dock-xbox-controller-skin-voucher-premgear-cloth.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 210, + "old_task_id": 283 + }, + { + "sites": [ + "shopping" + ], + "task_id": 68, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", + "instantiation_dict": { + "product": "switch card holder", + "min_storage": "15 cards" + }, + "intent": "Show the least expensive switch card holder with a minimum storage capacity of 15 cards.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 207, + "old_task_id": 285 + }, + { + "sites": [ + "map" + ], + "task_id": 69, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "How much time does it take from Pittsburgh to Philadelphia by car?", + "instantiation_dict": {}, + "intent": "How much time does it take from Pittsburgh to Philadelphia by car?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "8h 33min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "5h 47min" + }, + "intent_template_id": 47, + "old_task_id": 287 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 70, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": { + "attribute": "name" + }, + "intent": "Tell me the name of the customer who has the most cancellations in the history", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Samantha Jones" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Samantha Jones" + }, + "intent_template_id": 234, + "old_task_id": 288 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 71, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": { + "repo": "the best GAN python implementation" + }, + "intent": "Show me the command to clone the best GAN python implementation with SSH.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "ssh://git@localhost:2222/eriklindernoren/PyTorch-GAN.git" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "ssh://git@localhost:2222/eriklindernoren/PyTorch-GAN.git" + }, + "intent_template_id": 329, + "old_task_id": 296 + }, + { + "sites": [ + "shopping" + ], + "task_id": 72, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the most recent {{status}} order", + "instantiation_dict": { + "status": "pending" + }, + "intent": "Show the most recent pending order", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/sales/order/view/order_id/189/", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 180, + "old_task_id": 300 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 73, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", + "instantiation_dict": { + "repo": "Pytorch GAN" + }, + "intent": "Tell me who has made the most contributions, in terms of number of commits, to the Pytorch GAN project", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Erik Linder-NorĆ©n" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Erik Linder-NorĆ©n" + }, + "intent_template_id": 323, + "old_task_id": 311 + }, + { + "sites": [ + "shopping" + ], + "task_id": 74, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Which number to call for the customer service?", + "instantiation_dict": {}, + "intent": "Which number to call for the customer service?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no phone number in the website", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 134, + "old_task_id": 313 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 75, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "instantiation_dict": { + "repo": "2019-nCov", + "attribute": "last names" + }, + "intent": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Lo", + "Chen", + "Chu" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Lo, Chen, Chu" + }, + "intent_template_id": 324, + "old_task_id": 318 + }, + { + "sites": [ + "shopping" + ], + "task_id": 76, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much refund I should expect from my order canlled in {{time}}, including shipping fee", + "instantiation_dict": { + "time": "2022" + }, + "intent": "How much refund I should expect from my order canlled in 2022, including shipping fee", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "3053.97" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3053.97" + }, + "intent_template_id": 160, + "old_task_id": 321 + }, + { + "sites": [ + "shopping" + ], + "task_id": 77, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": { + "product": "chairs", + "sorting_order": "ascending price" + }, + "intent": "Show me the \"chairs\" listings by ascending price.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=chairs&product_list_dir=asc", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 208, + "old_task_id": 324 + }, + { + "sites": [ + "shopping" + ], + "task_id": 78, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much did I spend on shopping at One Stop Market {{time}}? They gave me a 20% discount on the total amount for orders exceeding $200 in cash", + "instantiation_dict": { + "time": "on November 2022" + }, + "intent": "How much did I spend on shopping at One Stop Market on November 2022? They gave me a 20% discount on the total amount for orders exceeding $200 in cash", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "359.546" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "359.546" + }, + "intent_template_id": 147, + "old_task_id": 333 + }, + { + "sites": [ + "shopping" + ], + "task_id": 79, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": { + "description": "body butter" + }, + "intent": "Tell me when I last ordered my body butter?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "January 16th 2023" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "January 16th 2023" + }, + "intent_template_id": 169, + "old_task_id": 335 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 80, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "How many reviews our shop received {{time}}?", + "instantiation_dict": { + "time": "in May 2023" + }, + "intent": "How many reviews our shop received in May 2023?", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 248, + "old_task_id": 348 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 81, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Who else have access to my repo {{repo}}, show me their usernames", + "instantiation_dict": { + "repo": "gimmiethat.space" + }, + "intent": "Who else have access to my repo gimmiethat.space, show me their usernames", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": "yjlou" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "yjlou" + }, + "intent_template_id": 298, + "old_task_id": 349 + }, + { + "sites": [ + "shopping" + ], + "task_id": 82, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": { + "product_category": "living room furtniture", + "order": "descending" + }, + "intent": "List products from living room furtniture category by descending price", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/home-kitchen/furniture/living-room-furniture.html?product_list_order=price&product_list_dir=desc", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 137, + "old_task_id": 354 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 83, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Checkout merge requests requiring my review", + "instantiation_dict": {}, + "intent": "Checkout merge requests requiring my review", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/dashboard/merge_requests?reviewer_username=byteblaze", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 291, + "old_task_id": 357 + }, + { + "sites": [ + "shopping" + ], + "task_id": 84, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the {{info}} for order number {{order_number}}.", + "instantiation_dict": { + "info": "order statuses", + "order_number": "170 and 189" + }, + "intent": "Show me the order statuses for order number 170 and 189.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "170: cancelled", + "189: pending" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "170: cancelled, 189: pending" + }, + "intent_template_id": 206, + "old_task_id": 361 + }, + { + "sites": [ + "map" + ], + "task_id": 85, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "instantiation_dict": { + "location/address_1": "Carnegie Mellon University", + "location/address_2": "CVS (closet one)" + }, + "intent": "Measure distance between Carnegie Mellon University and CVS (closet one) by walking", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "1.4km" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1.4km" + }, + "intent_template_id": 58, + "old_task_id": 367 + }, + { + "sites": [ + "shopping" + ], + "task_id": 86, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "find discounted items.", + "instantiation_dict": {}, + "intent": "find discounted items.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no function to show only discount items", + "reference_answer_raw_annotation": "There is no function to show only discount items." + }, + "intent_template_id": 188, + "old_task_id": 368 + }, + { + "sites": [ + "map" + ], + "task_id": 87, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": { + "location": "Carnegie Music Hall" + }, + "intent": "Pull up the description page of Carnegie Music Hall on Map", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Carnegie Music Hall" + ] + } + } + ] + }, + "intent_template_id": 52, + "old_task_id": 369 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 88, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Preview the {{name}} theme for my shop", + "instantiation_dict": { + "name": "Magento Blank" + }, + "intent": "Preview the Magento Blank theme for my shop", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/1", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 266, + "old_task_id": 374 + }, + { + "sites": [ + "shopping" + ], + "task_id": 89, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Summarize customer reviews for {{product}}.", + "instantiation_dict": { + "product": "Amazon Echo Dot 3rd generation" + }, + "intent": "Summarize customer reviews for Amazon Echo Dot 3rd generation.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Serena: It was the exact same one that came with my echo show that my puppy", + "Amazon Customer: It doesn’t work .", + "BlessedGirl: Great replacement for Echo Dot.", + "Jeff Sederquist: Great product no issues. It's for a Echo show 5", + "melvin m.: Very good charger but I had to buy this one because when I bought an echo show 5, I came with an echo show 10 charger in the package with the device", + "Jack Smith: Looks like pictures", + "Ignacio: Muy bueno de buena calidad", + "BusyB: My kids bent the original cord for their Echo show 5 and this replacement cord is just like the one it came with. It works just fine. Been using it for 2 weeks now and no problems.", + "Kee Williams: Great value works better than original", + "Howard A. Sinclair: Great product!!! Works exactly as it is supposed to." + ] + }, + "reference_url": "", + "program_html": [] + }, + "intent_template_id": 182, + "old_task_id": 376 + }, + { + "sites": [ + "map" + ], + "task_id": 90, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the {{space}} around {{location}}", + "instantiation_dict": { + "location": "Carnegie Music Hall", + "space": "hotel" + }, + "intent": "Find the hotel around Carnegie Music Hall", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__MAP__/search?query=hotels%20near%20Carnegie%20Music%20Hall", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 59, + "old_task_id": 381 + }, + { + "sites": [ + "map" + ], + "task_id": 91, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services", + "instantiation_dict": {}, + "intent": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "N/A" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no USCIS nearby", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 781, + "old_task_id": 382 + }, + { + "sites": [ + "map" + ], + "task_id": 92, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel", + "instantiation_dict": {}, + "intent": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Hyatt Regency Pittsburgh International Airport", + "Giant Eagle", + "ALDI" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Hyatt Regency Pittsburgh International Airport Giant Eagle, ALDI" + }, + "intent_template_id": 782, + "old_task_id": 383 + }, + { + "sites": [ + "shopping" + ], + "task_id": 93, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List the customer names who complain about the quality of EYZUTAK phone cases", + "instantiation_dict": {}, + "intent": "List the customer names who complain about the quality of EYZUTAK phone cases", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Lisa Lee", + "Evelyn Kurver", + "Amanda", + "N Randall" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Lisa Lee, Evelyn Kurver, Amanda, N Randall" + }, + "intent_template_id": 666, + "old_task_id": 384 + }, + { + "sites": [ + "shopping" + ], + "task_id": 94, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the rating of {{product}}", + "instantiation_dict": { + "product": "Ugreen lightning to 3.5mm cable" + }, + "intent": "What is the rating of Ugreen lightning to 3.5mm cable. Please round to the nearest whole number", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "65 |OR| 3" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Around 3.1 stars" + }, + "intent_template_id": 1355, + "old_task_id": 386 + }, + { + "sites": [ + "shopping" + ], + "task_id": 95, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", + "instantiation_dict": { + "stars": "4 or 5 stars" + }, + "intent": "Who gave 4 or 5 stars for phone cases from EYZUTAK", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "MH", + "Misba009", + "Amanda", + "Amazon Customer", + "Cally", + "Bethany Robertson" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "MH, Misba009, Amanda, Amazon Customer, Cally, Bethany Robertson, Bethany Robertson" + }, + "intent_template_id": 1356, + "old_task_id": 387 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 96, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", + "instantiation_dict": { + "mr": "color ulitity", + "repo": "a11yproject.com", + "content": "Good idea" + }, + "intent": "Post \"Good idea\" for the merge request related to color ulitity in a11yproject.com project", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1071", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "exact_match": "Good idea" + } + } + ] + }, + "intent_template_id": 348, + "old_task_id": 392 + }, + { + "sites": [ + "reddit" + ], + "task_id": 97, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": { + "content": "Seeking SDE positions" + }, + "intent": "Change my reddit bio to \"Seeking SDE positions\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/user/MarvelsGrantMan136", + "locator": "document.querySelector(\".user-bio__biography\").outerText", + "required_contents": { + "exact_match": "Seeking SDE positions" + } + } + ] + }, + "intent_template_id": 6, + "old_task_id": 401 + }, + { + "sites": [ + "reddit" + ], + "task_id": 98, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": { + "subreddit": "books" + }, + "intent": "Upvote the newest post in books subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/books/124260/adults-reading-to-each-other-out-loud", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 22, + "old_task_id": 404 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 99, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "instantiation_dict": { + "topic": "verification functions" + }, + "intent": "Go to the merge request on verification functions I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/a11y-webring.club/-/merge_requests/40", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "exact_match": "@davepgreene" + } + } + ] + }, + "intent_template_id": 360, + "old_task_id": 415 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 100, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": { + "status": "Enjoying life" + }, + "intent": "Set my gitlab status as Enjoying life.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.cover-status').lastChild.textContent", + "required_contents": { + "exact_match": "Enjoying life" + } + } + ] + }, + "intent_template_id": 361, + "old_task_id": 419 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 101, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Mark all {{brand}} shirts on sale", + "instantiation_dict": { + "brand": "Hollister" + }, + "intent": "Mark all Hollister shirts on sale", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/126/", + "locator": "document.querySelector('input[name=\"product[sale]\"]').value", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 237, + "old_task_id": 423 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 102, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the place in Pennsylvania where a plane crashed during the September 11th attacks" + }, + "intent": "Find the page of the place in Pennsylvania where a plane crashed during the September 11th attacks on the map.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Somerset County" + ] + } + } + ] + }, + "intent_template_id": 371, + "old_task_id": 426 + }, + { + "sites": [ + "shopping" + ], + "task_id": 103, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/tall-pink-taper-candles-4-piece-orange-colored-tapered-candles-gradient-candles-10-6-inches-tall-tie-dye-candle-set-large-dripless-long-burning-candlesticks-two-color-taper-candles-candlesticks.html |AND| __SHOPPING__/spaas-white-taper-candles-4-pack-10-inch-tall-candles-scent-free-premium-wax-candle-sticks-8-hour-long-burning-white-candlesticks-for-home-decoration-wedding-holiday-and-parties.html |AND| __SHOPPING__/white-starfish-wall-candle-sconces-set-of-2-beach-decor-ocean-themed-wall-mount-candleholders-nautical-style-beach-bathroom-decor-coastal-farmhouse-seashell-candle-holders.html", + "geolocation": null, + "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "instantiation_dict": {}, + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/checkout/cart", + "locator": "", + "required_contents": { + "must_include": [ + "SPAAS White Taper Candles - 4 Pack |OR| 10 Inch Tall Candles, Scent-Free Premium Wax Candle Sticks |OR| 8 Hour Long Burning White Candlesticks for Home Decoration, Wedding, Holiday and Parties" + ] + } + } + ] + }, + "intent_template_id": 145, + "old_task_id": 431 + }, + { + "sites": [ + "shopping" + ], + "task_id": 104, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": { + "product": "a make up removal kit", + "time": "during summer 2022" + }, + "intent": "I previously ordered some a make up removal kit during summer 2022 and later cancelled. Can you reorder it for me?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B0738JQG6Q" + ] + } + } + ] + }, + "intent_template_id": 156, + "old_task_id": 440 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 105, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": { + "url": "https://egg.tart.com" + }, + "intent": "set the homepage URL on my GitLab profile to https://egg.tart.com", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", + "required_contents": { + "exact_match": "egg.tart.com" + } + } + ] + }, + "intent_template_id": 331, + "old_task_id": 448 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 106, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": { + "product": "Ryker Tee Crew Neck" + }, + "intent": "Disable Ryker Tee Crew Neck from the site, they are facing some quality issues.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/478/", + "locator": "document.querySelector('[name=\"product[status]\"').value", + "required_contents": { + "exact_match": "2" + } + } + ] + }, + "intent_template_id": 242, + "old_task_id": 454 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 107, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/", + "geolocation": null, + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": { + "amount": "$5", + "action": "Reduce" + }, + "intent": "Reduce the price of this product by $5", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "27.00" + } + } + ] + }, + "intent_template_id": 247, + "old_task_id": 458 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 108, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update the product description of {{product}} to highlight the real user positive reviews by quoting the comments", + "instantiation_dict": { + "product": "Antonia Racer Tank" + }, + "intent": "Update the product description of Antonia Racer Tank to highlight the real user positive reviews by quoting the comments", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/../antonia-racer-tank.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", + "required_contents": { + "must_include": [ + "This is in regular rotation at the gym", + "Its colorful and looks kinda cute under my exercise tanks", + "it's very stylish for yoga or something else low impact" + ] + } + } + ] + }, + "intent_template_id": 251, + "old_task_id": 464 + }, + { + "sites": [ + "shopping" + ], + "task_id": 109, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add {{product}} to my wish list", + "instantiation_dict": { + "product": "2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts" + }, + "intent": "Add 2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts to my wish list", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts" + ] + } + } + ] + }, + "intent_template_id": 186, + "old_task_id": 466 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 110, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Cancel order {{id}}", + "instantiation_dict": { + "id": "302" + }, + "intent": "Cancel order 302", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/302/", + "locator": "document.querySelector(\"#order_status\").outerText", + "required_contents": { + "exact_match": "Canceled" + } + } + ] + }, + "intent_template_id": 257, + "old_task_id": 470 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 111, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": { + "project_name": "awesome_llm_reading" + }, + "intent": "Set up a new, empty repository with the name awesome_llm_reading?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/awesome_llm_reading", + "locator": "", + "required_contents": { + "must_include": [ + "awesome_llm_reading" + ] + } + } + ] + }, + "intent_template_id": 292, + "old_task_id": 476 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 112, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": { + "name": "Vinta" + }, + "intent": "Vinta wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'vinta')", + "required_contents": { + "must_include": [ + "Guest" + ] + } + } + ] + }, + "intent_template_id": 294, + "old_task_id": 485 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 113, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": { + "old-heading": "Home Page", + "heading": "This is the home page!! Leave here!!" + }, + "intent": "Change the page title of \"Home Page\" page on my site to \"This is the home page!! Leave here!!\".", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/2/", + "locator": "document.querySelector('input[name=\"title\"]').value", + "required_contents": { + "exact_match": "This is the home page!! Leave here!!" + } + } + ] + }, + "intent_template_id": 275, + "old_task_id": 488 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 114, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": { + "name": "Sarah Miller", + "message": "the order is ready to be shipped soon!" + }, + "intent": "Notify Sarah Miller in their most recent pending order with message \"the order is ready to be shipped soon!\"", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "System message: We cannot add order history." + }, + "intent_template_id": 280, + "old_task_id": 491 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 115, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": { + "tracking": "24353446464", + "order": "307", + "service": "DHL" + }, + "intent": "Update order #307 with the DHL tracking number 24353446464", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/307/active_tab/order_shipments/", + "locator": "", + "required_contents": { + "must_include": [ + "Tracking number 24353446464 for DHL assigned" + ] + } + } + ] + }, + "intent_template_id": 284, + "old_task_id": 497 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 116, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": { + "product": "Aeno capri" + }, + "intent": "Make all Aeno capri as out of stock", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1861/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", + "required_contents": { + "exact_match": "0" + } + } + ] + }, + "intent_template_id": 287, + "old_task_id": 505 + }, + { + "sites": [ + "shopping" + ], + "task_id": 117, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": { + "product_category": "meat substitute", + "dollar_value": "between 100 and 200" + }, + "intent": "Buy the highest rated product from the meat substitute category within a budget between 100 and 200.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B01CTR3DLE" + ] + } + } + ] + }, + "intent_template_id": 172, + "old_task_id": 506 + }, + { + "sites": [ + "shopping" + ], + "task_id": 118, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive", + "instantiation_dict": { + "category": "Men's shoe" + }, + "intent": "Buy the best rating product from \"Men's shoe\" category with at least 5 reviews and the product is least expensive", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B01J4MM3KO" + ] + } + } + ] + }, + "intent_template_id": 216, + "old_task_id": 509 + }, + { + "sites": [ + "shopping" + ], + "task_id": 119, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add a {{product}} to my wish list.", + "instantiation_dict": { + "product": "white desk" + }, + "intent": "Add a white desk to my wish list.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "white", + "desk" + ] + } + } + ] + }, + "intent_template_id": 189, + "old_task_id": 514 + }, + { + "sites": [ + "shopping" + ], + "task_id": 120, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/elmwood-inn-fine-teas-orange-vanilla-caffeine-free-fruit-infusion-16-ounce-pouch.html", + "geolocation": null, + "intent_template": "Add this product to my wishlist", + "instantiation_dict": {}, + "intent": "Add this product to my wishlist", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "Elmwood Inn Fine Teas, Orange Vanilla Caffeine-free Fruit Infusion, 16-Ounce Pouch" + ] + } + } + ] + }, + "intent_template_id": 196, + "old_task_id": 516 + }, + { + "sites": [ + "shopping" + ], + "task_id": 121, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Subscribe to the newsletter of OneStopMarket", + "instantiation_dict": {}, + "intent": "Subscribe to the newsletter of OneStopMarket", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/newsletter/manage/", + "locator": "document.querySelector('[title=\"General Subscription\"').checked.toString()", + "required_contents": { + "exact_match": "true" + } + } + ] + }, + "intent_template_id": 199, + "old_task_id": 521 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 122, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": { + "number": "eight" + }, + "intent": "Star the top eight most stared repos in Gitlab", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/starred", + "locator": "", + "required_contents": { + "must_include": [ + "AndroidSlidingUpPanel", + "create-react-app", + "ffmpeg-python", + "PHP_XLSXWriter", + "AndroidAsync", + "Pytorch-GAN", + "administrate", + "keycloak" + ] + } + } + ] + }, + "intent_template_id": 354, + "old_task_id": 524 + }, + { + "sites": [ + "shopping" + ], + "task_id": 123, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": { + "product": "phone screen protector", + "time": "March 2023" + }, + "intent": "Draft a refund message via their \"contact us\" form for the phone screen protector I bought March 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What’s on your mind?\"').value", + "required_contents": { + "fuzzy_match": [ + "refund", + "it broke after three days of use", + "000000180", + "12.99" + ] + } + } + ] + }, + "intent_template_id": 154, + "old_task_id": 528 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 124, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { + "account_list": [ + "Jakub Klinkovský", + "Koushik", + "Vinta Chen" + ] + }, + "intent": "Follow ['Jakub Klinkovský', 'Koushik', 'Vinta Chen'] on Gitlab", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/following", + "locator": "document.querySelector('.user-profile').outerText", + "required_contents": { + "must_include": [ + "@lahwaacz", + "@koush", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 330, + "old_task_id": 534 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 125, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Modify the address of order #{{order_id}} to {{address}}", + "instantiation_dict": { + "order_id": "299", + "address": "456 Oak Avenue, New York, NY, 10001" + }, + "intent": "Modify the address of order #299 to 456 Oak Avenue, New York, NY, 10001", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/299", + "locator": "", + "required_contents": { + "must_include": [ + "456 Oak Avenue", + "New York", + "10001" + ] + } + } + ] + }, + "intent_template_id": 240, + "old_task_id": 538 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 126, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} of {{product}}", + "instantiation_dict": { + "option": "color", + "value": "blue", + "base_setting": "size S and M", + "product": "Frankie Sweatshirt" + }, + "intent": "Add a new color blue to size S and M of Frankie Sweatshirt", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/110/", + "locator": "document.querySelector('[data-index=\"configurable\"').outerText", + "required_contents": { + "must_include": [ + "Sweatshirt-M-Blue", + "Sweatshirt-S-Blue" + ] + } + } + ] + }, + "intent_template_id": 252, + "old_task_id": 548 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 127, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", + "instantiation_dict": { + "name": "Do it myself", + "num": 8 + }, + "intent": "create a repository named Do it myself that includes a README file with the links to the most active 8 DIY ideas on DIY subreddit?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/Do-it-myself/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", + "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing", + "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", + "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", + "__REDDIT__/f/DIY/118931/afci-outlet-question", + "__REDDIT__/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe", + "__REDDIT__/f/DIY/118866/paver-base-for-shed" + ] + } + } + ] + }, + "intent_template_id": 88, + "old_task_id": 566 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 128, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo", + "instantiation_dict": { + "collaborator_account_list": "Jakub Klinkovský and BenoĆ®t Blanchon", + "repo": "gimmiethat.space" + }, + "intent": "Invite Jakub Klinkovský and BenoĆ®t Blanchon as collaborator to gimmiethat.space repo", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@lahwaacz", + "@bblanchon" + ] + } + } + ] + }, + "intent_template_id": 293, + "old_task_id": 567 + }, + { + "sites": [ + "shopping" + ], + "task_id": 129, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": { + "address": "111 Magnolia Path, Atlanta, GA, 30303" + }, + "intent": "I recently moved, my address is 111 Magnolia Path, Atlanta, GA, 30303, update my information on OneStopShopping accordingly", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", + "required_contents": { + "must_include": [ + "111 Magnolia Path", + "Atlanta, Georgia, 30303" + ] + } + }, + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", + "required_contents": { + "must_include": [ + "111 Magnolia Path", + "Atlanta, Georgia, 30303" + ] + } + } + ] + }, + "intent_template_id": 165, + "old_task_id": 574 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 130, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Add the following users to my {{repo}} as {{role}}: {{user_list}}", + "instantiation_dict": { + "repo": "GitHub timeline item management extension", + "role": "maintainer", + "user_list": [ + "abisubramanya27", + "lahwaacz" + ] + }, + "intent": "Add the following users to my GitHub timeline item management extension as maintainer: ['abisubramanya27', 'lahwaacz']", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/remove-board-movement-events-from-the-github-issue-timeline/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'abisubramanya27')", + "required_contents": { + "must_include": [ + "Maintainer" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/remove-board-movement-events-from-the-github-issue-timeline/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'lahwaacz')", + "required_contents": { + "must_include": [ + "Maintainer" + ] + } + } + ] + }, + "intent_template_id": 351, + "old_task_id": 577 + }, + { + "sites": [ + "reddit" + ], + "task_id": 131, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "instantiation_dict": { + "name": "Cyberpunk", + "description": "Welcome to the future", + "sidebar_list": [ + "Games", + "Books", + "Movies", + "Future" + ] + }, + "intent": "Create a new forum named Cyberpunk, with a description of Welcome to the future, and include ['Games', 'Books', 'Movies', 'Future'] in the sidebar?", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/Cyberpunk/edit", + "locator": "document.querySelector(\"#forum_description\").value", + "required_contents": { + "must_include": [ + "Welcome to the future" + ] + } + }, + { + "url": "__REDDIT__/f/Cyberpunk/edit", + "locator": "document.querySelector(\"#forum_sidebar\").value", + "required_contents": { + "must_include": [ + "Games", + "Books", + "Movies", + "Future" + ] + } + } + ] + }, + "intent_template_id": 7, + "old_task_id": 582 + }, + { + "sites": [ + "reddit" + ], + "task_id": 132, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": { + "subreddit": "machine learning" + }, + "intent": "Open the thread of a trending post on the forum \"machine learning\" and subscribe.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/MachineLearning", + "program_html": [ + { + "url": "__REDDIT__/", + "locator": "document.querySelector(\"#sidebar > section\").outerText", + "required_contents": { + "must_include": [ + "MachineLearning" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 4, + "old_task_id": 599 + }, + { + "sites": [ + "reddit" + ], + "task_id": 133, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": { + "question": "is car necessary in NYC" + }, + "intent": "Post my question, \"is car necessary in NYC\", in a subreddit where I'm likely to get an answer", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/nyc", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "is car necessary in NYC" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 3765, + "old_task_id": 601 + }, + { + "sites": [ + "reddit" + ], + "task_id": 134, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "instantiation_dict": { + "topic": "gaming consoles", + "question": "what is the recommended console to buy these days" + }, + "intent": "Find a subreddit focused on topics related to gaming consoles, and post my question, \"what is the recommended console to buy these days\" there", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/consoles", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "what is the recommended console to buy these days" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 5, + "old_task_id": 605 + }, + { + "sites": [ + "reddit" + ], + "task_id": 135, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", + "instantiation_dict": { + "book": "big little lies", + "content": "can't stop it" + }, + "intent": "Post a review of my recent reading \"big little lies\" in the r/books with my comment \"can't stop it\".", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "big little lies", + "can't stop it" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 9, + "old_task_id": 612 + }, + { + "sites": [ + "reddit" + ], + "task_id": 136, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/pics", + "geolocation": null, + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", + "instantiation_dict": { + "content": "costume contest", + "subreddit": "funny" + }, + "intent": "Re-post the image of costume contest in this page to funny subreddit and note \"from /f/pics\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/funny", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "from /f/pics" + ] + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "bd8bc5f4c846aac4df08626faa3a34a7d47c8f3bdd92bf615a54afd939f063a7.jpg" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 11, + "old_task_id": 619 + }, + { + "sites": [ + "reddit" + ], + "task_id": 137, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": { + "topic": "Iphone 14" + }, + "intent": "Create a discussion post about \"Iphone 14\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/iphone", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "your opinion", + "Iphone 14" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 13, + "old_task_id": 626 + }, + { + "sites": [ + "reddit" + ], + "task_id": 138, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", + "instantiation_dict": { + "category": "running shoes", + "price": "$100", + "subreddit": "r/sports" + }, + "intent": "Ask for product recommendations for running shoes within a budget of $100 in r/sports", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/sports", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "100", + "running", + "shoes" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 15, + "old_task_id": 631 + }, + { + "sites": [ + "reddit" + ], + "task_id": 139, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", + "instantiation_dict": { + "interest": "Harry Potter", + "date": "July 8th", + "subreddit": "books" + }, + "intent": "Post a notice on a virtual meetup for Harry Potter enthusiasts on July 8th in the books subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "fuzzy_match": [ + "Harry Potter", + "July 8th", + "virtual meetup" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 16, + "old_task_id": 641 + }, + { + "sites": [ + "reddit" + ], + "task_id": 140, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in {{subreddit}} subreddit about what could machine learning help the correpong field.", + "instantiation_dict": { + "subreddit": "books" + }, + "intent": "Post in books subreddit about what could machine learning help the correpong field.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "fuzzy_match": [ + "machine learning", + "help" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 19, + "old_task_id": 645 + }, + { + "sites": [ + "reddit" + ], + "task_id": 141, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/technology/134852/ai-experts-disown-musk-backed-campaign-citing-their-research", + "geolocation": null, + "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"", + "instantiation_dict": { + "position_description": "the post", + "content_description": "???" + }, + "intent": "Reply to the post with my comment \"???\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/technology/134852/ai-experts-disown-musk-backed-campaign-citing-their-research", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.comment__body').outerText", + "required_contents": { + "exact_match": "???" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 23, + "old_task_id": 652 + }, + { + "sites": [ + "shopping" + ], + "task_id": 142, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": { + "product": "speaker", + "order_id": "148" + }, + "intent": "Fill the \"contact us\" form in the site for a refund on the speaker I bought, stating that it broke after just three days of use. Also, ensure to include the order number #148 and the product SKU. Don't submit yet, I will check.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What’s on your mind?\"').value", + "required_contents": { + "fuzzy_match": [ + "refund", + "broke after three days of use", + "148", + "B003FVW3VA" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 153, + "old_task_id": 657 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 143, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "instantiation_dict": { + "source_branch": "a11yproject.com/redesign", + "target_branch": "master", + "reviewer": "Roshan Jossy" + }, + "intent": "Submit a merge request for a11yproject.com/redesign branch to be merged into master branch, assign Roshan Jossy as the reviewer", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", + "program_html": [ + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", + "required_contents": { + "exact_match": "redesign" + } + }, + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", + "required_contents": { + "exact_match": "main" + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.reviewer').outerText", + "required_contents": { + "must_include": [ + "Roshan Jossy" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 335, + "old_task_id": 668 + }, + { + "sites": [ + "shopping", + "reddit" + ], + "task_id": 144, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "instantiation_dict": { + "product": "Racing Wheel Overdrive for Xbox X", + "rating": "1 star" + }, + "intent": "Gather the titles of Racing Wheel Overdrive for Xbox X reviews with 1 star rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Racing Wheel Overdrive for Xbox X\"", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/gaming", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__title').outerText", + "required_contents": { + "exact_match": "real user feedback on Racing Wheel Overdrive for Xbox X" + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "must_include": [ + "Unable to set neutral steering", + "Doesn’t work with PC", + "Crazy problems in automatic mode", + "pedals stopped working", + "Only works with certain games" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 101, + "old_task_id": 673 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 145, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Lookup orders that are {{status}}", + "instantiation_dict": { + "status": "canceled" + }, + "intent": "Lookup orders that are canceled", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales/order/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", + "required_contents": { + "must_include": [ + "Canceled" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 253, + "old_task_id": 678 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 146, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", + "instantiation_dict": { + "topic": "chatGPT" + }, + "intent": "Find a GitLab repository related to chatGPT and make a Reddit post linking to it in a relevant subreddit", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/singularity |OR| __REDDIT__/f/deeplearning |OR| __REDDIT__/f/technology", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "chatgpt", + "__GITLAB__/convexegg/chatgpt" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 116, + "old_task_id": 682 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 147, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": { + "repo": "auth0/angular-storage", + "subreddit": "technology" + }, + "intent": "Promote auth0/angular-storage to subreddit technology with the description from the repo itself.", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/technology", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "A storage library for AngularJS done right", + "__GITLAB__/auth0/angular-storage" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 117, + "old_task_id": 686 + }, + { + "sites": [ + "shopping" + ], + "task_id": 148, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", + "instantiation_dict": { + "reason": "my refund is suppoed to be replaced by a coupon" + }, + "intent": "Draft an email to the shop owner via their contact us function for a coupon as my refund is suppoed to be replaced by a coupon", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact/index/", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 163, + "old_task_id": 693 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 149, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": { + "report": "sales order report", + "time_span": "for last month" + }, + "intent": "Today is 3/15/2023, generate a sales order report for last month", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "2/1/23" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "2/28/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 268, + "old_task_id": 704 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 150, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": { + "type": "shipping", + "start_date": "08/05/2022", + "end_date": "03/01/2023" + }, + "intent": "Create a shipping report from 08/05/2022 to 03/01/2023", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/shipping", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "8/5/22" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "3/1/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 271, + "old_task_id": 710 + }, + { + "sites": [ + "reddit" + ], + "task_id": 151, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": { + "subreddit": "gadgets", + "k": "1" + }, + "intent": "Thumbs down the top 1 post ever in gadgets.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/gadgets/19459/a-custom-gaming-pc-built-inside-a-vintage-1940s-motorola", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 24, + "old_task_id": 714 + }, + { + "sites": [ + "reddit" + ], + "task_id": 152, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "subreddit": "earthporn", + "user": "CameronKelsey" + }, + "intent": "Like all submissions created by CameronKelsey in subreddit earthporn", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/EarthPorn/98332/my-favorite-place-on-the-planet-henry-s-fork-of-the-snake", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/EarthPorn/98297/2-years-later-this-is-still-one-of-the-most-incredible", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/EarthPorn/98256/i-can-t-wait-for-all-this-green-to-start-coming-back-little", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 25, + "old_task_id": 720 + }, + { + "sites": [ + "reddit" + ], + "task_id": 153, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "AdamCannon", + "subreddit": "UpliftingNews" + }, + "intent": "DisLike all submissions created by AdamCannon in subreddit UpliftingNews", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/UpliftingNews/16087/same-sex-marriage-is-now-legal-in-all-of-mexico-s-states", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 1510, + "old_task_id": 729 + }, + { + "sites": [ + "reddit" + ], + "task_id": 154, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", + "instantiation_dict": { + "post": "Star Trek Starfleet Academy series", + "content": "Every watch makes me feel like a kid again" + }, + "intent": "Edit my post on Star Trek Starfleet Academy series by adding a line to the body that says \"Every watch makes me feel like a kid again\"", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/television/135201/star-trek-starfleet-academy-series-from-alex-kurtzman-and", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "exact_match": "Every watch makes me feel like a kid again" + } + } + ] + }, + "intent_template_id": 27, + "old_task_id": 733 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 155, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "instantiation_dict": { + "location": "Carnegie Mellon University", + "sport_team": "Boston home NBA team", + "time": "" + }, + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Boston home NBA team ", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Mellon University", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "TD Garden", + "Boston", + "Massachusetts" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 94, + "old_task_id": 741 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 156, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a new {{scope}} project \"awesome-llms\" and add {{account_list}} as members", + "instantiation_dict": { + "scope": "public", + "account_list": "primer, convexegg, abishek" + }, + "intent": "Create a new public project \"awesome-llms\" and add primer, convexegg, abishek as members", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/awesome-llms", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "public" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/awesome-llms/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@primer", + "@convexegg", + "@abisubramanya27" + ] + } + } + ] + }, + "intent_template_id": 332, + "old_task_id": 745 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 157, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "instantiation_dict": { + "project_name": "web_agent_android_xl", + "template": "Android", + "account_list": "primer, convexegg, abishek" + }, + "intent": "Start a private project web_agent_android_xl with Android template and add primer, convexegg, abishek as members", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/web_agent_android_xl", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_agent_android_xl/-/commits", + "locator": "", + "required_contents": { + "must_include": [ + "Initialized from 'Android' project template" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_agent_android_xl/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@primer", + "@convexegg", + "@abisubramanya27" + ] + } + } + ] + }, + "intent_template_id": 2100, + "old_task_id": 748 + }, + { + "sites": [ + "map", + "shopping_admin" + ], + "task_id": 158, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the route and driving time from {{city1}} to {{city2}}", + "instantiation_dict": { + "city1": "Allentown, PA", + "city2": "the city where my E-commerce customer Amanda Kim lives" + }, + "intent": "Show me the route and driving time from Allentown, PA to the city where my E-commerce customer Amanda Kim lives", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Allentown" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Hoboken", + "New Jersey" + ] + } + } + ] + }, + "intent_template_id": 42, + "old_task_id": 760 + }, + { + "sites": [ + "map" + ], + "task_id": 159, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Get directions from {{location/address_1}} to {{location/address_2}} using {{transportation}} options.", + "instantiation_dict": { + "location/address_1": "Carnegie Music Hall in NYC", + "location/address_2": "Carnegie Mellon University", + "transportation": "driving" + }, + "intent": "Get directions from Carnegie Music Hall in NYC to Carnegie Mellon University using driving options.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Hall", + "West 57th Street", + "Manhattan", + "New York" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Carnegie Mellon University", + "Pittsburgh" + ] + } + } + ] + }, + "intent_template_id": 54, + "old_task_id": 762 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 160, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "{{quantity}} {{product}} arrived, update the stock", + "instantiation_dict": { + "quantity": "5", + "product": "blue Cronus yoga pants with size 33" + }, + "intent": "5 blue Cronus yoga pants with size 33 arrived, update the stock", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/872/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "5" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/872/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 241, + "old_task_id": 768 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 161, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Make a folder named {{directory}} on the {{gitlab_repo}} repo and include a file called urls.txt that consists of the links to the 5 most recent posts from {{subreddit}}.", + "instantiation_dict": { + "directory": "car", + "subreddit": "cars", + "gitlab_repo": "gimmiethat.space" + }, + "intent": "Make a folder named car on the gimmiethat.space repo and include a file called urls.txt that consists of the links to the 5 most recent posts from cars.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "the subreddit does not exist", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 84, + "old_task_id": 791 + }, + { + "sites": [ + "shopping" + ], + "task_id": 162, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": { + "address": "77 Massachusetts Ave, Cambridge, MA" + }, + "intent": "Change the delivery address for my most recent order to 77 Massachusetts Ave, Cambridge, MA.", + "require_reset": false, + "eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "N/A" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "The shipping address cannot be changed after an order is placed in the website.", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 191, + "old_task_id": 798 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 163, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "instantiation_dict": { + "repo": "a11yproject", + "issue": "404 for many URLs", + "account": "myself", + "due": "2030-1-3" + }, + "intent": "Create an issue in a11yproject repo with title \"404 for many URLs\". Assign the issue to myself. Set due date to be 2030-1-3", + "require_reset": false, + "eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", + "required_contents": { + "exact_match": "404 for many URLs" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", + "required_contents": { + "must_include": [ + "Jan 3, 2030" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.assignee').outerText", + "required_contents": { + "must_include": [ + "Byte Blaze" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 327, + "old_task_id": 809 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 164, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", + "instantiation_dict": { + "repo": "a11yproject", + "issue": 404, + "account": "myself" + }, + "intent": "Assign the issue regarding 404 in a11yproject to myself.", + "require_reset": false, + "eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=byteblaze", + "locator": "", + "required_contents": { + "must_include": [ + "404s, bad host, timeouts, bad urls for URLs linked from website" + ] + } + } + ] + }, + "intent_template_id": 999, + "old_task_id": 811 + } +] \ No newline at end of file diff --git a/dev/requirements.txt b/dev/requirements.txt index c406ac7fc..4ea56ecf6 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -11,5 +11,6 @@ tenacity -e ../browsergym/visualwebarena # local package -e ../browsergym/experiments # local package -e ../browsergym/assistantbench # local package +-e ../browsergym/webarenalite # local package browsergym-workarena weblinx_browsergym diff --git a/tests/experiments/test_benchmark.py b/tests/experiments/test_benchmark.py index 9222be11c..14e598dff 100644 --- a/tests/experiments/test_benchmark.py +++ b/tests/experiments/test_benchmark.py @@ -50,6 +50,7 @@ def test_build_benchmarks(): "miniwob_tiny_test": 2 * 2, "webarena": 812, "webarena_tiny": 6, + "webarena_lite": 165, "visualwebarena": 910, "visualwebarena_tiny": 4, "workarena_l1": 33 * 10,