diff --git a/.changeset/pretty-jokes-own.md b/.changeset/pretty-jokes-own.md index b667e20b..31216b71 100644 --- a/.changeset/pretty-jokes-own.md +++ b/.changeset/pretty-jokes-own.md @@ -2,4 +2,4 @@ "@browserbasehq/stagehand": patch --- -Properly handle images in evaluator + clean up response parsing logic +Properly handle images in evaluator + clean up response parsing logic diff --git a/evals/core/summary.ts b/evals/core/summary.ts new file mode 100644 index 00000000..ae67c3ec --- /dev/null +++ b/evals/core/summary.ts @@ -0,0 +1,69 @@ +import fs from "fs"; +import { tasksByName } from "../taskConfig"; +import type { SummaryResult } from "@/types/evals"; + +export const generateSummary = async ( + results: SummaryResult[], + experimentName: string, +) => { + const passed = results + .filter((r) => r.output._success) + .map((r) => ({ + eval: r.input.name, + model: r.input.modelName, + categories: tasksByName[r.input.name].categories, + })); + + const failed = results + .filter((r) => !r.output._success) + .map((r) => ({ + eval: r.input.name, + model: r.input.modelName, + categories: tasksByName[r.input.name].categories, + })); + + const categorySuccessCounts: Record< + string, + { total: number; success: number } + > = {}; + for (const taskName of Object.keys(tasksByName)) { + const taskCategories = tasksByName[taskName].categories; + const taskResults = results.filter((r) => r.input.name === taskName); + const successCount = taskResults.filter((r) => r.output._success).length; + + for (const cat of taskCategories) { + if (!categorySuccessCounts[cat]) { + categorySuccessCounts[cat] = { total: 0, success: 0 }; + } + categorySuccessCounts[cat].total += taskResults.length; + categorySuccessCounts[cat].success += successCount; + } + } + + const categories: Record = {}; + for (const [cat, counts] of Object.entries(categorySuccessCounts)) { + categories[cat] = Math.round((counts.success / counts.total) * 100); + } + + const models: Record = {}; + const allModels = [...new Set(results.map((r) => r.input.modelName))]; + for (const model of allModels) { + const modelResults = results.filter((r) => r.input.modelName === model); + const successCount = modelResults.filter((r) => r.output._success).length; + models[model] = Math.round((successCount / modelResults.length) * 100); + } + + const formattedSummary = { + experimentName, + passed, + failed, + categories, + models, + }; + + fs.writeFileSync( + "eval-summary.json", + JSON.stringify(formattedSummary, null, 2), + ); + console.log("Evaluation summary written to eval-summary.json"); +}; diff --git a/evals/datasets/gaia/GAIA_web.jsonl b/evals/datasets/gaia/GAIA_web.jsonl new file mode 100644 index 00000000..b6f122b6 --- /dev/null +++ b/evals/datasets/gaia/GAIA_web.jsonl @@ -0,0 +1,90 @@ +{"task_id": "e1fc63a2-da7a-432f-be78-7c4a95598703", "Level": 1, "Final answer": "17", "id": "level1-0", "web": "https://www.google.com/", "ques": "If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary."} +{"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "Level": 1, "Final answer": "3", "id": "level1-1", "web": "https://www.google.com/", "ques": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."} +{"task_id": "5d0080cb-90d7-4712-bc33-848150e917d3", "Level": 1, "Final answer": "0.1777", "id": "level1-2", "web": "https://www.google.com/", "ques": "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper \"Can Hiccup Supply Enough Fish to Maintain a Dragon\u2019s Diet?\""} +{"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", "Level": 1, "Final answer": "3", "id": "level1-3", "web": "https://www.google.com/", "ques": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?"} +{"task_id": "46719c30-f4c3-4cad-be07-d5cb21eee6bb", "Level": 1, "Final answer": "Mapping Human Oriented Information to Software Agents for Online Systems Usage", "id": "level1-4", "web": "https://www.google.com/", "ques": "Of the authors (First M. Last) that worked on the paper \"Pie Menus or Linear Menus, Which Is Better?\" in 2015, what was the title of the first paper authored by the one that had authored prior papers?"} +{"task_id": "4b6bb5f7-f634-410e-815d-e673ab7f8632", "Level": 1, "Final answer": "THE CASTLE", "id": "level1-5", "web": "https://www.google.com/", "ques": "In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading."} +{"task_id": "b816bfce-3d80-4913-a07d-69b752ce6377", "Level": 1, "Final answer": "fluffy", "id": "level1-6", "web": "https://www.google.com/", "ques": "In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's sons that guarded his house, what word was quoted from two different authors in distaste for the nature of dragon depictions?"} +{"task_id": "72e110e7-464c-453c-a309-90a95aed6538", "Level": 1, "Final answer": "Guatemala", "id": "level1-7", "web": "https://www.google.com/", "ques": "Under DDC 633 on Bielefeld University Library's BASE, as of 2020, from what country was the unknown language article with a flag unique from the others?"} +{"task_id": "b415aba4-4b68-4fc6-9b89-2c812e55a3e1", "Level": 1, "Final answer": "diamond", "id": "level1-8", "web": "https://www.google.com/", "ques": "In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied? Don't use the prefix nano in your answer if there is one."} +{"task_id": "935e2cff-ae78-4218-b3f5-115589b19dae", "Level": 1, "Final answer": "research", "id": "level1-9", "web": "https://www.google.com/", "ques": "In the year 2022, and before December, what does \"R\" stand for in the three core policies of the type of content that was violated in the public logs on the Legume Wikipedia page?"} +{"task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8", "Level": 1, "Final answer": "FunkMonk", "id": "level1-10", "web": "https://www.google.com/", "ques": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?"} +{"task_id": "5188369a-3bbe-43d8-8b94-11558f909a08", "Level": 1, "Final answer": "Annie Levin", "id": "level1-11", "web": "https://www.google.com/", "ques": "What writer is quoted by Merriam-Webster for the Word of the Day from June 27, 2022?"} +{"task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2", "Level": 1, "Final answer": "Extremely", "id": "level1-12", "web": "https://www.google.com/", "ques": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\""} +{"task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91", "Level": 1, "Final answer": "Louvrier", "id": "level1-13", "web": "https://www.google.com/", "ques": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?"} +{"task_id": "d0633230-7067-47a9-9dbf-ee11e0a2cdd6", "Level": 1, "Final answer": "BaseLabelPropagation", "id": "level1-14", "web": "https://www.google.com/", "ques": "In the Scikit-Learn July 2017 changelog, what other predictor base command received a bug fix? Just give the name, not a path."} +{"task_id": "0383a3ee-47a7-41a4-b493-519bdefe0488", "Level": 1, "Final answer": "Rockhopper penguin", "id": "level1-15", "web": "https://www.google.com/", "ques": "On the BBC Earth YouTube video of the Top 5 Silliest Animal Moments, what species of bird is featured?"} +{"task_id": "11af4e1a-5f45-467d-9aeb-46f4bb0bf034", "Level": 1, "Final answer": "6", "id": "level1-16", "web": "https://www.google.com/", "ques": "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?"} +{"task_id": "7673d772-ef80-4f0f-a602-1bf4485c9b43", "Level": 1, "Final answer": "inference", "id": "level1-17", "web": "https://www.google.com/", "ques": "On Cornell Law School website's legal information institute, under the fifth section of federal rules alphabetically, what word was deleted in the last amendment to the first rule in the article that has \"witnesses\" in the most titles as of 2021?"} +{"task_id": "c365c1c7-a3db-4d5e-a9a1-66f56eae7865", "Level": 1, "Final answer": "Braintree, Honolulu", "id": "level1-18", "web": "https://www.google.com/", "ques": "Of the cities within the United States where U.S. presidents were born, which two are the farthest apart from the westernmost to the easternmost going east, giving the city names only? Give them to me in alphabetical order, in a comma-separated list"} +{"task_id": "7d4a7d1d-cac6-44a8-96e8-ea9584a70825", "Level": 1, "Final answer": "22", "id": "level1-19", "web": "https://www.google.com/", "ques": "According to Girls Who Code, how long did it take in years for the percentage of computer scientists that were women to change by 13% from a starting point of 37%?"} +{"task_id": "dc22a632-937f-4e6a-b72f-ba0ff3f5ff97", "Level": 1, "Final answer": "Five Hundred Things To Eat Before It's Too Late: and the Very Best Places to Eat Them", "id": "level1-20", "web": "https://www.google.com/", "ques": "What was the complete title of the book in which two James Beard Award winners recommended the restaurant where Ali Khan enjoyed a New Mexican staple in his cost-conscious TV show that started in 2015? Write the numbers in plain text if there are some in the title."} +{"task_id": "3f57289b-8c60-48be-bd80-01f8099ca449", "Level": 1, "Final answer": "519", "id": "level1-21", "web": "https://www.google.com/", "ques": "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?"} +{"task_id": "23dd907f-1261-4488-b21c-e9185af91d5e", "Level": 1, "Final answer": "2", "id": "level1-22", "web": "https://www.google.com/", "ques": "In Audre Lorde\u2019s poem \u201cFather Son and Holy Ghost\u201d, what is the number of the stanza in which some lines are indented?"} +{"task_id": "840bfca7-4f7b-481a-8794-c560c340185d", "Level": 1, "Final answer": "80GSFC21M0002", "id": "level1-23", "web": "https://www.google.com/", "ques": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?"} +{"task_id": "a0068077-79f4-461a-adfe-75c1a4148545", "Level": 1, "Final answer": "90", "id": "level1-24", "web": "https://www.google.com/", "ques": "What was the actual enrollment count of the clinical trial on H. pylori in acne vulgaris patients from Jan-May 2018 as listed on the NIH website?"} +{"task_id": "bda648d7-d618-4883-88f4-3466eabd860e", "Level": 1, "Final answer": "Saint Petersburg", "id": "level1-25", "web": "https://www.google.com/", "ques": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations."} +{"task_id": "c61d22de-5f6c-4958-a7f6-5e9707bd3466", "Level": 2, "Final answer": "egalitarian", "id": "level2-0", "web": "https://www.google.com/", "ques": "A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?"} +{"task_id": "17b5a6a3-bc87-42e8-b0fb-6ab0781ef2cc", "Level": 2, "Final answer": "34689", "id": "level2-1", "web": "https://www.google.com/", "ques": "I\u2019m researching species that became invasive after people who kept them as pets released them. There\u2019s a certain species of fish that was popularized as a pet by being the main character of the movie Finding Nemo. According to the USGS, where was this fish found as a nonnative species, before the year 2020? I need the answer formatted as the five-digit zip codes of the places the species was found, separated by commas if there is more than one place."} +{"task_id": "04a04a9b-226c-43fd-b319-d5e89743676f", "Level": 2, "Final answer": "41", "id": "level2-2", "web": "https://www.google.com/", "ques": "If we assume all articles published by Nature in 2020 (articles, only, not book reviews/columns, etc) relied on statistical significance to justify their findings and they on average came to a p-value of 0.04, how many papers would be incorrect as to their claims of statistical significance? Round the value up to the next integer."} +{"task_id": "14569e28-c88c-43e4-8c32-097d35b9a67d", "Level": 2, "Final answer": "backtick", "id": "level2-3", "web": "https://www.google.com/", "ques": "In Unlambda, what exact charcter or text needs to be added to correct the following code to output \"For penguins\"? If what is needed is a character, answer with the name of the character. If there are different names for the character, use the shortest. The text location is not needed. Code:\n\n`r```````````.F.o.r. .p.e.n.g.u.i.n.si"} +{"task_id": "3627a8be-a77f-41bb-b807-7e1bd4c0ebdf", "Level": 2, "Final answer": "142", "id": "level2-4", "web": "https://www.google.com/", "ques": "The object in the British Museum's collection with a museum number of 2012,5015.17 is the shell of a particular mollusk species. According to the abstract of a research article published in Science Advances in 2021, beads made from the shells of this species were found that are at least how many thousands of years old?"} +{"task_id": "7619a514-5fa8-43ef-9143-83b66a43d7a4", "Level": 2, "Final answer": "04/15/18", "id": "level2-5", "web": "https://www.google.com/", "ques": "According to github, when was Regression added to the oldest closed numpy.polynomial issue that has the Regression label in MM/DD/YY?"} +{"task_id": "2a649bb1-795f-4a01-b3be-9a01868dae73", "Level": 2, "Final answer": "3.1.3.1; 1.11.1.7", "id": "level2-6", "web": "https://www.google.com/", "ques": "What are the EC numbers of the two most commonly used chemicals for the virus testing method in the paper about SPFMV and SPCSV in the Pearl Of Africa from 2016? Return the semicolon-separated numbers in the order of the alphabetized chemicals."} +{"task_id": "87c610df-bef7-4932-b950-1d83ef4e282b", "Level": 2, "Final answer": "Morarji Desai", "id": "level2-7", "web": "https://www.google.com/", "ques": "In April of 1977, who was the Prime Minister of the first place mentioned by name in the Book of Esther (in the New International Version)?"} +{"task_id": "624cbf11-6a41-4692-af9c-36b3e5ca3130", "Level": 2, "Final answer": "So we had to let it die.", "id": "level2-8", "web": "https://www.google.com/", "ques": "What's the last line of the rhyme under the flavor name on the headstone visible in the background of the photo of the oldest flavor's headstone in the Ben & Jerry's online flavor graveyard as of the end of 2022?"} +{"task_id": "dd3c7503-f62a-4bd0-9f67-1b63b94194cc", "Level": 2, "Final answer": "6", "id": "level2-9", "web": "https://www.google.com/", "ques": "Use density measures from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023.\n\nI have a gallon of honey and a gallon of mayonnaise at 25C. I remove one cup of honey at a time from the gallon of honey. How many times will I need to remove a cup to have the honey weigh less than the mayonaise? Assume the containers themselves weigh the same."} +{"task_id": "f0f46385-fc03-4599-b5d3-f56496c3e69f", "Level": 2, "Final answer": "Indonesia, Myanmar", "id": "level2-10", "web": "https://www.google.com/", "ques": "In terms of geographical distance between capital cities, which 2 countries are the furthest from each other within the ASEAN bloc according to wikipedia? Answer using a comma separated list, ordering the countries by alphabetical order."} +{"task_id": "e4e91f1c-1dcd-439e-9fdd-cb976f5293fd", "Level": 2, "Final answer": "cloak", "id": "level2-11", "web": "https://www.google.com/", "ques": "I need to fact-check a citation. This is the citation from the bibliography:\n\nGreetham, David. \"Uncoupled: OR, How I Lost My Author(s).\" Textual Cultures: Texts, Contexts, Interpretation, vol. 3 no. 1, 2008, p. 45-46. Project MUSE, doi:10.2979/tex.2008.3.1.44.\n\nAnd this is the in-line citation:\n\nOur relationship with the authors of the works we read can often be \u201cobscured not by a \"cloak of print\" but by the veil of scribal confusion and mis-transmission\u201d (Greetham 45-46).\n\nDoes the quoted text match what is actually in the article? If Yes, answer Yes, otherwise, give me the word in my citation that does not match with the correct one (without any article)."} +{"task_id": "56137764-b4e0-45b8-9c52-1866420c3df5", "Level": 2, "Final answer": "Li Peng", "id": "level2-12", "web": "https://www.google.com/", "ques": "Which contributor to the version of OpenCV where support was added for the Mask-RCNN model has the same name as a former Chinese head of government when the names are transliterated to the Latin alphabet?"} +{"task_id": "8b3379c0-0981-4f5b-8407-6444610cb212", "Level": 2, "Final answer": "1.8", "id": "level2-13", "web": "https://www.google.com/", "ques": "What is the maximum length in meters of #9 in the first National Geographic short on YouTube that was ever released according to the Monterey Bay Aquarium website? Just give the number."} +{"task_id": "0ff53813-3367-4f43-bcbd-3fd725c1bf4b", "Level": 2, "Final answer": "beta geometric", "id": "level2-14", "web": "https://www.google.com/", "ques": "What two-word type of model did Manash Pratim Kashyap's and PS Fader's studies in customer retention studies published during 2018-2019 have in common (no punctuation)?"} +{"task_id": "a7feb290-76bb-4cb7-8800-7edaf7954f2f", "Level": 2, "Final answer": "31", "id": "level2-15", "web": "https://www.google.com/", "ques": "How many High Energy Physics - Lattice articles listed in January 2020 on Arxiv had ps versions available?"} +{"task_id": "b4cc024b-3f5e-480e-b96a-6656493255b5", "Level": 2, "Final answer": "Russian-German Legion", "id": "level2-16", "web": "https://www.google.com/", "ques": "The photograph in the Whitney Museum of American Art's collection with accession number 2022.128 shows a person holding a book. Which military unit did the author of this book join in 1813? Answer without using articles."} +{"task_id": "33d8ea3b-6c6b-4ff1-803d-7e270dea8a57", "Level": 2, "Final answer": "2", "id": "level2-17", "web": "https://www.google.com/", "ques": "What is the minimum number of page links a person must click on to go from the english Wikipedia page on The Lord of the Rings (the book) to the english Wikipedia page on A Song of Ice and Fire (the book series)? In your count, include each link you would click on to get to the page. Use the pages as they appeared at the end of the day on July 3, 2023."} +{"task_id": "e8cb5b03-41e0-4086-99e5-f6806cd97211", "Level": 2, "Final answer": "shrimp", "id": "level2-18", "web": "https://www.google.com/", "ques": "I went to Virtue restaurant & bar in Chicago for my birthday on March 22, 2021 and the main course I had was delicious! Unfortunately, when I went back about a month later on April 21, it was no longer on the dinner menu. Using the Wayback Machine, can you help me figure out which main course was on the dinner menu for Virtue on March 22, 2021 but not April 21, 2021? Answer using the singular form, without articles."} +{"task_id": "f46b4380-207e-4434-820b-f32ce04ae2a4", "Level": 2, "Final answer": "Harbinger, Tidal", "id": "level2-19", "web": "https://www.google.com/", "ques": "It is 1999. Before you party like it is 1999, please assist me in settling a bet.\n\nFiona Apple and Paula Cole released albums prior to 1999. Of these albums, which didn't receive a letter grade from Robert Christgau? Provide your answer as a comma delimited list of album titles, sorted alphabetically."} +{"task_id": "05407167-39ec-4d3a-a234-73a9120c325d", "Level": 2, "Final answer": "Format Document", "id": "level2-20", "web": "https://www.google.com/", "ques": "In the 2018 VSCode blog post on replit.com, what was the command they clicked on in the last video to remove extra lines?"} +{"task_id": "b9763138-c053-4832-9f55-86200cb1f99c", "Level": 2, "Final answer": "3", "id": "level2-21", "web": "https://www.google.com/", "ques": "Compute the check digit the Tropicos ID for the Order Helotiales would have if it were an ISBN-10 number."} +{"task_id": "16d825ff-1623-4176-a5b5-42e0f5c2b0ac", "Level": 2, "Final answer": "6:41 PM", "id": "level2-22", "web": "https://www.google.com/", "ques": "What time was the Tri-Rail train that carried the most passengers on May 27, 2019 scheduled to arrive in Pompano Beach? Express your answer in the 12-hour digital clock format without leading zero if any, and include whether it is AM or PM."} +{"task_id": "544b7f0c-173a-4377-8d56-57b36eb26ddf", "Level": 2, "Final answer": "A Nightmare on Elm Street", "id": "level2-23", "web": "https://www.google.com/", "ques": "In Valentina Re\u2019s contribution to the 2017 book \u201cWorld Building: Transmedia, Fans, Industries\u201d, what horror movie does the author cite as having popularized metalepsis between a dream world and reality? Use the complete name with article if any."} +{"task_id": "6b078778-0b90-464d-83f6-59511c811b01", "Level": 2, "Final answer": "Alfonso Visconti", "id": "level2-24", "web": "https://www.google.com/", "ques": "The Metropolitan Museum of Art has a portrait in its collection with an accession number of 29.100.5. Of the consecrators and co-consecrators of this portrait's subject as a bishop, what is the name of the one who never became pope?"} +{"task_id": "08cae58d-4084-4616-b6dd-dd6534e4825b", "Level": 2, "Final answer": "2018", "id": "level2-25", "web": "https://www.google.com/", "ques": "According to Google Finance, when was the first year the Apple stock went above $50 (without adjusting for stock split)?"} +{"task_id": "2dfc4c37-fec1-4518-84a7-10095d30ad75", "Level": 2, "Final answer": "6", "id": "level2-26", "web": "https://www.google.com/", "ques": "According to Box Office Mojo's 2020 Worldwide Box Office list, how many of the top 10 highest-grossing worldwide movies are also on the top 10 highest-grossing domestic movies? Your answer should be a numerical integer value."} +{"task_id": "9f41b083-683e-4dcf-9185-ccfeaa88fa45", "Level": 2, "Final answer": "0", "id": "level2-27", "web": "https://www.google.com/", "ques": "How many pages if the 2023 IPCC report (85 pages version) mentions nuclear energy?"} +{"task_id": "ecbc4f94-95a3-4cc7-b255-6741a458a625", "Level": 2, "Final answer": "13", "id": "level2-28", "web": "https://www.google.com/", "ques": "How many images are there in the latest 2022 Lego english wikipedia article?"} +{"task_id": "71345b0a-9c7d-4b50-b2bf-937ec5879845", "Level": 2, "Final answer": "Here be dragons", "id": "level2-29", "web": "https://www.google.com/", "ques": "On a leap day before the year 2008, a joke was removed from the Wikipedia page for \u201cDragon\u201d. What was the phrase that was removed? Give the phrase as it appeared on the page, but without punctuation."} +{"task_id": "7b5377b0-3f38-4103-8ad2-90fe89864c04", "Level": 2, "Final answer": "563.9", "id": "level2-30", "web": "https://www.google.com/", "ques": "Find the value of x to the nearest tenth: Lx = (d/dx * (A * x-squared)) + 4-thousand'n'ninety-7 minus C\nWhere L is the last two digits of the year of the Venezuelan Declaration of Independence,\nA is the number of colors in the TikTok logo as of July 2023, excluding black and white,\nand C is the height of the average woman in the Philippines according to a July 2023 Business Insider article, rounded to the nearest whole centimeter"} +{"task_id": "114d5fd0-e2ae-4b6d-a65a-870da2d19c08", "Level": 2, "Final answer": "4", "id": "level2-31", "web": "https://www.google.com/", "ques": "In the endnote found in the second-to-last paragraph of page 11 of the book with the doi 10.2307/j.ctv9b2xdv, what date in November was the Wikipedia article accessed? Just give the day of the month."} +{"task_id": "ad37a656-079a-49f9-a493-7b739c9167d1", "Level": 2, "Final answer": "Bravo", "id": "level2-32", "web": "https://www.google.com/", "ques": "On July 15, 2008, Phys.org published an article about a catastrophe. Find the explosive force of this catastrophe according to Encyclopedia Britannica, then find the name of the US nuclear test that had the same yield. Your answer should only be the last word of the name of the test."} +{"task_id": "f3917a3d-1d17-4ee2-90c5-683b072218fe", "Level": 2, "Final answer": "2732", "id": "level2-33", "web": "https://www.google.com/", "ques": "How many edits were made to the Wikipedia page on Antidisestablishmentarianism from its inception until June of 2023?"} +{"task_id": "48eb8242-1099-4c26-95d4-ef22b002457a", "Level": 2, "Final answer": "6", "id": "level2-34", "web": "https://www.google.com/", "ques": "How many nonindigenous crocodiles were found in Florida from the year 2000 through 2020? You can get the data from the USGS Nonindigenous Aquatic Species database."} +{"task_id": "c8b7e059-c60d-472e-ad64-3b04ae1166dc", "Level": 2, "Final answer": "8", "id": "level2-35", "web": "https://www.google.com/", "ques": "The work referenced in footnote 397 of Federico Lauria's 2014 dissertation is also the source for the titles of two paintings in the Smithsonian American Art Museum's collection, as of August 2023. What is the absolute difference between the chapter numbers of the chapters that the titles of these two paintings quote?"} +{"task_id": "d1af70ea-a9a4-421a-b9cc-94b5e02f1788", "Level": 2, "Final answer": "736455", "id": "level2-36", "web": "https://www.google.com/", "ques": "As of the 2020 census, what was the population difference between the largest county seat and smallest county seat, by land area of the county seat, in Washington state? For population figures, please use the official data from data.census.gov. Please report the integer difference."} +{"task_id": "ded28325-3447-4c56-860f-e497d6fb3577", "Level": 2, "Final answer": "Picnic is in Ploybius Plaza.", "id": "level2-37", "web": "https://www.google.com/", "ques": "This is a secret message my friend gave me. It says where we should meet for our picnic on Friday. The only problem is, it\u2019s encrypted in the Caesar cipher, so I can\u2019t read it. Can you tell me what it says? This is the message:\n\nZsmxsm sc sx Zyvilsec Zvkjk."} +{"task_id": "d700d50d-c707-4dca-90dc-4528cddd0c80", "Level": 2, "Final answer": "Roger Miller", "id": "level2-38", "web": "https://www.google.com/", "ques": "Who composed the song that was performed by a rooster and a hamster in separate animated videos at separate tempos with different lyrics? Answer using the format First name Last name."} +{"task_id": "0a3cd321-3e76-4622-911b-0fda2e5d6b1a", "Level": 2, "Final answer": "Brunei, China, Morocco, Singapore", "id": "level2-39", "web": "https://www.google.com/", "ques": "According to the World Bank, which countries had gross savings of over 35% of GDP for every year in the period 2001-2010? Give your answer as a comma-separated list of countries in alphabetical order. Use the countries most common names in english when answering."} +{"task_id": "f2feb6a4-363c-4c09-a804-0db564eafd68", "Level": 2, "Final answer": "900000", "id": "level2-40", "web": "https://www.google.com/", "ques": "I\u2019m thinking about selling my home, so I want to learn more about how homes in my area sold recently. I live in Pearl City, Hawaii, which is on the island of Oahu. I know two homes near me that sold in 2022 were 2072 Akaikai Loop, and 2017 Komo Mai Drive. Find which of those homes sold for more in 2022, and tell me how much it sold for. Don\u2019t put commas or decimal places in the answer."} +{"task_id": "0b260a57-3f3a-4405-9f29-6d7a1012dbfb", "Level": 2, "Final answer": "0.269", "id": "level2-41", "web": "https://www.google.com/", "ques": "On ScienceDirect, what is the difference to 3 decimal places in the sample standard deviations of the number of Reference Works in each Life Science domain compared to Health Sciences as of 2022?"} +{"task_id": "ed58682d-bc52-4baa-9eb0-4eb81e1edacc", "Level": 2, "Final answer": "stare", "id": "level2-42", "web": "https://www.google.com/", "ques": "What is the last word before the second chorus of the King of Pop's fifth single from his sixth studio album?"} +{"task_id": "023e9d44-96ae-4eed-b912-244ee8c3b994", "Level": 2, "Final answer": "8", "id": "level2-43", "web": "https://www.google.com/", "ques": "It's May 2023, and I'm about to drive across the U.S. from California to Maine. I always recycle my water bottles at the end of a trip, and I drink 5 12-ounce water bottles for every 100 miles I travel, rounded to the nearest 100. Assuming I follow I-40 from Los Angeles to Cincinnati, then take I-90 from Cincinnati to Augusta, how many dollars will I get back according to Wikipedia?"} +{"task_id": "0e9e85b8-52b9-4de4-b402-5f635ab9631f", "Level": 2, "Final answer": "1927", "id": "level2-44", "web": "https://www.google.com/", "ques": "What is the latest chronological year date written in the image on the webpage found when following the first citation reference link on the latest version of Carl Nebel's Wikipedia page as of August 2023?"} +{"task_id": "20194330-9976-4043-8632-f8485c6c71b2", "Level": 2, "Final answer": "4", "id": "level2-45", "web": "https://www.google.com/", "ques": "The YouTube channel Game Grumps began a Let\u2019s Play of the game Sonic the Hedgehog (2006) in the year 2012. Thirty seconds into the first episode, a phrase is shown on the screen in white letters on a red background. How many times does the letter \"E\" appear in this phrase?"} +{"task_id": "65638e28-7f37-4fa7-b7b9-8c19bb609879", "Level": 2, "Final answer": "Kleinpaul", "id": "level2-46", "web": "https://www.google.com/", "ques": "The book with the doi 10.1353/book.24372 concerns a certain neurologist. According to chapter 2 of the book, what author influenced this neurologist\u2019s belief in \u201cendopsychic myths\u201d? Give the last name only."} +{"task_id": "3ff6b7a9-a5bd-4412-ad92-0cd0d45c0fee", "Level": 2, "Final answer": "56000", "id": "level2-47", "web": "https://www.google.com/", "ques": "The longest-lived vertebrate is named after an island. According to Wikipedia as of January 1, 2021, what is the 2020 estimated population of that island, to the nearest thousand?"} +{"task_id": "708b99c5-e4a7-49cb-a5cf-933c8d46470d", "Level": 2, "Final answer": "Citations", "id": "level2-48", "web": "https://www.google.com/", "ques": "On the DeepFruits fruit detection graph on Connected Papers from 2016, what feature caused the largest bubble to be the size it is?"} +{"task_id": "0a65cb96-cb6e-4a6a-8aae-c1084f613456", "Level": 2, "Final answer": "Holabird", "id": "level2-49", "web": "https://www.google.com/", "ques": "During the first week of August 2015, one of the NASA Astronomy Pictures of the Day shows the lights of a city on the horizon. The namesake of this city also has a landmark building in Chicago named after him. What is the name of the architectural firm that designed this landmark building? Give the first name appearing in the name of the firm as of June 2023."} +{"task_id": "65da0822-a48a-4a68-bbad-8ed1b835a834", "Level": 2, "Final answer": "Santa Clara, Boston", "id": "level2-50", "web": "https://www.google.com/", "ques": "All of the individuals who formally held the position of United States secretary of homeland security prior to April 2019, excluding those who held the position in an acting capacity, have a bachelor's degree. Of the universities that these bachelor's degrees were from, which is the westernmost university and which is the easternmost university? Give them to me as a comma-separated list, I only want the name of the cities where the universities are located, with the westernmost city listed first."} +{"task_id": "73c1b9fe-ee1d-4cf4-96ca-35c08f97b054", "Level": 2, "Final answer": "1954", "id": "level2-51", "web": "https://www.google.com/", "ques": "According to the USGS, in what year was the American Alligator first found west of Texas (not including Texas)?"} +{"task_id": "e2d69698-bc99-4e85-9880-67eaccd66e6c", "Level": 2, "Final answer": "Michele Fitzgerald", "id": "level2-52", "web": "https://www.google.com/", "ques": "As of August 2023, who is the only winner of the US version of Survivor to be born in the month of May?"} +{"task_id": "a56f1527-3abf-41d6-91f8-7296d6336c3f", "Level": 2, "Final answer": "185", "id": "level2-53", "web": "https://www.google.com/", "ques": "The cover of the August 2021 issue of Vogue shows a famous landmark in the background behind some trees. How tall is this monument in yards, rounded to the nearest yard? Give the number only."} +{"task_id": "42d4198c-5895-4f0a-b0c0-424a66465d83", "Level": 2, "Final answer": "60", "id": "level2-54", "web": "https://www.google.com/", "ques": "I'm curious about how much information is available for popular video games before their release. Find the Wikipedia page for the 2019 game that won the British Academy Games Awards. How many revisions did that page have before the month listed as the game's release date on that Wikipedia page (as of the most recent entry from 2022)?"} +{"task_id": "a26649c6-1cb2-470a-871e-6910c64c3e53", "Level": 2, "Final answer": "116", "id": "level2-55", "web": "https://www.google.com/", "ques": "What is the absolute difference in tens of thousands between the population of chinstrap penguins on the Wikipedia page for penguin species populations as of the end of 2018 and the population recorded in the Nature.com \"global population assessment of the Chinstrap penguin\" article from 2020, assuming two penguins per breeding pair?"} +{"task_id": "d5141ca5-e7a0-469f-bf3e-e773507c86e2", "Level": 2, "Final answer": "19/02/2009", "id": "level2-56", "web": "https://www.google.com/", "ques": "When was a picture of St. Thomas Aquinas first added to the Wikipedia page on the Principle of double effect? Answer using the format DD/MM/YYYY."} +{"task_id": "1dcc160f-c187-48c2-b68e-319bd4354f3d", "Level": 2, "Final answer": "3", "id": "level2-57", "web": "https://www.google.com/", "ques": "According to Openreview.net, at the NeurIPS 2022 Conference, how many papers by an author named Yuri were accepted with a \"certain\" recommendation?"} +{"task_id": "e0c10771-d627-4fd7-9694-05348e54ee36", "Level": 2, "Final answer": "234.9", "id": "level2-58", "web": "https://www.google.com/", "ques": "Take the gender split from the 2011 Bulgarian census about those who have completed tertiary education. Subtract the smaller number from the larger number, then return the difference in thousands of women. So if there were 30.1 thousand more men, you'd give \"30.1\""} +{"task_id": "e29834fd-413a-455c-a33e-c3915b07401c", "Level": 2, "Final answer": "21", "id": "level2-59", "web": "https://www.google.com/", "ques": "I'd like to learn more about some popular reality television competition shows. As of the end of the 44th season of the American version of Survivor, how many more unique winners have there been compared to the number of winners of American Idol?"} +{"task_id": "08c0b6e9-1b43-4c2e-ae55-4e3fce2c2715", "Level": 2, "Final answer": "orange, white", "id": "level2-60", "web": "https://www.google.com/", "ques": "In the film Goldfinger, what color was the object that James Bond concealed himself and his companion Pussy Galore at the end of the film? If there are multiple colors, put them in a comma-separated list in alphabetical order."} +{"task_id": "db4fd70a-2d37-40ea-873f-9433dc5e301f", "Level": 2, "Final answer": "10", "id": "level2-61", "web": "https://www.google.com/", "ques": "As of May 2023, how many stops are between South Station and Windsor Gardens on MBTA\u2019s Franklin-Foxboro line (not included)?"} +{"task_id": "853c8244-429e-46ca-89f2-addf40dfb2bd", "Level": 2, "Final answer": "11", "id": "level2-62", "web": "https://www.google.com/", "ques": "In the 2015 Metropolitan Museum of Art exhibition titled after the Chinese zodiac animal of 2015, how many of the \"twelve animals of the Chinese zodiac\" have a hand visible?"} +{"task_id": "7a4a336d-dcfa-45a0-b014-824c7619e8de", "Level": 2, "Final answer": "1:41.614", "id": "level2-63", "web": "https://www.google.com/", "ques": "At the two-minute mark in the YouTube video uploaded by the channel \u201cGameGrumps\u201d on May 14, 2017 as part of their playthrough of the game Mario Kart 8 Deluxe, the shows\u2019 hosts are competing on one of the game\u2019s racetracks. What was the world record time for that track in the game\u2019s 150cc mode as of June 7, 2023? Express your answer in minutes and seconds, rounding the seconds to the nearest hundredth, e.g. 1:01.001."} diff --git a/evals/datasets/webvoyager/WebVoyager_data.jsonl b/evals/datasets/webvoyager/WebVoyager_data.jsonl new file mode 100644 index 00000000..7ba2ebd2 --- /dev/null +++ b/evals/datasets/webvoyager/WebVoyager_data.jsonl @@ -0,0 +1,643 @@ +{"web_name": "Allrecipes", "id": "Allrecipes--0", "ques": "Provide a recipe for vegetarian lasagna with more than 100 reviews and a rating of at least 4.5 stars suitable for 6 people.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--1", "ques": "Find a recipe for a vegetarian lasagna that has at least a four-star rating and uses zucchini.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--2", "ques": "Find a recipe for a vegetarian lasagna under 600 calories per serving that has a prep time of less than 1 hour.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--3", "ques": "Locate a recipe for vegan chocolate chip cookies with over 60 reviews and a rating of at least 4.5 stars on Allrecipes.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--4", "ques": "Find a recipe for Baked Salmon that takes less than 30 minutes to prepare and has at least a 4 star rating based on user reviews.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--5", "ques": "Search for a popular Pasta Sauce with more than 1000 reviews and a rating above 4 stars. Create a shopping list of ingredients for this recipe.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--6", "ques": "Search for a vegetarian lasagna recipe that has at least a four-star rating and over 500 reviews.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--7", "ques": "Find a popular recipe for a chocolate chip cookie and list the ingredients and preparation steps.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--8", "ques": "Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--9", "ques": "Find a high-rated recipe for vegetarian lasagna, list the key ingredients required, and include the total preparation and cook time stated on the recipe.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--10", "ques": "Find The Most Popular Recipes of the 1960s, noting the recipe name, preparation time and total time of the second recipe in this collection.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--11", "ques": "Discover a suitable chocolate cupcake recipe on Allrecipes that has a preparation time of under 1 hour and at least 100 user reviews.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--12", "ques": "Search for a popular cookie recipe on Allrecipes with more than 1000 reviews and a rating of 4.5 stars or better. Provide the list of ingredients needed.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--13", "ques": "Find a recipe with over 100 reviews for Fried Fish on Allrecipes, list the Full Nutrition Label and tell me the amount of Iron per Serving.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--14", "ques": "Search for a recipe that includes \"chicken breast\" and \"quinoa\" with preparation time under 30 minutes on Allrecipes.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--15", "ques": "Choose a dessert recipe on Allrecipes with a prep time of less than 30 minutes, has chocolate as an ingredient, and has a user rating of 4 stars or higher. Provide the name of the recipe, ingredients list, and step-by-step instructions.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--16", "ques": "Find a five-star rated chocolate chip cookie recipe that takes less than 1 hour to make on Allrecipes. Note how many reviews the recipe has and the main ingredients required.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--17", "ques": "Find the Easy Vegetarian Spinach Lasagna recipe on Allrecipes and tell me what the latest review says.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--18", "ques": "Find a recipe for a vegetarian lasagna that has over 300 reviews and an average rating of 4.5 or higher on Allrecipes.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--19", "ques": "Find a vegan lasagna recipe on Allrecipes that requires 10 ingredients or less and has feedback of more than 200 reviews. Provide a brief overview of the ingredient list and the total prep and cook time.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--20", "ques": "Find a recipe for a cauliflower pizza crust that has a preparation time of under 30 minutes and a rating of at least 4 stars on Allrecipes. Include the number of calories per serving.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--21", "ques": "Locate a high-rated recipe for gluten-free brownies on Allrecipes with at least 50 reviews. List the main ingredients and the total time required for preparation and cooking.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--22", "ques": "Find a recipe for a healthy avocado salad on Allrecipes that has a preparation time of less than 20 minutes and more than 30 user reviews. Include the nutritional information per serving.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--23", "ques": "Search Allrecipes for a baked lemon chicken recipe that has a prep time under 45 minutes, with at least a 4.5-star rating based on user reviews, and over 200 reviews. List the primary ingredients required.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--24", "ques": "Locate a recipe for an eggplant Parmesan on Allrecipes with a rating of at least 4.5 stars and over 50 reviews. Include the preparation time and the number of servings provided by the recipe.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--25", "ques": "Find a popular quinoa salad recipe on Allrecipes with more than 500 reviews and a rating above 4 stars. Create a shopping list of ingredients for this recipe and include the total cooking and preparation time.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--26", "ques": "Search for a high-protein vegetarian chili recipe on Allrecipes that has at least 50 reviews and a rating of 4 stars or higher. Provide the ingredient list, cooking time, and a brief description of the cooking steps.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--27", "ques": "Locate a chicken curry recipe on Allrecipes that has been reviewed more than 30 times and has a rating of at least 4 stars. Provide a summary of the recipe including ingredients, preparation time, and cooking instructions.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--28", "ques": "On Allrecipes, find a vegan brownie recipe that has at least 40 reviews and a rating of 4.5 or higher. Include the list of ingredients, total prep and cook time, and a brief overview of the preparation steps.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--29", "ques": "Search for a Mediterranean-style grilled fish recipe on Allrecipes that includes ingredients like olives, has at least a 4-star rating, and more than 25 reviews. Detail the ingredients, cooking method, and total time required for preparation and cooking.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--30", "ques": "Find a recipe for a vegan smoothie bowl on Allrecipes that includes bananas and leaves, has more than 20 reviews, and a rating of at least 4 stars. Provide a list of ingredients, preparation time, and a summary of the recipe steps.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--31", "ques": "Search for a seafood paella recipe on Allrecipes with a minimum of 4.5 stars rating and at least 50 reviews. The recipe should include shrimp and mussels. Provide the ingredients, total time, and an overview of the preparation steps.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--32", "ques": "Find a high-rated beef stew recipe on Allrecipes that requires a slow cooker and has at least 30 reviews. Detail the cooking time and the first five ingredients listed in the recipe.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--33", "ques": "Find a recipe for a low-carb breakfast on Allrecipes with at least 25 reviews. Show the Nutrition Facts and the total carbohydrate content per serving.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--34", "ques": "Locate a baked salmon recipe on Allrecipes that has at least 50 reviews and a rating of 4.5 stars or higher. Note the primary seasoning or herb used and the estimated cooking time.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--35", "ques": "Search for an Italian-style meatball recipe on Allrecipes that has more than 100 reviews. Detail the type of meat used and the overall cooking time required.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--36", "ques": "Locate a recipe for an American apple pie on Allrecipes with a rating of at least 4 stars and more than 50 reviews. Note the maximum temperature mentioned in the Directions.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--37", "ques": "Search for a Greek salad recipe on Allrecipes that has a prep time of under 25 minutes and more than 15 reviews. Include the primary cheese used and the type of dressing recommended.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--38", "ques": "Find a French ratatouille recipe on Allrecipes with a 4-star rating or higher and at least 15 reviews. Note the variety of vegetables included and the overall cooking time.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--39", "ques": "Locate a recipe for sushi rolls on Allrecipes with a minimum of 20 reviews. Show the Nutrition Facts and the main ingredients. Tell me how to store these rolls.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--40", "ques": "Browse the about us section of Allrecipes for a brief introduction to The Allrecipes Allstars.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--41", "ques": "List 3 recommended dinner recipes in the Allrecipes Dinners section.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--42", "ques": "Find a recipe for banana bread with more than 200 reviews and a rating of at least 4.0 stars on Allrecipes.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--43", "ques": "Find a recipe for a vegan pumpkin pie on Allrecipes with a minimum four-star rating and a total cook time exceeding 1 hour.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--44", "ques": "List at least 6 holiday recipes sections mentioned in the Occasions section of Allrecipes.", "web": "https://www.allrecipes.com/"} +{"web_name": "Amazon", "id": "Amazon--0", "ques": "Search an Xbox Wireless controller with green color and rated above 4 stars.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--1", "ques": "Search for women's golf polos in m size, priced between 50 to 75 dollars, and save the lowest priced among results.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--2", "ques": "Find a gaming desktop with Windows 11 Home, and the disk size should be 1TB.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--3", "ques": "Find climbing gears and sort the results by price high to low. Answer the first 3 results after sorting.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--4", "ques": "Find the used Nintendo Switch Lite on Amazon then filter by 'Used - Good', tell me the cheapest one that is 'Used - Good'.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--5", "ques": "Find a Blue iPhone 12 Pro 128gb and add to cart.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--6", "ques": "Browse black strollers within $100 to $200 on Amazon. Then find one Among these black strollers with over 20,000 reviews and a rating greater than 4 star.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--7", "ques": "Browse the women's hiking boots on Amazon and filter the results to show only those that are waterproof and have a rating of at least 4 stars and size 6.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--8", "ques": "Find the cheapest Samsung-made Android tablet with screen between 10-10.9 inches on Amazon. Only answer the cheapest one.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--9", "ques": "Find a dog bed on Amazon that is washable and has a length of at least 30 inches.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--10", "ques": "Find the cost of a 2-year protection for PS4 on Amazon.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--11", "ques": "Find a stainless steel kitchen sink with double bowls on Amazon. Sort the results and find the cheapest one with FREE delivery.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--12", "ques": "Check reviews for a Ride On Car with 100+ reviews & 4+ stars rating on Amazon. Give me the top review about this Ride On Car.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--13", "ques": "Browse best selling black hoodies in mens size Big and Tall that is between $25 and $50 on Amazon.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--14", "ques": "Find the new surge protector on Amazon with 6 to 8 outlets under 25 dollars with customer reviews above 4+ stars.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--15", "ques": "Find a pair of mens running shoes in black, size 7, 4+ stars and under $50 and add them to my cart on Amazon.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--16", "ques": "Find the Return Policy for Mens Rhinestone Skull Graphic Shirt on Amazon. Color: Black, Size: XX-Large. If Free return is avaliable, tell me how to return this item.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--17", "ques": "Show me the list of baby products that are on sale and under 10 dollars on Amazon. Provide at least 2 on sale products", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--18", "ques": "Open Amazon's home page and tell me what the deal is that is going on at the moment, list the names of at least 2 items that are on offer and tell me what percent off they are.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--19", "ques": "Look for an English language book on roman empire history in the Amazon Kindle store. Sort by newests arrivals and look for a title that will be released within a month.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--20", "ques": "Search for a wireless ergonomic keyboard with backlighting and a rating of at least 4 stars. The price should be between $40 to $60. Save the product with the 500+ customer reviews.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--21", "ques": "Find a stainless steel, 12-cup programmable coffee maker on Amazon. The price range should be between $100 to $200. Report the one with the 4+ customer rating.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--22", "ques": "Search for a set of non-stick, oven-safe cookware on Amazon. The set should include at least 10 pieces and be priced under $150.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--23", "ques": "Look for a men's waterproof digital sports watch with a heart rate monitor on Amazon. It should be priced between $50 to $100.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--24", "ques": "Browse for a compact air fryer on Amazon with a capacity of 2 to 3 quarts. It should have a digital display, auto shutoff and be priced under $100.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--25", "ques": "Search for a queen-sized, hypoallergenic mattress topper on Amazon. It should have a memory foam material and be priced between $50 to $100.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--26", "ques": "Find a portable Bluetooth speaker on Amazon with a water-resistant design, under $50. It should have a minimum battery life of 10 hours.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--27", "ques": "Look for a USB-C hub on Amazon compatible with MacBook Pro, featuring at least 4 ports, including HDMI and SD card reader. The price should be under $50. Select the one after sorting by Best Sellers.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--28", "ques": "Search for a yoga mat on Amazon that is at least 6mm thick, non-slip, and eco-friendly. The price should be under $50.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--29", "ques": "Find a set of solar-powered garden lights on Amazon with a minimum pack of 10 lights. They should be LED and priced under $50.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--30", "ques": "Locate the highest-rated fiction book released in 2024 on Amazon, with a minimum of 50 customer reviews.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--31", "ques": "Find a compact digital camera on Amazon with a zoom capability of at least 10x, rated 4 stars or higher, and priced between $100 to $300.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--32", "ques": "Search for an electric kettle on Amazon with a capacity of at least 1.5 liters, made of stainless steel, and with a customer rating of 4 stars or above.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--33", "ques": "Search for a portable air conditioner on Amazon suitable for a room size of 300 sq ft, with energy efficiency rating, and compare the prices of the top three search results.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--34", "ques": "Find a beginner's acrylic paint set on Amazon, with at least 24 colors, suitable for canvas painting, and priced under $40.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--35", "ques": "Find a men's leather wallet on Amazon with RFID blocking, at least 6 card slots, and priced below $50. Check if it's available for FREE delivery.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--36", "ques": "Search for a children's science experiment kit on Amazon suitable for ages 8-13, with at least a 4-star rating and priced under $30.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--37", "ques": "Locate a queen-sized bedspread on Amazon with a floral pattern, and check if it's available in blue color.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--38", "ques": "Find a bird feeder on Amazon suitable for small birds, with an anti-squirrel mechanism, and check if it's available with free shipping.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--39", "ques": "Locate a travel guide book on Amazon for Japan, published in 2024, with at least 20 customer reviews.", "web": "https://www.amazon.com/"} +{"web_name": "Amazon", "id": "Amazon--40", "ques": "Locate a women's yoga mat in purple, with a thickness of at least 5mm, rated 4+ stars, and priced under $30 on Amazon. Check how many colors are available in total, and what is the return and delivery policy.", "web": "https://www.amazon.com/"} +{"web_name": "Apple", "id": "Apple--0", "ques": "Compare the prices of the latest models of MacBook Air available on Apple's website.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--1", "ques": "Research the new features of the iOS 17 on Apple support and check its compatibility with the iPhone 12.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--2", "ques": "Compare the prices and chips for the iPhone 14 Pro and iPhone 15 Pro models directly from Apple's website.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--3", "ques": "Find the latest model of the iPhone and compare the price and screen size between the pro and pro max.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--4", "ques": "How much does it cost to buy a Macbook pro, 16-inch, Apple M3 Max chip with 16-core CPU, 40-core GPU, 64GB unified memory, 1TB SSD.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--5", "ques": "Check the release date and price for the latest version of the iPhone.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--6", "ques": "Find AirPods on Apple and how many types are currently available.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--7", "ques": "When and where the Apple Vision Pro will be released.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--8", "ques": "Identify and list the specifications of the latest iPad model released by Apple, including its storage options, processor type, and display features.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--9", "ques": "Check the Apple Store for the availability of the latest iPhone model and schedule an in-store pickup at the nearest Apple Store for January 10, 2024.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--10", "ques": "Find information on the latest (as of today's date) MacBook model, including its key features such as processor type, memory size, and storage capacity.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--11", "ques": "Get information about the latest iPad model released by Apple, including its release date, base storage capacity, and starting price available on Apple's official website.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--12", "ques": "What Apple Repair ways are mentioned on apple website, answer 2 of them.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--13", "ques": "How many colors does the latest MacBook Air come in?", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--14", "ques": "Identify the upgrade options available for the cheapest base model of the MacBook Pro 14-inch with M3 chip, and calculate the total price difference from the base model to the maximum upgrade (no Pre-Installed Software) offered by Apple.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--15", "ques": "On Apple's website, how many different types of keyboards are available when customizing your 14-inch MacBook Pro?", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--16", "ques": "Find on Apple website how many types of AirPods (3rd generation) are available and what is the price difference.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--17", "ques": "Search Apple for the accessory Smart Folio for iPad and check the closest pickup availability next to zip code 90038.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--18", "ques": "Check if there are trade-in offers for the latest model of iPhone.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--19", "ques": "On Apple's website, what is the slogan for the Mac and what is the slogan for the Macbook pro.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--20", "ques": "Check the price for an Apple iPhone 14 Plus with 256GB storage in Purple color.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--21", "ques": "Identify the available storage options for the latest iPad Pro on the Apple website.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--22", "ques": "Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--23", "ques": "Determine the price difference between the latest series of Apple Watch and Apple Watch SE on the Apple website.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--24", "ques": "Find out the starting price for the most recent model of the iMac on the Apple website.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--25", "ques": "On the Apple website, look up the processor for the latest model of the Apple TV.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--26", "ques": "Find the maximum video recording resolution supported by the latest iPad mini on the Apple website.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--27", "ques": "On Apple's website, check if the HomePod mini in store is available in multiple colors and list them.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--28", "ques": "On the Apple website, find out if the Mac Mini can be configured with a GPU larger than 16-core.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--29", "ques": "On Apple's website, check the estimated battery life of the latest MacBook Air during web browsing in Tech Specs.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--30", "ques": "Check the storage options and prices for the latest iPad Pro models on Apple's website.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--31", "ques": "On Apple's website, what is the slogan for the latest Apple Watch Series.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--32", "ques": "Investigate the trade-in value for an iPhone 11 Pro Max on Apple's website.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--33", "ques": "Look for the color options available for the newest iMac.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--34", "ques": "Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--35", "ques": "How many types of Apple Pencil are currently available on the Apple's website? Which one supports Wireless pairing and charging.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--36", "ques": "Browse Apple Music on the entertainment section of the Apple's website, and see which singers' names are included in the pictures on this page.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--37", "ques": "Compare the color options of iPhone 13 Pro, iPhone 14 Pro and iPhone 15 Pro.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--38", "ques": "Explore accessories for Apple Vision Pro, list at least three accessories.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--39", "ques": "Find solutions on Apple's website if you forgot your Apple ID password.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--40", "ques": "Find information on Apple website, and tell me the device weight of Apple Vision Pro and list 5 Built-in Apps it supports.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--41", "ques": "How much does it cost to buy an ipad mini with 64GB storage and Wi-Fi + Cellular connectivity? (no engraving, no apple pencil, no smart folio, no apple trade-in).", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--42", "ques": "Find updates for Apple Watch Series 7,8,9 on Apple's website.", "web": "https://www.apple.com/"} +{"web_name": "ArXiv", "id": "ArXiv--0", "ques": "Search for the latest preprints about 'quantum computing'.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--1", "ques": "Search for the latest research papers on quantum computing submitted to ArXiv within the last two days.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--2", "ques": "Look up the most recent papers related to 'cs.CL', select one and show its abstract.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--3", "ques": "Locate the most recent research paper about 'Algebraic Topology' under Mathematics published on ArXiv. Provide the title of the paper, the name of the authors, and the abstract.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--4", "ques": "Find the most recent research papers in Astrophysics of Galaxies. How many papers have been announced in the last day?", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--5", "ques": "Search papers about \"quantum computing\" which has been submitted to the Quantum Physics category on ArXiv. How many results in total. What if search in all archives?", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--6", "ques": "How many figures and tables are in the paper \"On the Sentence Embeddings from Pre-trained Language Models\"?", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--7", "ques": "Find the most recent paper submitted on machine learning in the Computer Science category posted on ArXiv.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--8", "ques": "What is the latest news on ArXiv?", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--9", "ques": "Find the latest research paper about neural networks published on ArXiv which has been submitted within the last week.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--10", "ques": "Visit ArXiv Help on how to withdraw an article if the submission is not yet announced.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--11", "ques": "For Non-English submissions, do I need to provide a multi-language abstract, if need, answer the separator between the multiple abstracts.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--12", "ques": "Find store in arXiv Help, tell me how many styles of arXiv Logo Shirt are available?", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--13", "ques": "How many articles on ArXiv with 'SimCSE' in the title?", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--14", "ques": "On ArXiv, how many articles have 'SimCSE' in the article and are originally announced in October 2023?", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--15", "ques": "Searching Chinese Benchmark on ArXiv, how many papers announced in December 2023 mention being accepted for AAAI 2024?", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--16", "ques": "Locate the latest research about gravitational waves that were uploaded to ArXiv this week and provide a brief summary of one article's main findings.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--17", "ques": "Find the paper 'GPT-4 Technical Report', when was v3 submitted?", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--18", "ques": "Download the paper 'Dense Passage Retrieval for Open-Domain Question Answering'. How many formulas are in the article and which one is the loss function?", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--19", "ques": "Which university maintains and manages ArXiv. Accessing the university's website from ArXiv, how many underegraduate students are currently at the university.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--20", "ques": "Find the latest paper on 'machine learning in the Statistics section of ArXiv and provide its abstract.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--21", "ques": "Search for papers on 'neural networks for image processing' in the Computer Science category on ArXiv and report how many were submitted in the last week.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--22", "ques": "Locate the ArXiv Help section and find instructions on how to subscribe to daily listing emails for new submissions in a specific category.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--23", "ques": "Determine how many articles with the keyword 'autonomous vehicles' were published in the 'Electrical Engineering and Systems Science' section of ArXiv yesterday.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--24", "ques": "Identify the most recent paper related to 'graph neural networks' on ArXiv and determine the affiliation of the first author.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--25", "ques": "Browse the ArXiv store and let me know how many different types of merchandise are available.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--26", "ques": "Search for papers related to 'climate change modeling' on ArXiv and find out how many have been published in the Earth and Planetary Astrophysics (astro-ph.EP) category in the last week.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--27", "ques": "On ArXiv, what categories does Economics include, and what are their abbreviations?", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--28", "ques": "Search 'Poly encoder' by title on ArXiv and check whether the articles in the search results provide HTML access.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--29", "ques": "On ArXiv, search for papers with 'Neural Network Optimization' in the title published in 2023, and provide the number of such papers.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--30", "ques": "Look up the submission guidelines on ArXiv for submitting a paper and tell me the formats for figures.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--31", "ques": "Search ArXiv for papers with 'Graph Neural Networks' in the abstract that were submitted between Jan 1, 2024, and Jan 3, 2024, and determine how many of these papers have more than five authors.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--32", "ques": "Locate the latest paper on ArXiv within the 'Nonlinear Sciences - Chaotic Dynamics' category, summarize the abstract and note the submission date.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--33", "ques": "Query ArXiv for the latest research article in the category of Systems and Control under Computer Science. Summarize the main objective or hypothesis presented in the paper and provide the names of the authors.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--34", "ques": "Search for the most recent paper related to non-commutative geometry submitted by an author with the first name John. Provide the title and the abstract.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--35", "ques": "Retrieve the latest research paper in Quantum Physics from ArXiv and provide the title, author(s), and date of submission.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--36", "ques": "Search 'CVPR 2023' and 'CVPR2023' through journal ref on ArXiv to see how many results there are respectively.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--37", "ques": "Find the names of people in ArXiv's Leadership Team.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--38", "ques": "Find the ArXiv Blog on the ArXiv website and summarize the content of its latest article.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--39", "ques": "Search the title 'GPT-4 Technical Report' and access this paper through HTML format. Read the paper on this page and tell me what is 'one of the main goals of developing such models' mentioned in the Introduction.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--40", "ques": "How many articles are there on each of the three most recent announce days in the Solar and Stellar Astrophysics section of ArXiv. Choose one at random and answer its title and when the first version was uploaded?", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--41", "ques": "Find the button to share arxiv non-profit store and follow the QR code to share the shop. Then add arXiv Forever short sleeve (XL) to your cart.", "web": "https://arxiv.org/"} +{"web_name": "ArXiv", "id": "ArXiv--42", "ques": "Find an article published between 1 January 2000 and 1 January 2005 that requires Support Vector Machines in the title and its Journey ref is ACL Workshop.", "web": "https://arxiv.org/"} +{"web_name": "BBC News", "id": "BBC News--0", "ques": "Find a report on the BBC News website about recent developments in renewable energy technologies in the UK.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--1", "ques": "Read the latest health-related news article published on BBC News and summarize the key points discussed.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--2", "ques": "Read the latest article regarding the environmental impacts of deforestation published within the last two days.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--3", "ques": "Check the leaderboard for Golf's DP World Tour in the SPORT section, what was the name of the most recent tournament, and how many teams have a Total of -10 strokes.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--4", "ques": "Find the latest article regarding the economic implications of climate change in Europe as reported by BBC News and summarize the central points.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--5", "ques": "Find the article \"What is climate change? A really simple guide\" and use it to answer what human activities are causing climate change.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--6", "ques": "Find the top story from BBC News in the technology section for today.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--7", "ques": "Find a AI-related story under Technology of Business. What is in the first picture in the story?", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--8", "ques": "Get a brief overview of the economic implications of the UK's latest trade deal posted on BBC News and the date when the article was published.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--9", "ques": "Find out which musician made the headlines in Music News.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--10", "ques": "Identify the main headlines covering the UK's plan to tackle climate change on BBC News.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--11", "ques": "Find out how many teams are in the Scottish Premiership of the Football Tournament and when did the Hibernian team's most recent match start?", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--12", "ques": "Find a picture in the travel section that contains food, tell me what the food is called and what region it comes from.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--13", "ques": "Search for recent news related to Trump and summarize the main points.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--14", "ques": "Find a news article on BBC News about the impact of the recent tech industry layoffs on the global economy. Summarize the key points and the name of the author, and provide the date of publication.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--15", "ques": "What does the current headline in Natural Wonders tell about.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--16", "ques": "Identify the most recent development or update in Brexit negotiations as reported on BBC News and report the key points and any stated impacts on European economies.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--17", "ques": "How many War related sections are currently in BBC News.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--18", "ques": "Visit BBC News Audio, What are the best PodCasts for 2023? List 2 of them.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--19", "ques": "Visit the Athletics calendar for the date of the next earliest game.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--20", "ques": "Find the latest article in the Green Living section on BBC News and provide a summary of its main points.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--21", "ques": "Identify the top headline in the World News section on BBC News and describe the region it is related to.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--22", "ques": "Determine the current top business story on BBC News and give a brief overview of its economic implications.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--23", "ques": "Identify the latest health-related news on BBC News and summarize the main findings or recommendations.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--24", "ques": "Search the latest article about space exploration on BBC News and summarize its key points.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--25", "ques": "Find the most recent sports analysis article on BBC News related to the English Premier League and summarize its key insights.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--26", "ques": "Locate the latest report on BBC News about the impact of recent natural disasters in Asia and summarize the key points and areas affected.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--27", "ques": "Find the most recent article on BBC News about archaeological discoveries and summarize the main findings and their significance.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--28", "ques": "Find the Market Data section on BBC News and tell me which company the data comes from.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--29", "ques": "Visit BBC News Audio and find out which podcast episode is currently featured as the \"New Releases\".", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--30", "ques": "In the Culture section, identify the latest film release reviewed and provide a brief summary of the review.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--31", "ques": "Check the Sports section for the result of the most recent Manchester United football match.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--32", "ques": "Find the artificial intelligence section, what is the top headline at this time, and which companies are involved?", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--33", "ques": "In the World News section, find the latest war situations of Middle East and provide a brief summary.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--34", "ques": "Find The SpeciaList section in Travel and browse the page to see which cities are mentioned.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--35", "ques": "In the Asia section, browse and identify the most recent report about technological advancements and summarize its content.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--36", "ques": "Look up recent articles in the Africa news section in World, summarize what topics most of these news are about", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--37", "ques": "Identify the latest book review featured in the Culture section and provide the title and author of the book.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--38", "ques": "Find news related to the storm in Weather section and indicate where and when the severe weather occurred.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--39", "ques": "Check the Horse Racing results in Sport section, browse all the games that took place yesterday and see which one had the highest number of runners.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--40", "ques": "Read and summarise a recent story on BBC News about people being injured or killed in wars.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--41", "ques": "Find Golf in BBC News, check the Leaderboard at this point in Women's Majors and count which country has the most players in the top 20? Which player has the best score amongst the Australian players and in what place.", "web": "https://www.bbc.com/news/"} +{"web_name": "Booking", "id": "Booking--0", "ques": "Find a Mexico hotel with deals for December 25-26.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--1", "ques": "Find the cheapest available hotel room for a three night stay from 1st Jan in Jakarta. The room is for 2 adults, just answer the cheapest hotel room and the price.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--2", "ques": "Find a hotel in Ohio From December 20th to December 23th for 3 adults and 2 rooms.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--3", "ques": "Find a hotel with 4 star and above rating in Los Angeles for 3 days from Dec 18th.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--4", "ques": "Search for the cheapest Hotel near Kashi Vishwanath Temple that offer breakfast from Dec 25th - Dec 26th.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--5", "ques": "Search a hotel with free WiFi and air conditioning in Bali from Jan 1 to Jan 4, 2024.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--6", "ques": "Book one room which provides breakfast, and airport shuttle from Jan 22 to 25 in Los Angeles.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--7", "ques": "Find a hotel room on January 3-6 that is closest to National University of Singapore and costs less than $500", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--8", "ques": "Get the hotel with highest review score and free cancelation in Chennai for 20/12/2023 - 21/12/2023.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--9", "ques": "Find hotels for 2 adults in London with a price less than 250 dollars for four days starting from December 25. You must browse the page and offer at least 3 options.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--10", "ques": "Find a well-reviewed hotel in Paris with available bookings suitable for a couple (2 adults) on Valentine's Day week, February 14-21, 2024, that offers free cancellation options.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--11", "ques": "Reserve a hotel in downtown Chicago with a rating of 9 or higher for a stay from March 20-27, 2024, which offers free cancellation and includes a fitness center.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--12", "ques": "Find a hotel in Paris with a customer review score of 8 or higher, free Wi-Fi, and available for a 5-night stay starting on January 5th, 2024.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--13", "ques": "Find and book a hotel in Paris with suitable accommodations for a family of four (two adults and two children) offering free cancellation for the dates of February 14-21, 2024.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--14", "ques": "Book a highly-rated hotel with a swimming pool and free WiFi near the Louvre Museum in Paris for the weekend of March 3-5, 2024.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--15", "ques": "Find the highest-rated luxury hotel in Rome available for booking from January 10, 2024, to January 20, 2024, for 2 adults. Include the cost, amenities offered, and customer rating.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--16", "ques": "Look for a hotel in Paris with a user rating of 9 or higher and available for a 5-night stay starting January 15, 2024. The hotel should also offer free Wi-Fi and breakfast included in the price. Provide the name, location, and price per night.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--17", "ques": "Find a hotel in Paris with a fitness center and a rating of 8 or higher available for a 5-night stay starting from February 14, 2024, and sort the results by best reviewed.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--18", "ques": "Search a hotel in London with a user rating of 8 or higher for a stay between February 14th, 2024, and February 21st, 2024, suitable for a couple. Provide the name and a short description of the hotel.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--19", "ques": "Look for a hotel with customer ratings above an 8.0 in Paris, France for a weekend stay from March 18, 2024, to March 20, 2024, and list top three suggestions based on user reviews.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--20", "ques": "Locate a hotel in Rome with a good rating (7 or above) that offers free cancellation and breakfast included, for a three-night stay from February 28 to March 2, 2024, for two adults.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--21", "ques": "Find a hotel in Sydney with a rating of 8 or higher, providing free Wi-Fi and parking, available for a four-night stay starting on March 10, 2024.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--22", "ques": "Search for a hotel in Amsterdam with a customer review score of 9 or higher, offering bicycle rentals, for a week-long stay from March 15 to March 22, 2024, for two adults.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--23", "ques": "Identify a hotel in Tokyo with a spa and wellness center, rated 9 or above, with availability for a five-night stay starting on February 20, 2024. Check if free cancellation is offered.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--24", "ques": "Find a hotel in Barcelona for a stay from February 25-28, 2024. Please sort the results by distance from the beach and make sure they offer free Wi-Fi and breakfast.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--25", "ques": "Search for a hotel in Lisbon with airport shuttle, rated 8.5 or above, available for a six-night stay from March 1 to March 7, 2024, for two adults, breakfast included.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--26", "ques": "Check Booking.com for a 3-star hotel or higher in Paris with a guest rating above 8.0 and available parking for dates February 20-23, 2024.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--27", "ques": "Locate a hotel in Melbourne offering free parking and free WiFi, for a stay from February 28 to March 4, 2024.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--28", "ques": "Find a hotel in Dubai with a swimming pool, for a week-long stay from February 22 to February 29, 2024.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--29", "ques": "Search for a hotel in Toronto with a fitness center and a rating of 8+, available for a two-night stay from March 5 to March 7, 2024.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--30", "ques": "Search for hotels in London from March 20 to March 23, 2024, on Booking. How many hotels are left after applying the Breakfast included and Fitness center filters?", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--31", "ques": "Search for hotels in Rio de Janeiro from March 1-7, 2024, check the Brands filter to see which brand has the most hotels and which brand has the fewest.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--32", "ques": "Look for hotels in Sydney from February 24 to February 27, 2024, on Booking. Once the Swimming Pool and Airport Shuttle filters are applied, what is the total number of hotels available?", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--33", "ques": "Find the Customer Service on the Booking website, browse the questions about cancellation, and tell me 'how do I know whether my booking has been cancelled'.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--34", "ques": "Search for a hotel in Berlin available for a three-night stay from March 15 to March 18, 2024, for one adult. Tell me the price in USD and CNY for the three-night stay.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--35", "ques": "Browse the booking website to get inspiration for your next trip, and summarize at least three places mentioned in one of the travel articles.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--36", "ques": "Search for a budget hotel in Rome under $100 per night for one adult from March 20 to March 23, 2024. Sort the results by price, identify if any of top three results offer breakfast.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--37", "ques": "Search for a resort (not hotel) in Bali, detailing the available dates between March 20, 2024, and March 25, 2024, and checking any provided tour or cultural experiences.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--38", "ques": "Look up Vienna hotel options with availability for a 4-night stay from February 28 to March 4, 2024, with amenities that include a Parking, breakfast included, and a rating of 8+ on Booking.com.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--39", "ques": "Find a pet-friendly hotel with parking available in downtown Toronto for the stay of February 24-26, 2024.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--40", "ques": "I need to choose a hotel in Shenzhen, please select date (6 March to 8 March 2024) and click the search button. How much it costs when convert the price to Chinese Yuan on the page.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--41", "ques": "Browse Booking's homepage to find out which company it belongs to.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--42", "ques": "Search for a hotel in Hokkaido for the period March 1 to March 7, 2024, with a rating of 9+, check out its user reviews, which categories are greater than 9 and which are less than 9?", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--43", "ques": "Search for properties in Los Angeles, browse the results page to see what filters are available, list some of them.", "web": "https://www.booking.com/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--0", "ques": "Look up the pronunciation and definition of the word \"sustainability\" on the Cambridge Dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--1", "ques": "Find the pronunciation, definition, and a sample sentence for the word 'serendipity'.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--2", "ques": "Look up the pronunciation, definition, and example sentence for the word \"ubiquitous\" in UK and US English.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--3", "ques": "Look up the definition, pronunciation, and examples of the word \"zeitgeist.\"", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--4", "ques": "Look for the British English pronunciation of the word \"innovate\" and write down the International Phonetic Alphabet (IPA) notation, then find one example sentence provided in the Cambridge Dictionary that uses this word.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--5", "ques": "Learn the UK and US pronunciation of the word \"procrastination\", and find one example sentence that reflects its use in context.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--6", "ques": "Search for the word \"sustainability\" on the Cambridge Dictionary, what is the translation of sustainability into Chinese and French in the dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--7", "ques": "Look up the meaning, pronunciation, and an example sentence of the word \"gestalt\" using the Cambridge Dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--8", "ques": "Find three different meanings of \"dog\" in Cambridge Dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--9", "ques": "Look up the British pronunciation of the word \"euphoria\" and find an example sentence using that word on the Cambridge Dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--10", "ques": "Look up the definition and pronunciation of the word \"impeccable\" and also find an example sentence using that word.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--11", "ques": "Look up the pronunciation and definition of the word \"ameliorate,\" and provide an example sentence using the word.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--12", "ques": "Find the pronunciation, definition, and a sample sentence for the word \"resilience\" in the Cambridge Dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--13", "ques": "Find one word, one phase and one idiom related to euphoria in Cambridge Dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--14", "ques": "Use the Cambridge Dictionary to find the pronunciation, definition, and one example sentence for the word \"concatenate\".", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--15", "ques": "Find the pronunciation and a sample sentence for the word \"pandemic.\"", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--16", "ques": "Look up the definition of \"cryptocurrency\" on Cambridge Dictionary, provide the pronunciation, and use it in two example sentences that illustrate different contexts.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--17", "ques": "How many meanings of \"unblemished\" are given in Cambridge Dictionary? Please browse the page and give the number directly.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--18", "ques": "Search for \"to behave well\" in Cambridge Dictionary's Thesaurus and see which synonyms the dictionary gives.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--19", "ques": "Try a Cambridge Dictionary translation and tell me which company provided the translation.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--20", "ques": "Look up the definition, pronunciation (both UK and US), and find one example sentence for the word \"altruism\" in the Cambridge Dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--21", "ques": "Search for the word \"ephemeral\" on Cambridge Dictionary and find its translation into Spanish.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--22", "ques": "Use the Cambridge Dictionary to find the definition, UK pronunciation, and an example sentence for the word \"quintessential.\"", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--23", "ques": "Find the US English pronunciation of the word \"meticulous\" using the Cambridge Dictionary and note the International Phonetic Alphabet (IPA) notation, then find one example sentence provided in the dictionary using this word.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--24", "ques": "Look up the definition and both UK and US pronunciation of the word \"reverie,\" and provide an example sentence using the word from Cambridge Dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--25", "ques": "Find two different meanings of the word \"harmony\" in the Cambridge Dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--26", "ques": "Search for the word \"nostalgia\" in the Cambridge Dictionary and report the translation of this word into Chinese.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--27", "ques": "Look up the meaning, pronunciation, and an example sentence of the word \"solitude\" using the Cambridge Dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--28", "ques": "Search for \"feel giddy\" in Cambridge Dictionary's Thesaurus and list the synonyms the dictionary provides.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--29", "ques": "Go to the Plus section of Cambridge Dictionary, find Image quizzes and do an easy quiz about Animals and tell me your final score.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--30", "ques": "Find the grammar for present perfect simple uses in English, including examples of affirmative, negative, and interrogative sentences, on the Cambridge Dictionary website.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--31", "ques": "Look up the use of modal verbs in grammar section for expressing possibility (e.g., 'might', 'could', 'may') and find examples of their usage in sentences on the Cambridge Dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--32", "ques": "Search for the differences between \"fewer\" and \"less\" in grammar section, and provide examples illustrating their correct usage from the Cambridge Dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--33", "ques": "Find explanations and examples of the passive voice in Grammar on the Cambridge Dictionary website.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--34", "ques": "Use the Cambridge Dictionary to understand the rules for forming and using comparative and superlative adjectives in English Grammar, including example sentences.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--35", "ques": "Find the most common prepositions that consist of groups of words on the Cambridge Dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--36", "ques": "Search for guidelines on using indirect speech in English, with examples of how to change direct speech to indirect speech, on the Cambridge Dictionary.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--37", "ques": "Use Cambridge Dictionary to understand the use of articles ('a', 'an', 'the') in English Grammar, including examples of usage with both countable and uncountable nouns.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--38", "ques": "Go to the Plus section of Cambridge Dictionary, finish a recommended Grammar quiz without login and tell me your final score.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--39", "ques": "Try the Word Scramble game in the Plus section, Can you beat the clock by unscrambling the letters to spell the word? (Just try the first example.)", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--40", "ques": "Look up the definition, pronunciation in UK English, and at least one example using the word 'mitigate'.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--41", "ques": "Find and browse Cambridge Dictionary Shop section, listing 3 items.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Cambridge Dictionary", "id": "Cambridge Dictionary--42", "ques": "Convert the Cambridge Dictionary homepage from English (UK) to Deutsch.", "web": "https://dictionary.cambridge.org/"} +{"web_name": "Coursera", "id": "Coursera--0", "ques": "Find a beginner-level online course about '3d printing' which lasts 1-3 months, and is provided by a renowned university.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--1", "ques": "Search for a beginner-level online course about Python programming, suitable for someone who has no programming experience on Coursera.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--2", "ques": "Find a Beginner's Spanish Specialization on Coursera and show all the courses in this Specialization.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--3", "ques": "Identify a new course or Specialization on Coursera related to Python Data Science, sort the courses by newest, what the first course is and which institution offers it.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--4", "ques": "Identify a course or Specialization on Coursera that helps business process management with with a rating 4.7.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--5", "ques": "Identify a Specialization on Coursera that teaches C++ programming for beginners, provide the name and what the learning outcomes are.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--6", "ques": "Identify a course on Coursera related to 'Artificial Intelligence for Healthcare' and note the course duration along with the number of quizzes in Assessments.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--7", "ques": "Find a course on Coursera that teaches Reinforcement Learning for Intermediate with a rating of at least 4.5. Provide the name of the course, the institution offering it, and the number of reviews it has received.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--8", "ques": "Find a free course related to 'R for Data Science' available on Coursera. Scroll to find a course with the Free tag. What language the course is taught in?", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--9", "ques": "Identify a Coursera course on artificial intelligence ethics that has a duration of less than 20 hours to complete and has been rated 4+ stars by participants.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--10", "ques": "Locate an introductory course related to artificial intelligence on Coursera, ensuring it's suitable for beginners and contains at least one module discussing Ethical Considerations.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--11", "ques": "Search for a Specialization on Coursera about project management that is produced by a university, show a testimonial for this Specialization.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--12", "ques": "Look for a Coursera course (not Specialization) that teaches Java programming basics.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--13", "ques": "Look for a Specialization on Coursera that teaches Python programming, and identify the skills you will learn by taking this Specialization.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--14", "ques": "Find a course on Coursera related to Introductory Project Management that includes modules on Agile methodology.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--15", "ques": "Find a course on Coursera named 'Introduction to Mathematical Thinking' offered by Stanford, what is the percentage (rounded) of 5 star ratings in reviews and which level has the least percentage?.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--16", "ques": "Identify a course on Coursera named 'Introduction to Finance: The Basics', who is the course instructor and what other courses does he/she teach.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--17", "ques": "How many results are there for a search on Coursera for Machine Learning, then filtered by Credit Eligible and 1-4 Years duration?", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--18", "ques": "Identify a Coursera course that teaches JavaScript, which is beginner-friendly and includes a certificate upon completion.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--19", "ques": "Identify a course on Coursera that provides an introduction to Psychology, list the instructor's name, the institution offering it, and how many hours it will approximately take to complete.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--20", "ques": "Find an Intermediate-level online course on Coursera about 'Blockchain Technology' which lasts between 1 to 4 weeks, and is provided by a well-known institution. Also, note the course's main goals and the instructor's name.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--21", "ques": "Search for an online course on Coursera about 'Digital Marketing', suitable for beginner-level learners. Specify the course duration, the main learning outcomes, and the institution offering the course.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--22", "ques": "Identify a Specialization on Coursera that focuses on 'Human Resource', list the courses included in this Specialization, and the institution offering it.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--23", "ques": "Find a course on Coursera about 'Artificial Intelligence Ethics', which has a duration of less than 5 weeks and has been rated 4.5 stars or higher. Provide the course name and the instructor's name.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--24", "ques": "Locate an online course on Coursera related to 'Sustainability' that belongs to Physical Science and Engineering subject. The course should include a module on Measuring Sustainability. Note the course duration and the offering institution.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--25", "ques": "Find a course on Coursera about 'Relativity' for beginners. List the course's main topics and the estimated time (in hours) required to complete it.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--26", "ques": "Identify a Specialization on Coursera that offers an overview of 'Renewable Energy'. The Specialization should be beginner-level and include a course on Renewable Energy Futures. Note the instructor's name and the number of weeks required to complete the course if I spend 5 hours a week.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--27", "ques": "Search for a Specialization on Coursera about 'Data Visualization' that includes a project. Provide the name of the Specialization, the institution offering it, and the skills that will be developed by completing it.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--28", "ques": "Locate a Coursera Guided project related to 'Astrophysics' suitable for advanced learners. Mention the course duration, the institution offering it, and the main subjects covered in the course.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--29", "ques": "Browse the Coursera website and find the price required for one year of Coursera Plus. How much is the discount? Then list 3 companies that work with Coursera.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--30", "ques": "Locate the course 'Modern Art & Ideas' on Coursera offered by The Museum of Modern Art. Find out the percentage (rounded) of 3-star ratings in the reviews and note which star level has the lowest percentage.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--31", "ques": "Search for the course 'Exploring Quantum Physics' on Coursera, offered by the University of Maryland, College Park. Identify the percentage (rounded) of 5-star ratings in the reviews.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--32", "ques": "Search for 'Data Analysis' courses on Coursera. Apply filters to find courses that are 'Beginner Level' and have a duration ranging from 1 to 3 months. Determine the total count of courses that match these specifications.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--33", "ques": "Find a beginner level Coursera course related to \"Internet of Things (IoT)\" with a high rating. Provide the course name, instructor's name, and a brief summary of the skills that will be taught.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--34", "ques": "Find the course on Coursera named 'Essentials of Global Health'. Determine the instructor of this course and summarize his bio, note if there are any additional courses he offers on Coursera.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--35", "ques": "Find a Coursera course on Sustainable Agriculture practices, and detail the course's objectives and the background of the lead instructor.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--36", "ques": "Browse Coursera, which universities offer Master of Advanced Study in Engineering degrees? Tell me what is the latest application deadline for this degree?", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--37", "ques": "Browse the Coursera homepage and list at least three free courses.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--38", "ques": "Browse Coursera, which universities and companies from Australia are partners of Coursera? List all of them.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--39", "ques": "Find the Space Safety course offered by TUM on Coursera. How many videos are there in module 2? What is the name of each video?", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--40", "ques": "Browse Coursera for Business and Coursera for Teams and summarise some of their advantages.", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--41", "ques": "Browse online degrees section on Coursera and list 3 Bachelor's degree programmes.", "web": "https://www.coursera.org/"} +{"web_name": "ESPN", "id": "ESPN--0", "ques": "Look up the current standings for the NBA Eastern Conference on ESPN.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--1", "ques": "Check the latest articles on ESPN for updates on any trades that occurred in the NBA within the past 2 days.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--2", "ques": "Show the scores and main highlight of the Milwaukee Bucks game that took place within the last 2 days on ESPN.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--3", "ques": "Retrieve the final score from the most recent NBA game broadcast on ESPN, including the playing teams' names and the date of the match.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--4", "ques": "Check ESPN for the final scores of NBA games that were played yesterday.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--5", "ques": "Identify the top scorer in the NBA from the latest completed game and note down the points scored, the team they play for, and their position on the team.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--6", "ques": "Find the result of the latest basketball game between the Los Angeles Lakers and the Boston Celtics, including the final score and top scorer from the match.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--7", "ques": "Retrieve the final score and a brief summary of the latest NBA game played by the Los Angeles Lakers as reported on ESPN.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--8", "ques": "Find information on ESPN about the top three scoring leaders in the NBA as of the last day of the regular season, and note which teams they play for.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--9", "ques": "Search on ESPN for how many teams have Los Angeles in their name and how many of them are NBA.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--10", "ques": "Check ESPN for the score and a brief recap of the latest college football championship game.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--11", "ques": "How many NBA teams are there and list all the teams with 'New' in their name.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--12", "ques": "The first three Top Headlines in the current ESPN home page correspond to which sports leagues?", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--13", "ques": "Identify today's top headline in the Basketball section of ESPN, and summarize the main points of that article.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--14", "ques": "Find the latest news about NBA trades or player movements on ESPN and report the most recent trade deal OR player acquisition.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--15", "ques": "Check the scores of the NBA games played on December 25, 2023.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--16", "ques": "Check the schedule for the NBA game on December 25, 2023, and provide the teams that are playing and their current standings in their respective conferences.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--17", "ques": "Check out the NBA Basketball Power Index 2023-24 to see which teams are in first place and which are in last place.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--18", "ques": "How many sports leagues can you choose from on the ESPN home page?", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--19", "ques": "Who has the highest salary in Boston Celtics Roster 2023-24?", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--20", "ques": "Look up the current leaders in rebounds and assists in the NBA Western Conference on ESPN.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--21", "ques": "Show the scores and main highlight of the Denver Nuggets game that occurred within the last 3 days on ESPN.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--22", "ques": "Find the latest Team transactions in the NBA within the past week.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--23", "ques": "Find the result of the latest basketball game between the Miami Heat and the New York Knicks, including the final score and top rebounder from the match.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--24", "ques": "Find the final score from the most recent NFL game broadcast on ESPN, including the teams' names and the date of the match.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--25", "ques": "Identify the player with the most assists in the latest NBA game and show me the assists, the team they play for, and their position.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--26", "ques": "Find information on ESPN NBA schedule. Tell me yesterday's matchups in which the loser high was higher than the winner high.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--27", "ques": "Search on ESPN for how many teams have 'Golden' in their name and how many of them are in the NHL.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--28", "ques": "How many MLB teams are there and list all the teams with 'City' in their name.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--29", "ques": "Identify today's top headline in the Soccer section of ESPN, and summarize the main points of that article.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--30", "ques": "Check out the NHL Standings 2023-24 on ESPN to see which teams are at the top and which are at the bottom in Eastern and Western Conference. What about the situation in Division.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--31", "ques": "Who has the heaviest weight among infielders in the New York Yankees Roster 2023-24?", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--32", "ques": "Review yesterday's NHL game results on ESPN, focusing on teams' performance.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--33", "ques": "Locate the latest ESPN articles discussing potential MVP candidates in the NFL for 2023 season.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--34", "ques": "Visit ESPN to view the Philadelphia 76ers' latest injuries.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--35", "ques": "Browse ESPN to find out when the next game of the Los Angeles Lakers will start. Then navigate to the ticket purchasing website from ESPN, what is the cheapest ticket available.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--36", "ques": "Search for Lionel Messi's last 5 games, which teams has he played for, and what are the results?", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--37", "ques": "Check out LeBron James' Stats to see how many games he has played in his career so far.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--38", "ques": "Check Los Angeles Lakers Stats 2023-24, calculate Anthony Davis' games played (GP) percentage, tell me if there are other players with the same games played percentage as Anthony Davis.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--39", "ques": "Check the New York Jets Depth Chart in the NFL section of ESPN and identify the players listed as injured in the 2ND position.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--40", "ques": "Browse the ESPN+ page from ESPN for a brief summary of what ESPN+ Tools is used for.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--41", "ques": "Find out which four teams the NFC North contains in the NFL on ESPN.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--42", "ques": "Check out NCAAM standings on ESPN, what are the teams with equal wins and losses in the America East Conference currently?", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--43", "ques": "Check out NCAAW recruiting on ESPN, what colleges are the top three players from?", "web": "https://www.espn.com/"} +{"web_name": "GitHub", "id": "GitHub--0", "ques": "Search for an open-source project related to 'climate change data visualization' on GitHub and report the project with the most stars.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--1", "ques": "Search for an open-source repository for machine learning in Python, specifically focused on decision trees, updated within the last 2 days.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--2", "ques": "Look for the trending Python repositories on GitHub with most stars.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--3", "ques": "Find out how much more package storage the Enterprise version has over Team in GitHub Pricing.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--4", "ques": "Find a popular JavaScript repository created in the last 30 days on GitHub with a Readme file.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--5", "ques": "Find a Python repository on GitHub that has been updated in the past 2 days and has at least 500 stars.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--6", "ques": "Search for an open-source project related to 'cryptocurrency wallet' updated in the past 30 days and provide the top three contributors.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--7", "ques": "Find the official GitHub repository for ALBERT and show me what files the repo changed in the most recent commit.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--8", "ques": "Look up the latest stable release version of Vuex and find out when it was published.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--9", "ques": "Locate a repository on GitHub that was created in the last week and has 50 or more stars. Provide brief details about the project's purpose and its programming language.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--10", "ques": "If I start using Copilot Individual, how much US dollars will it cost per year and what features does it have?", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--11", "ques": "Find a newly created open-source project on GitHub related to 'climate change' that has been initiated in January 2023; check the main programming language used and the project's description.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--12", "ques": "Retrieve the latest release from the 'electron/electron' repository on GitHub and note down the release version number and date.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--13", "ques": "Identify the latest top-trending open-source project in the category of 'Machine Learning' on GitHub, and check the number of stars it has received.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--14", "ques": "Locate the repository for the open-source project \"vscode\" and identify the top three contributors.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--15", "ques": "Locate a repository on GitHub related to 'quantum computing' that has been updated within the last week and has at least 50 stars. Provide a brief description of the project.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--16", "ques": "Find the GitHub Skill section and how many courses are under the 'First day on GitHub' heading.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--17", "ques": "Locate a C++ project on GitHub that has been recently updated in the last week and has at least 500 stars, then describe its main purpose.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--18", "ques": "Identify and report the most popular (in terms of stars) open-source image processing tool on GitHub.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--19", "ques": "Look up the most recently updated Python repository on GitHub that is tagged with 'web scraping' and has over 100 stars.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--20", "ques": "Open GitHub Copilot's FAQs to find the official answer to when Copilot chat can be used on mobile.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--21", "ques": "Find the Security topic in GitHub Resources and answer the role of GitHub Advanced Security.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--22", "ques": "Find an open-source repository on GitHub focused on natural language processing in Ruby, updated within the last week.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--23", "ques": "Find the wiki page of ohmyzsh on GitHub and tell me how to change the theme of zsh to agnoster.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--24", "ques": "Locate the GitHub repository for the open-source project \"angular\" and identify the last three issues closed.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--25", "ques": "Search for a 'virtual reality' related repository on GitHub updated in the last 10 days with at least 200 stars and summarize its main objective.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--26", "ques": "Find the Resolve merge conflicts course in GitHub Skills and what actions learners will perform in this course.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--27", "ques": "Find a Ruby repository on GitHub that has been updated in the past 3 days and has at least 1000 stars.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--28", "ques": "Identify the most starred JavaScript repositories on GitHub that were created after 2023-12-29.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--29", "ques": "Compare the maximum number of private repositories allowed in the Free and Pro plans in GitHub Pricing.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--30", "ques": "Search for an open-source project related to 'blockchain technology' on GitHub updated in the past 15 days and list the top five contributors.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--31", "ques": "Find the official GitHub repository for TensorFlow and list the files changed in the last commit. Tell me the name of changed files, total additions and total deletion.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--32", "ques": "Discover the latest C# repository on GitHub related to 'game development' and having over 150 stars, and describe its main features.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--33", "ques": "Find Customer Stories on the GitHub page and list the 2 stories that appear on the web page.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--34", "ques": "Search for an open-source project on GitHub related to 'Protein prediction' and identify the project with the highest number of forks.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--35", "ques": "Check the latest release version of React and the date it was published on GitHub.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--36", "ques": "Identify a new open-source project on GitHub related to 'AI agriculture' that created in 2022, and note its main programming language and description.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--37", "ques": "List the 3 features mentioned in GitHub's Copilot product page.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--38", "ques": "Identify and report the most popular (by stars) open-source repo related to cybersecurity on GitHub.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--39", "ques": "Browse the GitHub Trending and find out which developer is currently ranked first this month and the corresponding repository.", "web": "https://github.com/"} +{"web_name": "GitHub", "id": "GitHub--40", "ques": "Select Sign up on the GitHub homepage to see if email 'test123@gmail.com' already exists.", "web": "https://github.com/"} +{"web_name": "Google Flights", "id": "Google Flights--0", "ques": "Book a journey with return option on same day from Edinburg to Manchester on December 28th and show me the lowest price option available.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--1", "ques": "Show me the list of one-way flights today (February 17, 2024) from Chicago to Paris.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--2", "ques": "Find the lowest fare from all eligible one-way flights for 1 adult from JFK to Heathrow on Jan. 22.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--3", "ques": "Search for the one-way flight available from Calgary to New York on Jan. 1st with the lowest carbon dioxide emissions.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--4", "ques": "Search for one-way flights from New York to London on Dec. 26th and filter the results to show only non-stop flights.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--5", "ques": "Find flights from Chicago to London on 20 December and return on 23 December.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--6", "ques": "Search for a flight on December 19 and return on December 26 from Tel Aviv to Venice and Select First Class.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--7", "ques": "Find a round trip from Phoenix to Miami (Dec. 25th - Dec. 28th), show the First Class plane tickets for me that do not exceed $1320..", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--8", "ques": "Search a one-way filght from Dublin To Athens Greece for 1 Adult that leaves on December 30 and analyse the price graph for the next 2 months.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--9", "ques": "Find a one way economy flight from Pune to New York in Jan. 15th and show me how long it will take for flight transfer.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--10", "ques": "Locate the cheapest round-trip flights from New York to Tokyo leaving on January 25, 2024, and returning on February 15, 2024.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--11", "ques": "Compare the prices for round-trip flights from New York to Tokyo for a departure on February 10, 2024, and a return on February 24, 2024, and select the option with the least number of stops.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--12", "ques": "Find the best-priced round-trip flight from New York to London leaving on December 25, 2023, and returning on January 5, 2024, with one stop or fewer.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--13", "ques": "Find the cheapest round-trip flight option from New York City to Tokyo for a departure on January 10, 2024, and a return on January 24, 2024.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--14", "ques": "Compare flight options and find the lowest round trip fare from New York to London departing on January 10, 2024, and returning on January 17, 2024.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--15", "ques": "Compare the prices and total duration of non-stop flights from New York to Tokyo Narita Airport departing on February 12th, 2024, and returning on February 26th, 2024.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--16", "ques": "Find the cheapest one-way flight from New York to Tokyo departing on January 15, 2024, and provide the airline and total flight duration.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--17", "ques": "Find the cheapest round-trip flight from New York to Paris leaving on December 27, 2023, and returning on January 10, 2024.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--18", "ques": "Compare flight options from New York to Tokyo for a round trip leaving on January 25, 2024, and returning on February 15, 2024, for one adult. Prioritize the comparisons by the shortest travel time.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--19", "ques": "Find the cheapest one-way flight from London to Paris, departing on January 25, 2024. Include the airline, total travel time, and layovers for the chosen flight.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--20", "ques": "Book a round-trip flight from San Francisco to Berlin, departing on March 5, 2024, and returning on March 12, 2024, and find the option with the shortest total travel time.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--21", "ques": "Locate the lowest-priced one-way flight from Tokyo to Sydney for an adult, departing on February 25, 2024, and include the flight duration and number of layovers.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--22", "ques": "Find a round-trip flight from Rio de Janeiro to Los Angeles, leaving on March 15, 2024, and returning on March 22, 2024, and select the option with the least carbon dioxide emissions.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--23", "ques": "Search for a one-way flight from Mumbai to Vancouver on February 28, 2024, filtering the results to show only 1-stop flights.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--24", "ques": "Compare prices for economy class round-trip flights from Dubai to Rome, departing on March 1, 2024, and returning on March 8, 2024, and select the option with the fewest stops.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--25", "ques": "Find a one-way business class flight from Buenos Aires to Amsterdam on March 10, 2024, and provide the details of the flight with the shortest duration.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--26", "ques": "Search for the cheapest round-trip flights from Bangkok to Madrid, leaving on February 26, 2024, and returning on February 28, 2024, and provide options under $1000.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--27", "ques": "Locate a one-way flight from Johannesburg to Toronto on March 30, 2024, for one adult, and analyze the price trends for the following month.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--28", "ques": "Find the best-priced round-trip flight from Seattle to Paris, departing on February 27, 2024, and returning on March 1, 2024, with a maximum of one stop.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--29", "ques": "Compare the prices and total travel time of non-stop flights from Mexico City to Frankfurt, departing on March 5, 2024, and returning on March 15, 2024.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--30", "ques": "Find the most affordable one-way flight from Cape Town to Singapore, departing on March 20, 2024, and include the airline and total number of layovers.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--31", "ques": "Find a one-way economy flight from Auckland to Honolulu on March 25, 2024, browse the full page and display a flight option with the most stops.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--32", "ques": "Search for round-trip flights from Stockholm to Toronto, departing on March 3, 2024, and returning on March 10, 2024, and sort the results to find the shortest total travel time.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--33", "ques": "Find a one-way flight from Shanghai to Vancouver on February 27, 2024, and compare the options based on carbon dioxide emissions.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--34", "ques": "Compare business class flight options from Lisbon to Singapore for a one-way trip on March 15, 2024, select one of the flights and see which websites offer its booking options. Which one is the cheapest.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--35", "ques": "Find the lowest-priced one-way flight from Cairo to Montreal on February 21, 2024, including the total travel time and number of stops.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--36", "ques": "Search for round-trip flights from Helsinki to New Delhi, departing on March 28, 2024, and returning on April 4, 2024, and filter the results to show only flights under $1000.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--37", "ques": "Locate a round-trip flight from Buenos Aires to Beijing, leaving on February 28, 2024, and returning on March 3, 2024, check out one of the options and tell me if the airline for my return flight is the same as my departure flight.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--38", "ques": "Compare the prices and flight durations for economy class flights from Oslo to Dubai, departing on March 8, 2024, and show the options with no more than two layovers.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--39", "ques": "Find a one-way flight from Prague to a city in Japan on March 20, 2024, which city in Japan is cheaper to go to, Tokyo or a certain city in Hokkaido?", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--40", "ques": "Browse destinations on the Google Flights homepage from Seattle, look at destinations on a map, and recommend some famous places to travel that are within a reasonable distance and price.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Flights", "id": "Google Flights--41", "ques": "Choose one way business class ticket from Hong Kong to Glacier National Park on 8 March 2024, offering a 1 stop ticket.", "web": "https://www.google.com/travel/flights/"} +{"web_name": "Google Map", "id": "Google Map--0", "ques": "Find 5 beauty salons with ratings greater than 4.8 in Seattle, WA.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--1", "ques": "Tell me one bus stop that is nearest to the intersection of main street and Amherst street in Altavista.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--2", "ques": "Find Apple Stores close to zip code 90028", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--3", "ques": "The least amount of walking from Central Park Zoo to the Broadway Theater in New York.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--4", "ques": "Plan a trip from Boston Logan Airport to North Station.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--5", "ques": "Search for a parking garage near Thalia Hall in Chicago that isn't open 24 hours.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--6", "ques": "Find all Uniqlo locations in Chicago, IL.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--7", "ques": "Find bus stops in Alanson, MI", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--8", "ques": "Find a place to climb within 2 miles of zip code 90028.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--9", "ques": "Find the art gallery that is nearest to Los Angeles Hindu Temple.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--10", "ques": "Search for a park in the state of California called Castle Mountains National Monument and find out it's Basic Information.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--11", "ques": "Locate a large store in Washington that has kids' and maternity products, also check if it has a parking lot.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--12", "ques": "Find 5 places that serve burgers near 44012 zip code and sort these 5 places by highest rating.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--13", "ques": "Find a parking lot in Gloucester and book a ride from there to North Plymouth, view the map to understand the route better.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--14", "ques": "Find motorcycle parking near Radio City Music Hall.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--15", "ques": "Find daytime only parking nearest to Madison Square Garden. Summarize what people are saying about it. ", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--16", "ques": "Find EV charging supported parking closest to Smithsonian museum.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--17", "ques": "Search for locksmiths open now but not open 24 hours in Texas City.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--18", "ques": "Find a route between Chicago to Los Angeles, then print the route details.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--19", "ques": "I will arrive Pittsburgh Airport soon. Provide the name of the Hilton hotel closest to the airport. Then, tell me the the walking time to the nearest supermarket from the hotel.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--20", "ques": "Find Tesla Destination Charger closest to the National Air and Space Museum.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--21", "ques": "Identify the nearest bus stop to the corner of Elm Street and Oak Street in Massachusetts.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--22", "ques": "Find a Best Buy store near zip code 33139.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--23", "ques": "Determine the shortest walking route from The Metropolitan Museum of Art to Times Square in New York.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--24", "ques": "Plan a journey from San Francisco International Airport to Union Square via driving.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--25", "ques": "Search for a parking facility near the Fox Theater in Detroit that closes at night.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--26", "ques": "Search for Los Angeles on Google Map, try to print the map as PDF and summarize the information on the map.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--27", "ques": "Locate the Target stores in Atlanta, GA. How many results are shown on the map.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--28", "ques": "Find the search settings for Google Map, what options are shown on that page?", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--29", "ques": "Identify bus stops in Ypsilanti, MI, list three of them.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--30", "ques": "Locate a parking lot near the Brooklyn Bridge that open 24 hours. Review the user comments about it.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--31", "ques": "First search New York's Central Park Zoo on Google Map, and then find the way to share the map. What is the generated sharing link?", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--32", "ques": "Search for plumbers available now but not open 24 hours in Orlando, FL.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--33", "ques": "Check out Denver International Airport's information and tell me: 1) which level has the least proportion in reviews; 2) what are its Accessibility and Amenities.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--34", "ques": "Find a hiking trail within 2 miles of zip code 80202.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--35", "ques": "Search for a natural reserve in Texas called Big Bend National Park and gather its Basic Information.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--36", "ques": "Identify 5 restaurants serving pizza near the 30309 zip code and rank them by their ratings.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--37", "ques": "Locate a parking area in Salem and find a route from there to Marblehead, including map directions for better understanding.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--38", "ques": "Search for bicycle parking near the Empire State Building.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--39", "ques": "Find a route from Miami to New Orleans, and provide the detailed route information.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Map", "id": "Google Map--40", "ques": "Find a restaurant in Boston that eats Boston lobster and asks for a rating of 4.6 or higher, and check out what a one-star review says.", "web": "https://www.google.com/maps/"} +{"web_name": "Google Search", "id": "Google Search--0", "ques": "Find the initial release date for Guardians of the Galaxy Vol. 3 the movie.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--1", "ques": "Find Kevin Durant's bio", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--2", "ques": "Search for the latest news title about the NBA team the Los Angeles Lakers.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--3", "ques": "Show me a list of comedy movies, sorted by user ratings. Show me the Top 5 movies.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--4", "ques": "Show most played games in Steam. And tell me the number of players in In game at this time", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--5", "ques": "find the score of the latest nba game played by the phoenix suns.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--6", "ques": "Browse the monthly trending searches in Columbus.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--7", "ques": "Find the software requirements for iPhones that support AirDrop's ability to continue transmitting over the web when out of range.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--8", "ques": "Find the video on YouTube: 'Oscars 2023: Must-See Moments!'. Tell me who the first comment displayed under that video belongs to, and how many thumbs up and replies it has.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--9", "ques": "Show the rating of Prometheus movie on IMDb and Rotten Tomatoes.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--10", "ques": "Find the no. 1 weekly charts ranked artist based on Billboard and tell me 10 most played song by this artist until now.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--11", "ques": "According to FlightAware, tell me the busiest airport last week and its total arrivals and departures last week.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--12", "ques": "Find the year that Tom Brady had the most touchdowns in a single seasson.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--13", "ques": "What are Jerry Trainor's upcoming projects?", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--14", "ques": "Find the retired players the year before last named James Smith and tell me which club he has been a member of from 2020\u20132021.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--15", "ques": "Please try to log in to twitter with email: webagenttest@testmail.com and password: test123456. Let me know if the login was successful.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--16", "ques": "How many members are there in the OpenAI community on Reddit, and what is the hottest news right now?", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--17", "ques": "Tell me the names of Trump's kids", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--18", "ques": "When and where the most recent World Cup was held, and which team was the winner?", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--19", "ques": "What are the first 7 bits of the SHA of the Bert's latest commit on GitHub, and what exactly was changed in that commit.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--20", "ques": "Find the release date for the latest \"Fast & Furious\" movie.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--21", "ques": "Show a list of the top 5 highest-grossing animated movies, sorted by box office earnings.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--22", "ques": "Browse and list the top three trending topics this month in New York City.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--23", "ques": "Retrieve a short biography of LeBron James.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--24", "ques": "What is the name of the star system closest to the Solar System, and what are the discovered planets in it?", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--25", "ques": "Get the latest news headline about the English Premier League football club Manchester United.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--26", "ques": "Identify the hardware requirements for using the latest version of Adobe Photoshop on a Mac.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--27", "ques": "Check the current air quality index in Paris.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--28", "ques": "Check the IMDb and Metacritic scores of the movie \"Inception.\"", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--29", "ques": "Find out the current world record for the men's 100m sprint.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--30", "ques": "Find the current number one artist on the Spotify Global Top 50 chart and list his/her top 10 songs as of now.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--31", "ques": "Discover which year Cristiano Ronaldo scored the most goals in a single season.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--32", "ques": "Find out where and when the most recent UEFA Champions League final was held, and which team won.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--33", "ques": "Find and copy the SHA of the latest commit in the TensorFlow repository on GitHub, then find a textbox to paste and tell me what the SHA is.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--34", "ques": "Determine the distance from Earth to Mars as of today's date.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--35", "ques": "Look up the latest research paper related to black holes published in the journal \"Nature Astronomy\".", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--36", "ques": "Search for the most recent Nobel Prize winner in Physics and their contribution to the field.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--37", "ques": "Find the current top 3 super-earth planets and give a brief introduction to them.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--38", "ques": "Search for the next visible solar eclipse in North America and its expected date, and what about the one after that.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--39", "ques": "Identify the top-10 trending travel destination for 2024 through a blog, how many of them are in Asian.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--40", "ques": "Look up the elevation of Mount Kilimanjaro on Google Search.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--41", "ques": "Look up the current statistics of air pollution level in Los Angeles using Google Search.", "web": "https://www.google.com/"} +{"web_name": "Google Search", "id": "Google Search--42", "ques": " Use Google Search to find an article that explains the major differences between American English and British English.", "web": "https://www.google.com/"} +{"web_name": "Huggingface", "id": "Huggingface--0", "ques": "Find a pre-trained natural language processing model on Hugging Face that can perform sentiment analysis, and make sure the model's last update is within March 2023.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--1", "ques": "Use the Huggingface Inference API to generate a short story about a dragon and a wizard.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--2", "ques": "Discover three new and popular open-source NLP models for language translation released in the past month on Huggingface.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--3", "ques": "Look up a model with a license of cc-by-sa-4.0 with the most likes on Hugging face.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--4", "ques": "Locate an open-source conversational AI model on Hugging Face, trained in English and list its main features and applications.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--5", "ques": "Find a model released on Hugging Face for recipe generation. Retrieve the information of the model, including its name, model size and tensor type.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--6", "ques": "Find the model sentence-transformers/all-MiniLM-L6-v2 and use the Inference API on the webpage to get the similarity of the following two sentences: 'Tomorrow is Sunday', 'Eat a burger on Sunday'.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--7", "ques": "Which is the most downloaded audio related dataset on Hugging face currently.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--8", "ques": "Retrieve an example of a pre-trained language model in natural language processing and identify the tasks it is specifically designed for, like translation or text summarization.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--9", "ques": "Find the most download machine translation model on Huggingface which focuses on English and Japanese (en-ja) and report the evaluation metrics stated for it.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--10", "ques": "Open space: argilla/notux-chat-ui and interact with it by asking it 'which team trained you'. What is its answer.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--11", "ques": "Identify the latest updated image to video model available on Huggingface and summarize its main features.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--12", "ques": "Find the most recently updated machine learning model on Huggingface which focuses on Error Correction.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--13", "ques": "Search for LLaMA in the huggingface doc, what type is the spaces_between_special_tokens parameter in LlamaTokenizer and what is its default value.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--14", "ques": "How much is the Pro account of Hugging face for a month and what are the features?", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--15", "ques": "Identify the most downloaded models on Hugging face that use the PaddlePaddle library.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--16", "ques": "Find information on the latest (as of today's date) pre-trained language model on Huggingface suitable for text classification and briefly describe its intended use case and architecture.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--17", "ques": "Find the most recently updated open-source project related to natural language processing on the Huggingface platform. Provide the project's name, creator, and a brief description of its functionality.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--18", "ques": "Look up TRL's forward modelling in the hugging face documentation on how to add a margin to a loss.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--19", "ques": "Explore and summarize the features of the most recent open-source NLP model released by Hugging Face for English text summarization.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--20", "ques": "Locate a pre-trained natural language processing model on Hugging Face that specializes in named entity recognition (NER), confirm that the model was last updated in 2022 and has 1M+ downloads.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--21", "ques": "Look up the tour about how to use the 'pipeline' feature in the Hugging Face Transformers library for sentiment analysis, and identify the default model it uses.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--22", "ques": "Identify the steps to convert a PyTorch model to TensorFlow using the Hugging Face Transformers library as described in their documentation.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--23", "ques": "Identify three innovative and widely recognized open-source NLP models for automatic speech recognition released in the past month on Huggingface.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--24", "ques": "Search for a model on Hugging Face with an Apache-2.0 license that has received the highest number of likes.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--25", "ques": "In the Hugging Face documentation, find the tutorial on loading adapters with PEFT, tell me how to load in 8bit or 4bit.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--26", "ques": "Identify a model on Hugging Face designed for generating travel chats. Obtain information about the model, including its name, size and training framwork.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--27", "ques": "Determine the most downloaded dataset related to Text Retrieval in NLP on Hugging Face.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--28", "ques": "Retrieve an example of a pre-trained model on Hugging Face that is optimized for question answering tasks and detail the languages it supports.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--29", "ques": "Summarize the description of the recent open-source NLP model released on Hugging Face for medical summarization.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--30", "ques": "Identify the most downloaded English-Chinese (en-zh) machine translation model on Huggingface and report its latest performance metrics and usage guidelines.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--31", "ques": "Identify the latest machine learning model on Huggingface that specializes in detecting fake news, including the date of its last update.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--32", "ques": "On the Hugging Face website, search for the model 'GPT-J-6B' and find the 'temperature' parameter in its settings. What is the default value of this parameter?", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--33", "ques": "List three hugging face docs. How many GitHub stars have they earned so far?", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--34", "ques": "List the benefits of hugging face classroom mentioned on Hugging face website.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--35", "ques": "Find the latest Diffusion-related blog on Hugging Face, and read its intro or overview section to roughly summarize the content of the blog.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--36", "ques": "Summarize all the payment plans and their advantages in huggingface pricing.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--37", "ques": "Browse the daily paper on Hugging Face. What is the title of the first article, how many upvotes has it received, and is there any related model or data release?", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--38", "ques": "Investigate the 'transformers' library in the Hugging Face documentation, focusing on how to add new tokens to a tokenizer.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--39", "ques": "Investigate in the Hugging Face documentation how to utilize the 'Trainer' API for training a model on a custom dataset, and note the configurable parameters of the Trainer class.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--40", "ques": "Check out Text Embeddings Inference in Hugging face's Doc to summarise the strengths of the toolkit.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--41", "ques": "What is the current Text-to-3D model with the highest number of downloads and tell me are there Spaces that use the model.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--42", "ques": "Check the Dataset Viewer for ai2lumos/lumos_complex_qa_plan_onetime on Hugging face. what is the content corresponding to user in the first message?", "web": "https://huggingface.co/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--0", "ques": "derivative of x^2 when x=5.6", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--1", "ques": "Give a constraint on the set of inequalities for the inner region of the pentagram.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--2", "ques": "Calculate 3^71 and retain 5 significant figures in scientific notation.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--3", "ques": "Let g(x) be the integral of x^2 cos(2x). Write the expression of g(x).", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--4", "ques": "Pack 24 circles in a circle radius r. Compare Densest known packing and Square packing. Then tell me the radius of the inner circles.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--5", "ques": "Show the solution of y\"(z) + sin(y(z)) = 0 from wolframalpha.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--6", "ques": "Simplify x^5-20x^4+163x^3-676x^2+1424x-1209 so that it has fewer items.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--7", "ques": "Give the final angle and final length after 6s of a Spring pendulum with spring equilibrium length=0.12m, initial length=0.24m, initial angle=80deg, mass=1kg, spring constant=120 N/m .", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--8", "ques": "Give 12 lbs of 4-cyanoindole, converted to molar and indicate the percentage of C, H, N.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--9", "ques": "Annual energy production of Diablo Canyon 2 in 2010.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--10", "ques": "Give the geomagnetic field on June 20, 2023 in Oslo.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--11", "ques": "Show the electrical resistivity of UNS A92024 and UNS G10800 at 20 degrees Celsius.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--12", "ques": "Which character in unicode 8900 to 8920 looks like a snowflake", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--13", "ques": "What is 10,000 US dollars worth now in 1980 and in 1970?", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--14", "ques": "Compare the total Calories: whopper vs baconator vs big mac. Assume that each serving of food is 300g.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--15", "ques": "Show the blood relationship fraction between you and your father's mother's sister's son.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--16", "ques": "Weight lose for a male with current weight 90 kg, 40 year old, 175 cm. If he intakes 1500 calories every day, how long will it take to lose 17 kg.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--17", "ques": "Show the average price of movie ticket in Providence, Nashville, Boise in 2023.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--18", "ques": "Plot Albert Einstein curve with Parametric equations.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--19", "ques": "Standing in the sun from 11:00 am with SPF 5 in Australia. Approximate time to sunburn for each skin type.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--20", "ques": "Compute the integral of 3e^(2x) from x=0 to x=5.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--21", "ques": "Calculate (1+0.1*i)^8 + (1\u22120.2*i)^8 where i is a complex number.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--22", "ques": "Determine the area of a regular hexagon with a side length of 7 cm.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--23", "ques": "Calculate the population growth rate of Canada from 2020 to 2023 using Wolfram Alpha.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--24", "ques": "Solve the differential equation y''(t) - 2y'(t) + 10y(t) = 0 and display its general solution.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--25", "ques": "Calculate the final position and velocity of a projectile launched at 45 degrees with an initial speed of 30 m/s after 3 seconds.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--26", "ques": "Convert 15 kilograms of sulfuric acid to moles and display the percentage composition of H, S, and O by weight.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--27", "ques": "Display the thermal conductivity of Copper (Cu) and Aluminum (Al) at 25 degrees Celsius.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--28", "ques": "Identify the character in Unicode range 9632 to 9650 that represents a hollow parallelogram.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--29", "ques": "Create a plot of cat curve using wolfram alpha.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--30", "ques": "Calculate the estimated time to sunburn for different skin types when exposed to the sun at 1:00 pm with SPF 1 in Brazil.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--31", "ques": "Using Wolfram Alpha, determine the current temperature and wind speed in Chicago, IL.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--32", "ques": "Print all prime numbers between 1000 and 1200 using Wolfram alpha.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--33", "ques": "Identify the electrical energy output of a hydroelectric power plant named Itaipu Dam in 2023 using Wolfram Alpha.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--34", "ques": "Calculate the mass of Jupiter compared to Earth using Wolfram Alpha. Also, find the length of one day on Jupiter.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--35", "ques": "Calculate the determinant of a 6x6 Hilbert matrix.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--36", "ques": "Determine the convergence or divergence of the series \u03a3 (n=1 to \u221e) of 1/(n^3 + 1).", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--37", "ques": "How many days are there between February 12, 2024 and August 9, 2050?", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--38", "ques": "Compute the length of a curve defined by y = 2x^3 - 3x^2 + 4x - 5 from x = 0 to x = 3.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--39", "ques": "Use Wolfram alpha to write the expression of the ellipse x^2 + 3 y^2 = 4 rotated 33 degrees counterclockwise.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--40", "ques": "Approximate amount of fat burned by a 28yo, 172cm tall, 70kg woman running for 30min at a pace of 6min/mile.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--41", "ques": "What is the approximate Heart Rate Reserve of a 50 year old man who has a heart rate of 60bpm at rest.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--42", "ques": "What is the raw memory of a 100.2\" * 123.5\" true colour picture at 72 ppi?", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--43", "ques": "A polyominoes of order 6 means you have 6 identical squares to combine different shapes (2-sided). How many combinations are there? Looking at all the shapes in the result, how many of them have only 2 rows in total?", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--44", "ques": "Solve the ODE, g' + cos(g) = 0, if there is a constant in the result, determine the value of the constant by the condition that g(0) = 1.", "web": "https://www.wolframalpha.com/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--45", "ques": "A 175cm tall, 85kg, 40yo man climbs 2500 steps at about 18cm per step and 40 steps per minute. summarise the Metabolic properties.", "web": "https://www.wolframalpha.com/"} \ No newline at end of file diff --git a/evals/evals.config.json b/evals/evals.config.json index 06ed6459..a8596409 100644 --- a/evals/evals.config.json +++ b/evals/evals.config.json @@ -419,5 +419,15 @@ "name": "agent/sign_in", "categories": ["agent"] } + , + { + "name": "agent/webarena_gaia", + "categories": ["agent"] + } + , + { + "name": "agent/webvoyager", + "categories": ["agent"] + } ] } diff --git a/evals/index.eval.ts b/evals/index.eval.ts index 69ac9166..91668a87 100644 --- a/evals/index.eval.ts +++ b/evals/index.eval.ts @@ -12,7 +12,6 @@ * - Runs each selected task against each selected model in parallel, collecting results. * - Saves a summary of the evaluation results to `eval-summary.json`. */ -import fs from "fs"; import path from "path"; import process from "process"; import { @@ -24,7 +23,7 @@ import { generateExperimentName } from "./utils"; import { exactMatch, errorMatch } from "./scoring"; import { tasksByName, tasksConfig, getModelList } from "./taskConfig"; import { Eval, wrapAISDKModel, wrapOpenAI } from "braintrust"; -import { SummaryResult, Testcase } from "@/types/evals"; +import { SummaryResult, Testcase, EvalInput } from "@/types/evals"; import { EvalLogger } from "./logger"; import { AvailableModel, LLMClient } from "@browserbasehq/stagehand"; import { env } from "./env"; @@ -37,6 +36,9 @@ import { AISdkClient } from "@/examples/external_clients/aisdk"; import { getAISDKLanguageModel } from "@/lib/llm/LLMProvider"; import { loadApiKeyFromEnv } from "@/lib/utils"; import { LogLine } from "@/types/log"; +import { generateSummary } from "./core/summary"; +import { buildGAIATestcases } from "./suites/gaia"; +import { buildWebVoyagerTestcases } from "./suites/webvoyager"; dotenv.config(); @@ -54,88 +56,6 @@ const TRIAL_COUNT = process.env.EVAL_TRIAL_COUNT const USE_API: boolean = (process.env.USE_API ?? "").toLowerCase() === "true"; -/** - * generateSummary: - * After all evaluations have finished, aggregate the results into a summary. - * This summary includes: - * - Which tasks passed or failed (with model and categories). - * - Category-wise success percentages. - * - Model-wise success percentages. - * - * The summary is written to `eval-summary.json` for further analysis. - */ -const generateSummary = async ( - results: SummaryResult[], - experimentName: string, -) => { - // Determine passed testcases (those with _success: true) - const passed = results - .filter((r) => r.output._success) - .map((r) => ({ - eval: r.input.name, - model: r.input.modelName, - categories: tasksByName[r.input.name].categories, - })); - - // Determine failed testcases (those with _success: false) - const failed = results - .filter((r) => !r.output._success) - .map((r) => ({ - eval: r.input.name, - model: r.input.modelName, - categories: tasksByName[r.input.name].categories, - })); - - // Calculate success counts for each category - const categorySuccessCounts: Record< - string, - { total: number; success: number } - > = {}; - for (const taskName of Object.keys(tasksByName)) { - const taskCategories = tasksByName[taskName].categories; - const taskResults = results.filter((r) => r.input.name === taskName); - const successCount = taskResults.filter((r) => r.output._success).length; - - for (const cat of taskCategories) { - if (!categorySuccessCounts[cat]) { - categorySuccessCounts[cat] = { total: 0, success: 0 }; - } - categorySuccessCounts[cat].total += taskResults.length; - categorySuccessCounts[cat].success += successCount; - } - } - - // Compute percentage success per category - const categories: Record = {}; - for (const [cat, counts] of Object.entries(categorySuccessCounts)) { - categories[cat] = Math.round((counts.success / counts.total) * 100); - } - - // Compute percentage success per model - const models: Record = {}; - const allModels = [...new Set(results.map((r) => r.input.modelName))]; - for (const model of allModels) { - const modelResults = results.filter((r) => r.input.modelName === model); - const successCount = modelResults.filter((r) => r.output._success).length; - models[model] = Math.round((successCount / modelResults.length) * 100); - } - - // Format and write the summary to a JSON file - const formattedSummary = { - experimentName, - passed, - failed, - categories, - models, - }; - - fs.writeFileSync( - "eval-summary.json", - JSON.stringify(formattedSummary, null, 2), - ); - console.log("Evaluation summary written to eval-summary.json"); -}; - /** * generateFilteredTestcases: * Based on the chosen filters (category or specific eval name) and environment, @@ -187,8 +107,25 @@ const generateFilteredTestcases = (): Testcase[] => { currentModels, ); - // Create a list of all testcases using the determined task names and models - let allTestcases = currentModels.flatMap((model) => + // Special handling: fan out GAIA (WebVoyager) dataset for agent/webarena_gaia + const isGAIATaskIncluded = taskNamesToRun.includes("agent/webarena_gaia"); + // Special handling: fan out WebVoyager dataset for agent/webvoyager + const isWebVoyagerTaskIncluded = taskNamesToRun.includes("agent/webvoyager"); + + let allTestcases: Testcase[] = []; + + if (isGAIATaskIncluded) { + taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webarena_gaia"); + allTestcases.push(...buildGAIATestcases(currentModels)); + } + + if (isWebVoyagerTaskIncluded) { + taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webvoyager"); + allTestcases.push(...buildWebVoyagerTestcases(currentModels)); + } + + // Create a list of all remaining testcases using the determined task names and models + const regularTestcases = currentModels.flatMap((model) => taskNamesToRun.map((testName) => ({ input: { name: testName, modelName: model as AvailableModel }, name: testName, @@ -202,12 +139,13 @@ const generateFilteredTestcases = (): Testcase[] => { metadata: { model: model as AvailableModel, test: testName, - categories: tasksConfig.find((t) => t.name === testName)?.categories, }, expected: true, })), ); + allTestcases = [...allTestcases, ...regularTestcases]; + // This filtering step might now be redundant if taskNamesToRun is already filtered if (filterByCategory) { allTestcases = allTestcases.filter((testcase) => @@ -227,7 +165,7 @@ const generateFilteredTestcases = (): Testcase[] => { allTestcases .map( (t, i) => - `${i}: ${t.name} (${t.input.modelName}): ${t.metadata.categories}`, + `${i}: ${t.name} (${t.input.modelName}): ${tasksByName[t.name].categories}`, ) .join("\n"), ); @@ -266,7 +204,7 @@ const generateFilteredTestcases = (): Testcase[] => { experimentName, data: generateFilteredTestcases, // Each test is a function that runs the corresponding task module - task: async (input: { name: string; modelName: AvailableModel }) => { + task: async (input: EvalInput) => { const logger = new EvalLogger(); try { // Dynamically import the task based on its name @@ -367,6 +305,8 @@ const generateFilteredTestcases = (): Testcase[] => { modelName: input.modelName, }); } + // Attach per-test parameters (for data-driven tasks) + taskInput.taskParams = input.params; let result; try { result = await taskFunction(taskInput); diff --git a/evals/suites/gaia.ts b/evals/suites/gaia.ts new file mode 100644 index 00000000..ae406ae4 --- /dev/null +++ b/evals/suites/gaia.ts @@ -0,0 +1,120 @@ +import fs from "fs"; +import path from "path"; +import type { Testcase, EvalInput } from "@/types/evals"; +import type { AvailableModel } from "@/types/model"; +import { tasksConfig } from "../taskConfig"; + +export const buildGAIATestcases = (models: string[]): Testcase[] => { + const gaiaFilePath = + process.env.EVAL_GAIA_FILE || + path.join(__dirname, "..", "datasets", "gaia", "GAIA_web.jsonl"); + + let gaiaLines: string[] = []; + try { + const content = fs.readFileSync(gaiaFilePath, "utf-8"); + gaiaLines = content.split(/\r?\n/).filter((l) => l.trim().length > 0); + } catch (e) { + console.warn( + `Could not read GAIA file at ${gaiaFilePath}. Set EVAL_GAIA_FILE to override. Error: ${e instanceof Error ? e.message : String(e)}`, + ); + gaiaLines = []; + } + + const levelFilter = process.env.EVAL_GAIA_LEVEL + ? Number(process.env.EVAL_GAIA_LEVEL) + : undefined; + const maxCases = process.env.EVAL_GAIA_LIMIT + ? Number(process.env.EVAL_GAIA_LIMIT) + : 25; + const sampleCount = process.env.EVAL_GAIA_SAMPLE + ? Number(process.env.EVAL_GAIA_SAMPLE) + : undefined; + + type GaiaRow = { + id: string; + Level?: number; + web: string; + ques: string; + [key: string]: unknown; + }; + + const gaiaRows: GaiaRow[] = []; + const candidates: GaiaRow[] = []; + for (const line of gaiaLines) { + try { + const parsed = JSON.parse(line) as GaiaRow; + if ( + typeof parsed.id === "string" && + typeof parsed.web === "string" && + typeof parsed.ques === "string" + ) { + if (!levelFilter || parsed.Level === levelFilter) { + candidates.push(parsed); + } + } + } catch { + // skip invalid lines + } + } + if (sampleCount && sampleCount > 0) { + gaiaRows.push(...sampleUniform(candidates, sampleCount)); + } else { + for (const row of candidates) { + gaiaRows.push(row); + if (gaiaRows.length >= maxCases) break; + } + } + + const allTestcases: Testcase[] = []; + for (const model of models) { + for (const row of gaiaRows) { + const finalAnswer = (row as Record)[ + "Final answer" + ] as unknown; + const input: EvalInput = { + name: "agent/webarena_gaia", + modelName: model as AvailableModel, + params: { + id: row.id, + level: row.Level, + web: row.web, + ques: row.ques, + expected: typeof finalAnswer === "string" ? finalAnswer : undefined, + }, + }; + allTestcases.push({ + input, + name: input.name, + tags: [ + model, + input.name, + ...( + tasksConfig.find((t) => t.name === input.name)?.categories || [] + ).map((x) => `category/${x}`), + `gaia/id/${row.id}`, + row.Level ? `gaia/level/${row.Level}` : "gaia/level/unknown", + ], + metadata: { + model: model as AvailableModel, + test: `${input.name}:${row.id}`, + }, + expected: true, + }); + } + } + + return allTestcases; +}; + +function sampleUniform(arr: T[], k: number): T[] { + const n = arr.length; + if (k >= n) return arr.slice(); + const copy = arr.slice(); + for (let i = n - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + const tmp = copy[i]; + copy[i] = copy[j]; + copy[j] = tmp; + } + return copy.slice(0, k); +} diff --git a/evals/suites/webvoyager.ts b/evals/suites/webvoyager.ts new file mode 100644 index 00000000..ef3cd412 --- /dev/null +++ b/evals/suites/webvoyager.ts @@ -0,0 +1,114 @@ +import fs from "fs"; +import path from "path"; +import type { Testcase, EvalInput } from "@/types/evals"; +import type { AvailableModel } from "@/types/model"; +import { tasksConfig } from "../taskConfig"; + +export const buildWebVoyagerTestcases = (models: string[]): Testcase[] => { + const voyagerFilePath = path.join( + __dirname, + "..", + "datasets", + "webvoyager", + "WebVoyager_data.jsonl", + ); + + let lines: string[] = []; + try { + const content = fs.readFileSync(voyagerFilePath, "utf-8"); + lines = content.split(/\r?\n/).filter((l) => l.trim().length > 0); + } catch (e) { + console.warn( + `Could not read WebVoyager file at ${voyagerFilePath}. Error: ${e instanceof Error ? e.message : String(e)}`, + ); + lines = []; + } + + const maxCases = process.env.EVAL_WEBVOYAGER_LIMIT + ? Number(process.env.EVAL_WEBVOYAGER_LIMIT) + : 25; + const sampleCount = process.env.EVAL_WEBVOYAGER_SAMPLE + ? Number(process.env.EVAL_WEBVOYAGER_SAMPLE) + : undefined; + + type VoyagerRow = { + id: string; + web: string; + ques: string; + web_name?: string; + [key: string]: unknown; + }; + + const rows: VoyagerRow[] = []; + const candidates: VoyagerRow[] = []; + for (const line of lines) { + try { + const parsed = JSON.parse(line) as VoyagerRow; + if ( + typeof parsed.id === "string" && + typeof parsed.web === "string" && + typeof parsed.ques === "string" + ) { + candidates.push(parsed); + } + } catch { + // skip invalid + } + } + if (sampleCount && sampleCount > 0) { + rows.push(...sampleUniform(candidates, sampleCount)); + } else { + for (const row of candidates) { + rows.push(row); + if (rows.length >= maxCases) break; + } + } + + const allTestcases: Testcase[] = []; + for (const model of models) { + for (const row of rows) { + const input: EvalInput = { + name: "agent/webvoyager", + modelName: model as AvailableModel, + params: { + id: row.id, + web: row.web, + ques: row.ques, + web_name: row.web_name, + }, + }; + allTestcases.push({ + input, + name: input.name, + tags: [ + model, + input.name, + ...( + tasksConfig.find((t) => t.name === input.name)?.categories || [] + ).map((x) => `category/${x}`), + `webvoyager/id/${row.id}`, + ], + metadata: { + model: model as AvailableModel, + test: `${input.name}:${row.id}`, + }, + expected: true, + }); + } + } + + return allTestcases; +}; + +function sampleUniform(arr: T[], k: number): T[] { + const n = arr.length; + if (k >= n) return arr.slice(); + const copy = arr.slice(); + for (let i = n - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + const tmp = copy[i]; + copy[i] = copy[j]; + copy[j] = tmp; + } + return copy.slice(0, k); +} diff --git a/evals/tasks/agent/webarena_gaia.ts b/evals/tasks/agent/webarena_gaia.ts new file mode 100644 index 00000000..7e1a5d60 --- /dev/null +++ b/evals/tasks/agent/webarena_gaia.ts @@ -0,0 +1,78 @@ +import { EvalFunction } from "@/types/evals"; + +/** + * Data-driven GAIA (WebVoyager) agent eval + * - Expects per-test params injected via eval runner: { id, level, web, ques } + * - Starts at `web`, runs the agent with `ques` as instruction + * - Requires the agent to output a final answer in the form: "Final Answer: " + * - Marks success if such an answer string is present (exact matching against dataset can be layered later) + */ +export const webarena_gaia: EvalFunction = async ({ + stagehand, + logger, + debugUrl, + sessionUrl, + modelName, + taskParams, +}) => { + try { + const params = (taskParams || {}) as { + id?: string; + level?: number; + web?: string; + ques?: string; + }; + + if (!params.web || !params.ques) { + return { + _success: false, + error: `Missing GAIA params (web, ques). Got: ${JSON.stringify(params)}`, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + + await stagehand.page.goto(params.web); + + const agent = stagehand.agent({ + model: modelName, + provider: modelName.startsWith("claude") ? "anthropic" : "openai", + instructions: `You are a helpful assistant that must solve the task by browsing. You must produce a single line at the end like: "Final Answer: ". Do not ask follow up questions. Current page: ${await stagehand.page.title()}`, + }); + + const result = await agent.execute({ + instruction: params.ques, + maxSteps: 20, + }); + + const message = result?.message || ""; + const hasFinal = + typeof message === "string" && /Final Answer\s*:\s*(.+)/i.test(message); + const providedAnswer = hasFinal + ? (message.match(/Final Answer\s*:\s*(.+)/i)?.[1] || "").trim() + : ""; + + const expected = (params as Record).expected as + | string + | undefined; + const success = expected + ? hasFinal && providedAnswer.trim() === expected.trim() + : hasFinal; + + return { + _success: !!success, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + error, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } +}; diff --git a/evals/tasks/agent/webvoyager.ts b/evals/tasks/agent/webvoyager.ts new file mode 100644 index 00000000..e4b21dad --- /dev/null +++ b/evals/tasks/agent/webvoyager.ts @@ -0,0 +1,61 @@ +import { EvalFunction } from "@/types/evals"; + +export const webvoyager: EvalFunction = async ({ + stagehand, + logger, + debugUrl, + sessionUrl, + modelName, + taskParams, +}) => { + try { + const params = (taskParams || {}) as { + id?: string; + web?: string; + ques?: string; + web_name?: string; + }; + + if (!params.web || !params.ques) { + return { + _success: false, + error: `Missing WebVoyager params (web, ques). Got: ${JSON.stringify(params)}`, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + + await stagehand.page.goto(params.web); + + const agent = stagehand.agent({ + model: modelName, + provider: modelName.startsWith("claude") ? "anthropic" : "openai", + instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: " summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}`, + }); + + const result = await agent.execute({ + instruction: params.ques, + maxSteps: 20, + }); + + const message = result?.message || ""; + const success = + typeof message === "string" && /Final Answer\s*:/i.test(message); + + return { + _success: !!success, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + error, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } +}; diff --git a/types/evals.ts b/types/evals.ts index d97e2796..14e6ea54 100644 --- a/types/evals.ts +++ b/types/evals.ts @@ -13,6 +13,8 @@ export type StagehandInitResult = { sessionUrl: string; stagehandConfig: ConstructorParams; modelName: AvailableModel; + // Optional per-test parameters to pass into a task + taskParams?: Record; }; export type EvalFunction = (taskInput: StagehandInitResult) => Promise<{ @@ -40,18 +42,20 @@ export type EvalCategory = z.infer; export interface EvalInput { name: string; modelName: AvailableModel; + // Optional per-test parameters, used by data-driven tasks + params?: Record; } export interface Testcase extends EvalCase< EvalInput, unknown, - { model: AvailableModel; test: string } + { model: AvailableModel; test: string; categories?: string[] } > { input: EvalInput; name: string; tags: string[]; - metadata: { model: AvailableModel; test: string }; + metadata: { model: AvailableModel; test: string; categories?: string[] }; expected: unknown; }