|
| 1 | +import concurrent.futures as f |
| 2 | + |
| 3 | +from datasets import DatasetDict, load_dataset |
| 4 | +from langchain.llms import OpenAI |
| 5 | + |
| 6 | + |
| 7 | +def format_for_belar(row): |
| 8 | + row["context"] = row["selftext"] |
| 9 | + row["prompt"] = row["title"] |
| 10 | + row["ground_truth"] = row["answers"]["text"] |
| 11 | + return row |
| 12 | + |
| 13 | + |
| 14 | +d = load_dataset("eli5") |
| 15 | +assert isinstance(d, DatasetDict) |
| 16 | +ds = d["test_eli5"].map(format_for_belar, batched=False) |
| 17 | +ds = ds.select_columns(["context", "prompt", "ground_truth"]) |
| 18 | + |
| 19 | +ds = ds.shuffle(seed=42).select(range(500)) |
| 20 | +print(ds.shape, ds.column_names) |
| 21 | + |
| 22 | + |
| 23 | +llm = OpenAI() # type: ignore |
| 24 | +prompt = """ |
| 25 | +{context} |
| 26 | +with the above context explain like I'm five: {prompt} |
| 27 | +""" |
| 28 | + |
| 29 | + |
| 30 | +def get_answers(row): |
| 31 | + qs, cs = row["prompt"], row["context"] |
| 32 | + |
| 33 | + generated_answers = [] |
| 34 | + with f.ThreadPoolExecutor(max_workers=10) as executor: |
| 35 | + results = executor.map( |
| 36 | + llm, [prompt.format(context=cs[i], prompt=qs[i]) for i in range(len(qs))] |
| 37 | + ) |
| 38 | + for result in results: |
| 39 | + generated_answers.append(result) |
| 40 | + |
| 41 | + row["generated_answers"] = generated_answers |
| 42 | + return row |
| 43 | + |
| 44 | + |
| 45 | +ds = ds.map(get_answers, batched=True, batch_size=10) |
0 commit comments