|
1 | 1 | import json |
2 | 2 | import logging |
| 3 | +import math |
| 4 | +import random |
3 | 5 | from pathlib import Path |
4 | 6 |
|
5 | 7 | from azure.ai.generative.synthetic.qa import QADataGenerator, QAType |
6 | 8 | from azure.search.documents import SearchClient |
7 | 9 |
|
| 10 | +from . import service_setup |
| 11 | + |
8 | 12 | logger = logging.getLogger("scripts") |
9 | 13 |
|
10 | 14 |
|
@@ -50,3 +54,75 @@ def generate_test_qa_data( |
50 | 54 | with open(output_file, "w", encoding="utf-8") as f: |
51 | 55 | for item in qa: |
52 | 56 | f.write(json.dumps(item) + "\n") |
| 57 | + |
| 58 | + |
| 59 | +def generate_based_on_questions(openai_client, model: str, qa: list, num_questions: int, prompt: str): |
| 60 | + existing_questions = "" |
| 61 | + if qa: |
| 62 | + qa = random.sample(qa, len(qa)) # Shuffle questions for some randomness |
| 63 | + existing_questions = "\n".join([item["question"] for item in qa]) |
| 64 | + |
| 65 | + gpt_response = openai_client.chat.completions.create( |
| 66 | + model=model, |
| 67 | + messages=[ |
| 68 | + { |
| 69 | + "role": "user", |
| 70 | + "content": f"{prompt} Only generate {num_questions} TOTAL. Separate each question by a new line. \n{existing_questions}", # noqa: E501 |
| 71 | + } |
| 72 | + ], |
| 73 | + n=1, |
| 74 | + max_tokens=num_questions * 50, |
| 75 | + temperature=0.3, |
| 76 | + ) |
| 77 | + |
| 78 | + qa = [] |
| 79 | + for message in gpt_response.choices[0].message.content.split("\n")[0:num_questions]: |
| 80 | + qa.append({"question": message, "truth": f"Generated from this prompt: {prompt}"}) |
| 81 | + return qa |
| 82 | + |
| 83 | + |
| 84 | +def generate_dontknows_qa_data(openai_config: dict, num_questions_total: int, input_file: Path, output_file: Path): |
| 85 | + logger.info("Generating off-topic questions based on %s", input_file) |
| 86 | + with open(input_file, encoding="utf-8") as f: |
| 87 | + qa = [json.loads(line) for line in f.readlines()] |
| 88 | + |
| 89 | + openai_client = service_setup.get_openai_client(openai_config) |
| 90 | + dontknows_qa = [] |
| 91 | + num_questions_each = math.ceil(num_questions_total / 4) |
| 92 | + dontknows_qa += generate_based_on_questions( |
| 93 | + openai_client, |
| 94 | + openai_config["model"], |
| 95 | + qa, |
| 96 | + num_questions_each, |
| 97 | + f"Given these questions, suggest {num_questions_each} questions that are very related but are not directly answerable by the same sources. Do not simply ask for other examples of the same thing - your question should be standalone.", # noqa: E501 |
| 98 | + ) |
| 99 | + dontknows_qa += generate_based_on_questions( |
| 100 | + openai_client, |
| 101 | + openai_config["model"], |
| 102 | + qa, |
| 103 | + num_questions_each, |
| 104 | + f"Given these questions, suggest {num_questions_each} questions with similar keywords that are about publicly known facts.", # noqa: E501 |
| 105 | + ) |
| 106 | + dontknows_qa += generate_based_on_questions( |
| 107 | + openai_client, |
| 108 | + openai_config["model"], |
| 109 | + qa, |
| 110 | + num_questions_each, |
| 111 | + f"Given these questions, suggest {num_questions_each} questions that are not related to these topics at all but have well known answers.", # noqa: E501 |
| 112 | + ) |
| 113 | + remaining = num_questions_total - len(dontknows_qa) |
| 114 | + dontknows_qa += generate_based_on_questions( |
| 115 | + openai_client, |
| 116 | + openai_config["model"], |
| 117 | + qa=None, |
| 118 | + num_questions=remaining, |
| 119 | + prompt=f"Suggest {remaining} questions that are nonsensical, and would result in confusion if you asked it.", # noqa: E501 |
| 120 | + ) |
| 121 | + |
| 122 | + logger.info("Writing %d off-topic questions to %s", len(dontknows_qa), output_file) |
| 123 | + directory = Path(output_file).parent |
| 124 | + if not directory.exists(): |
| 125 | + directory.mkdir(parents=True) |
| 126 | + with open(output_file, "w", encoding="utf-8") as f: |
| 127 | + for item in dontknows_qa: |
| 128 | + f.write(json.dumps(item) + "\n") |
0 commit comments