diff --git a/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb b/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb index 92369d6..69bdb54 100644 --- a/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb +++ b/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb @@ -600,6 +600,37 @@ "NUM_SEED_EXAMPLES = 7" ] }, + { + "cell_type": "markdown", + "id": "32e13a94-1c5e-4310-9500-6940368ec2ea", + "metadata": {}, + "source": [ + "#### [OPTIONAL] Prompt customization for Q&A Generation\n", + "\n", + "Optionally insert your own stylistic customization statement below. If `customization_str` is `None`, there will be no customization attempted and the default QA generation prompt will be used." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78b51f57-7b7b-4d53-a129-29c291939dae", + "metadata": {}, + "outputs": [], + "source": [ + "customization_str = None \n", + "\n", + "# Example: \n", + "# customization_str = \"Write at the fifth grade level.\"" + ] + }, + { + "cell_type": "markdown", + "id": "d9d5191d-cbd7-4c4e-b5e1-748ed8684eaf", + "metadata": {}, + "source": [ + "### Run QA Generation" + ] + }, { "cell_type": "code", "execution_count": null, @@ -621,7 +652,8 @@ " NUM_SEED_EXAMPLES,\n", " API_KEY,\n", " ENDPOINT_URL,\n", - " MODEL_NAME)\n", + " MODEL_NAME,\n", + " customization_str)\n", " print(f\"qna.yaml saved to: {qna_output_path}\")\n" ] }, diff --git a/notebooks/instructlab-knowledge/utils/qna_gen.py b/notebooks/instructlab-knowledge/utils/qna_gen.py index a277ae4..bd1a0d4 100644 --- a/notebooks/instructlab-knowledge/utils/qna_gen.py +++ b/notebooks/instructlab-knowledge/utils/qna_gen.py @@ -3,6 +3,8 @@ import random from pathlib import Path + +from docling_sdg.qa.prompts.generation_prompts import QaPromptTemplate from pydantic import SecretStr from textwrap import wrap @@ -11,6 +13,32 @@ from docling_sdg.qa.generate import Generator from docling_sdg.qa.base import GenerateOptions, LlmProvider +CUSTOM_COMBINED_QUESTION_PROMPT = ( + "I will provide you a text passage. I need you to generate three questions that " + "must be answered only with information contained in this passage, and nothing " + "else.\n" + 'The first question is of type "fact_single", which means that the answer to this ' + "question is a simple, single piece of factual information contained in the " + "context.\n" + 'The second question is of type "summary", which means that the answer to this ' + "question summarizes different pieces of factual information contained in the " + "context.\n" + 'The third question is of type "reasoning", which is a question that requires the ' + "reader to think critically and make an inference or draw a conclusion based on " + "the information provided in the passage.\n" + "Make sure that the three questions are different. Make sure that every question " + " has a provided answer\n\n" + "{customization_str}\n\n" + "You will format your generation as a python dictionary, such as:\n\n" + '{"fact_single": , ' + '"fact_single_answer: , "summary": , "summary_answer": , "reasoning": , "reasoning_answer": }\n\n' + "Only provide the python dictionary as your output. Make sure you provide an answer for each question.\n\n" + "Context: {context_str}" +) + chunk_filter = [ lambda chunk: len(str(chunk.text)) > 500 ] @@ -32,7 +60,7 @@ class IndentedDumper(yaml.Dumper): def increase_indent(self, flow=False, indentless=False): return super(IndentedDumper, self).increase_indent(flow, False) -def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, output_dir: Path, domain: str, summary: str, num_seed_examples: int, api_key: str, api_url: str, model_id: str) -> Path: +def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, output_dir: Path, domain: str, summary: str, num_seed_examples: int, api_key: str, api_url: str, model_id: str, customization_str: str | None = None) -> Path: """ Creates a seed dataset from a path Args: @@ -44,6 +72,7 @@ def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, outp api_key (str): API key for the model used to generate questions and answers from contexts api_url (str): Endpoint for the model used to generate questions and answers from contexts model_id (str): Name of the model used to generate questions and answers from contexts + customization_str (str | None) A directive for how to stylistically customize the generated QAs Returns: qna_output_path (pathlib.Path): Path to the generated seed example file """ @@ -89,6 +118,15 @@ def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, outp generate_options.url = api_url generate_options.model_id = model_id generate_options.generated_file = output_dir / f"qagen-{contribution_name}.json" + + if customization_str is not None: + generate_options.prompts = [QaPromptTemplate( + template=CUSTOM_COMBINED_QUESTION_PROMPT, + keys=["context_str", "customization_str"], + labels=["fact_single", "summary", "reasoning"], + type_="question", + )] + gen = Generator(generate_options=generate_options) Path.unlink(generate_options.generated_file, missing_ok=True)