Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion notebooks/instructlab-knowledge/instructlab-knowledge.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,37 @@
"NUM_SEED_EXAMPLES = 7"
]
},
{
"cell_type": "markdown",
"id": "32e13a94-1c5e-4310-9500-6940368ec2ea",
"metadata": {},
"source": [
"#### [OPTIONAL] Prompt customization for Q&A Generation\n",
"\n",
"Optionally insert your own stylistic customization statement below. If `customization_str` is `None`, there will be no customization attempted and the default QA generation prompt will be used."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "78b51f57-7b7b-4d53-a129-29c291939dae",
"metadata": {},
"outputs": [],
"source": [
"customization_str = None \n",
"\n",
"# Example: \n",
"# customization_str = \"Write at the fifth grade level.\""
]
},
{
"cell_type": "markdown",
"id": "d9d5191d-cbd7-4c4e-b5e1-748ed8684eaf",
"metadata": {},
"source": [
"### Run QA Generation"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -621,7 +652,8 @@
" NUM_SEED_EXAMPLES,\n",
" API_KEY,\n",
" ENDPOINT_URL,\n",
" MODEL_NAME)\n",
" MODEL_NAME,\n",
" customization_str)\n",
" print(f\"qna.yaml saved to: {qna_output_path}\")\n"
]
},
Expand Down
40 changes: 39 additions & 1 deletion notebooks/instructlab-knowledge/utils/qna_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import random

from pathlib import Path

from docling_sdg.qa.prompts.generation_prompts import QaPromptTemplate
from pydantic import SecretStr
from textwrap import wrap

Expand All @@ -11,6 +13,32 @@
from docling_sdg.qa.generate import Generator
from docling_sdg.qa.base import GenerateOptions, LlmProvider

CUSTOM_COMBINED_QUESTION_PROMPT = (
"I will provide you a text passage. I need you to generate three questions that "
"must be answered only with information contained in this passage, and nothing "
"else.\n"
'The first question is of type "fact_single", which means that the answer to this '
"question is a simple, single piece of factual information contained in the "
"context.\n"
'The second question is of type "summary", which means that the answer to this '
"question summarizes different pieces of factual information contained in the "
"context.\n"
'The third question is of type "reasoning", which is a question that requires the '
"reader to think critically and make an inference or draw a conclusion based on "
"the information provided in the passage.\n"
"Make sure that the three questions are different. Make sure that every question "
" has a provided answer\n\n"
"{customization_str}\n\n"
"You will format your generation as a python dictionary, such as:\n\n"
'{"fact_single": <The "fact_single" type question you thought of>, '
'"fact_single_answer: <Answer to the "fact_single" question>, "summary": <the '
'"summary" type question you thought of>, "summary_answer": <Answer to the '
'"summary" question>, "reasoning": <the "reasoning" type question you thought '
'of>, "reasoning_answer": <Answer to the "reasoning" question>}\n\n'
"Only provide the python dictionary as your output. Make sure you provide an answer for each question.\n\n"
"Context: {context_str}"
)

chunk_filter = [
lambda chunk: len(str(chunk.text)) > 500
]
Expand All @@ -32,7 +60,7 @@ class IndentedDumper(yaml.Dumper):
def increase_indent(self, flow=False, indentless=False):
return super(IndentedDumper, self).increase_indent(flow, False)

def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, output_dir: Path, domain: str, summary: str, num_seed_examples: int, api_key: str, api_url: str, model_id: str) -> Path:
def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, output_dir: Path, domain: str, summary: str, num_seed_examples: int, api_key: str, api_url: str, model_id: str, customization_str: str | None = None) -> Path:
"""
Creates a seed dataset from a path
Args:
Expand All @@ -44,6 +72,7 @@ def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, outp
api_key (str): API key for the model used to generate questions and answers from contexts
api_url (str): Endpoint for the model used to generate questions and answers from contexts
model_id (str): Name of the model used to generate questions and answers from contexts
customization_str (str | None) A directive for how to stylistically customize the generated QAs
Returns:
qna_output_path (pathlib.Path): Path to the generated seed example file
"""
Expand Down Expand Up @@ -89,6 +118,15 @@ def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, outp
generate_options.url = api_url
generate_options.model_id = model_id
generate_options.generated_file = output_dir / f"qagen-{contribution_name}.json"

if customization_str is not None:
generate_options.prompts = [QaPromptTemplate(
template=CUSTOM_COMBINED_QUESTION_PROMPT,
keys=["context_str", "customization_str"],
labels=["fact_single", "summary", "reasoning"],
type_="question",
)]

gen = Generator(generate_options=generate_options)

Path.unlink(generate_options.generated_file, missing_ok=True)
Expand Down