diff --git a/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb b/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb index fb9bc6c..742c1ff 100644 --- a/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb +++ b/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb @@ -675,9 +675,9 @@ "id": "f3490c8a-5ee8-44cd-ae5e-26a6ca7b4017", "metadata": {}, "source": [ - "### Install docling-sdg\n", + "#### Install docling-sdg\n", "\n", - "[Docling-sdg](https://github.com/docling-project/docling-sdg) project is used to generate question and answer pairs for seed examples." + "This notebook uses [Docling-sdg](https://github.com/docling-project/docling-sdg) to generate question and answer pairs for each chunk." ] }, { @@ -692,16 +692,50 @@ }, { "cell_type": "markdown", - "id": "d65ec755-e3de-40ab-bf3a-23ebb29a705d", + "id": "1497c44e-7b82-4646-b00f-e910688bfb3d", "metadata": {}, "source": [ - "### Initialize QA generator model & Number of Seed examples\n", + "### Select the chunks for the seed examples\n", "\n", - "To generate seed examples you need to set: \n", + "Chunks for seed examples should be diverse in style. These can be selected by hand or selecting diverse chunks from all of the chunks using the [subset selection notebook](https://github.com/instructlab/examples/blob/main/notebooks/instructlab-knowledge/subset-selection.ipynb).\n", + "\n", + "If users are selecting chunks by hand, chunks should be taken directly from lines in `chunks.jsonl`. These lines have `chunk`, `file`, and `metadata` fields for each entry.\n", + "\n", + "The below code randomly selects a preset number of chunks and saves them in a jsonl file for the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fec2c039-a4bb-46cd-92aa-1b24dafb018e", + "metadata": {}, + "outputs": [], + "source": [ + "from utils.qna_gen import save_random_chunk_selection\n", + "\n", + "NUM_SEED_EXAMPLES = 7\n", + "\n", + "for contribution in contributions:\n", + " chunks_jsonl_path = contribution[\"dir\"] / CHUNKING_DIR / \"chunks.jsonl\"\n", + " authoring_path = contribution[\"dir\"] / AUTHORING_DIR\n", + "\n", + " selected_chunks_jsonl = save_random_chunk_selection(chunks_jsonl_path,\n", + " authoring_path,\n", + " NUM_SEED_EXAMPLES)\n", + " print(f\"selected_chunks.jsonl saved to: {selected_chunks_jsonl}\")" + ] + }, + { + "cell_type": "markdown", + "id": "051ae1b9-5351-4051-ab26-733e9d05a791", + "metadata": {}, + "source": [ + "### Generate the questions and answers for each chunk\n", + "\n", + "To generate questions and answers you need to set: \n", "1. The the Open AI compatible endpoint for the model generating question and answer pairs\n", "2. The model's API key\n", - "3. The model's name\n", - "4. The number of seed example you wish to generate for each contribution" + "3. The model's name" ] }, { @@ -715,8 +749,7 @@ "\n", "API_KEY = os.getenv(\"MODEL_API_KEY\") or \"\" # the API access key for your account (cannot be empty)\n", "ENDPOINT_URL = os.getenv(\"MODEL_ENDPOINT_URL\") or \"\" # the URL of your model's API. URL can end in \"/v1\"\n", - "MODEL_NAME = os.getenv(\"MODEL_NAME\") or \"mistralai/Mixtral-8x7B-Instruct-v0.1\" # the name of your model\n", - "NUM_SEED_EXAMPLES = 7" + "MODEL_NAME = os.getenv(\"MODEL_NAME\") or \"mistralai/Mixtral-8x7B-Instruct-v0.1\" # the name of your model" ] }, { @@ -744,10 +777,10 @@ }, { "cell_type": "markdown", - "id": "d9d5191d-cbd7-4c4e-b5e1-748ed8684eaf", + "id": "d54cf5e5-339f-44ec-af46-7a023f94e994", "metadata": {}, "source": [ - "### Run QA Generation" + "#### Generate questions and answers and create qna.yaml file" ] }, { @@ -760,18 +793,17 @@ "from utils.qna_gen import generate_seed_examples\n", "\n", "for contribution in contributions:\n", - " chunks_jsonl_path = contribution[\"dir\"] / CHUNKING_DIR / \"chunks.jsonl\"\n", " authoring_path = contribution[\"dir\"] / AUTHORING_DIR\n", + " selected_chunks_path = authoring_path / \"selected_chunks.jsonl\"\n", "\n", " qna_output_path = generate_seed_examples(contribution[\"name\"],\n", - " chunks_jsonl_path,\n", + " selected_chunks_path,\n", " authoring_path,\n", - " contribution[\"domain\"],\n", - " contribution[\"summary\"],\n", - " NUM_SEED_EXAMPLES,\n", " API_KEY,\n", " ENDPOINT_URL,\n", " MODEL_NAME,\n", + " contribution[\"domain\"],\n", + " contribution[\"summary\"], \n", " customization_str)\n", " print(f\"qna.yaml saved to: {qna_output_path}\")\n" ] @@ -783,8 +815,29 @@ "source": [ "### Review and Revise Seed Examples\n", "\n", - "A quality set of seed examples has diverse contexts and question-and-answer pairs across every seed example. You can asses the `qna.yaml` files in your preferred text editor to ensure the quality, diversity, and style of generated questions and answers, and modify them accordingly.\n", + "A quality set of seed examples has diverse contexts and question-and-answer pairs across every seed example. You can asses the `qna.yaml` files in your preferred text editor to ensure the quality, diversity, and style of generated questions and answers, and modify them accordingly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c80d9d9f-4796-4768-890e-5535939fa082", + "metadata": {}, + "outputs": [], + "source": [ + "from utils.qna_gen import view_seed_example\n", + "\n", + "index = 0 # index of seed example to view. Value must be lower than number of seed examples\n", "\n", + "# pass in path to qna.yaml file and seed example index to view single seed example\n", + "view_seed_example(qna_output_path, index)" + ] + }, + { + "cell_type": "markdown", + "id": "6272356c-5ab1-4864-80f0-bab393b65cc1", + "metadata": {}, + "source": [ "After assessment, the `qna.yaml` files can be quickly reviewed to ensure they includes the required elements and correct number of each. It is recommended to have at least 5 seed examples. Each seed example must have 3 question and answer pairs." ] }, @@ -797,8 +850,6 @@ "source": [ "from utils.qna_gen import review_seed_examples_file\n", "\n", - "\n", - "\n", "for contribution in contributions:\n", " qna_path = contribution[\"dir\"] / AUTHORING_DIR / \"qna.yaml\"\n", " review_seed_examples_file(qna_path, min_seed_examples=5, num_qa_pairs=3)" @@ -908,7 +959,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.13" + "version": "3.12.10" } }, "nbformat": 4, diff --git a/notebooks/instructlab-knowledge/utils/qna_gen.py b/notebooks/instructlab-knowledge/utils/qna_gen.py index bd1a0d4..a112d5e 100644 --- a/notebooks/instructlab-knowledge/utils/qna_gen.py +++ b/notebooks/instructlab-knowledge/utils/qna_gen.py @@ -60,28 +60,56 @@ class IndentedDumper(yaml.Dumper): def increase_indent(self, flow=False, indentless=False): return super(IndentedDumper, self).increase_indent(flow, False) -def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, output_dir: Path, domain: str, summary: str, num_seed_examples: int, api_key: str, api_url: str, model_id: str, customization_str: str | None = None) -> Path: +def save_random_chunk_selection(chunks_jsonl_path: Path, output_dir: Path, num_seed_examples: int) -> Path: """ Creates a seed dataset from a path + Args: + chunks_jsonl_path (Path): Path to the chunks.jsonl file + output_dir (Path): Path to output dir for select_chunks.jsonl + num_seed_examples (int): Number of chunks user wishes to randomly select + Returns: + selected_chunks_file_path (pathlib.Path): Path to the generated seed example file + """ + if not chunks_jsonl_path.exists(): + raise ValueError(f"chunks.jsonl does not exist but should at {chunks_jsonl_path}") + + chunks = [] + + with open(chunks_jsonl_path, 'r') as file: # khaled was here + for line in file: + chunk = json.loads(line) + chunks.append(chunk) + + selected_chunks = random.sample(chunks, num_seed_examples) + + selected_chunks_file_path = output_dir / "selected_chunks.jsonl" + with open(selected_chunks_file_path, "w", encoding="utf-8") as file: + for chunk in selected_chunks: + json.dump(chunk, file) + file.write("\n") + + return selected_chunks_file_path + +def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, output_dir: Path, api_key: str, api_url: str, model_id: str, domain: str, summary: str, customization_str: str | None = None) -> Path: + """ + Generates questions and answers per chunk via docling sdg. Saves them in an intermediate file Args: contribution_name (str): Name of the contribution chunks_jsonl_path (Path): Path to the chunks/chunks.jsonl file output_dir (Path): Path to output dir for the qna.yaml and intermediate outputs by docling-sdg - contribution_metadata (dict): Dictionary with the domain and summary of the contribution - num_seed_examples (str): Number of seed examples user wishes to generate for the contribution api_key (str): API key for the model used to generate questions and answers from contexts api_url (str): Endpoint for the model used to generate questions and answers from contexts model_id (str): Name of the model used to generate questions and answers from contexts customization_str (str | None) A directive for how to stylistically customize the generated QAs Returns: - qna_output_path (pathlib.Path): Path to the generated seed example file + qna_output_path (pathlib.Path): Path to a json file for generated questions and answers """ dataset = {} dataset[contribution_name] = {} dataset[contribution_name]["chunks"] = [] if not chunks_jsonl_path.exists(): - raise ValueError(f"chunks.jsonl does not exist but should at {chunks_jsonl_path}") + raise ValueError(f"chunks file does not exist but should at {chunks_jsonl_path}") docs = [] @@ -103,14 +131,13 @@ def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, outp docs.append(doc) for doc in docs: - print(f"Filtering smaller chunks out of document {doc['file']}") + print(f"Filtering smaller chunks out of chunks from document {doc['file']}") qa_chunks = get_qa_chunks(doc["file"], doc["chunk_objs"], chunk_filter) dataset[contribution_name]["chunks"].extend(list(qa_chunks)) - l = dataset[contribution_name]["chunks"] - selected_chunks = random.sample(l, num_seed_examples) + selected_chunks = dataset[contribution_name]["chunks"] generate_options = GenerateOptions(project_id="project_id") generate_options.provider = LlmProvider.OPENAI_LIKE @@ -171,6 +198,15 @@ def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, outp return qna_output_path def review_seed_examples_file(seed_examples_path: Path, min_seed_examples: int = 5, num_qa_pairs: int = 3) -> None: + """ + Review a seed example file has the expected number of fieldds + Args: + seed_examples_path (Path): Path to the qna.yaml file + min_seed_example (int): Minimum number of expected seed examples + num_qa_pairs (int): Number of expected question and answer pairs in a seed example + Returns: + None + """ with open(seed_examples_path, 'r') as yaml_file: yaml_data = yaml.safe_load(yaml_file) errors = [] @@ -215,3 +251,27 @@ def review_seed_examples_file(seed_examples_path: Path, min_seed_examples: int = else: print(f"Seed Examples YAML {seed_examples_path.resolve()} is valid :)") print(f"\n") + + + +def view_seed_example(qna_output_path: Path, seed_example_num: int) -> None: + """ + View a specific seed example in a qna.yaml + Args: + qna_output_path (Path): Path to the qna.yaml file + seed_example_num (int): index of seed example to view + Returns: + None + """ + + with open(qna_output_path, "r") as yaml_file: + yaml_data = yaml.safe_load(yaml_file) + seed_examples = yaml_data.get('seed_examples') + if seed_example_num >= len(seed_examples): + raise ValueError(f"seed_example_num must be less than number of seed examples {len(seed_examples)}") + seed_example = seed_examples[seed_example_num] + print("Context:") + print(f"{seed_example["context"]}\n") + for qna in seed_example["questions_and_answers"]: + print(f"Question: {qna["question"]}") + print(f"Answer: {qna["answer"]}\n")