instructlab · fabianofranz · Jun 12, 2025 · May 22, 2025 · May 22, 2025 · May 22, 2025
@@ -22,14 +22,15 @@ jobs:
         runs-on: ubuntu-latest
         env:
             # customize the workflow here
-            API_URL: "http://127.0.0.1:11434/v1"
-            MODEL_ID: "Mixtral-8x7B" # must be open-AI compatible when using inference mock
+            MODEL_ENDPOINT_URL: "http://127.0.0.1:11434/v1"
+            MODEL_API_KEY: "none"
+            MODEL_NAME: "Mixtral-8x7B" # must be open-AI compatible when using inference mock
         steps:
             - uses: actions/checkout@v3
             - name: Set up Python
               uses: actions/setup-python@v5
               with:
-                python-version: '3.11'
+                python-version: '3.12'
                 cache: pip
             - name: Install Testing Tools
               run: |
@@ -50,4 +51,4 @@ jobs:
             - name: Run End To End Tests
               working-directory: ./notebooks/instructlab-knowledge
               # NOTE: for this to work, cells with parameters need to be tagged as parameters in the target notebooks
-              run: papermill ${{ matrix.notebooks_to_test }} ${{ matrix.notebooks_to_test }}.tmp -p API_URL $API_URL -p MODEL_ID $MODEL_ID
+              run: papermill ${{ matrix.notebooks_to_test }} ${{ matrix.notebooks_to_test }}.tmp
@@ -0,0 +1,3 @@
+.DS_Store
+notebooks/instructlab-knowledge/workspaces/*
+!notebooks/instructlab-knowledge/workspaces/.gitkeep
@@ -9,7 +9,7 @@
 from transformers import AutoTokenizer
 import yaml
 
-def get_seed_dataset(path: str) -> Dataset:
+def get_seed_dataset(chunks_path: Path, seed_examples_path: Path) -> Dataset:
     """
     Creates a seed dataset from a path
     Args:
@@ -19,64 +19,52 @@ def get_seed_dataset(path: str) -> Dataset:
                       of seed data for the knowledge generation pipeline in
                       SDG.
     """
-    valid_path = is_dir_valid(path)
-    ds = create_dataset_from_dir(valid_path)
+    if not chunks_path.is_dir():
+        raise ValueError(f"Path to chunks {chunks_path} must be a directory")
+    if not seed_examples_path.is_dir():
+        raise ValueError(f"Path to seed examples {seed_examples_path} must be a directory")
 
-    return ds
+    files = list(seed_examples_path.iterdir())
+    has_qna = any(f.name == 'qna.yaml' for f in files)
+    files = list(chunks_path.iterdir())
+    has_chunks_jsonl = any(f.name == 'chunks.jsonl' for f in files)
 
-def is_dir_valid(path: str) -> Path:
-    """
-    Returns whether or not a directory contains a qna.yaml and one or more .txt chunks
-    Args:
-        path (str):       Path to directory of qna.yaml and chunks
-    Returns:
-        base_path (Path): pathlib.Path to a directory that can create a jsonl
-                          of seed data
-    """
-    base_path = Path(path)
-    if not base_path.is_dir():
-        raise ValueError("Base path must be a directory")
+    if not has_qna:
+        raise ValueError(f"Seed examples dir {seed_examples_path} does not contain a qna.yaml")
 
-    files = list(base_path.iterdir())
-    has_qna = any(f.name == 'qna.yaml' for f in files)
-    has_txt = any(f.suffix == '.txt' for f in files)
-    if not has_qna or not has_txt:
-        raise ValueError("Directory does not contain a qna.yaml and chunks")
+    if not has_chunks_jsonl:
+        raise ValueError(f"Chunks dir {chunks_path} does not contain a chunks.jsonl")
 
-    return base_path
+    ds = create_dataset_from_dir(chunks_path, seed_examples_path)
 
-def read_chunks(path: Path) -> Dict[str, str]:
+    return ds
+
+def read_chunks(chunks_path: Path) -> Dict[str, str]:
     """
-    Returns a dictionary with all of the .txt chunks in a directory
+    Returns a dictionary with all of the chunks in a chunks.jsonl
     The chunks may originate from one or more different files
     Args:
-        path (Path): Path to directory of chunks
+        path (Path): Path to directory of chunks in a file called chunks.jsonl
     Returns:
         chunks_dict (Dict[str,str]: Dictionary with key of the original file name
                                     and a list of chunks as the value
     """
-    chunk_files = path.glob('*.txt')
-
+    chunks_jsonl_path = chunks_path / "chunks.jsonl"
     chunks_dict = {}
-    for file in chunk_files:
-        chunks = []
-        match = re.match(r"^(.*?)[-_]\d+\.txt$", file.name)
-        if match:
-            orig_filename = match.group(1)
 
-            with file.open('r', encoding='utf-8') as f:
-                chunk = f.read()
+    with open(chunks_jsonl_path, 'r') as file:
+        for line in file:
+            entry = yaml.safe_load(line)
+            orig_filename = entry.get("file")
 
             if orig_filename not in chunks_dict:
                 chunks_dict[orig_filename] = []
-            chunks_dict[orig_filename].append(chunk)
 
-        else:
-            print(f"Ignoring .txt file {file}, file name is not the right format")
+            chunks_dict[orig_filename].append(entry.get("chunk"))
 
     return chunks_dict
 
-def create_dataset_from_dir(path: Path) -> Dataset:
+def create_dataset_from_dir(chunks_path: Path, seed_examples_path: Path) -> Dataset:
     """
     Process a directory with chunks and a qna.yaml return a dataset.
     Args:
@@ -85,7 +73,7 @@ def create_dataset_from_dir(path: Path) -> Dataset:
         Dataset: Dataset object.
     """
 
-    qna_yaml_path = path / "qna.yaml"
+    qna_yaml_path = seed_examples_path / "qna.yaml"
 
     with open(qna_yaml_path, 'r') as f:
       qna_yaml = yaml.safe_load(f)
@@ -94,7 +82,7 @@ def create_dataset_from_dir(path: Path) -> Dataset:
     if not all(key in qna_yaml for key in ['document_outline', 'domain', 'seed_examples']):
         raise ValueError("qna.yaml file is missing document_outline, domain, or seed_examples fields")
 
-    chunks_dict = read_chunks(path)
+    chunks_dict = read_chunks(chunks_path)
 
     datasets = []
     for filename in chunks_dict.keys():

@@ -0,0 +1,179 @@
+import json
+import yaml
+import random
+
+from pathlib import Path
+from pydantic import SecretStr
+from textwrap import wrap
+
+from docling_core.transforms.chunker.hierarchical_chunker import DocChunk, DocMeta
+from docling_sdg.qa.utils import get_qa_chunks
+from docling_sdg.qa.generate import Generator
+from docling_sdg.qa.base import GenerateOptions, LlmProvider
+
+chunk_filter = [
+    lambda chunk: len(str(chunk.text)) > 500
+]
+
+def str_presenter(dumper, data):
+  if len(data.splitlines()) > 1:  # check for multiline string
+    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
+  elif len(data) > 80:
+    data = "\n".join(wrap(data, 80))
+    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
+  return dumper.represent_scalar('tag:yaml.org,2002:str', data)
+
+yaml.add_representer(str, str_presenter)
+
+# to use with safe_dump:
+yaml.representer.SafeRepresenter.add_representer(str, str_presenter)
+
+class IndentedDumper(yaml.Dumper):
+    def increase_indent(self, flow=False, indentless=False):
+        return super(IndentedDumper, self).increase_indent(flow, False)
+
+def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, output_dir: Path, domain: str, summary: str, num_seed_examples: int, api_key: str, api_url: str, model_id: str) -> Path:
+    """
+    Creates a seed dataset from a path
+    Args:
+        contribution_name (str):        Name of the contribution
+        chunks_jsonl_path (Path):       Path to the chunks/chunks.jsonl file
+        output_dir (Path):              Path to output dir for the qna.yaml and intermediate outputs by docling-sdg
+        contribution_metadata (dict):   Dictionary with the domain and summary of the contribution
+        num_seed_examples (str):        Number of seed examples user wishes to generate for the contribution
+        api_key (str):                  API key for the model used to generate questions and answers from contexts
+        api_url (str):                  Endpoint for the model used to generate questions and answers from contexts
+        model_id (str):                 Name of the model used to generate questions and answers from contexts
+    Returns:
+        qna_output_path (pathlib.Path): Path to the generated seed example file
+    """
+    dataset = {}
+    dataset[contribution_name] = {}
+    dataset[contribution_name]["chunks"] = []
+
+    if not chunks_jsonl_path.exists():
+        raise ValueError(f"chunks.jsonl does not exist but should at {chunks_jsonl_path}")
+
+    docs = []
+
+    with open(chunks_jsonl_path, 'r') as file:  # khaled was here
+        for line in file:
+            file_in_docs = False
+            entry = json.loads(line)
+            #entry = yaml.safe_load(line)
+            meta = DocMeta(**entry['metadata'])
+            chunk = DocChunk(text=entry['chunk'], meta=meta)
+            for doc in docs:
+                if doc["file"] == entry['file']:
+                    doc["chunk_objs"].append(chunk)
+                    file_in_docs = True
+                    break
+
+            if file_in_docs == False:
+                doc = dict(file=entry['file'], chunk_objs=[chunk])
+                docs.append(doc)
+
+    for doc in docs:
+        print(f"Filtering smaller chunks out of document {doc['file']}")
+
+        qa_chunks = get_qa_chunks(doc["file"], doc["chunk_objs"], chunk_filter)
+        dataset[contribution_name]["chunks"].extend(list(qa_chunks))
+
+
+    l = dataset[contribution_name]["chunks"]
+    selected_chunks = random.sample(l, num_seed_examples)
+
+    generate_options = GenerateOptions(project_id="project_id")
+    generate_options.provider = LlmProvider.OPENAI_LIKE
+    generate_options.api_key = SecretStr(api_key)
+    generate_options.url = api_url
+    generate_options.model_id = model_id
+    generate_options.generated_file = output_dir / f"qagen-{contribution_name}.json"
+    gen = Generator(generate_options=generate_options)
+
+    Path.unlink(generate_options.generated_file, missing_ok=True)
+    results = gen.generate_from_chunks(selected_chunks) # automatically saves to file
+
+    print(f"Status for Q&A generation for {contribution_name} is: {results.status}")
+
+    qnas = {}
+    chunk_id_to_text = {}
+    with open(generate_options.generated_file, "rt") as f:
+        for line in f.readlines():
+            entry = json.loads(line)
+            chunk_id = entry['chunk_id']
+            if chunk_id not in chunk_id_to_text:
+                chunk_id_to_text[chunk_id] = entry['context']
+            if chunk_id not in qnas:
+                qnas[chunk_id] = []
+            qnas[chunk_id].append({'question': entry['question'], 'answer': entry['answer']})
+
+    qna_output_path = output_dir / "qna.yaml"
+
+    data = {'seed_examples': []}
+    for chunk_id, context in chunk_id_to_text.items():
+        data['seed_examples'].append({
+            'context': context,
+            'questions_and_answers': [
+                {
+                    'question': example['question'],
+                    'answer': example['answer'],
+                } for example in qnas[chunk_id]
+            ]
+        })
+
+
+    data['document_outline'] = summary
+    data['domain'] = domain
+
+    Path.unlink(qna_output_path, missing_ok=True) # shouldn't be necessary but was. jupyter caching thing?
+    with open(qna_output_path, 'w') as yaml_file:
+        yaml.dump(data, yaml_file, Dumper=IndentedDumper, default_flow_style=False, sort_keys=False, width=80)
+
+    return qna_output_path
+
+def review_seed_examples_file(seed_examples_path: Path, min_seed_examples: int = 5, num_qa_pairs: int = 3) -> None:
+    with open(seed_examples_path, 'r') as yaml_file:
+        yaml_data = yaml.safe_load(yaml_file)
+        errors = []
+        print(f"Reviewing seed examples file at {seed_examples_path.resolve()}")
+
+        # Check for document_outline
+        if 'document_outline' not in yaml_data:
+            errors.append("Missing contribution summary in 'document_outline'")
+        else:
+            # contribution summary is called document_outline internally
+            print(f"Found contribution summary...")
+
+        # Check for domain
+        if 'domain' not in yaml_data:
+            errors.append("Missing 'domain'")
+        else:
+            print(f"Found 'domain'...")
+
+        # Check seed_examples
+        seed_examples = yaml_data.get('seed_examples')
+        if not seed_examples:
+            errors.append("'seed_examples' section is missing or empty.")
+        elif len(seed_examples) < min_seed_examples:
+            errors.append(f"'seed_examples' should contain at least {min_seed_examples} examples, found {len(seed_examples)}. Please add {min_seed_examples - len(seed_examples)} more seed example(s)")
+        else:
+            print(f"Found {len(seed_examples)} 'contexts' in 'sed_examples'. Minimum expected number is {min_seed_examples}...")
+
+        if seed_examples:
+            for i, example in enumerate(seed_examples, start=1):
+                qa_pairs = example.get('questions_and_answers')
+                if not qa_pairs:
+                    errors.append(f"Seed Example {i} is missing 'questions_and_answers' section.")
+                elif len(qa_pairs) != num_qa_pairs:
+                    errors.append(f"Seed Example {i} should contain {num_qa_pairs} question-answer pairs, found {len(qa_pairs)}. Please add {num_qa_pairs - len(qa_pairs)} more question-answer pair(s) to seed example {i}")
+                else:
+                    print(f"Seed Example {i} contains expected number ({num_qa_pairs}) of 'question_and_answers'...")
+
+        if errors:
+            print("\n\033[31mERROR! Seed Examples validation failed with the following issues:\033[0m")
+            for err in errors:
+                print(f"- {err}")
+        else:
+            print(f"Seed Examples YAML {seed_examples_path.resolve()} is valid :)")
+        print(f"\n")