Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/instructlab-knowledge-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,15 @@ jobs:
runs-on: ubuntu-latest
env:
# customize the workflow here
API_URL: "http://127.0.0.1:11434/v1"
MODEL_ID: "Mixtral-8x7B" # must be open-AI compatible when using inference mock
MODEL_ENDPOINT_URL: "http://127.0.0.1:11434/v1"
MODEL_API_KEY: "none"
MODEL_NAME: "Mixtral-8x7B" # must be open-AI compatible when using inference mock
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
python-version: '3.12'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just FYI, I successfully ran it end-to-end on 3.11 with just one minor adjustment. On notebooks/instructlab-knowledge/instructlab-knowledge.ipynb#232 , use contribution['name'] (single quotes).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made that adjustment but I was hoping to use this PR as an opportunity to bump the python version for the notebook entirely. I'm not sure how high or low a user could go. If we say we're going to support python 3.11 and 3.12 then we should have a CI job for each one.

cache: pip
- name: Install Testing Tools
run: |
Expand All @@ -50,4 +51,4 @@ jobs:
- name: Run End To End Tests
working-directory: ./notebooks/instructlab-knowledge
# NOTE: for this to work, cells with parameters need to be tagged as parameters in the target notebooks
run: papermill ${{ matrix.notebooks_to_test }} ${{ matrix.notebooks_to_test }}.tmp -p API_URL $API_URL -p MODEL_ID $MODEL_ID
run: papermill ${{ matrix.notebooks_to_test }} ${{ matrix.notebooks_to_test }}.tmp
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.DS_Store
notebooks/instructlab-knowledge/workspaces/*
!notebooks/instructlab-knowledge/workspaces/.gitkeep
686 changes: 344 additions & 342 deletions notebooks/instructlab-knowledge/instructlab-knowledge.ipynb

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
68 changes: 28 additions & 40 deletions notebooks/instructlab-knowledge/utils/create_seed_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from transformers import AutoTokenizer
import yaml

def get_seed_dataset(path: str) -> Dataset:
def get_seed_dataset(chunks_path: Path, seed_examples_path: Path) -> Dataset:
"""
Creates a seed dataset from a path
Args:
Expand All @@ -19,64 +19,52 @@ def get_seed_dataset(path: str) -> Dataset:
of seed data for the knowledge generation pipeline in
SDG.
"""
valid_path = is_dir_valid(path)
ds = create_dataset_from_dir(valid_path)
if not chunks_path.is_dir():
raise ValueError(f"Path to chunks {chunks_path} must be a directory")
if not seed_examples_path.is_dir():
raise ValueError(f"Path to seed examples {seed_examples_path} must be a directory")

return ds
files = list(seed_examples_path.iterdir())
has_qna = any(f.name == 'qna.yaml' for f in files)
files = list(chunks_path.iterdir())
has_chunks_jsonl = any(f.name == 'chunks.jsonl' for f in files)

def is_dir_valid(path: str) -> Path:
"""
Returns whether or not a directory contains a qna.yaml and one or more .txt chunks
Args:
path (str): Path to directory of qna.yaml and chunks
Returns:
base_path (Path): pathlib.Path to a directory that can create a jsonl
of seed data
"""
base_path = Path(path)
if not base_path.is_dir():
raise ValueError("Base path must be a directory")
if not has_qna:
raise ValueError(f"Seed examples dir {seed_examples_path} does not contain a qna.yaml")

files = list(base_path.iterdir())
has_qna = any(f.name == 'qna.yaml' for f in files)
has_txt = any(f.suffix == '.txt' for f in files)
if not has_qna or not has_txt:
raise ValueError("Directory does not contain a qna.yaml and chunks")
if not has_chunks_jsonl:
raise ValueError(f"Chunks dir {chunks_path} does not contain a chunks.jsonl")

return base_path
ds = create_dataset_from_dir(chunks_path, seed_examples_path)

def read_chunks(path: Path) -> Dict[str, str]:
return ds

def read_chunks(chunks_path: Path) -> Dict[str, str]:
"""
Returns a dictionary with all of the .txt chunks in a directory
Returns a dictionary with all of the chunks in a chunks.jsonl
The chunks may originate from one or more different files
Args:
path (Path): Path to directory of chunks
path (Path): Path to directory of chunks in a file called chunks.jsonl
Returns:
chunks_dict (Dict[str,str]: Dictionary with key of the original file name
and a list of chunks as the value
"""
chunk_files = path.glob('*.txt')

chunks_jsonl_path = chunks_path / "chunks.jsonl"
chunks_dict = {}
for file in chunk_files:
chunks = []
match = re.match(r"^(.*?)[-_]\d+\.txt$", file.name)
if match:
orig_filename = match.group(1)

with file.open('r', encoding='utf-8') as f:
chunk = f.read()
with open(chunks_jsonl_path, 'r') as file:
for line in file:
entry = yaml.safe_load(line)
orig_filename = entry.get("file")

if orig_filename not in chunks_dict:
chunks_dict[orig_filename] = []
chunks_dict[orig_filename].append(chunk)

else:
print(f"Ignoring .txt file {file}, file name is not the right format")
chunks_dict[orig_filename].append(entry.get("chunk"))

return chunks_dict

def create_dataset_from_dir(path: Path) -> Dataset:
def create_dataset_from_dir(chunks_path: Path, seed_examples_path: Path) -> Dataset:
"""
Process a directory with chunks and a qna.yaml return a dataset.
Args:
Expand All @@ -85,7 +73,7 @@ def create_dataset_from_dir(path: Path) -> Dataset:
Dataset: Dataset object.
"""

qna_yaml_path = path / "qna.yaml"
qna_yaml_path = seed_examples_path / "qna.yaml"

with open(qna_yaml_path, 'r') as f:
qna_yaml = yaml.safe_load(f)
Expand All @@ -94,7 +82,7 @@ def create_dataset_from_dir(path: Path) -> Dataset:
if not all(key in qna_yaml for key in ['document_outline', 'domain', 'seed_examples']):
raise ValueError("qna.yaml file is missing document_outline, domain, or seed_examples fields")

chunks_dict = read_chunks(path)
chunks_dict = read_chunks(chunks_path)

datasets = []
for filename in chunks_dict.keys():
Expand Down
179 changes: 179 additions & 0 deletions notebooks/instructlab-knowledge/utils/qna_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import json
import yaml
import random

from pathlib import Path
from pydantic import SecretStr
from textwrap import wrap

from docling_core.transforms.chunker.hierarchical_chunker import DocChunk, DocMeta
from docling_sdg.qa.utils import get_qa_chunks
from docling_sdg.qa.generate import Generator
from docling_sdg.qa.base import GenerateOptions, LlmProvider

chunk_filter = [
lambda chunk: len(str(chunk.text)) > 500
]

def str_presenter(dumper, data):
if len(data.splitlines()) > 1: # check for multiline string
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
elif len(data) > 80:
data = "\n".join(wrap(data, 80))
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)

yaml.add_representer(str, str_presenter)

# to use with safe_dump:
yaml.representer.SafeRepresenter.add_representer(str, str_presenter)

class IndentedDumper(yaml.Dumper):
def increase_indent(self, flow=False, indentless=False):
return super(IndentedDumper, self).increase_indent(flow, False)

def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, output_dir: Path, domain: str, summary: str, num_seed_examples: int, api_key: str, api_url: str, model_id: str) -> Path:
"""
Creates a seed dataset from a path
Args:
contribution_name (str): Name of the contribution
chunks_jsonl_path (Path): Path to the chunks/chunks.jsonl file
output_dir (Path): Path to output dir for the qna.yaml and intermediate outputs by docling-sdg
contribution_metadata (dict): Dictionary with the domain and summary of the contribution
num_seed_examples (str): Number of seed examples user wishes to generate for the contribution
api_key (str): API key for the model used to generate questions and answers from contexts
api_url (str): Endpoint for the model used to generate questions and answers from contexts
model_id (str): Name of the model used to generate questions and answers from contexts
Returns:
qna_output_path (pathlib.Path): Path to the generated seed example file
"""
dataset = {}
dataset[contribution_name] = {}
dataset[contribution_name]["chunks"] = []

if not chunks_jsonl_path.exists():
raise ValueError(f"chunks.jsonl does not exist but should at {chunks_jsonl_path}")

docs = []

with open(chunks_jsonl_path, 'r') as file: # khaled was here
for line in file:
file_in_docs = False
entry = json.loads(line)
#entry = yaml.safe_load(line)
meta = DocMeta(**entry['metadata'])
chunk = DocChunk(text=entry['chunk'], meta=meta)
for doc in docs:
if doc["file"] == entry['file']:
doc["chunk_objs"].append(chunk)
file_in_docs = True
break

if file_in_docs == False:
doc = dict(file=entry['file'], chunk_objs=[chunk])
docs.append(doc)

for doc in docs:
print(f"Filtering smaller chunks out of document {doc['file']}")

qa_chunks = get_qa_chunks(doc["file"], doc["chunk_objs"], chunk_filter)
dataset[contribution_name]["chunks"].extend(list(qa_chunks))


l = dataset[contribution_name]["chunks"]
selected_chunks = random.sample(l, num_seed_examples)

generate_options = GenerateOptions(project_id="project_id")
generate_options.provider = LlmProvider.OPENAI_LIKE
generate_options.api_key = SecretStr(api_key)
generate_options.url = api_url
generate_options.model_id = model_id
generate_options.generated_file = output_dir / f"qagen-{contribution_name}.json"
gen = Generator(generate_options=generate_options)

Path.unlink(generate_options.generated_file, missing_ok=True)
results = gen.generate_from_chunks(selected_chunks) # automatically saves to file

print(f"Status for Q&A generation for {contribution_name} is: {results.status}")

qnas = {}
chunk_id_to_text = {}
with open(generate_options.generated_file, "rt") as f:
for line in f.readlines():
entry = json.loads(line)
chunk_id = entry['chunk_id']
if chunk_id not in chunk_id_to_text:
chunk_id_to_text[chunk_id] = entry['context']
if chunk_id not in qnas:
qnas[chunk_id] = []
qnas[chunk_id].append({'question': entry['question'], 'answer': entry['answer']})

qna_output_path = output_dir / "qna.yaml"

data = {'seed_examples': []}
for chunk_id, context in chunk_id_to_text.items():
data['seed_examples'].append({
'context': context,
'questions_and_answers': [
{
'question': example['question'],
'answer': example['answer'],
} for example in qnas[chunk_id]
]
})


data['document_outline'] = summary
data['domain'] = domain

Path.unlink(qna_output_path, missing_ok=True) # shouldn't be necessary but was. jupyter caching thing?
with open(qna_output_path, 'w') as yaml_file:
yaml.dump(data, yaml_file, Dumper=IndentedDumper, default_flow_style=False, sort_keys=False, width=80)

return qna_output_path

def review_seed_examples_file(seed_examples_path: Path, min_seed_examples: int = 5, num_qa_pairs: int = 3) -> None:
with open(seed_examples_path, 'r') as yaml_file:
yaml_data = yaml.safe_load(yaml_file)
errors = []
print(f"Reviewing seed examples file at {seed_examples_path.resolve()}")

# Check for document_outline
if 'document_outline' not in yaml_data:
errors.append("Missing contribution summary in 'document_outline'")
else:
# contribution summary is called document_outline internally
print(f"Found contribution summary...")

# Check for domain
if 'domain' not in yaml_data:
errors.append("Missing 'domain'")
else:
print(f"Found 'domain'...")

# Check seed_examples
seed_examples = yaml_data.get('seed_examples')
if not seed_examples:
errors.append("'seed_examples' section is missing or empty.")
elif len(seed_examples) < min_seed_examples:
errors.append(f"'seed_examples' should contain at least {min_seed_examples} examples, found {len(seed_examples)}. Please add {min_seed_examples - len(seed_examples)} more seed example(s)")
else:
print(f"Found {len(seed_examples)} 'contexts' in 'sed_examples'. Minimum expected number is {min_seed_examples}...")

if seed_examples:
for i, example in enumerate(seed_examples, start=1):
qa_pairs = example.get('questions_and_answers')
if not qa_pairs:
errors.append(f"Seed Example {i} is missing 'questions_and_answers' section.")
elif len(qa_pairs) != num_qa_pairs:
errors.append(f"Seed Example {i} should contain {num_qa_pairs} question-answer pairs, found {len(qa_pairs)}. Please add {num_qa_pairs - len(qa_pairs)} more question-answer pair(s) to seed example {i}")
else:
print(f"Seed Example {i} contains expected number ({num_qa_pairs}) of 'question_and_answers'...")

if errors:
print("\n\033[31mERROR! Seed Examples validation failed with the following issues:\033[0m")
for err in errors:
print(f"- {err}")
else:
print(f"Seed Examples YAML {seed_examples_path.resolve()} is valid :)")
print(f"\n")