Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/instructlab-knowledge-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,15 @@ jobs:
runs-on: ubuntu-latest
env:
# customize the workflow here
API_URL: "http://127.0.0.1:11434/v1"
MODEL_ID: "Mixtral-8x7B" # must be open-AI compatible when using inference mock
MODEL_ENDPOINT_URL: "http://127.0.0.1:11434/v1"
MODEL_API_KEY: "none"
MODEL_NAME: "Mixtral-8x7B" # must be open-AI compatible when using inference mock
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
python-version: '3.12'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just FYI, I successfully ran it end-to-end on 3.11 with just one minor adjustment. On notebooks/instructlab-knowledge/instructlab-knowledge.ipynb#232 , use contribution['name'] (single quotes).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made that adjustment but I was hoping to use this PR as an opportunity to bump the python version for the notebook entirely. I'm not sure how high or low a user could go. If we say we're going to support python 3.11 and 3.12 then we should have a CI job for each one.

cache: pip
- name: Install Testing Tools
run: |
Expand All @@ -50,4 +51,4 @@ jobs:
- name: Run End To End Tests
working-directory: ./notebooks/instructlab-knowledge
# NOTE: for this to work, cells with parameters need to be tagged as parameters in the target notebooks
run: papermill ${{ matrix.notebooks_to_test }} ${{ matrix.notebooks_to_test }}.tmp -p API_URL $API_URL -p MODEL_ID $MODEL_ID
run: papermill ${{ matrix.notebooks_to_test }} ${{ matrix.notebooks_to_test }}.tmp
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.DS_Store
notebooks/instructlab-knowledge/workspaces/*
!notebooks/instructlab-knowledge/workspaces/.gitkeep
669 changes: 326 additions & 343 deletions notebooks/instructlab-knowledge/instructlab-knowledge.ipynb

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
55 changes: 31 additions & 24 deletions notebooks/instructlab-knowledge/utils/create_seed_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from transformers import AutoTokenizer
import yaml

def get_seed_dataset(path: str) -> Dataset:
def get_seed_dataset(chunks_path: Path, seed_examples_path: Path) -> Dataset:
"""
Creates a seed dataset from a path
Args:
Expand All @@ -19,12 +19,27 @@ def get_seed_dataset(path: str) -> Dataset:
of seed data for the knowledge generation pipeline in
SDG.
"""
valid_path = is_dir_valid(path)
ds = create_dataset_from_dir(valid_path)
if not chunks_path.is_dir():
raise ValueError(f"Path to chunks {chunks_path} must be a directory")
if not seed_examples_path.is_dir():
raise ValueError(f"Path to seed examples {seed_examples_path} must be a directory")

files = list(seed_examples_path.iterdir())
has_qna = any(f.name == 'qna.yaml' for f in files)
files = list(chunks_path.iterdir())
has_chunks_jsonl = any(f.name == 'chunks.jsonl' for f in files)

if not has_qna:
raise ValueError(f"Seed examples dir {seed_examples_path} does not contain a qna.yaml")

if not has_chunks_jsonl:
raise ValueError(f"Chunks dir {chunks_path} does not contain a chunks.jsonl")

ds = create_dataset_from_dir(chunks_path, seed_examples_path)

return ds

def is_dir_valid(path: str) -> Path:
def is_dir_valid(path: str) -> None:
"""
Returns whether or not a directory contains a qna.yaml and one or more .txt chunks
Args:
Expand All @@ -43,40 +58,32 @@ def is_dir_valid(path: str) -> Path:
if not has_qna or not has_txt:
raise ValueError("Directory does not contain a qna.yaml and chunks")

return base_path

def read_chunks(path: Path) -> Dict[str, str]:
def read_chunks(chunks_path: Path) -> Dict[str, str]:
"""
Returns a dictionary with all of the .txt chunks in a directory
Returns a dictionary with all of the chunks in a chunks.jsonl
The chunks may originate from one or more different files
Args:
path (Path): Path to directory of chunks
path (Path): Path to directory of chunks in a file called chunks.jsonl
Returns:
chunks_dict (Dict[str,str]: Dictionary with key of the original file name
and a list of chunks as the value
"""
chunk_files = path.glob('*.txt')

chunks_jsonl_path = chunks_path / "chunks.jsonl"
chunks_dict = {}
for file in chunk_files:
chunks = []
match = re.match(r"^(.*?)[-_]\d+\.txt$", file.name)
if match:
orig_filename = match.group(1)

with file.open('r', encoding='utf-8') as f:
chunk = f.read()
with open(chunks_jsonl_path, 'r') as file:
for line in file:
entry = yaml.safe_load(line)
orig_filename = entry.get("file")

if orig_filename not in chunks_dict:
chunks_dict[orig_filename] = []
chunks_dict[orig_filename].append(chunk)

else:
print(f"Ignoring .txt file {file}, file name is not the right format")
chunks_dict[orig_filename].append(entry.get("chunk"))

return chunks_dict

def create_dataset_from_dir(path: Path) -> Dataset:
def create_dataset_from_dir(chunks_path: Path, seed_examples_path: Path) -> Dataset:
"""
Process a directory with chunks and a qna.yaml return a dataset.
Args:
Expand All @@ -85,7 +92,7 @@ def create_dataset_from_dir(path: Path) -> Dataset:
Dataset: Dataset object.
"""

qna_yaml_path = path / "qna.yaml"
qna_yaml_path = seed_examples_path / "qna.yaml"

with open(qna_yaml_path, 'r') as f:
qna_yaml = yaml.safe_load(f)
Expand All @@ -94,7 +101,7 @@ def create_dataset_from_dir(path: Path) -> Dataset:
if not all(key in qna_yaml for key in ['document_outline', 'domain', 'seed_examples']):
raise ValueError("qna.yaml file is missing document_outline, domain, or seed_examples fields")

chunks_dict = read_chunks(path)
chunks_dict = read_chunks(chunks_path)

datasets = []
for filename in chunks_dict.keys():
Expand Down
178 changes: 178 additions & 0 deletions notebooks/instructlab-knowledge/utils/qna_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import json
import yaml
import random

from pathlib import Path
from pydantic import SecretStr
from textwrap import wrap

from docling_core.transforms.chunker.hierarchical_chunker import DocChunk, DocMeta
from docling_sdg.qa.utils import get_qa_chunks
from docling_sdg.qa.generate import Generator
from docling_sdg.qa.base import GenerateOptions, LlmProvider

chunk_filter = [
lambda chunk: len(str(chunk.text)) > 500
]

def str_presenter(dumper, data):
if len(data.splitlines()) > 1: # check for multiline string
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
elif len(data) > 80:
data = "\n".join(wrap(data, 80))
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)

yaml.add_representer(str, str_presenter)

# to use with safe_dump:
yaml.representer.SafeRepresenter.add_representer(str, str_presenter)

class IndentedDumper(yaml.Dumper):
def increase_indent(self, flow=False, indentless=False):
return super(IndentedDumper, self).increase_indent(flow, False)

def generate_seed_examples(contribution_name: str, contribution_dir: str, domain: str, summary: str, num_seed_examples: int, api_key: str, api_url: str, model_id: str) -> Path:
"""
Creates a seed dataset from a path
Args:
contribution_name (str): Name of the contribution
contribution_dir (str): Path to the artifacts for the contribution. contribution_dir should contain
a chunks/chunks.jsonl file
contribution_metadata (dict): Dictionary with the domain and summary of the contribution
num_seed_examples (str): Number of seed examples user wishes to generate for the contribution
api_key (str): API key for the model used to generate questions and answers from contexts
api_url (str): Endpoint for the model used to generate questions and answers from contexts
model_id (str): Name of the model used to generate questions and answers from contexts
Returns:
qna_output_path (pathlib.Path): Path to the generated seed example file
"""
dataset = {}
dataset[contribution_name] = {}
dataset[contribution_name]["chunks"] = []

chunks_jsonl_path = contribution_dir / "chunks" / "chunks.jsonl"
if not chunks_jsonl_path.exists():
raise ValueError(f"chunks.jsonl does not exist but should at {chunks_jsonl_path}")

docs = []

with open(chunks_jsonl_path, 'r') as file: # khaled was here
for line in file:
file_in_docs = False
entry = json.loads(line)
#entry = yaml.safe_load(line)
meta = DocMeta(**entry['metadata'])
chunk = DocChunk(text=entry['chunk'], meta=meta)
for doc in docs:
if doc["file"] == entry['file']:
doc["chunk_objs"].append(chunk)
file_in_docs = True
break

if file_in_docs == False:
doc = dict(file=entry['file'], chunk_objs=[chunk])
docs.append(doc)

for doc in docs:
print(f"Filtering smaller chunks out of document {doc['file']}")

qa_chunks = get_qa_chunks(doc["file"], doc["chunk_objs"], chunk_filter)
dataset[contribution_name]["chunks"].extend(list(qa_chunks))


l = dataset[contribution_name]["chunks"]
selected_chunks = random.sample(l, num_seed_examples)

generate_options = GenerateOptions(project_id="project_id")
generate_options.provider = LlmProvider.OPENAI_LIKE
generate_options.api_key = SecretStr(api_key)
generate_options.url = api_url
generate_options.model_id = model_id
generate_options.generated_file = contribution_dir / "authoring" / f"qagen-{contribution_name}.json"
gen = Generator(generate_options=generate_options)

Path.unlink(generate_options.generated_file, missing_ok=True)
results = gen.generate_from_chunks(selected_chunks) # automatically saves to file

print(f"Status for Q&A generation for {contribution_name} is: {results.status}")

qnas = {}
chunk_id_to_text = {}
with open(generate_options.generated_file, "rt") as f:
for line in f.readlines():
entry = json.loads(line)
chunk_id = entry['chunk_id']
if chunk_id not in chunk_id_to_text:
chunk_id_to_text[chunk_id] = entry['context']
if chunk_id not in qnas:
qnas[chunk_id] = []
qnas[chunk_id].append({'question': entry['question'], 'answer': entry['answer']})

qna_output_path = contribution_dir / "authoring" / "qna.yaml"

data = {'seed_examples': []}
for chunk_id, context in chunk_id_to_text.items():
data['seed_examples'].append({
'context': context,
'questions_and_answers': [
{
'question': example['question'],
'answer': example['answer'],
} for example in qnas[chunk_id]
]
})

data['document_outline'] = summary
data['domain'] = domain

Path.unlink(qna_output_path, missing_ok=True) # shouldn't be necessary but was. jupyter caching thing?
with open(qna_output_path, 'w') as yaml_file:
yaml.dump(data, yaml_file, Dumper=IndentedDumper, default_flow_style=False, sort_keys=False, width=80)

return qna_output_path

def review_seed_examples_file(seed_examples_path: Path, num_seed_examples: int = 5, num_qa_pairs: int = 3) -> None:
with open(seed_examples_path, 'r') as yaml_file:
yaml_data = yaml.safe_load(yaml_file)
errors = []
print(f"Reviewing seed examples file at {seed_examples_path.resolve()}")

# Check for document_outline
if 'document_outline' not in yaml_data:
errors.append("Missing contribution summary in 'document_outline'")
else:
# contribution summary is called document_outline internally
print(f"Found contribution summary in 'document_outline'...")

# Check for domain
if 'domain' not in yaml_data:
errors.append("Missing 'domain'")
else:
print(f"Found 'domain'...")

# Check seed_examples
seed_examples = yaml_data.get('seed_examples')
if not seed_examples:
errors.append("'seed_examples' section is missing or empty.")
elif len(seed_examples) != num_seed_examples:
errors.append(f"'seed_examples' should contain {num_seed_examples} examples, found {len(seed_examples)}.")

if seed_examples:
print(f"Found expected number ({num_seed_examples}) of 'seed_examples'...")
for i, example in enumerate(seed_examples, start=1):
qa_pairs = example.get('questions_and_answers')
if not qa_pairs:
errors.append(f"Seed Example {i} is missing 'questions_and_answers'.")
elif len(qa_pairs) != num_qa_pairs:
errors.append(f"Seed Example {i} should contain {num_qa_pairs} question-answer pairs, found {len(qa_pairs)}.")
else:
print(f"Seed Example {i} contains expected number ({num_qa_pairs}) of 'question_and_answers'...")

if errors:
print("\nERROR! Seed Examples validation failed with the following issues:")
for err in errors:
print(f"- {err}")
else:
print(f"Seed Examples YAML {seed_examples_path.resolve()} is valid :)")
print(f"\n")
Empty file.