Skip to content

Commit 32e1661

Browse files
authored
Merge pull request #18 from alimaredia/multiple-source-and-qna-files
Add ability for `instructlab-knowledge` notebook to take multiple source and qna files
2 parents 0f9ec2c + 0841462 commit 32e1661

File tree

8 files changed

+559
-386
lines changed

8 files changed

+559
-386
lines changed

.github/workflows/instructlab-knowledge-e2e.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,15 @@ jobs:
2222
runs-on: ubuntu-latest
2323
env:
2424
# customize the workflow here
25-
API_URL: "http://127.0.0.1:11434/v1"
26-
MODEL_ID: "Mixtral-8x7B" # must be open-AI compatible when using inference mock
25+
MODEL_ENDPOINT_URL: "http://127.0.0.1:11434/v1"
26+
MODEL_API_KEY: "none"
27+
MODEL_NAME: "Mixtral-8x7B" # must be open-AI compatible when using inference mock
2728
steps:
2829
- uses: actions/checkout@v3
2930
- name: Set up Python
3031
uses: actions/setup-python@v5
3132
with:
32-
python-version: '3.11'
33+
python-version: '3.12'
3334
cache: pip
3435
- name: Install Testing Tools
3536
run: |
@@ -50,4 +51,4 @@ jobs:
5051
- name: Run End To End Tests
5152
working-directory: ./notebooks/instructlab-knowledge
5253
# NOTE: for this to work, cells with parameters need to be tagged as parameters in the target notebooks
53-
run: papermill ${{ matrix.notebooks_to_test }} ${{ matrix.notebooks_to_test }}.tmp -p API_URL $API_URL -p MODEL_ID $MODEL_ID
54+
run: papermill ${{ matrix.notebooks_to_test }} ${{ matrix.notebooks_to_test }}.tmp

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
.DS_Store
2+
notebooks/instructlab-knowledge/workspaces/*
3+
!notebooks/instructlab-knowledge/workspaces/.gitkeep

notebooks/instructlab-knowledge/instructlab-knowledge.ipynb

Lines changed: 344 additions & 342 deletions
Large diffs are not rendered by default.
Binary file not shown.
Binary file not shown.

notebooks/instructlab-knowledge/workspaces/default/source_documents/2502.01618v3.pdf renamed to notebooks/instructlab-knowledge/sample-pdfs/inference-time-scaling.pdf

File renamed without changes.

notebooks/instructlab-knowledge/utils/create_seed_dataset.py

Lines changed: 28 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from transformers import AutoTokenizer
1010
import yaml
1111

12-
def get_seed_dataset(path: str) -> Dataset:
12+
def get_seed_dataset(chunks_path: Path, seed_examples_path: Path) -> Dataset:
1313
"""
1414
Creates a seed dataset from a path
1515
Args:
@@ -19,64 +19,52 @@ def get_seed_dataset(path: str) -> Dataset:
1919
of seed data for the knowledge generation pipeline in
2020
SDG.
2121
"""
22-
valid_path = is_dir_valid(path)
23-
ds = create_dataset_from_dir(valid_path)
22+
if not chunks_path.is_dir():
23+
raise ValueError(f"Path to chunks {chunks_path} must be a directory")
24+
if not seed_examples_path.is_dir():
25+
raise ValueError(f"Path to seed examples {seed_examples_path} must be a directory")
2426

25-
return ds
27+
files = list(seed_examples_path.iterdir())
28+
has_qna = any(f.name == 'qna.yaml' for f in files)
29+
files = list(chunks_path.iterdir())
30+
has_chunks_jsonl = any(f.name == 'chunks.jsonl' for f in files)
2631

27-
def is_dir_valid(path: str) -> Path:
28-
"""
29-
Returns whether or not a directory contains a qna.yaml and one or more .txt chunks
30-
Args:
31-
path (str): Path to directory of qna.yaml and chunks
32-
Returns:
33-
base_path (Path): pathlib.Path to a directory that can create a jsonl
34-
of seed data
35-
"""
36-
base_path = Path(path)
37-
if not base_path.is_dir():
38-
raise ValueError("Base path must be a directory")
32+
if not has_qna:
33+
raise ValueError(f"Seed examples dir {seed_examples_path} does not contain a qna.yaml")
3934

40-
files = list(base_path.iterdir())
41-
has_qna = any(f.name == 'qna.yaml' for f in files)
42-
has_txt = any(f.suffix == '.txt' for f in files)
43-
if not has_qna or not has_txt:
44-
raise ValueError("Directory does not contain a qna.yaml and chunks")
35+
if not has_chunks_jsonl:
36+
raise ValueError(f"Chunks dir {chunks_path} does not contain a chunks.jsonl")
4537

46-
return base_path
38+
ds = create_dataset_from_dir(chunks_path, seed_examples_path)
4739

48-
def read_chunks(path: Path) -> Dict[str, str]:
40+
return ds
41+
42+
def read_chunks(chunks_path: Path) -> Dict[str, str]:
4943
"""
50-
Returns a dictionary with all of the .txt chunks in a directory
44+
Returns a dictionary with all of the chunks in a chunks.jsonl
5145
The chunks may originate from one or more different files
5246
Args:
53-
path (Path): Path to directory of chunks
47+
path (Path): Path to directory of chunks in a file called chunks.jsonl
5448
Returns:
5549
chunks_dict (Dict[str,str]: Dictionary with key of the original file name
5650
and a list of chunks as the value
5751
"""
58-
chunk_files = path.glob('*.txt')
59-
52+
chunks_jsonl_path = chunks_path / "chunks.jsonl"
6053
chunks_dict = {}
61-
for file in chunk_files:
62-
chunks = []
63-
match = re.match(r"^(.*?)[-_]\d+\.txt$", file.name)
64-
if match:
65-
orig_filename = match.group(1)
6654

67-
with file.open('r', encoding='utf-8') as f:
68-
chunk = f.read()
55+
with open(chunks_jsonl_path, 'r') as file:
56+
for line in file:
57+
entry = yaml.safe_load(line)
58+
orig_filename = entry.get("file")
6959

7060
if orig_filename not in chunks_dict:
7161
chunks_dict[orig_filename] = []
72-
chunks_dict[orig_filename].append(chunk)
7362

74-
else:
75-
print(f"Ignoring .txt file {file}, file name is not the right format")
63+
chunks_dict[orig_filename].append(entry.get("chunk"))
7664

7765
return chunks_dict
7866

79-
def create_dataset_from_dir(path: Path) -> Dataset:
67+
def create_dataset_from_dir(chunks_path: Path, seed_examples_path: Path) -> Dataset:
8068
"""
8169
Process a directory with chunks and a qna.yaml return a dataset.
8270
Args:
@@ -85,7 +73,7 @@ def create_dataset_from_dir(path: Path) -> Dataset:
8573
Dataset: Dataset object.
8674
"""
8775

88-
qna_yaml_path = path / "qna.yaml"
76+
qna_yaml_path = seed_examples_path / "qna.yaml"
8977

9078
with open(qna_yaml_path, 'r') as f:
9179
qna_yaml = yaml.safe_load(f)
@@ -94,7 +82,7 @@ def create_dataset_from_dir(path: Path) -> Dataset:
9482
if not all(key in qna_yaml for key in ['document_outline', 'domain', 'seed_examples']):
9583
raise ValueError("qna.yaml file is missing document_outline, domain, or seed_examples fields")
9684

97-
chunks_dict = read_chunks(path)
85+
chunks_dict = read_chunks(chunks_path)
9886

9987
datasets = []
10088
for filename in chunks_dict.keys():
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
import json
2+
import yaml
3+
import random
4+
5+
from pathlib import Path
6+
from pydantic import SecretStr
7+
from textwrap import wrap
8+
9+
from docling_core.transforms.chunker.hierarchical_chunker import DocChunk, DocMeta
10+
from docling_sdg.qa.utils import get_qa_chunks
11+
from docling_sdg.qa.generate import Generator
12+
from docling_sdg.qa.base import GenerateOptions, LlmProvider
13+
14+
chunk_filter = [
15+
lambda chunk: len(str(chunk.text)) > 500
16+
]
17+
18+
def str_presenter(dumper, data):
19+
if len(data.splitlines()) > 1: # check for multiline string
20+
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
21+
elif len(data) > 80:
22+
data = "\n".join(wrap(data, 80))
23+
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
24+
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
25+
26+
yaml.add_representer(str, str_presenter)
27+
28+
# to use with safe_dump:
29+
yaml.representer.SafeRepresenter.add_representer(str, str_presenter)
30+
31+
class IndentedDumper(yaml.Dumper):
32+
def increase_indent(self, flow=False, indentless=False):
33+
return super(IndentedDumper, self).increase_indent(flow, False)
34+
35+
def generate_seed_examples(contribution_name: str, chunks_jsonl_path: Path, output_dir: Path, domain: str, summary: str, num_seed_examples: int, api_key: str, api_url: str, model_id: str) -> Path:
36+
"""
37+
Creates a seed dataset from a path
38+
Args:
39+
contribution_name (str): Name of the contribution
40+
chunks_jsonl_path (Path): Path to the chunks/chunks.jsonl file
41+
output_dir (Path): Path to output dir for the qna.yaml and intermediate outputs by docling-sdg
42+
contribution_metadata (dict): Dictionary with the domain and summary of the contribution
43+
num_seed_examples (str): Number of seed examples user wishes to generate for the contribution
44+
api_key (str): API key for the model used to generate questions and answers from contexts
45+
api_url (str): Endpoint for the model used to generate questions and answers from contexts
46+
model_id (str): Name of the model used to generate questions and answers from contexts
47+
Returns:
48+
qna_output_path (pathlib.Path): Path to the generated seed example file
49+
"""
50+
dataset = {}
51+
dataset[contribution_name] = {}
52+
dataset[contribution_name]["chunks"] = []
53+
54+
if not chunks_jsonl_path.exists():
55+
raise ValueError(f"chunks.jsonl does not exist but should at {chunks_jsonl_path}")
56+
57+
docs = []
58+
59+
with open(chunks_jsonl_path, 'r') as file: # khaled was here
60+
for line in file:
61+
file_in_docs = False
62+
entry = json.loads(line)
63+
#entry = yaml.safe_load(line)
64+
meta = DocMeta(**entry['metadata'])
65+
chunk = DocChunk(text=entry['chunk'], meta=meta)
66+
for doc in docs:
67+
if doc["file"] == entry['file']:
68+
doc["chunk_objs"].append(chunk)
69+
file_in_docs = True
70+
break
71+
72+
if file_in_docs == False:
73+
doc = dict(file=entry['file'], chunk_objs=[chunk])
74+
docs.append(doc)
75+
76+
for doc in docs:
77+
print(f"Filtering smaller chunks out of document {doc['file']}")
78+
79+
qa_chunks = get_qa_chunks(doc["file"], doc["chunk_objs"], chunk_filter)
80+
dataset[contribution_name]["chunks"].extend(list(qa_chunks))
81+
82+
83+
l = dataset[contribution_name]["chunks"]
84+
selected_chunks = random.sample(l, num_seed_examples)
85+
86+
generate_options = GenerateOptions(project_id="project_id")
87+
generate_options.provider = LlmProvider.OPENAI_LIKE
88+
generate_options.api_key = SecretStr(api_key)
89+
generate_options.url = api_url
90+
generate_options.model_id = model_id
91+
generate_options.generated_file = output_dir / f"qagen-{contribution_name}.json"
92+
gen = Generator(generate_options=generate_options)
93+
94+
Path.unlink(generate_options.generated_file, missing_ok=True)
95+
results = gen.generate_from_chunks(selected_chunks) # automatically saves to file
96+
97+
print(f"Status for Q&A generation for {contribution_name} is: {results.status}")
98+
99+
qnas = {}
100+
chunk_id_to_text = {}
101+
with open(generate_options.generated_file, "rt") as f:
102+
for line in f.readlines():
103+
entry = json.loads(line)
104+
chunk_id = entry['chunk_id']
105+
if chunk_id not in chunk_id_to_text:
106+
chunk_id_to_text[chunk_id] = entry['context']
107+
if chunk_id not in qnas:
108+
qnas[chunk_id] = []
109+
qnas[chunk_id].append({'question': entry['question'], 'answer': entry['answer']})
110+
111+
qna_output_path = output_dir / "qna.yaml"
112+
113+
data = {'seed_examples': []}
114+
for chunk_id, context in chunk_id_to_text.items():
115+
data['seed_examples'].append({
116+
'context': context,
117+
'questions_and_answers': [
118+
{
119+
'question': example['question'],
120+
'answer': example['answer'],
121+
} for example in qnas[chunk_id]
122+
]
123+
})
124+
125+
126+
data['document_outline'] = summary
127+
data['domain'] = domain
128+
129+
Path.unlink(qna_output_path, missing_ok=True) # shouldn't be necessary but was. jupyter caching thing?
130+
with open(qna_output_path, 'w') as yaml_file:
131+
yaml.dump(data, yaml_file, Dumper=IndentedDumper, default_flow_style=False, sort_keys=False, width=80)
132+
133+
return qna_output_path
134+
135+
def review_seed_examples_file(seed_examples_path: Path, min_seed_examples: int = 5, num_qa_pairs: int = 3) -> None:
136+
with open(seed_examples_path, 'r') as yaml_file:
137+
yaml_data = yaml.safe_load(yaml_file)
138+
errors = []
139+
print(f"Reviewing seed examples file at {seed_examples_path.resolve()}")
140+
141+
# Check for document_outline
142+
if 'document_outline' not in yaml_data:
143+
errors.append("Missing contribution summary in 'document_outline'")
144+
else:
145+
# contribution summary is called document_outline internally
146+
print(f"Found contribution summary...")
147+
148+
# Check for domain
149+
if 'domain' not in yaml_data:
150+
errors.append("Missing 'domain'")
151+
else:
152+
print(f"Found 'domain'...")
153+
154+
# Check seed_examples
155+
seed_examples = yaml_data.get('seed_examples')
156+
if not seed_examples:
157+
errors.append("'seed_examples' section is missing or empty.")
158+
elif len(seed_examples) < min_seed_examples:
159+
errors.append(f"'seed_examples' should contain at least {min_seed_examples} examples, found {len(seed_examples)}. Please add {min_seed_examples - len(seed_examples)} more seed example(s)")
160+
else:
161+
print(f"Found {len(seed_examples)} 'contexts' in 'sed_examples'. Minimum expected number is {min_seed_examples}...")
162+
163+
if seed_examples:
164+
for i, example in enumerate(seed_examples, start=1):
165+
qa_pairs = example.get('questions_and_answers')
166+
if not qa_pairs:
167+
errors.append(f"Seed Example {i} is missing 'questions_and_answers' section.")
168+
elif len(qa_pairs) != num_qa_pairs:
169+
errors.append(f"Seed Example {i} should contain {num_qa_pairs} question-answer pairs, found {len(qa_pairs)}. Please add {num_qa_pairs - len(qa_pairs)} more question-answer pair(s) to seed example {i}")
170+
else:
171+
print(f"Seed Example {i} contains expected number ({num_qa_pairs}) of 'question_and_answers'...")
172+
173+
if errors:
174+
print("\n\033[31mERROR! Seed Examples validation failed with the following issues:\033[0m")
175+
for err in errors:
176+
print(f"- {err}")
177+
else:
178+
print(f"Seed Examples YAML {seed_examples_path.resolve()} is valid :)")
179+
print(f"\n")

0 commit comments

Comments
 (0)