Skip to content

Commit 146b25a

Browse files
committed
Split generate_data into multiple discrete steps
This doesn't move things out into separate files yet, but it does split the existing functionality of `generate_date` into multiple discrete steps and changes `generate_date` to just call those steps. This is a step towards cleaner separation between the steps and creating top-level Python APIs for each discrete step for advanced use-cases that don't just want an entire single step generation pipeline. Signed-off-by: Ben Browning <bbrownin@redhat.com>
1 parent a80a3f7 commit 146b25a

File tree

8 files changed

+367
-186
lines changed

8 files changed

+367
-186
lines changed

src/instructlab/sdg/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
"FULL_PIPELINES_PACKAGE",
3030
"SIMPLE_PIPELINES_PACKAGE",
3131
"generate_data",
32-
"taxonomy_to_samples",
32+
"preprocess_taxonomy",
3333
)
3434

3535
# Local
@@ -62,6 +62,6 @@
6262
PipelineContext,
6363
)
6464
from .registry import BlockRegistry, PromptRegistry
65-
from .taxonomy import taxonomy_to_samples
65+
from .taxonomy import preprocess_taxonomy
6666
from .utils import GenerateException
6767
from .utils.taxonomy import TaxonomyReadingException

src/instructlab/sdg/cli/taxonomy_to_samples.py renamed to src/instructlab/sdg/cli/preprocess_taxonomy.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
DEFAULT_CHUNK_WORD_COUNT,
99
DEFAULT_SERVER_CTX_SIZE,
1010
DEFAULT_TAXONOMY_BASE,
11-
taxonomy_to_samples,
11+
preprocess_taxonomy,
1212
)
1313
from instructlab.sdg.utils.logging import setup_logger
1414

@@ -68,7 +68,7 @@
6868

6969
args = parser.parse_args()
7070
setup_logger(args.log_level)
71-
taxonomy_to_samples(
71+
preprocess_taxonomy(
7272
args.taxonomy_path,
7373
args.output_dir,
7474
chunk_word_count=args.chunk_word_count,
@@ -78,5 +78,5 @@
7878
)
7979

8080
"""
81-
python -m instructlab.sdg.cli.taxonomy_to_samples --taxonomy-path /path/to/my/taxonomy --output-dir /path/to/my/output
81+
python -m instructlab.sdg.cli.preprocess_taxonomy --taxonomy-path /path/to/my/taxonomy --output-dir /path/to/my/output
8282
"""

src/instructlab/sdg/datamixing.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ def _create_mixed_dataset(self, num_proc):
160160
Create the final mixed dataset by loading, sampling, and
161161
concatenating all datasets in this recipe
162162
"""
163-
if not self.dataset_added:
163+
if not self.datasets:
164164
logger.error("No dataset added to the recipe")
165165

166166
mixed_ds = self._load_and_sample_datasets(num_proc)
@@ -726,19 +726,36 @@ def collect(
726726
sampling_size=self.NUM_SYNTH_SKILLS,
727727
)
728728

729+
def _write_mixed_recipe(self, recipe, output_file_recipe):
730+
"""
731+
Write the recipes created during data mixing without writing the actual
732+
mixed datasets to disk.
733+
"""
734+
full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
735+
recipe.save_recipe(full_recipe_path)
736+
729737
def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data):
730738
"""
731739
Mix the generated leaf node data into a single dataset and write it to
732740
disk. The heavy lifting is delegated to the Recipe class.
733741
"""
742+
self._write_mixed_recipe(recipe, output_file_recipe)
734743
if recipe.dataset_added:
735-
full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
736-
recipe.save_recipe(full_recipe_path)
737744
recipe.save_mixed_dataset(
738745
os.path.join(self.output_dir, output_file_data),
739746
self.num_procs,
740747
)
741748

749+
def write_recipes(self):
750+
self._write_mixed_recipe(
751+
self.knowledge_recipe,
752+
self.output_file_knowledge_recipe,
753+
)
754+
self._write_mixed_recipe(
755+
self.skills_recipe,
756+
self.output_file_skills_recipe,
757+
)
758+
742759
def generate(self):
743760
self._gen_mixed_data(
744761
self.knowledge_recipe,

0 commit comments

Comments
 (0)