33
44from functools import partial
55import logging
6+ from pathlib import Path
67import random
78from typing import Callable
89
1617)
1718from data_designer .engine .dataset_builders .multi_column_configs import SamplerMultiColumnConfig
1819from data_designer .engine .processing .utils import concat_datasets
20+ from data_designer .engine .resources .errors import ManagedAssetMissingError , ManagedAssetsPathNotSetError
1921from data_designer .engine .resources .managed_dataset_generator import ManagedDatasetGenerator
20- from data_designer .engine .resources .resource_provider import ResourceType
2122from data_designer .engine .sampling_gen .data_sources .sources import SamplerType
2223from data_designer .engine .sampling_gen .entities .person import load_person_data_sampler
2324from data_designer .engine .sampling_gen .generator import DatasetGenerator as SamplingDatasetGenerator
@@ -32,7 +33,7 @@ def metadata() -> GeneratorMetadata:
3233 name = "sampler_column_generator" ,
3334 description = "Generate columns using sampling-based method." ,
3435 generation_strategy = GenerationStrategy .FULL_COLUMN ,
35- required_resources = [ ResourceType . BLOB_STORAGE ] ,
36+ required_resources = None ,
3637 )
3738
3839 def generate (self , data : pd .DataFrame ) -> pd .DataFrame :
@@ -52,7 +53,32 @@ def _needs_person_generator(self) -> bool:
5253 def _person_generator_loader (self ) -> Callable [[bool ], ManagedDatasetGenerator ]:
5354 return partial (load_person_data_sampler , blob_storage = self .resource_provider .blob_storage )
5455
56+ def _check_managed_assets_exist_if_needed (self ) -> None :
57+ if self ._needs_person_generator :
58+ if (
59+ self .resource_provider .blob_storage is None
60+ or not self .resource_provider .blob_storage .root_path .exists ()
61+ ):
62+ raise ManagedAssetsPathNotSetError (
63+ "🛑 The managed assets path does not exist. If you are using the Person Sampler, "
64+ "You must have a managed assets directory that contains the Nemotron-Personas dataset "
65+ "for each locale you want to sample from."
66+ )
67+ is_missing = []
68+ for c in [c for c in self .config .columns if c .sampler_type == SamplerType .PERSON ]:
69+ locale_file_path = self .resource_provider .blob_storage .root_path / f"datasets/{ c .params .locale } .parquet"
70+ if not Path (locale_file_path ).exists () or not Path (locale_file_path ).is_file ():
71+ is_missing .append ([c .params .locale , locale_file_path ])
72+ if len (is_missing ) > 0 :
73+ raise ManagedAssetMissingError (
74+ "🛑 The Nemotron-Personas dataset is missing for the following locales: "
75+ f"{ ', ' .join ([f'{ locale } ' for locale , _ in is_missing ])} . "
76+ "Please ensure the files exist at the following paths: "
77+ f"{ ', ' .join ([f'{ str (file_path )!r} ' for _ , file_path in is_missing ])} "
78+ )
79+
5580 def _create_sampling_dataset_generator (self ) -> SamplingDatasetGenerator :
81+ self ._check_managed_assets_exist_if_needed ()
5682 return SamplingDatasetGenerator (
5783 sampler_columns = self .config ,
5884 person_generator_loader = (self ._person_generator_loader if self ._needs_person_generator else None ),
0 commit comments