NVIDIA-NeMo
diff --git a/‎pyproject.toml‎
Lines changed: 0 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/data_designer/cli/commands/list.py‎
Lines changed: 4 additions & 4 deletions b/‎src/data_designer/cli/commands/list.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/data_designer/cli/commands/models.py‎
Lines changed: 2 additions & 2 deletions b/‎src/data_designer/cli/commands/models.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/data_designer/cli/commands/providers.py‎
Lines changed: 2 additions & 2 deletions b/‎src/data_designer/cli/commands/providers.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/data_designer/cli/commands/reset.py‎
Lines changed: 4 additions & 4 deletions b/‎src/data_designer/cli/commands/reset.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/data_designer/config/sampler_params.py‎
Lines changed: 78 additions & 38 deletions b/‎src/data_designer/config/sampler_params.py‎
Lines changed: 78 additions & 38 deletions
diff --git a/‎src/data_designer/config/utils/constants.py‎
Lines changed: 8 additions & 4 deletions b/‎src/data_designer/config/utils/constants.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎src/data_designer/engine/resources/managed_dataset_generator.py‎
Lines changed: 4 additions & 6 deletions b/‎src/data_designer/engine/resources/managed_dataset_generator.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎src/data_designer/engine/resources/managed_dataset_repository.py‎
Lines changed: 8 additions & 6 deletions b/‎src/data_designer/engine/resources/managed_dataset_repository.py‎
Lines changed: 8 additions & 6 deletions
@@ -34,7 +34,6 @@ dependencies = [
   "rich>=13.7.1",
   "typer>=0.12.0",
   "anyascii>=0.3.3,<1.0",
-  "boto3==1.35.74",
   "datasets>=4.0.0",
   "duckdb==1.1.3",
   "faker==20.1.0",
@@ -48,7 +47,6 @@ dependencies = [
   "networkx==3.0",
   "pydantic[email]>=2.9.2",
   "scipy>=1.11.0",
-  "smart-open==7.0.5",
   "sqlfluff==3.2.0",
   "tiktoken>=0.8.0",
   "ruff==0.12.3",
 
@@ -6,7 +6,7 @@
 from data_designer.cli.repositories.model_repository import ModelRepository
 from data_designer.cli.repositories.provider_repository import ProviderRepository
 from data_designer.cli.ui import console, print_error, print_header, print_info, print_warning
-from data_designer.config.utils.constants import DATA_DESIGNER_HOME_DIR, NordColor
+from data_designer.config.utils.constants import DATA_DESIGNER_HOME, NordColor
 
 
 def list_command() -> None:
@@ -17,12 +17,12 @@ def list_command() -> None:
     """
     # Determine config directory
     print_header("Data Designer Configurations")
-    print_info(f"Configuration directory: {DATA_DESIGNER_HOME_DIR}")
+    print_info(f"Configuration directory: {DATA_DESIGNER_HOME}")
     console.print()
 
     # Display providers
-    display_providers(ProviderRepository(DATA_DESIGNER_HOME_DIR))
-    display_models(ModelRepository(DATA_DESIGNER_HOME_DIR))
+    display_providers(ProviderRepository(DATA_DESIGNER_HOME))
+    display_models(ModelRepository(DATA_DESIGNER_HOME))
 
 
 def display_providers(provider_repo: ProviderRepository) -> None:
 
@@ -2,9 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from data_designer.cli.controllers.model_controller import ModelController
-from data_designer.config.utils.constants import DATA_DESIGNER_HOME_DIR
+from data_designer.config.utils.constants import DATA_DESIGNER_HOME
 
 
 def models_command() -> None:
-    controller = ModelController(DATA_DESIGNER_HOME_DIR)
+    controller = ModelController(DATA_DESIGNER_HOME)
     controller.run()
@@ -2,10 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from data_designer.cli.controllers.provider_controller import ProviderController
-from data_designer.config.utils.constants import DATA_DESIGNER_HOME_DIR
+from data_designer.config.utils.constants import DATA_DESIGNER_HOME
 
 
 def providers_command() -> None:
     """Configure model providers interactively."""
-    controller = ProviderController(DATA_DESIGNER_HOME_DIR)
+    controller = ProviderController(DATA_DESIGNER_HOME)
     controller.run()
@@ -14,20 +14,20 @@
     print_success,
     print_text,
 )
-from data_designer.config.utils.constants import DATA_DESIGNER_HOME_DIR
+from data_designer.config.utils.constants import DATA_DESIGNER_HOME
 
 
 def reset_command() -> None:
     """Reset configuration files by deleting them after confirmation."""
     print_header("Reset Configuration")
 
     # Determine configuration directory
-    print_info(f"Configuration directory: {DATA_DESIGNER_HOME_DIR}")
+    print_info(f"Configuration directory: {DATA_DESIGNER_HOME}")
     console.print()
 
     # Create repositories
-    provider_repo = ProviderRepository(DATA_DESIGNER_HOME_DIR)
-    model_repo = ModelRepository(DATA_DESIGNER_HOME_DIR)
+    provider_repo = ProviderRepository(DATA_DESIGNER_HOME)
+    model_repo = ModelRepository(DATA_DESIGNER_HOME)
 
     # Check which config files exist
     provider_exists = provider_repo.exists()
 
@@ -15,7 +15,6 @@
     LOCALES_WITH_MANAGED_DATASETS,
     MAX_AGE,
     MIN_AGE,
-    US_STATES_AND_MAJOR_TERRITORIES,
 )
 
 
@@ -27,6 +26,7 @@ class SamplerType(str, Enum):
     DATETIME = "datetime"
     GAUSSIAN = "gaussian"
     PERSON = "person"
+    PERSON_FROM_FAKER = "person_from_faker"
     POISSON = "poisson"
     SCIPY = "scipy"
     SUBCATEGORY = "subcategory"
@@ -219,8 +219,10 @@ class PersonSamplerParams(ConfigBase):
     locale: str = Field(
         default="en_US",
         description=(
-            "Locale string, determines the language and geographic locale "
-            "that a synthetic person will be sampled from. E.g, en_US, en_GB, fr_FR, ..."
+            "Locale that determines the language and geographic location "
+            "that a synthetic person will be sampled from. Must be a locale supported by "
+            "a managed Nemotron Personas dataset. Managed datasets exist for the following locales: "
+            f"{', '.join(LOCALES_WITH_MANAGED_DATASETS)}."
         ),
     )
     sex: Optional[SexT] = Field(
@@ -237,36 +239,96 @@ class PersonSamplerParams(ConfigBase):
         min_length=2,
         max_length=2,
     )
-
-    state: Optional[Union[str, list[str]]] = Field(
+    select_field_values: Optional[dict[str, list[str]]] = Field(
         default=None,
         description=(
-            "Only supported for 'en_US' locale. If specified, then only synthetic people "
-            "from these states will be sampled. States must be given as two-letter abbreviations."
+            "Sample synthetic people with the specified field values. This is meant to be a flexible argument for "
+            "selecting a subset of the population from the managed dataset. Note that this sampler does not support "
+            "rare combinations of field values and will likely fail if your desired subset is not well-represented "
+            "in the managed Nemotron Personas dataset. We generally recommend using the `sex`, `city`, and `age_range` "
+            "arguments to filter the population when possible."
         ),
+        examples=[
+            {"state": ["NY", "CA", "OH", "TX", "NV"], "education_level": ["high_school", "some_college", "bachelors"]}
+        ],
     )
 
     with_synthetic_personas: bool = Field(
         default=False,
         description="If True, then append synthetic persona columns to each generated person.",
     )
 
-    sample_dataset_when_available: bool = Field(
-        default=True,
-        description="If True, sample person data from managed dataset when available. Otherwise, use Faker.",
+    @property
+    def generator_kwargs(self) -> list[str]:
+        """Keyword arguments to pass to the person generator."""
+        return [f for f in list(PersonSamplerParams.model_fields) if f != "locale"]
+
+    @property
+    def people_gen_key(self) -> str:
+        return f"{self.locale}_with_personas" if self.with_synthetic_personas else self.locale
+
+    @field_validator("age_range")
+    @classmethod
+    def _validate_age_range(cls, value: list[int]) -> list[int]:
+        msg_prefix = "'age_range' must be a list of two integers, representing the min and max age."
+        if value[0] < MIN_AGE:
+            raise ValueError(
+                f"{msg_prefix} The first integer (min age) must be greater than or equal to {MIN_AGE}, "
+                f"but the first integer provided was {value[0]}."
+            )
+        if value[1] > MAX_AGE:
+            raise ValueError(
+                f"{msg_prefix} The second integer (max age) must be less than or equal to {MAX_AGE}, "
+                f"but the second integer provided was {value[1]}."
+            )
+        if value[0] >= value[1]:
+            raise ValueError(
+                f"{msg_prefix} The first integer (min age) must be less than the second integer (max age), "
+                f"but the first integer provided was {value[0]} and the second integer provided was {value[1]}."
+            )
+        return value
+
+    @model_validator(mode="after")
+    def _validate_locale_with_managed_datasets(self) -> Self:
+        if self.locale not in LOCALES_WITH_MANAGED_DATASETS:
+            raise ValueError(
+                "Person sampling from managed datasets is only supported for the following "
+                f"locales: {', '.join(LOCALES_WITH_MANAGED_DATASETS)}."
+            )
+        return self
+
+
+class PersonFromFakerSamplerParams(ConfigBase):
+    locale: str = Field(
+        default="en_US",
+        description=(
+            "Locale string, determines the language and geographic locale "
+            "that a synthetic person will be sampled from. E.g, en_US, en_GB, fr_FR, ..."
+        ),
+    )
+    sex: Optional[SexT] = Field(
+        default=None,
+        description="If specified, then only synthetic people of the specified sex will be sampled.",
+    )
+    city: Optional[Union[str, list[str]]] = Field(
+        default=None,
+        description="If specified, then only synthetic people from these cities will be sampled.",
+    )
+    age_range: list[int] = Field(
+        default=DEFAULT_AGE_RANGE,
+        description="If specified, then only synthetic people within this age range will be sampled.",
+        min_length=2,
+        max_length=2,
     )
 
     @property
     def generator_kwargs(self) -> list[str]:
         """Keyword arguments to pass to the person generator."""
-        return [f for f in list(PersonSamplerParams.model_fields) if f != "locale"]
+        return [f for f in list(PersonFromFakerSamplerParams.model_fields) if f != "locale"]
 
     @property
     def people_gen_key(self) -> str:
-        if self.locale in LOCALES_WITH_MANAGED_DATASETS and self.sample_dataset_when_available:
-            return f"{self.locale}_with_personas" if self.with_synthetic_personas else self.locale
-        else:
-            return f"{self.locale}_faker"
+        return f"{self.locale}_faker"
 
     @field_validator("age_range")
     @classmethod
@@ -298,35 +360,13 @@ def _validate_locale(cls, value: str) -> str:
             )
         return value
 
-    @model_validator(mode="after")
-    def _validate_state(self) -> Self:
-        if self.state is not None:
-            orig_state_value = self.state
-            if self.locale != "en_US":
-                raise ValueError("'state' is only supported for 'en_US' locale.")
-            if not isinstance(self.state, list):
-                self.state = [self.state]
-            self.state = [state.upper() for state in self.state]
-            for state in self.state:
-                if state not in US_STATES_AND_MAJOR_TERRITORIES:
-                    raise ValueError(f"State {orig_state_value!r} is not a supported state.")
-        return self
-
-    @model_validator(mode="after")
-    def _validate_with_synthetic_personas(self) -> Self:
-        if self.with_synthetic_personas and self.locale not in LOCALES_WITH_MANAGED_DATASETS:
-            raise ValueError(
-                "'with_synthetic_personas' is only supported for the following "
-                f"locales: {', '.join(LOCALES_WITH_MANAGED_DATASETS)}."
-            )
-        return self
-
 
 SamplerParamsT: TypeAlias = Union[
     SubcategorySamplerParams,
     CategorySamplerParams,
     DatetimeSamplerParams,
     PersonSamplerParams,
+    PersonFromFakerSamplerParams,
     TimeDeltaSamplerParams,
     UUIDSamplerParams,
     BernoulliSamplerParams,
 
@@ -260,17 +260,21 @@ class NordColor(Enum):
     "zu_ZA",
 ]
 
-DATA_DESIGNER_HOME_DIR_ENV_VAR = "DATA_DESIGNER_HOME_DIR"
+DATA_DESIGNER_HOME_ENV_VAR = "DATA_DESIGNER_HOME"
 
-DATA_DESIGNER_HOME_DIR = Path(os.getenv(DATA_DESIGNER_HOME_DIR_ENV_VAR, Path.home() / ".data-designer"))
+DATA_DESIGNER_HOME = Path(os.getenv(DATA_DESIGNER_HOME_ENV_VAR, Path.home() / ".data-designer"))
+
+MANAGED_ASSETS_PATH_ENV_VAR = "DATA_DESIGNER_MANAGED_ASSETS_PATH"
+
+MANAGED_ASSETS_PATH = Path(os.getenv(MANAGED_ASSETS_PATH_ENV_VAR, DATA_DESIGNER_HOME / "managed-assets"))
 
 MODEL_CONFIGS_FILE_NAME = "model_configs.yaml"
 
-MODEL_CONFIGS_FILE_PATH = DATA_DESIGNER_HOME_DIR / MODEL_CONFIGS_FILE_NAME
+MODEL_CONFIGS_FILE_PATH = DATA_DESIGNER_HOME / MODEL_CONFIGS_FILE_NAME
 
 MODEL_PROVIDERS_FILE_NAME = "model_providers.yaml"
 
-MODEL_PROVIDERS_FILE_PATH = DATA_DESIGNER_HOME_DIR / MODEL_PROVIDERS_FILE_NAME
+MODEL_PROVIDERS_FILE_PATH = DATA_DESIGNER_HOME / MODEL_PROVIDERS_FILE_NAME
 
 NVIDIA_PROVIDER_NAME = "nvidia"
 
 
@@ -17,21 +17,19 @@ def generate_samples(
         self,
         size: int = 1,
         evidence: dict[str, Any | list[Any]] = {},
-        seed: int | None = None,
     ) -> pd.DataFrame:
+        parameters = []
         query = f"select * from {self.dataset_name}"
-        # Build the WHERE clause if there are filters
-        # NOTE: seed is not used because it's not straightforward
-        # to make randomization both fast and repeatable
         if evidence:
             where_conditions = []
             for column, values in evidence.items():
                 if values:
                     values = values if isinstance(values, list) else [values]
-                    formatted_values = [f"'{val}'" for val in values]
+                    formatted_values = ["?"] * len(values)
                     condition = f"{column} IN ({', '.join(formatted_values)})"
                     where_conditions.append(condition)
+                    parameters.extend(values)
             if where_conditions:
                 query += " where " + " and ".join(where_conditions)
         query += f" order by random() limit {size}"
-        return self.managed_datasets.query(query)
+        return self.managed_datasets.query(query, parameters)
@@ -9,6 +9,7 @@
 import tempfile
 import threading
 import time
+from typing import Any
 
 import duckdb
 import pandas as pd
@@ -60,7 +61,7 @@ def name(self) -> str:
 
 class ManagedDatasetRepository(ABC):
     @abstractmethod
-    def query(self, sql: str) -> pd.DataFrame: ...
+    def query(self, sql: str, parameters: list[Any]) -> pd.DataFrame: ...
 
     @property
     @abstractmethod
@@ -129,7 +130,7 @@ def _register_datasets(self):
                 for table in self.data_catalog:
                     key = table.source if table.schema == "main" else f"{table.schema}/{table.source}"
                     if self._use_cache:
-                        tmp_root = Path(tempfile.gettempdir()) / "gretel_ds_cache"
+                        tmp_root = Path(tempfile.gettempdir()) / "dd_cache"
                         local_path = tmp_root / key
                         local_path.parent.mkdir(parents=True, exist_ok=True)
                         if not local_path.exists():
@@ -160,7 +161,7 @@ def _register_datasets(self):
                 # Signal that registration is complete so any waiting queries can proceed.
                 self._registration_event.set()
 
-    def query(self, sql: str) -> pd.DataFrame:
+    def query(self, sql: str, parameters: list[Any]) -> pd.DataFrame:
         # Ensure dataset registration has completed. Possible future optimization:
         # pull datasets in parallel and only wait here if the query requires a
         # table that isn't cached.
@@ -173,7 +174,7 @@ def query(self, sql: str) -> pd.DataFrame:
         # more details here: https://duckdb.org/docs/stable/guides/python/multiple_threads.html
         cursor = self.db.cursor()
         try:
-            df = cursor.sql(sql).df()
+            df = cursor.execute(sql, parameters).df()
         finally:
             cursor.close()
         return df
@@ -183,10 +184,11 @@ def data_catalog(self) -> DataCatalog:
         return self._data_catalog
 
 
-def load_managed_dataset_repository(blob_storage: ManagedBlobStorage) -> ManagedDatasetRepository:
+def load_managed_dataset_repository(blob_storage: ManagedBlobStorage, locales: list[str]) -> ManagedDatasetRepository:
     return DuckDBDatasetRepository(
         blob_storage,
-        {"threads": 1, "memory_limit": "2 gb"},
+        config={"threads": 1, "memory_limit": "2 gb"},
+        data_catalog=[Table(f"{locale}.parquet") for locale in locales],
         # Only cache if not using local storage.
         use_cache=not isinstance(blob_storage, LocalBlobStorageProvider),
     )