|
| 1 | +# --- |
| 2 | +# jupyter: |
| 3 | +# jupytext: |
| 4 | +# text_representation: |
| 5 | +# extension: .py |
| 6 | +# format_name: percent |
| 7 | +# format_version: '1.3' |
| 8 | +# jupytext_version: 1.18.1 |
| 9 | +# kernelspec: |
| 10 | +# display_name: .venv |
| 11 | +# language: python |
| 12 | +# name: python3 |
| 13 | +# --- |
| 14 | + |
| 15 | +# %% [markdown] |
| 16 | +# # 🎨 Data Designer 101: The Basics |
| 17 | +# |
| 18 | +# [Click here](https://raw.githubusercontent.com/NVIDIA-NeMo/DataDesigner/refs/heads/main/docs/notebooks/1-the-basics.ipynb) to download this notebook to your computer. |
| 19 | +# |
| 20 | +# #### 📚 What you'll learn |
| 21 | +# |
| 22 | +# This notebook demonstrates the basics of Data Designer by generating a simple product review dataset. |
| 23 | +# |
| 24 | + |
| 25 | +# %% [markdown] |
| 26 | +# ### 📦 Import the essentials |
| 27 | +# |
| 28 | +# - The `essentials` module provides quick access to the most commonly used objects. |
| 29 | +# |
| 30 | + |
| 31 | +# %% |
| 32 | +from data_designer.essentials import ( |
| 33 | + CategorySamplerParams, |
| 34 | + DataDesigner, |
| 35 | + DataDesignerConfigBuilder, |
| 36 | + InferenceParameters, |
| 37 | + LLMTextColumnConfig, |
| 38 | + ModelConfig, |
| 39 | + PersonFromFakerSamplerParams, |
| 40 | + SamplerColumnConfig, |
| 41 | + SamplerType, |
| 42 | + SubcategorySamplerParams, |
| 43 | + UniformSamplerParams, |
| 44 | +) |
| 45 | + |
| 46 | +# %% [markdown] |
| 47 | +# ### ⚙️ Initialize the Data Designer interface |
| 48 | +# |
| 49 | +# - `DataDesigner` is the main object is responsible for managing the data generation process. |
| 50 | +# |
| 51 | +# - When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/models/default-model-settings/) are used. |
| 52 | +# |
| 53 | + |
| 54 | +# %% |
| 55 | +data_designer = DataDesigner() |
| 56 | + |
| 57 | +# %% [markdown] |
| 58 | +# ### 🎛️ Define model configurations |
| 59 | +# |
| 60 | +# - Each `ModelConfig` defines a model that can be used during the generation process. |
| 61 | +# |
| 62 | +# - The "model alias" is used to reference the model in the Data Designer config (as we will see below). |
| 63 | +# |
| 64 | +# - The "model provider" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/models/default-model-settings/) docs for more details). |
| 65 | +# |
| 66 | +# - By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider. |
| 67 | +# |
| 68 | + |
| 69 | +# %% |
| 70 | +# This name is set in the model provider configuration. |
| 71 | +MODEL_PROVIDER = "nvidia" |
| 72 | + |
| 73 | +# The model ID is from build.nvidia.com. |
| 74 | +MODEL_ID = "nvidia/nvidia-nemotron-nano-9b-v2" |
| 75 | + |
| 76 | +# We choose this alias to be descriptive for our use case. |
| 77 | +MODEL_ALIAS = "nemotron-nano-v2" |
| 78 | + |
| 79 | +# This sets reasoning to False for the nemotron-nano-v2 model. |
| 80 | +SYSTEM_PROMPT = "/no_think" |
| 81 | + |
| 82 | +model_configs = [ |
| 83 | + ModelConfig( |
| 84 | + alias=MODEL_ALIAS, |
| 85 | + model=MODEL_ID, |
| 86 | + provider=MODEL_PROVIDER, |
| 87 | + inference_parameters=InferenceParameters( |
| 88 | + temperature=0.5, |
| 89 | + top_p=1.0, |
| 90 | + max_tokens=1024, |
| 91 | + ), |
| 92 | + ) |
| 93 | +] |
| 94 | + |
| 95 | +# %% [markdown] |
| 96 | +# ### 🏗️ Initialize the Data Designer Config Builder |
| 97 | +# |
| 98 | +# - The Data Designer config defines the dataset schema and generation process. |
| 99 | +# |
| 100 | +# - The config builder provides an intuitive interface for building this configuration. |
| 101 | +# |
| 102 | +# - The list of model configs is provided to the builder at initialization. |
| 103 | +# |
| 104 | + |
| 105 | +# %% |
| 106 | +config_builder = DataDesignerConfigBuilder(model_configs=model_configs) |
| 107 | + |
| 108 | +# %% [markdown] |
| 109 | +# ## 🎲 Getting started with sampler columns |
| 110 | +# |
| 111 | +# - Sampler columns offer non-LLM based generation of synthetic data. |
| 112 | +# |
| 113 | +# - They are particularly useful for **steering the diversity** of the generated data, as we demonstrate below. |
| 114 | +# |
| 115 | +# <br> |
| 116 | +# |
| 117 | +# You can view available samplers using the config builder's `info` property: |
| 118 | +# |
| 119 | + |
| 120 | +# %% |
| 121 | +config_builder.info.display("samplers") |
| 122 | + |
| 123 | +# %% [markdown] |
| 124 | +# Let's start designing our product review dataset by adding product category and subcategory columns. |
| 125 | +# |
| 126 | + |
| 127 | +# %% |
| 128 | +config_builder.add_column( |
| 129 | + SamplerColumnConfig( |
| 130 | + name="product_category", |
| 131 | + sampler_type=SamplerType.CATEGORY, |
| 132 | + params=CategorySamplerParams( |
| 133 | + values=[ |
| 134 | + "Electronics", |
| 135 | + "Clothing", |
| 136 | + "Home & Kitchen", |
| 137 | + "Books", |
| 138 | + "Home Office", |
| 139 | + ], |
| 140 | + ), |
| 141 | + ) |
| 142 | +) |
| 143 | + |
| 144 | +config_builder.add_column( |
| 145 | + SamplerColumnConfig( |
| 146 | + name="product_subcategory", |
| 147 | + sampler_type=SamplerType.SUBCATEGORY, |
| 148 | + params=SubcategorySamplerParams( |
| 149 | + category="product_category", |
| 150 | + values={ |
| 151 | + "Electronics": [ |
| 152 | + "Smartphones", |
| 153 | + "Laptops", |
| 154 | + "Headphones", |
| 155 | + "Cameras", |
| 156 | + "Accessories", |
| 157 | + ], |
| 158 | + "Clothing": [ |
| 159 | + "Men's Clothing", |
| 160 | + "Women's Clothing", |
| 161 | + "Winter Coats", |
| 162 | + "Activewear", |
| 163 | + "Accessories", |
| 164 | + ], |
| 165 | + "Home & Kitchen": [ |
| 166 | + "Appliances", |
| 167 | + "Cookware", |
| 168 | + "Furniture", |
| 169 | + "Decor", |
| 170 | + "Organization", |
| 171 | + ], |
| 172 | + "Books": [ |
| 173 | + "Fiction", |
| 174 | + "Non-Fiction", |
| 175 | + "Self-Help", |
| 176 | + "Textbooks", |
| 177 | + "Classics", |
| 178 | + ], |
| 179 | + "Home Office": [ |
| 180 | + "Desks", |
| 181 | + "Chairs", |
| 182 | + "Storage", |
| 183 | + "Office Supplies", |
| 184 | + "Lighting", |
| 185 | + ], |
| 186 | + }, |
| 187 | + ), |
| 188 | + ) |
| 189 | +) |
| 190 | + |
| 191 | +config_builder.add_column( |
| 192 | + SamplerColumnConfig( |
| 193 | + name="target_age_range", |
| 194 | + sampler_type=SamplerType.CATEGORY, |
| 195 | + params=CategorySamplerParams(values=["18-25", "25-35", "35-50", "50-65", "65+"]), |
| 196 | + ) |
| 197 | +) |
| 198 | + |
| 199 | +# Optionally validate that the columns are configured correctly. |
| 200 | +config_builder.validate() |
| 201 | + |
| 202 | +# %% [markdown] |
| 203 | +# Next, let's add samplers to generate data related to the customer and their review. |
| 204 | +# |
| 205 | + |
| 206 | +# %% |
| 207 | +config_builder.add_column( |
| 208 | + SamplerColumnConfig( |
| 209 | + name="customer", |
| 210 | + sampler_type=SamplerType.PERSON_FROM_FAKER, |
| 211 | + params=PersonFromFakerSamplerParams(age_range=[18, 70], locale="en_US"), |
| 212 | + ) |
| 213 | +) |
| 214 | + |
| 215 | +config_builder.add_column( |
| 216 | + SamplerColumnConfig( |
| 217 | + name="number_of_stars", |
| 218 | + sampler_type=SamplerType.UNIFORM, |
| 219 | + params=UniformSamplerParams(low=1, high=5), |
| 220 | + convert_to="int", # Convert the sampled float to an integer. |
| 221 | + ) |
| 222 | +) |
| 223 | + |
| 224 | +config_builder.add_column( |
| 225 | + SamplerColumnConfig( |
| 226 | + name="review_style", |
| 227 | + sampler_type=SamplerType.CATEGORY, |
| 228 | + params=CategorySamplerParams( |
| 229 | + values=["rambling", "brief", "detailed", "structured with bullet points"], |
| 230 | + weights=[1, 2, 2, 1], |
| 231 | + ), |
| 232 | + ) |
| 233 | +) |
| 234 | + |
| 235 | +config_builder.validate() |
| 236 | + |
| 237 | +# %% [markdown] |
| 238 | +# ## 🦜 LLM-generated columns |
| 239 | +# |
| 240 | +# - The real power of Data Designer comes from leveraging LLMs to generate text, code, and structured data. |
| 241 | +# |
| 242 | +# - When prompting the LLM, we can use Jinja templating to reference other columns in the dataset. |
| 243 | +# |
| 244 | +# - As we see below, nested json fields can be accessed using dot notation. |
| 245 | +# |
| 246 | + |
| 247 | +# %% |
| 248 | +config_builder.add_column( |
| 249 | + LLMTextColumnConfig( |
| 250 | + name="product_name", |
| 251 | + prompt=( |
| 252 | + "You are a helpful assistant that generates product names. DO NOT add quotes around the product name.\n\n" |
| 253 | + "Come up with a creative product name for a product in the '{{ product_category }}' category, focusing " |
| 254 | + "on products related to '{{ product_subcategory }}'. The target age range of the ideal customer is " |
| 255 | + "{{ target_age_range }} years old. Respond with only the product name, no other text." |
| 256 | + ), |
| 257 | + system_prompt=SYSTEM_PROMPT, |
| 258 | + model_alias=MODEL_ALIAS, |
| 259 | + ) |
| 260 | +) |
| 261 | + |
| 262 | +config_builder.add_column( |
| 263 | + LLMTextColumnConfig( |
| 264 | + name="customer_review", |
| 265 | + prompt=( |
| 266 | + "You are a customer named {{ customer.first_name }} from {{ customer.city }}, {{ customer.state }}. " |
| 267 | + "You are {{ customer.age }} years old and recently purchased a product called {{ product_name }}. " |
| 268 | + "Write a review of this product, which you gave a rating of {{ number_of_stars }} stars. " |
| 269 | + "The style of the review should be '{{ review_style }}'." |
| 270 | + ), |
| 271 | + system_prompt=SYSTEM_PROMPT, |
| 272 | + model_alias=MODEL_ALIAS, |
| 273 | + ) |
| 274 | +) |
| 275 | + |
| 276 | +config_builder.validate() |
| 277 | + |
| 278 | +# %% [markdown] |
| 279 | +# ### 🔁 Iteration is key – preview the dataset! |
| 280 | +# |
| 281 | +# 1. Use the `preview` method to generate a sample of records quickly. |
| 282 | +# |
| 283 | +# 2. Inspect the results for quality and format issues. |
| 284 | +# |
| 285 | +# 3. Adjust column configurations, prompts, or parameters as needed. |
| 286 | +# |
| 287 | +# 4. Re-run the preview until satisfied. |
| 288 | +# |
| 289 | + |
| 290 | +# %% |
| 291 | +preview = data_designer.preview(config_builder) |
| 292 | + |
| 293 | +# %% |
| 294 | +# Run this cell multiple times to cycle through the 10 preview records. |
| 295 | +preview.display_sample_record() |
| 296 | + |
| 297 | +# %% |
| 298 | +# The preview dataset is available as a pandas DataFrame. |
| 299 | +preview.dataset |
| 300 | + |
| 301 | +# %% [markdown] |
| 302 | +# ### 📊 Analyze the generated data |
| 303 | +# |
| 304 | +# - Data Designer automatically generates a basic statistical analysis of the generated data. |
| 305 | +# |
| 306 | +# - This analysis is available via the `analysis` property of generation result objects. |
| 307 | +# |
| 308 | + |
| 309 | +# %% |
| 310 | +# Print the analysis as a table. |
| 311 | +preview.analysis.to_report() |
| 312 | + |
| 313 | +# %% [markdown] |
| 314 | +# ### 🆙 Scale up! |
| 315 | +# |
| 316 | +# - Happy with your preview data? |
| 317 | +# |
| 318 | +# - Use the `create` method to submit larger Data Designer generation jobs. |
| 319 | +# |
| 320 | + |
| 321 | +# %% |
| 322 | +results = data_designer.create(config_builder, num_records=20) |
| 323 | + |
| 324 | +# %% |
| 325 | +# Load the generated dataset as a pandas DataFrame. |
| 326 | +dataset = results.load_dataset() |
| 327 | + |
| 328 | +dataset.head() |
| 329 | + |
| 330 | +# %% |
| 331 | +# Load the analysis results into memory. |
| 332 | +analysis = results.load_analysis() |
| 333 | + |
| 334 | +analysis.to_report() |
| 335 | + |
| 336 | +# %% [markdown] |
| 337 | +# ## ⏭️ Next Steps |
| 338 | +# |
| 339 | +# Now that you've seen the basics of Data Designer, check out the following notebooks to learn more about: |
| 340 | +# |
| 341 | +# - [Structured outputs and jinja expressions](/notebooks/2-structured-outputs-and-jinja-expressions/) |
| 342 | +# |
| 343 | +# - [Seeding synthetic data generation with an external dataset](/notebooks/3-seeding-with-a-dataset/) |
| 344 | +# |
0 commit comments